/*****************************************************************************\
 *  gres.c - driver for gres plugin
 *****************************************************************************
 *  Copyright (C) 2010 Lawrence Livermore National Security.
 *  Copyright (C) SchedMD LLC.
 *  Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
 *  Written by Morris Jette <jette1@llnl.gov>
 *  CODE-OCEC-09-009. All rights reserved.
 *
 *  This file is part of Slurm, a resource management program.
 *  For details, see <https://slurm.schedmd.com/>.
 *  Please also read the included file: DISCLAIMER.
 *
 *  Slurm is free software; you can redistribute it and/or modify it under
 *  the terms of the GNU General Public License as published by the Free
 *  Software Foundation; either version 2 of the License, or (at your option)
 *  any later version.
 *
 *  In addition, as a special exception, the copyright holders give permission
 *  to link the code of portions of this program with the OpenSSL library under
 *  certain conditions as described in each individual source file, and
 *  distribute linked combinations including the two. You must obey the GNU
 *  General Public License in all respects for all of the code used other than
 *  OpenSSL. If you modify file(s) with this exception, you may extend this
 *  exception to your version of the file(s), but you are not obligated to do
 *  so. If you do not wish to do so, delete this exception statement from your
 *  version.  If you delete this exception statement from all source files in
 *  the program, then also delete it here.
 *
 *  Slurm is distributed in the hope that it will be useful, but WITHOUT ANY
 *  WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 *  FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
 *  details.
 *
 *  You should have received a copy of the GNU General Public License along
 *  with Slurm; if not, write to the Free Software Foundation, Inc.,
 *  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301  USA.
\*****************************************************************************/

#include "config.h"

#define _GNU_SOURCE

#include <ctype.h>
#include <inttypes.h>
#include <limits.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include <sys/stat.h>
#include <sys/types.h>

#ifdef MAJOR_IN_MKDEV
#  include <sys/mkdev.h>
#endif
#ifdef MAJOR_IN_SYSMACROS
#  include <sys/sysmacros.h>
#endif

#include "slurm/slurm.h"
#include "slurm/slurm_errno.h"
#include "src/common/assoc_mgr.h"
#include "src/common/bitstring.h"
#include "src/interfaces/cgroup.h"
#include "src/interfaces/gres.h"
#include "src/interfaces/gpu.h"
#include "src/common/job_resources.h"
#include "src/common/list.h"
#include "src/common/log.h"
#include "src/common/macros.h"
#include "src/common/node_conf.h"
#include "src/common/pack.h"
#include "src/common/parse_config.h"
#include "src/common/plugin.h"
#include "src/common/plugrack.h"
#include "src/common/read_config.h"
#include "src/interfaces/select.h"
#include "src/common/slurm_protocol_api.h"
#include "src/common/slurm_protocol_pack.h"
#include "src/common/strlcpy.h"
#include "src/common/xmalloc.h"
#include "src/common/xsched.h"
#include "src/common/xstring.h"

#define MAX_GRES_BITMAP 1024

strong_alias(gres_find_id, slurm_gres_find_id);
strong_alias(gres_find_job_by_key_exact_type,
	     slurm_gres_find_job_by_key_exact_type);
strong_alias(gres_find_sock_by_job_state, slurm_gres_find_sock_by_job_state);
strong_alias(gres_get_node_used, slurm_gres_get_node_used);
strong_alias(gres_get_system_cnt, slurm_gres_get_system_cnt);
strong_alias(gres_get_step_info, slurm_gres_get_step_info);
strong_alias(gres_sock_delete, slurm_gres_sock_delete);
strong_alias(gres_job_list_delete, slurm_gres_job_list_delete);
strong_alias(destroy_gres_device, slurm_destroy_gres_device);
strong_alias(destroy_gres_slurmd_conf, slurm_destroy_gres_slurmd_conf);

static s_p_options_t _gres_options[] = {
	{"AutoDetect", S_P_STRING},
	{"Count", S_P_STRING},	/* Number of Gres available */
	{"CPUs" , S_P_STRING},	/* CPUs to bind to Gres resource
				 * (deprecated, use Cores) */
	{"Cores", S_P_STRING},	/* Cores to bind to Gres resource */
	{"File",  S_P_STRING},	/* Path to Gres device */
	{"Files", S_P_STRING},	/* Path to Gres device */
	{"Flags", S_P_STRING},	/* GRES Flags */
	{"Link",  S_P_STRING},	/* Communication link IDs */
	{"Links", S_P_STRING},	/* Communication link IDs */
	{"MultipleFiles", S_P_STRING}, /* list of GRES device files */
	{"Name",  S_P_STRING},	/* Gres name */
	{"Type",  S_P_STRING},	/* Gres type (e.g. model name) */
	{NULL}
};

/* Gres symbols provided by the plugin */
typedef struct slurm_gres_ops {
	int		(*node_config_load)	( list_t *gres_conf_list,
						  node_config_load_t *node_conf);
	void		(*job_set_env)		( char ***job_env_ptr,
						  bitstr_t *gres_bit_alloc,
						  uint64_t gres_cnt,
						  gres_internal_flags_t flags);
	void		(*step_set_env)		( char ***step_env_ptr,
						  bitstr_t *gres_bit_alloc,
						  uint64_t gres_cnt,
						  gres_internal_flags_t flags);
	void		(*task_set_env)		( char ***task_env_ptr,
						  bitstr_t *gres_bit_alloc,
						  uint64_t gres_cnt,
						  bitstr_t *usable_gres,
						  gres_internal_flags_t flags);
	void		(*send_stepd)		( buf_t *buffer );
	void		(*recv_stepd)		( buf_t *buffer );
	list_t *(*get_devices)(void);
	void            (*step_hardware_init)	( bitstr_t *, char * );
	void            (*step_hardware_fini)	( void );
	gres_prep_t *(*prep_build_env)(gres_job_state_t *gres_js);
	void            (*prep_set_env)	( char ***prep_env_ptr,
					  gres_prep_t *gres_prep,
					  int node_inx );
} slurm_gres_ops_t;

/*
 * Gres plugin context, one for each gres type.
 * Add to gres_context through _add_gres_context().
 */
typedef struct slurm_gres_context {
	plugin_handle_t	cur_plugin;
	uint32_t	config_flags;		/* See GRES_CONF_* in gres.h */
	char *		gres_name;		/* name (e.g. "gpu") */
	char *		gres_name_colon;	/* name + colon (e.g. "gpu:") */
	int		gres_name_colon_len;	/* size of gres_name_colon */
	char *		gres_type;		/* plugin name (e.g. "gres/gpu") */
	list_t *np_gres_devices; /* list of devices when we don't have a plugin */
	slurm_gres_ops_t ops;			/* pointers to plugin symbols */
	uint32_t	plugin_id;		/* key for searches */
	plugrack_t	*plugin_list;		/* plugrack info */
	uint64_t        total_cnt;		/* Total GRES across all nodes */
} slurm_gres_context_t;

typedef struct {
	uint32_t plugin_id;
	bool with_type;
	bool without_type;
	void *without_type_state; /* gres_[job|step]_state_t */
} overlap_check_t;

typedef struct {
	slurm_gres_context_t *gres_ctx;
	int new_has_file;
	int new_has_type;
	int rec_count;
} foreach_gres_conf_t;

typedef struct {
	bitstr_t **gres_bit_alloc;
	uint64_t gres_cnt;
	uint64_t **gres_per_bit;
	bool is_job;
	int node_inx;
	uint32_t plugin_id;
	bool sharing_gres_allocated;
} foreach_gres_accumulate_device_t;

typedef struct {
	node_config_load_t *config;
	list_t **gres_devices;
	int index;
	int max_dev_num;
	list_t *names_list;
	int rc;
} foreach_fill_in_gres_devices_t;

typedef struct {
	char *node_list;
	list_t *prep_gres_list;
} foreach_prep_build_env_t;

typedef struct {
	int node_inx;
	char ***prep_env_ptr;
} foreach_prep_set_env_t;

typedef struct {
	uint32_t core_cnt;
	int core_end_bit;
	int core_start_bit;
	uint32_t job_id;
	list_t *node_gres_list;
	char *node_name;
	bool use_total_gres;
} foreach_job_test_t;

typedef struct {
	void *data;
	enum gres_step_data_type data_type;
	uint32_t node_inx;
	uint32_t plugin_id;
	int rc;
} foreach_step_info_t;

typedef struct {
	char *gres_str;
	char *sep;
	int sock_inx;
} foreach_sock_str_t;

typedef struct {
	list_t *device_list;
	bitstr_t *gres_bit_alloc;
	bitstr_t *usable_gres;
} foreach_alloc_gres_device_t;

typedef struct {
	bool filter_type;
	uint64_t gres_cnt;
	char *gres_type;
	bool is_job;
	uint32_t plugin_id;
} foreach_gres_list_cnt_t;

typedef struct {
	int job_node_index;
	list_t *new_gres_list;
} foreach_state_list_dup_t;

typedef struct {
	int bitmap_size;
	int gres_inx;
	uint32_t plugin_id;
	bitstr_t *task_cpus_bitmap;
	bitstr_t *usable_gres;
} foreach_closest_usable_gres_t;

typedef struct {
	int best_slot;
	int gres_inx;
	bitstr_t *gres_slots;
	int ntasks_per_gres;
	bool overlap;
	uint32_t plugin_id;
	bitstr_t *task_cpus_bitmap;
} foreach_gres_to_task_t;

typedef struct {
	int array_len;
	uint32_t *gres_count_ids;
	uint64_t *gres_count_vals;
	int index;
	int val_type;
} foreach_node_count_t;

/* Pointers to functions in src/slurmd/common/xcpuinfo.h that we may use */
typedef struct xcpuinfo_funcs {
	int (*xcpuinfo_abs_to_mac) (char *abs, char **mac);
} xcpuinfo_funcs_t;
xcpuinfo_funcs_t xcpuinfo_ops;

typedef struct {
	uint32_t flags;
	uint32_t name_hash;
	bool no_gpu_env;
} prev_gres_flags_t;

typedef struct {
	uint32_t config_flags;
	int config_type_cnt;
	uint32_t cpu_set_cnt;
	uint64_t gres_cnt;
	uint32_t plugin_id;
	uint32_t rec_cnt;
	uint64_t topo_cnt;
} tot_from_slurmd_conf_t;

typedef struct {
	int core_cnt;
	int cores_per_sock;
	bool cpu_config_err;
	int cpus_config;
	uint64_t dev_cnt;
	slurm_gres_context_t *gres_ctx;
	gres_node_state_t *gres_ns;
	int gres_inx;
	int topo_cnt;
	bool has_file;
	char *node_name;
	int rc;
	char **reason_down;
	int sock_cnt;
	uint64_t tot_gres_cnt;
} rebuild_topo_t;

typedef struct {
	slurm_gres_context_t *gres_ctx;
	gres_node_state_t *gres_ns;
} add_gres_info_t;

typedef struct {
	uint64_t count;
	slurm_gres_context_t *gres_ctx;
	char *type_name;
} conf_cnt_t;

typedef struct {
	list_t *gres_conf_list;
	slurm_gres_context_t *gres_ctx;
} check_conf_t;

typedef struct {
	uint64_t cpu_cnt;
	list_t *gres_conf_list;
	slurm_gres_context_t *gres_ctx;
	list_t *new_list;
} merge_gres_t;

typedef struct {
	void *generic_gres_data;
	bool is_job;
	uint32_t plugin_id;
} merge_generic_t;

typedef struct {
	uint32_t cpus_per_gres;
	gres_job_state_validate_t *gres_js_val;
	bool have_gres_shared;
	bool have_gres_sharing;
	bool is_job;
	bool overlap_merge;
	int over_count;
	overlap_check_t *over_array;
	int rc;
	uint32_t tmp_min_cpus;
} job_validate_t;

typedef struct {
	uint32_t job_id;
	list_t *node_gres_list;
	int node_inx;
	char *node_name;
} validate_job_gres_cnt_t;

typedef struct {
	int job_node_index;
	list_t *new_list;
} job_state_extract_t;

typedef struct {
	buf_t *buffer;
	bool details;
	uint32_t magic;
	uint16_t protocol_version;
} pack_state_t;

/* Local variables */
static int gres_context_cnt = -1;
static uint32_t gres_cpu_cnt = 0;
static slurm_gres_context_t *gres_context = NULL;
static char *gres_node_name = NULL;
static char *local_plugins_str = NULL;
static pthread_mutex_t gres_context_lock = PTHREAD_MUTEX_INITIALIZER;
static list_t *gres_conf_list = NULL;
static uint32_t gpu_plugin_id = NO_VAL;
static volatile uint32_t autodetect_flags = GRES_AUTODETECT_UNSET;
static buf_t *gres_context_buf = NULL;
static buf_t *gres_conf_buf = NULL;
static bool reset_prev = true;
static bool use_local_index = false;
static bool dev_index_mode_set = false;

/* Local functions */
static void _accumulate_job_gres_alloc(gres_job_state_t *gres_js,
				       int node_inx,
				       bitstr_t **gres_bit_alloc,
				       uint64_t *gres_cnt);
static void _accumulate_step_gres_alloc(gres_state_t *gres_state_step,
					bitstr_t **gres_bit_alloc,
					uint64_t *gres_cnt,
					uint64_t **gres_per_bit);
static void _add_gres_context(char *gres_name);
static gres_node_state_t *_build_gres_node_state(void);
static void	_build_node_gres_str(list_t **gres_list, char **gres_str,
				     int cores_per_sock, int sock_per_node);
static bitstr_t *_core_bitmap_rebuild(bitstr_t *old_core_bitmap, int new_size);
static void	_prep_list_del(void *x);
static void	_get_gres_cnt(gres_node_state_t *gres_ns, char *orig_config,
			      char *gres_name, char *gres_name_colon,
			      int gres_name_colon_len);
static uint64_t _get_job_gres_list_cnt(list_t *gres_list, char *gres_name,
				       char *gres_type);
static void *	_job_state_dup2(gres_job_state_t *gres_js, int job_node_index);
static int	_load_plugin(slurm_gres_context_t *gres_ctx);
static int	_log_gres_slurmd_conf(void *x, void *arg);
static void	_my_stat(char *file_name);
static void	_node_config_init(char *orig_config,
				  slurm_gres_context_t *gres_ctx,
				  gres_state_t *gres_state_node);
static char *	_node_gres_used(gres_node_state_t *gres_ns, char *gres_name);
static int	_node_reconfig(char *node_name, char *new_gres, char **gres_str,
			       gres_state_t *gres_state_node,
			       bool config_overrides,
			       slurm_gres_context_t *gres_ctx,
			       bool *updated_gpu_cnt);
static int	_node_reconfig_test(char *node_name, char *new_gres,
				    gres_state_t *gres_state_node,
				    slurm_gres_context_t *gres_ctx);
static void *	_node_state_dup(gres_node_state_t *gres_ns);
static int	_parse_gres_config(void **dest, slurm_parser_enum_t type,
				   const char *key, const char *value,
				   const char *line, char **leftover);
static int	_parse_gres_config_node(void **dest, slurm_parser_enum_t type,
					const char *key, const char *value,
					const char *line, char **leftover);
static int	_post_plugin_gres_conf(void *x, void *arg);
static void *	_step_state_dup(gres_step_state_t *gres_ss);
static void *	_step_state_dup2(gres_step_state_t *gres_ss,
				 int job_node_index);
static int	_unload_plugin(slurm_gres_context_t *gres_ctx);
static void	_validate_slurm_conf(list_t *slurm_conf_list,
				     slurm_gres_context_t *gres_ctx);
static void	_validate_gres_conf(list_t *gres_conf_list,
				    slurm_gres_context_t *gres_ctx);
static int	_validate_file(char *path_name, char *gres_name);
static int	_valid_gres_type(char *gres_name, gres_node_state_t *gres_ns,
				 bool config_overrides, char **reason_down);
static void _parse_accel_bind_type(uint16_t accel_bind_type,
				   char *tres_bind_str);
static int _get_usable_gres(int context_inx, int proc_id,
			    char *tres_bind_str, bitstr_t **usable_gres_ptr,
			    bitstr_t *gres_bit_alloc,  bool get_devices,
			    stepd_step_rec_t *step, uint64_t *gres_per_bit,
			    gres_internal_flags_t *flags);

extern uint32_t gres_build_id(char *name)
{
	int i, j;
	uint32_t id = 0;

	if (!name)
		return id;

	for (i = 0, j = 0; name[i]; i++) {
		id += (name[i] << j);
		j = (j + 8) % 32;
	}

	return id;
}

extern int gres_find_id(void *x, void *key)
{
	uint32_t *plugin_id = (uint32_t *)key;
	gres_state_t *state_ptr = (gres_state_t *) x;
	if (state_ptr->plugin_id == *plugin_id)
		return 1;
	return 0;
}

extern int gres_find_flags(void *x, void *key)
{
	gres_state_t *state_ptr = x;
	uint32_t flags = *(uint32_t *)key;
	if (state_ptr->config_flags & flags)
		return 1;
	return 0;
}

/* Find job record with matching name and type */
extern int gres_find_job_by_key_exact_type(void *x, void *key)
{
	gres_state_t *gres_state_job = (gres_state_t *) x;
	gres_key_t *job_key = (gres_key_t *) key;
	gres_job_state_t *gres_js;
	gres_js = (gres_job_state_t *)gres_state_job->gres_data;

	if ((gres_state_job->plugin_id == job_key->plugin_id) &&
	    (gres_js->type_id == job_key->type_id))
		return 1;
	return 0;
}

/* Find job record with matching name and type */
extern int gres_find_job_by_key(void *x, void *key)
{
	gres_state_t *gres_state_job = (gres_state_t *) x;
	gres_key_t *job_key = (gres_key_t *) key;
	gres_job_state_t *gres_js;
	gres_js = (gres_job_state_t *)gres_state_job->gres_data;

	if ((gres_state_job->plugin_id == job_key->plugin_id) &&
	    ((job_key->type_id == NO_VAL) ||
	     (gres_js->type_id == job_key->type_id)))
		return 1;
	return 0;
}

/* Find job record with matching name and type */
extern int gres_find_job_by_key_with_cnt(void *x, void *key)
{
	gres_state_t *gres_state_job = (gres_state_t *) x;
	gres_key_t *job_key = (gres_key_t *) key;
	gres_job_state_t *gres_js;
	gres_js = (gres_job_state_t *)gres_state_job->gres_data;

	if (!gres_find_job_by_key(x, key))
		return 0;

	/* This gres has been allocated on this node */
	if (!gres_js->node_cnt ||
	    ((job_key->node_offset < gres_js->node_cnt) &&
	     gres_js->gres_cnt_node_alloc[job_key->node_offset]))
		return 1;

	return 0;
}

extern int gres_find_step_by_key(void *x, void *key)
{
	gres_state_t *state_ptr = (gres_state_t *) x;
	gres_key_t *step_key = (gres_key_t *) key;
	gres_step_state_t *gres_ss = (gres_step_state_t *)state_ptr->gres_data;

	if ((state_ptr->plugin_id == step_key->plugin_id) &&
	    (gres_ss->type_id == step_key->type_id))
		return 1;
	return 0;
}


extern bool gres_use_local_device_index(void)
{
	bool use_cgroup = false;

	if (dev_index_mode_set)
		return use_local_index;
	dev_index_mode_set = true;

	if (!slurm_conf.task_plugin)
		return use_local_index;

	if (xstrstr(slurm_conf.task_plugin, "cgroup"))
		use_cgroup = true;
	if (!use_cgroup)
		return use_local_index;

	cgroup_conf_init();
	if (slurm_cgroup_conf.constrain_devices)
		use_local_index = true;

	return use_local_index;
}

extern gres_state_t *gres_create_state(void *src_ptr,
				       gres_state_src_t state_src,
				       gres_state_type_enum_t state_type,
				       void *gres_data)
{
	gres_state_t *new_gres_state = xmalloc(sizeof(gres_state_t));

	new_gres_state->gres_data = gres_data;
	new_gres_state->state_type = state_type;

	switch (state_src) {
	case GRES_STATE_SRC_STATE_PTR:
	{
		gres_state_t *gres_state = src_ptr;
		new_gres_state->config_flags = gres_state->config_flags;
		new_gres_state->plugin_id = gres_state->plugin_id;
		new_gres_state->gres_name = xstrdup(gres_state->gres_name);
		break;
	}
	case GRES_STATE_SRC_CONTEXT_PTR:
	{
		slurm_gres_context_t *gres_ctx = src_ptr;
		new_gres_state->config_flags = gres_ctx->config_flags;
		new_gres_state->plugin_id = gres_ctx->plugin_id;
		new_gres_state->gres_name = xstrdup(gres_ctx->gres_name);
		break;
	}
	case GRES_STATE_SRC_KEY_PTR:
	{
		gres_key_t *search_key = src_ptr;
		new_gres_state->config_flags = search_key->config_flags;
		new_gres_state->plugin_id = search_key->plugin_id;
		/*
		 * gres_name should be handled after this since search_key
		 * doesn't have that
		 */
		break;
	}
	default:
		error("%s: No way to create gres_state given", __func__);
		xfree(new_gres_state);
		break;
	}

	return new_gres_state;
}

/*
 * Find a gres_context by plugin_id
 * Must hold gres_context_lock before calling.
 */
static slurm_gres_context_t *_find_context_by_id(uint32_t plugin_id)
{
	for (int j = 0; j < gres_context_cnt; j++)
		if (gres_context[j].plugin_id == plugin_id)
			return &gres_context[j];
	return NULL;
}

static int _load_plugin(slurm_gres_context_t *gres_ctx)
{
	/*
	 * Must be synchronized with slurm_gres_ops_t above.
	 */
	static const char *syms[] = {
		"gres_p_node_config_load",
		"gres_p_job_set_env",
		"gres_p_step_set_env",
		"gres_p_task_set_env",
		"gres_p_send_stepd",
		"gres_p_recv_stepd",
		"gres_p_get_devices",
		"gres_p_step_hardware_init",
		"gres_p_step_hardware_fini",
		"gres_p_prep_build_env",
		"gres_p_prep_set_env"
	};
	int n_syms = sizeof(syms) / sizeof(char *);

	/* Find the correct plugin */
	if (gres_ctx->config_flags & GRES_CONF_COUNT_ONLY) {
		debug("Plugin of type %s only tracks gres counts",
		      gres_ctx->gres_type);
		return SLURM_SUCCESS;
	}

	gres_ctx->cur_plugin = plugin_load_and_link(
		gres_ctx->gres_type,
		n_syms, syms,
		(void **) &gres_ctx->ops);
	if (gres_ctx->cur_plugin != PLUGIN_INVALID_HANDLE)
		return SLURM_SUCCESS;

	if (errno != ESLURM_PLUGIN_NOTFOUND) {
		error("Couldn't load specified plugin name for %s: %s",
		      gres_ctx->gres_type, slurm_strerror(errno));
		return SLURM_ERROR;
	}

	debug("gres: Couldn't find the specified plugin name for %s looking "
	      "at all files", gres_ctx->gres_type);

	/* Get plugin list */
	if (gres_ctx->plugin_list == NULL) {
		gres_ctx->plugin_list = plugrack_create("gres");
		plugrack_read_dir(gres_ctx->plugin_list,
				  slurm_conf.plugindir);
	}

	gres_ctx->cur_plugin = plugrack_use_by_type(
		gres_ctx->plugin_list,
		gres_ctx->gres_type );
	if (gres_ctx->cur_plugin == PLUGIN_INVALID_HANDLE) {
		debug("Cannot find plugin of type %s, just track gres counts",
		      gres_ctx->gres_type);
		gres_ctx->config_flags |= GRES_CONF_COUNT_ONLY;
		return SLURM_ERROR;
	}

	/* Dereference the API. */
	if (plugin_get_syms(gres_ctx->cur_plugin,
			    n_syms, syms,
			    (void **) &gres_ctx->ops ) < n_syms ) {
		error("Incomplete %s plugin detected",
		      gres_ctx->gres_type);
		return SLURM_ERROR;
	}

	return SLURM_SUCCESS;
}

static int _unload_plugin(slurm_gres_context_t *gres_ctx)
{
	int rc;

	/*
	 * Must check return code here because plugins might still
	 * be loaded and active.
	 */
	if (gres_ctx->plugin_list)
		rc = plugrack_destroy(gres_ctx->plugin_list);
	else {
		rc = SLURM_SUCCESS;
		plugin_unload(gres_ctx->cur_plugin);
	}
	xfree(gres_ctx->gres_name);
	xfree(gres_ctx->gres_name_colon);
	xfree(gres_ctx->gres_type);
	FREE_NULL_LIST(gres_ctx->np_gres_devices);

	return rc;
}

extern bool gres_is_shared_name(char *name)
{
	if (!xstrcmp(name, "mps") ||
	    !xstrcmp(name, "shard"))
		return true;
	return false;
}

static void _set_shared_flag(char *name, uint32_t *config_flags)
{
	if (gres_is_shared_name(name))
		*config_flags |= GRES_CONF_SHARED;
}

/*
 * Add new gres context to gres_context array and load the plugin.
 * Must hold gres_context_lock before calling.
 */
static void _add_gres_context(char *gres_name)
{
	slurm_gres_context_t *gres_ctx;

	if (!gres_name || !gres_name[0])
		fatal("%s: invalid empty gres_name", __func__);

	xrecalloc(gres_context, (gres_context_cnt + 1),
		  sizeof(slurm_gres_context_t));

	gres_ctx = &gres_context[gres_context_cnt];
	_set_shared_flag(gres_name, &gres_ctx->config_flags);
	gres_ctx->gres_name = xstrdup(gres_name);
	gres_ctx->plugin_id = gres_build_id(gres_name);
	gres_ctx->gres_type = xstrdup_printf("gres/%s", gres_name);
	gres_ctx->plugin_list = NULL;
	gres_ctx->cur_plugin = PLUGIN_INVALID_HANDLE;

	gres_context_cnt++;
}

/*
 * Initialize the GRES plugins.
 *
 * Returns a Slurm errno.
 */
extern int gres_init(void)
{
	int i, j, rc = SLURM_SUCCESS;
	char *last = NULL, *names, *one_name, *full_name;
	char *sorted_names = NULL, *sep = "", *shared_names = NULL;
	bool have_gpu = false, have_shared = false;
	char *shared_sep = "";

	slurm_mutex_lock(&gres_context_lock);

	if (gres_context_cnt >= 0)
		goto fini;

	local_plugins_str = xstrdup(slurm_conf.gres_plugins);
	gres_context_cnt = 0;
	if ((local_plugins_str == NULL) || (local_plugins_str[0] == '\0'))
		goto fini;

	/* Ensure that "gres/'shared'" follows "gres/gpu" */
	have_gpu = false;
	have_shared = false;
	names = xstrdup(local_plugins_str);
	one_name = strtok_r(names, ",", &last);
	while (one_name) {
		bool skip_name = false;
		if (gres_is_shared_name(one_name)) {
			have_shared = true;
			if (!have_gpu) {
				/* "shared" must follow "gpu" */
				skip_name = true;
				xstrfmtcat(shared_names, "%s%s",
					   shared_sep, one_name);
				shared_sep = ",";
			}
		} else if (!xstrcmp(one_name, "gpu")) {
			have_gpu = true;
			gpu_plugin_id = gres_build_id("gpu");
		}
		if (!skip_name) {
			xstrfmtcat(sorted_names, "%s%s", sep, one_name);
			sep = ",";
		}
		one_name = strtok_r(NULL, ",", &last);
	}
	if (shared_names) {
		if (!have_gpu)
			fatal("GresTypes: gres/'shared' requires that gres/gpu also be configured");
		xstrfmtcat(sorted_names, "%s%s", sep, shared_names);
		xfree(shared_names);
	}
	xfree(names);

	gres_context_cnt = 0;
	one_name = strtok_r(sorted_names, ",", &last);
	while (one_name) {
		full_name = xstrdup("gres/");
		xstrcat(full_name, one_name);
		for (i = 0; i < gres_context_cnt; i++) {
			if (!xstrcmp(full_name, gres_context[i].gres_type))
				break;
		}
		xfree(full_name);
		if (i < gres_context_cnt) {
			error("Duplicate plugin %s ignored",
			      gres_context[i].gres_type);
		} else {
			_add_gres_context(one_name);
		}
		one_name = strtok_r(NULL, ",", &last);
	}
	xfree(sorted_names);

	/* Ensure that plugin_id is valid and unique */
	for (i = 0; i < gres_context_cnt; i++) {
		for (j = i + 1; j < gres_context_cnt; j++) {
			if (gres_context[i].plugin_id !=
			    gres_context[j].plugin_id)
				continue;
			fatal("Gres: Duplicate plugin_id %u for %s and %s, "
			      "change gres name for one of them",
			      gres_context[i].plugin_id,
			      gres_context[i].gres_type,
			      gres_context[j].gres_type);
		}
		xassert(gres_context[i].gres_name);

		gres_context[i].gres_name_colon =
			xstrdup_printf("%s:", gres_context[i].gres_name);
		gres_context[i].gres_name_colon_len =
			strlen(gres_context[i].gres_name_colon);
	}

fini:
	if (have_shared && running_in_slurmctld() && !running_cons_tres()) {
		fatal("Use of shared gres requires the use of select/cons_tres");
	}

	slurm_mutex_unlock(&gres_context_lock);

	return rc;
}

extern int gres_get_gres_cnt(void)
{
	static int cnt = -1;

	if (cnt != -1)
		return cnt;

	xassert(gres_context_cnt >= 0);

	slurm_mutex_lock(&gres_context_lock);
	cnt = gres_context_cnt;
	slurm_mutex_unlock(&gres_context_lock);

	return cnt;
}

/*
 * Add a GRES record. This is used by the node_features plugin after the
 * slurm.conf file is read and the initial GRES records are built by
 * gres_init().
 */
extern void gres_add(char *gres_name)
{
	int i;

	slurm_mutex_lock(&gres_context_lock);
	for (i = 0; i < gres_context_cnt; i++) {
		if (!xstrcmp(gres_context[i].gres_name, gres_name))
			goto fini;
	}

	_add_gres_context(gres_name);
fini:	slurm_mutex_unlock(&gres_context_lock);
}

/* Given a gres_name, return its context index or -1 if not found */
static int _gres_name_context(char *gres_name)
{
	int i;

	for (i = 0; i < gres_context_cnt; i++) {
		if (!xstrcmp(gres_context[i].gres_name, gres_name))
			return i;
	}

	return -1;
}

/*
 * Takes a GRES config line (typically from slurm.conf) and remove any
 * records for GRES which are not defined in GresTypes.
 * RET string of valid GRES, Release memory using xfree()
 */
extern char *gres_name_filter(char *orig_gres, char *nodes)
{
	char *new_gres = NULL, *save_ptr = NULL;
	char *colon, *sep = "", *tmp, *tok, *name;

	slurm_mutex_lock(&gres_context_lock);
	if (!orig_gres || !orig_gres[0] || !gres_context_cnt) {
		slurm_mutex_unlock(&gres_context_lock);
		return new_gres;
	}

	tmp = xstrdup(orig_gres);
	tok = strtok_r(tmp, ",", &save_ptr);
	while (tok) {
		name = xstrdup(tok);
		if ((colon = strchr(name, ':')))
			colon[0] = '\0';
		if (_gres_name_context(name) != -1) {
			xstrfmtcat(new_gres, "%s%s", sep, tok);
			sep = ",";
		} else {
			/* Logging may not be initialized at this point */
			error("Invalid GRES configured on node %s: %s", nodes,
			      tok);
		}
		xfree(name);
		tok = strtok_r(NULL, ",", &save_ptr);
	}
	slurm_mutex_unlock(&gres_context_lock);
	xfree(tmp);

	return new_gres;
}

/*
 * Terminate the gres plugin. Free memory.
 *
 * Returns a Slurm errno.
 */
extern int gres_fini(void)
{
	int i, j, rc = SLURM_SUCCESS;

	slurm_mutex_lock(&gres_context_lock);
	xfree(gres_node_name);
	if (gres_context_cnt < 0)
		goto fini;

	for (i = 0; i < gres_context_cnt; i++) {
		j = _unload_plugin(gres_context + i);
		if (j != SLURM_SUCCESS)
			rc = j;
	}
	xfree(gres_context);
	xfree(local_plugins_str);
	FREE_NULL_LIST(gres_conf_list);
	FREE_NULL_BUFFER(gres_context_buf);
	FREE_NULL_BUFFER(gres_conf_buf);
	gres_context_cnt = -1;

fini:	slurm_mutex_unlock(&gres_context_lock);
	return rc;
}

/*
 * ************************************************************************
 *                          P L U G I N   C A L L S                       *
 * ************************************************************************
 */

/*
 * Return a plugin-specific help message for salloc, sbatch and srun
 * Result must be xfree()'d.
 *
 * NOTE: GRES "type" (e.g. model) information is only available from slurmctld
 * after slurmd registers. It is not readily available from srun (as used here).
 */
extern char *gres_help_msg(void)
{
	int i;
	char *msg = xstrdup("Valid gres options are:\n");

	xassert(gres_context_cnt >= 0);

	slurm_mutex_lock(&gres_context_lock);
	for (i = 0; i < gres_context_cnt; i++) {
		xstrcat(msg, gres_context[i].gres_name);
		xstrcat(msg, "[[:type]:count]\n");
	}
	slurm_mutex_unlock(&gres_context_lock);

	return msg;
}

/*
 * Perform reconfig, re-read any configuration files
 * OUT did_change - set if gres configuration changed
 */
extern int gres_reconfig(void)
{
	int rc = SLURM_SUCCESS;
	bool plugin_change;

	slurm_mutex_lock(&gres_context_lock);

	if (xstrcmp(slurm_conf.gres_plugins, local_plugins_str))
		plugin_change = true;
	else
		plugin_change = false;

	reset_prev = true;

	/* Reset the flags so when the node checks in we believe that */
	for (int i = 0; i < gres_context_cnt; i++)
		gres_context[i].config_flags |= GRES_CONF_FROM_STATE;

	slurm_mutex_unlock(&gres_context_lock);

	if (plugin_change) {
		error("GresPlugins changed from %s to %s ignored",
		      local_plugins_str, slurm_conf.gres_plugins);
		error("Restart the slurmctld daemon to change GresPlugins");
#if 0
		/* This logic would load new plugins, but we need the old
		 * plugins to persist in order to process old state
		 * information. */
		rc = gres_fini();
		if (rc == SLURM_SUCCESS)
			rc = gres_init();
#endif
	}

	return rc;
}

/* Return 1 if a gres_conf record is the correct plugin_id and has no file */
static int _find_fileless_gres(void *x, void *arg)
{
	gres_slurmd_conf_t *gres_slurmd_conf = (gres_slurmd_conf_t *)x;
	uint32_t plugin_id = *(uint32_t *)arg;

	if ((gres_slurmd_conf->plugin_id == plugin_id) &&
	    !gres_slurmd_conf->file) {
		warning("Ignoring file-less GPU %s:%s from final GRES list",
			gres_slurmd_conf->name, gres_slurmd_conf->type_name);
		return 1;
	}
	return 0;

}

/*
 * Log the contents of a gres_slurmd_conf_t record
 */
static int _log_gres_slurmd_conf(void *x, void *arg)
{
	gres_slurmd_conf_t *p;
	int index = -1, offset, mult = 1;

	p = (gres_slurmd_conf_t *) x;
	xassert(p);

	if (!(slurm_conf.debug_flags & DEBUG_FLAG_GRES)) {
		verbose("Gres Name=%s Type=%s Count=%"PRIu64" Flags=%s",
			p->name, p->type_name, p->count,
			gres_flags2str(p->config_flags));
		return 0;
	}

	if (p->file) {
		index = 0;
		offset = strlen(p->file);
		while (offset > 0) {
			offset--;
			if ((p->file[offset] < '0') || (p->file[offset] > '9'))
				break;
			index += (p->file[offset] - '0') * mult;
			mult *= 10;
		}
	}

	if (p->cpus && (index != -1)) {
		info("Gres Name=%s Type=%s Count=%"PRIu64" Index=%d ID=%u File=%s Cores=%s CoreCnt=%u Links=%s Flags=%s",
		     p->name,
		     p->type_name,
		     p->count,
		     index,
		     p->plugin_id,
		     p->file,
		     p->cpus,
		     p->cpu_cnt,
		     p->links,
		     gres_flags2str(p->config_flags));
	} else if (index != -1) {
		info("Gres Name=%s Type=%s Count=%"PRIu64" Index=%d ID=%u File=%s Links=%s Flags=%s",
		     p->name,
		     p->type_name,
		     p->count,
		     index,
		     p->plugin_id,
		     p->file,
		     p->links,
		     gres_flags2str(p->config_flags));
	} else if (p->file) {
		info("Gres Name=%s Type=%s Count=%"PRIu64" ID=%u File=%s Links=%s Flags=%s",
		     p->name,
		     p->type_name,
		     p->count,
		     p->plugin_id,
		     p->file,
		     p->links,
		     gres_flags2str(p->config_flags));
	} else {
		info("Gres Name=%s Type=%s Count=%"PRIu64" ID=%u Links=%s Flags=%s",
		     p->name,
		     p->type_name,
		     p->count,
		     p->plugin_id,
		     p->links,
		     gres_flags2str(p->config_flags));
	}

	return 0;
}


static int _post_plugin_gres_conf(void *x, void *arg)
{
	gres_slurmd_conf_t *gres_slurmd_conf = x;
	slurm_gres_context_t *gres_ctx = arg;

	if (gres_slurmd_conf->plugin_id != gres_ctx->plugin_id)
		return 0;

	if (gres_slurmd_conf->config_flags & GRES_CONF_GLOBAL_INDEX)
		gres_ctx->config_flags |= GRES_CONF_GLOBAL_INDEX;

	return 1;
}

/* Make sure that specified file name exists, wait up to 20 seconds or generate
 * fatal error and exit. */
static void _my_stat(char *file_name)
{
	struct stat config_stat;
	bool sent_msg = false;
	int i;

	if (!running_in_slurmd_stepd())
		return;

	for (i = 0; i < 20; i++) {
		if (i)
			sleep(1);
		if (stat(file_name, &config_stat) == 0) {
			if (sent_msg)
				info("gres.conf file %s now exists", file_name);
			return;
		}

		if (errno != ENOENT)
			break;

		if (!sent_msg) {
			error("Waiting for gres.conf file %s", file_name);
			sent_msg = true;
		}
	}
	fatal("can't stat gres.conf file %s: %m", file_name);
	return;
}

static int _validate_file(char *filenames, char *gres_name)
{
	char *one_name;
	hostlist_t *hl;
	int file_count = 0;

	if (!(hl = hostlist_create(filenames)))
		fatal("can't parse File=%s", filenames);

	while ((one_name = hostlist_shift(hl))) {
		_my_stat(one_name);
		file_count++;
		free(one_name);
	}

	hostlist_destroy(hl);

	return file_count;
}

/*
 * Create and return a comma-separated zeroed-out links string with a -1 in the
 * given GPU position indicated by index. Caller must xfree() the returned
 * string.
 *
 * Used to record the enumeration order (PCI bus ID order) of GPUs for sorting,
 * even when the GPU does not support nvlinks. E.g. for three total GPUs, their
 * links strings would look like this:
 *
 * GPU at index 0: -1,0,0
 * GPU at index 1: 0,-1,0
 * GPU at index 2: -0,0,-1
 */
extern char *gres_links_create_empty(unsigned int index,
				     unsigned int device_count)
{
	char *links_str = NULL;

	for (unsigned int i = 0; i < device_count; ++i) {
		xstrfmtcat(links_str, "%s%d",
			   i ? "," : "",
			   (i == index) ? -1 : 0);
	}

	return links_str;
}

/*
 * Check that we have a comma-delimited list of numbers, and return the index of
 * the GPU (-1) in the links string.
 *
 * Returns a non-zero-based index of the GPU in the links string, if found.
 * If not found, returns a negative value.
 * Return values:
 * 0+: GPU index
 * -1: links string is NULL.
 * -2: links string is not NULL, but is invalid. Possible invalid reasons:
 *     * error parsing the comma-delimited links string
 *     * links string is an empty string
 *     * the 'self' GPU identifier isn't found (i.e. no -1)
 *     * there is more than one 'self' GPU identifier found
 */
extern int gres_links_validate(char *links)
{
	char *tmp, *tok, *save_ptr = NULL, *end_ptr = NULL;
	long int val;
	int rc;
	int i;

	if (!links)
		return -1;
	if (links[0] == '\0') {
		error("%s: Links is an empty string", __func__);
		return -2;
	}

	tmp = xstrdup(links);
	tok = strtok_r(tmp, ",", &save_ptr);
	rc = -1;
	i = 0;
	while (tok) {
		val = strtol(tok, &end_ptr, 10);
		if ((val < -2) || (val > GRES_MAX_LINK) || (val == LONG_MIN) ||
		    (end_ptr[0] != '\0')) {
			error("%s: Failed to parse token '%s' in links string '%s'",
			      __func__, tok, links);
			rc = -2;
			break;
		}
		if (val == -1) {
			if (rc != -1) {
				error("%s: links string '%s' has more than one -1",
				      __func__, links);
				rc = -2;
				break;
			}
			rc = i;
		}
		i++;
		tok = strtok_r(NULL, ",", &save_ptr);
	}
	xfree(tmp);

	/* If the current GPU (-1) wasn't found, that's an error */
	if (rc == -1) {
		error("%s: -1 wasn't found in links string '%s'", __func__,
		      links);
		rc = -2;
	}

	return rc;
}

static char *_get_autodetect_flags_str(void)
{
	char *flags = NULL;

	if (!(autodetect_flags & GRES_AUTODETECT_GPU_FLAGS))
		xstrfmtcat(flags, "%sunset", flags ? "," : "");
	else {
		if (autodetect_flags & GRES_AUTODETECT_GPU_NVML)
			xstrfmtcat(flags, "%snvml", flags ? "," : "");
		else if (autodetect_flags & GRES_AUTODETECT_GPU_RSMI)
			xstrfmtcat(flags, "%srsmi", flags ? "," : "");
		else if (autodetect_flags & GRES_AUTODETECT_GPU_ONEAPI)
			xstrfmtcat(flags, "%soneapi", flags ? "," : "");
		else if (autodetect_flags & GRES_AUTODETECT_GPU_NRT)
			xstrfmtcat(flags, "%snrt", flags ? "," : "");
		else if (autodetect_flags & GRES_AUTODETECT_GPU_NVIDIA)
			xstrfmtcat(flags, "%snvidia", flags ? "," : "");
		else if (autodetect_flags & GRES_AUTODETECT_GPU_OFF)
			xstrfmtcat(flags, "%soff", flags ? "," : "");
	}

	return flags;
}

static uint32_t _handle_autodetect_flags(char *str)
{
	uint32_t flags = 0;

	/* Set the node-local gpus value of autodetect_flags */
	if (xstrcasestr(str, "nvml"))
		flags |= GRES_AUTODETECT_GPU_NVML;
	else if (xstrcasestr(str, "rsmi"))
		flags |= GRES_AUTODETECT_GPU_RSMI;
	else if (xstrcasestr(str, "oneapi"))
		flags |= GRES_AUTODETECT_GPU_ONEAPI;
	else if (xstrcasestr(str, "nrt"))
		flags |= GRES_AUTODETECT_GPU_NRT;
	else if (xstrcasestr(str, "nvidia"))
		flags |= GRES_AUTODETECT_GPU_NVIDIA;
	else if (!xstrcasecmp(str, "off"))
		flags |= GRES_AUTODETECT_GPU_OFF;
	else
		error("unknown autodetect flag '%s'", str);

	return flags;
}

static void _handle_local_autodetect(char *str)
{
	uint32_t autodetect_flags_local = _handle_autodetect_flags(str);

	/* Only set autodetect_flags once locally, unless it's the same val */
	if ((autodetect_flags != GRES_AUTODETECT_UNSET) &&
	    (autodetect_flags != autodetect_flags_local)) {
		fatal("gres.conf: duplicate node-local AutoDetect specification does not match the first");
		return;
	}

	/* Set the node-local gpus value of autodetect_flags */
	autodetect_flags |= autodetect_flags_local;

	if (slurm_conf.debug_flags & DEBUG_FLAG_GRES) {
		char *flags = _get_autodetect_flags_str();
		log_flag(GRES, "Using node-local AutoDetect=%s(%d)",
			 flags, autodetect_flags);
		xfree(flags);
	}
}

static void _handle_global_autodetect(char *str)
{
	/* If GPU flags exist, node-local value was already specified */
	if (autodetect_flags & GRES_AUTODETECT_GPU_FLAGS)
		debug2("gres.conf: AutoDetect GPU flags were locally set, so ignoring global flags");
	else
		autodetect_flags |= _handle_autodetect_flags(str);

	if (slurm_conf.debug_flags & DEBUG_FLAG_GRES) {
		char *flags = _get_autodetect_flags_str();
		log_flag(GRES, "Global AutoDetect=%s(%d)",
			 flags, autodetect_flags);
		xfree(flags);
	}
}

static int _get_match(void *x, void *arg)
{
	gres_slurmd_conf_t *gres_slurmd_conf1 = x;
	gres_slurmd_conf_t *gres_slurmd_conf2 = arg;

	/* We only need to check type name because they should all be gpus */
	if (!gres_slurmd_conf1->type_name && !gres_slurmd_conf2->type_name)
		return 1;

	if (!gres_slurmd_conf1->type_name || !gres_slurmd_conf2->type_name)
		return 0;

	if (!xstrcmp(gres_slurmd_conf1->type_name,
		     gres_slurmd_conf2->type_name))
		return 1;

	return 0;
}

static int _merge_by_type(void *x, void *arg)
{
	gres_slurmd_conf_t *gres_slurmd_conf = x, *merged_gres_slurmd_conf;
	list_t *gres_list_merged = arg;

	merged_gres_slurmd_conf = list_find_first(gres_list_merged, _get_match,
						  gres_slurmd_conf);

	/* We are merging types and don't care about files or links */
	if (merged_gres_slurmd_conf)
		merged_gres_slurmd_conf->count++;
	else
		list_append(gres_list_merged, gres_slurmd_conf);

	return SLURM_SUCCESS;
}

static int _slurm_conf_gres_str(void *x, void *arg)
{
	gres_slurmd_conf_t *gres_slurmd_conf = x;
	char **gres_str = arg;
	if (gres_slurmd_conf && gres_slurmd_conf->name) {
		bool has_type = gres_slurmd_conf->type_name &&
				gres_slurmd_conf->type_name[0];
		xstrfmtcat(*gres_str, "%s%s:%s%s%ld",
			   gres_str && gres_str[0] ? "," : "",
			   gres_slurmd_conf->name,
			   has_type ? gres_slurmd_conf->type_name : "",
			   has_type ? ":" : "",
			   gres_slurmd_conf->count);
	}
	return SLURM_SUCCESS;
}

extern void gres_get_autodetected_gpus(node_config_load_t node_conf,
				       char **first_gres_str,
				       char **autodetect_str)
{
	list_t *gres_list_system = NULL, *gres_list_merged = NULL;

	char *gres_str = NULL;
	char *autodetect_option_name = NULL;

	int autodetect_options[] = {
		GRES_AUTODETECT_GPU_NVML,
		GRES_AUTODETECT_GPU_NVIDIA,
		GRES_AUTODETECT_GPU_RSMI,
		GRES_AUTODETECT_GPU_ONEAPI,
		GRES_AUTODETECT_GPU_NRT,
		GRES_AUTODETECT_UNSET /* For loop is done */
	};

	for (int i = 0; autodetect_options[i] != GRES_AUTODETECT_UNSET; i++) {
		autodetect_flags = autodetect_options[i];
		if (gpu_plugin_init() != SLURM_SUCCESS)
			continue;
		gres_list_system = gpu_g_get_system_gpu_list(&node_conf);
		if (gres_list_system) {
			gres_list_merged = list_create(NULL);
			list_for_each(gres_list_system, _merge_by_type,
				      gres_list_merged);
			list_for_each(gres_list_merged, _slurm_conf_gres_str,
				      &gres_str);
		}
		FREE_NULL_LIST(gres_list_merged);
		FREE_NULL_LIST(gres_list_system);
		gpu_plugin_fini();

		if (!gres_str)
			continue;

		if (autodetect_flags == GRES_AUTODETECT_GPU_NVML)
			i++; /* Skip NVIDIA if NVML finds gpus */

		autodetect_option_name = _get_autodetect_flags_str();
		xstrfmtcat(*autodetect_str, "%sFound %s with Autodetect=%s (Substring of gpu name may be used instead)",
			   (*autodetect_str ? "\n" : ""),
			   gres_str,
			   autodetect_option_name);
		xfree(autodetect_option_name);

		if (!*first_gres_str){
			*first_gres_str = gres_str;
			gres_str = NULL;
		} else {
			xfree(gres_str);
		}
	}
}

/*
 * Check to see if current GRES record matches the name of the previous GRES
 * record that set env flags.
 */
static bool _same_gres_name_as_prev(prev_gres_flags_t *prev_gres,
				    gres_slurmd_conf_t *p)
{
	if ((gres_build_id(p->name) == prev_gres->name_hash))
		return true;
	else
		return false;
}

/*
 * Save off env flags, GRES name, and no_gpu_env (for the next gres.conf line to
 * possibly inherit or to check against).
 */
static void _set_prev_gres_flags(prev_gres_flags_t *prev_gres,
				 gres_slurmd_conf_t *p, uint32_t env_flags,
				 bool no_gpu_env)
{
	prev_gres->flags = env_flags;
	prev_gres->name_hash = gres_build_id(p->name);
	prev_gres->no_gpu_env = no_gpu_env;
}

/*
 * Parse a gres.conf Flags string
 */
extern uint32_t gres_flags_parse(char *input, bool *no_gpu_env,
				 bool *sharing_mentioned)
{
	uint32_t flags = 0;
	if (xstrcasestr(input, "CountOnly"))
		flags |= GRES_CONF_COUNT_ONLY;
	if (xstrcasestr(input, "nvidia_gpu_env"))
		flags |= GRES_CONF_ENV_NVML;
	if (xstrcasestr(input, "amd_gpu_env"))
		flags |= GRES_CONF_ENV_RSMI;
	if (xstrcasestr(input, "intel_gpu_env"))
		flags |= GRES_CONF_ENV_ONEAPI;
	if (xstrcasestr(input, "opencl_env"))
		flags |= GRES_CONF_ENV_OPENCL;
	if (xstrcasestr(input, "one_sharing"))
		flags |= GRES_CONF_ONE_SHARING;
	if (xstrcasestr(input, "explicit"))
		flags |= GRES_CONF_EXPLICIT;
	/* String 'no_gpu_env' will clear all GPU env vars */
	if (no_gpu_env)
		*no_gpu_env = xstrcasestr(input, "no_gpu_env");
	if (sharing_mentioned) {
		if ((flags & GRES_CONF_ONE_SHARING) ||
		    xstrcasestr(input, "all_sharing"))
			*sharing_mentioned = true;
	}
	return flags;
}

/*
 * Build gres_slurmd_conf_t record based upon a line from the gres.conf file
 */
static int _parse_gres_config(void **dest, slurm_parser_enum_t type,
			      const char *key, const char *value,
			      const char *line, char **leftover)
{
	int i;
	s_p_hashtbl_t *tbl;
	gres_slurmd_conf_t *p;
	uint64_t tmp_uint64, mult;
	char *tmp_str, *last;
	bool cores_flag = false, cpus_flag = false;
	char *type_str = NULL;
	char *autodetect_string = NULL;
	bool autodetect = false, set_default_envs = true;
	/* Remember the last-set Flags value */
	static prev_gres_flags_t prev_gres = { 0 };

	if (reset_prev) {
		memset(&prev_gres, 0, sizeof(prev_gres));
		reset_prev = false;
	}

	tbl = s_p_hashtbl_create(_gres_options);
	s_p_parse_line(tbl, *leftover, leftover);

	p = xmalloc(sizeof(gres_slurmd_conf_t));

	/*
	 * Detect and set the node-local AutoDetect option only if
	 * NodeName is specified.
	 */
	if (s_p_get_string(&autodetect_string, "AutoDetect", tbl)) {
		if (value)
			error("gres.conf: In-line AutoDetect requires NodeName to take effect");
		else {
			_handle_local_autodetect(autodetect_string);
			/* AutoDetect was specified w/ NodeName */
			autodetect = true;
		}
		xfree(autodetect_string);
	}

	if (!value) {
		if (!s_p_get_string(&p->name, "Name", tbl)) {
			if (!autodetect)
				error("Invalid GRES data, no type name (%s)",
				      line);
			xfree(p);
			s_p_hashtbl_destroy(tbl);
			return 0;
		}
	} else {
		p->name = xstrdup(value);
	}

	if (s_p_get_string(&p->type_name, "Type", tbl)) {
		p->config_flags |= GRES_CONF_HAS_TYPE;
	}

	p->cpu_cnt = gres_cpu_cnt;
	if (s_p_get_string(&p->cpus, "Cores", tbl)) {
		cores_flag = true;
		type_str = "Cores";
	} else if (s_p_get_string(&p->cpus, "CPUs", tbl)) {
		cpus_flag = true;
		type_str = "CPUs";
	}
	if (cores_flag || cpus_flag) {
		char *local_cpus = NULL;
		if (xcpuinfo_ops.xcpuinfo_abs_to_mac) {
			i = (xcpuinfo_ops.xcpuinfo_abs_to_mac)
				(p->cpus, &local_cpus);
			if (i != SLURM_SUCCESS) {
				error("Invalid GRES data for %s, %s=%s",
				      p->name, type_str, p->cpus);
			}
		} else {
			/*
			 * Not converting Cores into machine format is only for
			 * testing or if we don't care about cpus_bitmap. The
			 * slurmd should always convert to machine format.
			 */
			debug("%s: %s=%s is not being converted to machine-local format",
			      __func__, type_str, p->cpus);
			local_cpus = xstrdup(p->cpus);
			i = SLURM_SUCCESS;
		}
		if (i == SLURM_SUCCESS) {
			p->cpus_bitmap = bit_alloc(gres_cpu_cnt);
			if (!bit_size(p->cpus_bitmap) ||
			    bit_unfmt(p->cpus_bitmap, local_cpus)) {
				fatal("Invalid GRES data for %s, %s=%s (only %u CPUs are available)",
				      p->name, type_str, p->cpus, gres_cpu_cnt);
			}
		}
		xfree(local_cpus);
	}

	if (s_p_get_string(&p->file, "File", tbl) ||
	    s_p_get_string(&p->file, "Files", tbl)) {
		p->count = _validate_file(p->file, p->name);
		p->config_flags |= GRES_CONF_HAS_FILE;
	}

	if (s_p_get_string(&p->file, "MultipleFiles", tbl)) {
		int file_count = 0;
		if (p->config_flags & GRES_CONF_HAS_FILE)
			fatal("File and MultipleFiles options are mutually exclusive");
		p->count = 1;
		file_count = _validate_file(p->file, p->name);
		if (file_count < 2)
			fatal("MultipleFiles does not contain multiple files. Use File instead");
		p->config_flags |= GRES_CONF_HAS_FILE;
		p->config_flags |= GRES_CONF_HAS_MULT;
	}

	if (s_p_get_string(&tmp_str, "Flags", tbl)) {
		uint32_t env_flags = 0;
		bool no_gpu_env = false;
		bool sharing_mentioned = false;
		uint32_t flags = gres_flags_parse(tmp_str, &no_gpu_env,
						  &sharing_mentioned);

		/* The default for MPS is to have only one gpu sharing */
		if (!sharing_mentioned && !xstrcasecmp(p->name, "mps"))
			flags |= GRES_CONF_ONE_SHARING;

		/* Break out flags into env flags and non-env flags */
		env_flags = flags & GRES_CONF_ENV_SET;
		p->config_flags |= flags;

		if (env_flags && no_gpu_env)
			fatal("Invalid GRES record name=%s type=%s: Flags (%s) contains \"no_gpu_env\", which must be mutually exclusive to all other GRES env flags of same node and name",
			      p->name, p->type_name, tmp_str);

		set_default_envs = false;
		/*
		 * Make sure that Flags are consistent with each other
		 * if set for multiple lines of the same GRES.
		 */
		if (prev_gres.name_hash &&
		    _same_gres_name_as_prev(&prev_gres, p) &&
		    ((prev_gres.flags != flags) ||
		     (prev_gres.no_gpu_env != no_gpu_env)))
			fatal("Invalid GRES record name=%s type=%s: Flags (%s) does not match env flags for previous GRES of same node and name",
			      p->name, p->type_name, tmp_str);

		_set_prev_gres_flags(&prev_gres, p, flags,
				     no_gpu_env);

		xfree(tmp_str);
	} else if ((prev_gres.flags || prev_gres.no_gpu_env) &&
		   _same_gres_name_as_prev(&prev_gres, p)) {
		/* Inherit flags from previous GRES line with same name */
		set_default_envs = false;
		p->config_flags |= prev_gres.flags;
	} else {
		if (!xstrcasecmp(p->name, "mps"))
			p->config_flags |= GRES_CONF_ONE_SHARING;
	}

	/* Flags not set. By default, all env vars are set for GPUs */
	if (set_default_envs && !xstrcasecmp(p->name, "gpu")) {
		uint32_t env_flags = GRES_CONF_ENV_SET | GRES_CONF_ENV_DEF;
		p->config_flags |= env_flags;
		_set_prev_gres_flags(&prev_gres, p, env_flags, false);
	}

	if (s_p_get_string(&p->links, "Link",  tbl) ||
	    s_p_get_string(&p->links, "Links", tbl)) {
		if (gres_links_validate(p->links) < -1) {
			error("gres.conf: Ignoring invalid Links=%s for Name=%s",
			      p->links, p->name);
			xfree(p->links);
		}

	}

	_set_shared_flag(p->name, &p->config_flags);

	if (s_p_get_string(&tmp_str, "Count", tbl)) {
		tmp_uint64 = strtoll(tmp_str, &last, 10);
		if ((tmp_uint64 == LONG_MIN) || (tmp_uint64 == LONG_MAX)) {
			fatal("Invalid GRES record for %s, invalid count %s",
			      p->name, tmp_str);
		}
		if ((mult = suffix_mult(last)) != NO_VAL64) {
			tmp_uint64 *= mult;
		} else {
			fatal("Invalid GRES record for %s, invalid count %s",
			      p->name, tmp_str);
		}
		/*
		 * Some GRES can have count > 1 for a given file. For example,
		 * each GPU can have arbitrary count of MPS elements.
		 */
		if (p->count && (p->count != tmp_uint64) &&
		    !gres_id_shared(p->config_flags)) {
			fatal("Invalid GRES record for %s, count does not match File value",
			      p->name);
		}
		if (tmp_uint64 >= NO_VAL64) {
			fatal("GRES %s has invalid count value %"PRIu64,
			      p->name, tmp_uint64);
		}
		p->count = tmp_uint64;
		xfree(tmp_str);
	} else if (p->count == 0)
		p->count = 1;

	s_p_hashtbl_destroy(tbl);

	for (i = 0; i < gres_context_cnt; i++) {
		if (xstrcasecmp(p->name, gres_context[i].gres_name) == 0)
			break;
	}
	if (i >= gres_context_cnt) {
		error("Ignoring gres.conf record, invalid name: %s", p->name);
		destroy_gres_slurmd_conf(p);
		return 0;
	}
	p->plugin_id = gres_context[i].plugin_id;
	*dest = (void *)p;
	return 1;
}
static int _parse_gres_config_node(void **dest, slurm_parser_enum_t type,
				   const char *key, const char *value,
				   const char *line, char **leftover)
{
	s_p_hashtbl_t *tbl;

	if (gres_node_name && value) {
		bool match = false;
		hostlist_t *hl;
		hl = hostlist_create(value);
		if (hl) {
			match = (hostlist_find(hl, gres_node_name) >= 0);
			hostlist_destroy(hl);
		}
		if (!match) {
			debug("skipping GRES for NodeName=%s %s", value, line);
			tbl = s_p_hashtbl_create(_gres_options);
			s_p_parse_line(tbl, *leftover, leftover);
			s_p_hashtbl_destroy(tbl);
			return 0;
		}
	}
	return _parse_gres_config(dest, type, key, NULL, line, leftover);
}

static int _foreach_slurm_conf(void *x, void *arg)
{
	gres_state_t *gres_state_node = (gres_state_t *)x;
	slurm_gres_context_t *gres_ctx = (slurm_gres_context_t *)arg;
	gres_node_state_t *gres_ns;
	uint64_t tmp_count = 0;

	/* Only look at the GRES under the current plugin (same name) */
	if (gres_state_node->plugin_id != gres_ctx->plugin_id)
		return 0;

	gres_ns = (gres_node_state_t *)gres_state_node->gres_data;

	/*
	 * gres_cnt_config should equal the combined count from
	 * type_cnt_avail if there are no untyped GRES
	 */
	for (uint16_t i = 0; i < gres_ns->type_cnt; i++)
		tmp_count += gres_ns->type_cnt_avail[i];

	/* Forbid mixing typed and untyped GRES under the same name */
	if (gres_ns->type_cnt &&
	    gres_ns->gres_cnt_config > tmp_count)
		fatal("%s: Some %s GRES in slurm.conf have a type while others do not (gres_ns->gres_cnt_config (%"PRIu64") > tmp_count (%"PRIu64"))",
		      __func__, gres_ctx->gres_name,
		      gres_ns->gres_cnt_config, tmp_count);
	return 1;
}

static void _validate_slurm_conf(list_t *slurm_conf_list,
				 slurm_gres_context_t *gres_ctx)
{
	if (!slurm_conf_list)
		return;

	(void)list_for_each_nobreak(slurm_conf_list, _foreach_slurm_conf,
				    gres_ctx);
}

static int _foreach_gres_conf(void *x, void *arg)
{
	gres_slurmd_conf_t *gres_slurmd_conf = (gres_slurmd_conf_t *)x;
	foreach_gres_conf_t *foreach_gres_conf = (foreach_gres_conf_t *)arg;
	slurm_gres_context_t *gres_ctx = foreach_gres_conf->gres_ctx;
	bool orig_has_file, orig_has_type;

	/* Only look at the GRES under the current plugin (same name) */
	if (gres_slurmd_conf->plugin_id != gres_ctx->plugin_id)
		return 0;

	/*
	 * If any plugin of this type has this set it will virally set
	 * any other to be the same as we use the gres_ctx from here
	 * on out.
	 */
	if (gres_slurmd_conf->config_flags & GRES_CONF_EXPLICIT)
		gres_ctx->config_flags |= GRES_CONF_EXPLICIT;

	if (gres_slurmd_conf->config_flags & GRES_CONF_COUNT_ONLY)
		gres_ctx->config_flags |= GRES_CONF_COUNT_ONLY;

	if (gres_slurmd_conf->config_flags & GRES_CONF_HAS_FILE)
		gres_ctx->config_flags |= GRES_CONF_HAS_FILE;

	if (gres_slurmd_conf->config_flags & GRES_CONF_ONE_SHARING)
		gres_ctx->config_flags |= GRES_CONF_ONE_SHARING;
	/*
	 * Since there could be multiple types of the same plugin we
	 * need to only make sure we load it once.
	 */
	if (!(gres_ctx->config_flags & GRES_CONF_LOADED)) {
		/*
		 * Ignore return code, as we will still support the gres
		 * with or without the plugin.
		 */
		if (_load_plugin(gres_ctx) == SLURM_SUCCESS)
			gres_ctx->config_flags |= GRES_CONF_LOADED;
	}

	foreach_gres_conf->rec_count++;
	orig_has_file = gres_slurmd_conf->config_flags & GRES_CONF_HAS_FILE;
	if (foreach_gres_conf->new_has_file == -1) {
		if (gres_slurmd_conf->config_flags & GRES_CONF_HAS_FILE)
			foreach_gres_conf->new_has_file = 1;
		else
			foreach_gres_conf->new_has_file = 0;
	} else if ((foreach_gres_conf->new_has_file && !orig_has_file) ||
		   (!foreach_gres_conf->new_has_file && orig_has_file)) {
		fatal("gres.conf for %s, some records have \"File\" specification while others do not",
		      gres_ctx->gres_name);
	}
	orig_has_type = gres_slurmd_conf->config_flags &
		GRES_CONF_HAS_TYPE;
	if (foreach_gres_conf->new_has_type == -1) {
		if (gres_slurmd_conf->config_flags &
		    GRES_CONF_HAS_TYPE) {
			foreach_gres_conf->new_has_type = 1;
		} else
			foreach_gres_conf->new_has_type = 0;
	} else if ((foreach_gres_conf->new_has_type && !orig_has_type) ||
		   (!foreach_gres_conf->new_has_type && orig_has_type)) {
		fatal("gres.conf for %s, some records have \"Type=\" specification while others do not",
		      gres_ctx->gres_name);
	}

	if (!foreach_gres_conf->new_has_file &&
	    !foreach_gres_conf->new_has_type &&
	    (foreach_gres_conf->rec_count > 1)) {
		fatal("gres.conf duplicate records for %s",
		      gres_ctx->gres_name);
	}

	if (foreach_gres_conf->new_has_file)
		gres_ctx->config_flags |= GRES_CONF_HAS_FILE;

	return 0;
}

static void _validate_gres_conf(list_t *gres_conf_list,
				slurm_gres_context_t *gres_ctx)
{
	foreach_gres_conf_t gres_conf = {
		.gres_ctx = gres_ctx,
		.new_has_file = -1,
		.new_has_type = -1,
		.rec_count = 0,
	};

	(void)list_for_each_nobreak(gres_conf_list, _foreach_gres_conf,
				    &gres_conf);

	if (!(gres_ctx->config_flags & GRES_CONF_LOADED)) {
		/*
		 * This means there was no gres.conf line for this gres found.
		 * We still need to try to load it for AutoDetect's sake.
		 * If we fail loading we will treat it as a count
		 * only GRES since the stepd will try to load it elsewise.
		 */
		if (_load_plugin(gres_ctx) != SLURM_SUCCESS)
			gres_ctx->config_flags |= GRES_CONF_COUNT_ONLY;
	} else
		/* Remove as this is only really used locally */
		gres_ctx->config_flags &= (~GRES_CONF_LOADED);
}

/*
 * Keep track of which gres.conf lines have a count greater than expected
 * according to the current slurm.conf GRES. Modify the count of
 * gres_slurmd_conf to keep track of this. Any gres.conf records
 * with a count > 0 means that slurm.conf did not account for it completely.
 *
 * gres_slurmd_conf - (in/out) pointer to conf we are looking at.
 *                    This should be a temporary copy that we can modify.
 * conf_cnt->count - (in) The count of the current slurm.conf GRES record.
 * conf_cnt->type_name - (in) The type of the current slurm.conf GRES record.
 */
static int _foreach_compare_conf_counts(void *x, void *arg)
{
	gres_slurmd_conf_t *gres_slurmd_conf = x;
	conf_cnt_t *conf_cnt = arg;

	/* Note: plugin type filter already applied */
	/* Check that type is the same */
	if (gres_slurmd_conf->type_name &&
	    xstrcasecmp(gres_slurmd_conf->type_name, conf_cnt->type_name))
		return 0;
	/* Keep track of counts */
	if (gres_slurmd_conf->count > conf_cnt->count) {
		gres_slurmd_conf->count -= conf_cnt->count;
		/* This slurm.conf GRES specification is now used up */
		return -1;
	} else {
		conf_cnt->count -= gres_slurmd_conf->count;
		gres_slurmd_conf->count = 0;
	}
	return 0;
}

static int _lite_copy_gres_slurmd_conf(void *x, void *arg)
{
	gres_slurmd_conf_t *gres_slurmd_conf = x;
	check_conf_t *check_conf = arg;
	gres_slurmd_conf_t *gres_slurmd_conf_tmp;

	if (gres_slurmd_conf->plugin_id != check_conf->gres_ctx->plugin_id)
		return 0;

	gres_slurmd_conf_tmp = xmalloc(sizeof(*gres_slurmd_conf_tmp));
	gres_slurmd_conf_tmp->name = xstrdup(gres_slurmd_conf->name);
	gres_slurmd_conf_tmp->type_name = xstrdup(gres_slurmd_conf->type_name);
	gres_slurmd_conf_tmp->count = gres_slurmd_conf->count;
	list_append(check_conf->gres_conf_list, gres_slurmd_conf_tmp);

	return 0;
}

static int _foreach_slurm_conf_mismatch_comp(void *x, void *arg)
{
	gres_state_t *gres_state_node = x;
	check_conf_t *check_conf = arg;
	gres_node_state_t *gres_ns;
	conf_cnt_t conf_cnt = { 0 };

	if (gres_state_node->plugin_id != check_conf->gres_ctx->plugin_id)
		return 0;

	/* Determine if typed or untyped, and act accordingly */
	gres_ns = gres_state_node->gres_data;
	if (!gres_ns->type_name) {
		conf_cnt.count = gres_ns->gres_cnt_config;
		conf_cnt.type_name = NULL;
		(void) list_for_each(check_conf->gres_conf_list,
				     _foreach_compare_conf_counts,
				     &conf_cnt);
		return 0;
	}

	for (int i = 0; i < gres_ns->type_cnt; ++i) {
		conf_cnt.count = gres_ns->type_cnt_avail[i];
		conf_cnt.type_name = gres_ns->type_name[i];
		(void) list_for_each(check_conf->gres_conf_list,
				     _foreach_compare_conf_counts,
				     &conf_cnt);
	}

	return 0;
}

int _print_slurm_conf_mismatch(void *x, void *arg)
{
	gres_slurmd_conf_t *gres_slurmd_conf = x;

	if (gres_slurmd_conf->count > 0)
		warning("A line in gres.conf for GRES %s%s%s has %"PRIu64" more configured than expected in slurm.conf. Ignoring extra GRES.",
			gres_slurmd_conf->name,
			(gres_slurmd_conf->type_name) ? ":" : "",
			(gres_slurmd_conf->type_name) ?
			gres_slurmd_conf->type_name : "",
			gres_slurmd_conf->count);
	return 0;
}

/*
 * Loop through each entry in gres.conf and see if there is a corresponding
 * entry in slurm.conf. If so, see if the counts line up. If there are more
 * devices specified in gres.conf than in slurm.conf, emit errors.
 *
 * slurm_conf_list - (in) The slurm.conf GRES list.
 * gres_conf_list  - (in) The gres.conf GRES list.
 * gres_ctx     - (in) Which GRES plugin we are currently working in.
 */
static void _check_conf_mismatch(list_t *slurm_conf_list, list_t *gres_conf_list,
				 slurm_gres_context_t *gres_ctx)
{
	check_conf_t check_conf = {
		.gres_ctx = gres_ctx,
	};

	/* E.g. slurm_conf_list will be NULL in the case of --gpu-bind */
	if (!slurm_conf_list || !gres_conf_list)
		return;

	/*
	 * Duplicate the gres.conf list with records relevant to this GRES
	 * plugin only so we can mangle records. Only add records under the
	 * current plugin.
	 */
	check_conf.gres_conf_list = list_create(destroy_gres_slurmd_conf);
	(void) list_for_each(gres_conf_list,
			     _lite_copy_gres_slurmd_conf,
			     &check_conf);

	/*
	 * Loop through the slurm.conf list and see if there are more gres.conf
	 * GRES than expected.
	 */
	(void) list_for_each(slurm_conf_list,
			     _foreach_slurm_conf_mismatch_comp,
			     &check_conf);

	/*
	 * Loop through gres_conf_list_tmp to print errors for gres.conf
	 * records that were not completely accounted for in slurm.conf.
	 */
	(void) list_for_each(check_conf.gres_conf_list,
			     _print_slurm_conf_mismatch,
			     NULL);

	FREE_NULL_LIST(check_conf.gres_conf_list);
}

/*
 * Match the type of a GRES from slurm.conf to a GRES in the gres.conf list. If
 * a match is found, pop it off the gres.conf list and return it.
 *
 * gres_context   - (in) Which GRES plugin we are currently working in.
 * type_name      - (in) The type of the slurm.conf GRES record. If null, then
 *			 it's an untyped GRES.
 *
 * Returns the first gres.conf record from gres_conf_list with the same type
 * name as the slurm.conf record.
 */
static int _match_type(void *x, void *key)
{
	gres_slurmd_conf_t *gres_slurmd_conf = x;
	conf_cnt_t *conf_cnt = key;

	if (gres_slurmd_conf->plugin_id != conf_cnt->gres_ctx->plugin_id)
		return 0;

	/*
	 * If type_name is NULL we will take the first matching
	 * gres_slurmd_conf that we find.  This means we also will
	 * remove the type from the gres_slurmd_conf to match 18.08
	 * stylings.
	 */
	if (!conf_cnt->type_name) {
		xfree(gres_slurmd_conf->type_name);
		gres_slurmd_conf->config_flags &= ~GRES_CONF_HAS_TYPE;
	} else if (xstrcasecmp(gres_slurmd_conf->type_name,
			       conf_cnt->type_name))
		return 0;

	return 1;
}

/*
 * Add a GRES conf record with count == 0 to gres_list.
 *
 * new_list - (in/out) The gres list to add to.
 * gres_ctx - (in) The GRES plugin to add a GRES record for.
 * count - (in) The cpu count configured for the node.
 */
static void _add_gres_config_empty(merge_gres_t *merge_gres)
{
	gres_slurmd_conf_t *gres_slurmd_conf =
		xmalloc(sizeof(*gres_slurmd_conf));
	gres_slurmd_conf->cpu_cnt = merge_gres->cpu_cnt;
	gres_slurmd_conf->name = xstrdup(merge_gres->gres_ctx->gres_name);
	gres_slurmd_conf->plugin_id = merge_gres->gres_ctx->plugin_id;
	list_append(merge_gres->new_list, gres_slurmd_conf);
}

/*
 * Truncate the File hostrange string of a GRES record to be at most
 * new_count entries. The extra entries will be removed.
 *
 * gres_slurmd_conf - (in/out) The GRES record to modify.
 * count     - (in) The new number of entries in File
 */
static void _set_file_subset(gres_slurmd_conf_t *gres_slurmd_conf,
			     uint64_t new_count)
{
	/* Convert file to hostrange */
	hostlist_t *hl = hostlist_create(gres_slurmd_conf->file);
	unsigned long old_count = hostlist_count(hl);

	if (new_count >= old_count) {
		hostlist_destroy(hl);
		/* Nothing to do */
		return;
	}

	/* Remove all but the first entries */
	for (int i = old_count; i > new_count; --i) {
		free(hostlist_pop(hl));
	}

	debug3("%s: Truncating %s:%s File from (%ld) %s", __func__,
	       gres_slurmd_conf->name, gres_slurmd_conf->type_name, old_count,
	       gres_slurmd_conf->file);

	/* Set file to the new subset */
	xfree(gres_slurmd_conf->file);
	gres_slurmd_conf->file = hostlist_ranged_string_xmalloc(hl);

	debug3("%s: to (%"PRIu64") %s", __func__, new_count,
	       gres_slurmd_conf->file);
	hostlist_destroy(hl);
}

/*
 * A continuation of _merge_gres() depending on if the slurm.conf GRES is typed
 * or not.
 *
 * gres_conf_list - (in) The gres.conf list.
 * new_list       - (out) The new merged [slurm|gres].conf list.
 * count          - (in) The count of the slurm.conf GRES record.
 * type_name      - (in) The type of the slurm.conf GRES record, if it exists.
 * gres_context   - (in) Which GRES plugin we are working in.
 * cpu_cnt        - (in) A count of CPUs on the node.
 */
static void _merge_gres2(merge_gres_t *merge_gres,
			 uint64_t count, char *type_name)
{
	gres_slurmd_conf_t *match;
	gres_slurmd_conf_t gres_slurmd_conf = {
		.cpu_cnt = merge_gres->cpu_cnt,
		.name = merge_gres->gres_ctx->gres_name,
		.type_name = type_name,
	};
	conf_cnt_t conf_cnt = {
		.count = count,
		.gres_ctx = merge_gres->gres_ctx,
		.type_name = type_name,
	};

	/* If slurm.conf count is initially 0, don't waste time on it */
	if (count == 0)
		return;

	/*
	 * There can be multiple gres.conf GRES lines contained within a
	 * single slurm.conf GRES line, due to different values of Cores
	 * and Links. Append them to the list where possible.
	 */
	while ((match = list_remove_first(
			merge_gres->gres_conf_list, _match_type, &conf_cnt))) {
		list_append(merge_gres->new_list, match);

		debug3("%s: From gres.conf, using %s:%s:%"PRIu64":%s", __func__,
		       match->name, match->type_name, match->count,
		       match->file);

		/*
		 * See if we need to merge with any more gres.conf records.
		 * NOTE: _set_file_subset() won't run on a MultipleFiles GRES,
		 * since match->count will always be 1 and count is always >= 1
		 */
		if (match->count > count) {
			/*
			 * Truncate excess count of gres.conf to match total
			 * count of slurm.conf.
			 */
			match->count = count;
			/*
			 * Truncate excess file of gres.conf to match total
			 * count of slurm.conf.
			 */
			if (match->file)
				_set_file_subset(match, count);
			/* Floor to 0 to break out of loop. */
			count = 0;
		} else
			/*
			 * Subtract this gres.conf line count from the
			 * slurm.conf total.
			 */
			count -= match->count;

		/*
		 * All devices outlined by this slurm.conf record have now been
		 * merged with gres.conf records and added to new_list, so exit.
		 */
		if (count == 0)
			break;
	}

	if (count == 0)
		return;

	/*
	 * There are leftover GRES specified in this slurm.conf record that are
	 * not accounted for in gres.conf that still need to be added.
	 */

	/* Set default env flags, and allow AutoDetect to override */
	if (!xstrcasecmp(merge_gres->gres_ctx->gres_name, "gpu"))
		gres_slurmd_conf.config_flags |=
			(GRES_CONF_ENV_SET | GRES_CONF_ENV_DEF);
	if (merge_gres->gres_ctx->config_flags & GRES_CONF_COUNT_ONLY)
		gres_slurmd_conf.config_flags |= GRES_CONF_COUNT_ONLY;

	gres_slurmd_conf.count = count;

	add_gres_to_list(merge_gres->new_list, &gres_slurmd_conf);
}

/*
 * Merge a single slurm.conf GRES specification with any relevant gres.conf
 * records and append the result to new_list.
 *
 * gres_conf_list - (in) The gres.conf list.
 * new_list       - (out) The new merged [slurm|gres].conf list.
 * ptr            - (in) A slurm.conf GRES record.
 * gres_ctx   - (in) Which GRES plugin we are working in.
 * cpu_cnt        - (in) A count of CPUs on the node.
 */
static int _merge_gres(void *x, void *arg)
{
	gres_state_t *gres_state_node = x;
	merge_gres_t *merge_gres = arg;
	gres_node_state_t *gres_ns;

	if (gres_state_node->plugin_id != merge_gres->gres_ctx->plugin_id)
		return 0;

	gres_ns = gres_state_node->gres_data;
	/* If this GRES has no types, merge in the single untyped GRES */
	if (gres_ns->type_cnt == 0) {
		_merge_gres2(merge_gres,
			     gres_ns->gres_cnt_config, NULL);
		return 0;
	}

	/* If this GRES has types, merge in each typed GRES */
	for (int i = 0; i < gres_ns->type_cnt; i++) {
		_merge_gres2(merge_gres,
			     gres_ns->type_cnt_avail[i],
			     gres_ns->type_name[i]);
	}

	return 0;
}

/*
 * Merge slurm.conf and gres.conf GRES configuration.
 * gres.conf can only work within what is outlined in slurm.conf. Every
 * gres.conf device that does not match up to a device in slurm.conf is
 * discarded with an error. If no gres conf found for what is specified in
 * slurm.conf, create a zero-count conf record.
 *
 * node_conf       - (in) node configuration info (cpu count).
 * gres_conf_list  - (in/out) GRES data from gres.conf. This becomes the new
 *		     merged slurm.conf/gres.conf list.
 * slurm_conf_list - (in) GRES data from slurm.conf.
 */
static void _merge_config(node_config_load_t *node_conf, list_t *gres_conf_list,
			  list_t *slurm_conf_list)
{
	merge_gres_t merge_gres = {
		.cpu_cnt = node_conf->cpu_cnt,
		.gres_conf_list = gres_conf_list,
		.new_list = list_create(destroy_gres_slurmd_conf),
	};

	for (int i = 0; i < gres_context_cnt; i++) {
		merge_gres.gres_ctx = &gres_context[i];

		/* Copy GRES configuration from slurm.conf */
		if (slurm_conf_list) {
			if (list_for_each(slurm_conf_list,
					  _merge_gres,
					  &merge_gres) > 0)
				continue;
		}

		/* Add GRES record with zero count */
		_add_gres_config_empty(&merge_gres);
	}
	/* Set gres_conf_list to be the new merged list */
	list_flush(gres_conf_list);
	list_transfer(gres_conf_list, merge_gres.new_list);
	FREE_NULL_LIST(merge_gres.new_list);
}

static void _pack_gres_context(slurm_gres_context_t *gres_ctx, buf_t *buffer)
{
	/* gres_ctx->cur_plugin: DON'T PACK will be filled in on the other
	 * side */
	pack32(gres_ctx->config_flags, buffer);
	packstr(gres_ctx->gres_name, buffer);
	packstr(gres_ctx->gres_name_colon, buffer);
	pack32((uint32_t)gres_ctx->gres_name_colon_len, buffer);
	packstr(gres_ctx->gres_type, buffer);
	gres_send_stepd(buffer, gres_ctx->np_gres_devices);
	/* gres_ctx->ops: DON'T PACK will be filled in on the other side */
	pack32(gres_ctx->plugin_id, buffer);
	/* gres_ctx->plugin_list: DON'T PACK will be filled in on the other
	 * side */
	pack64(gres_ctx->total_cnt, buffer);
}

static int _unpack_gres_context(slurm_gres_context_t *gres_ctx, buf_t *buffer)
{
	uint32_t uint32_tmp;

	/* gres_ctx->cur_plugin: filled in later with _load_plugin() */
	safe_unpack32(&gres_ctx->config_flags, buffer);
	safe_unpackstr(&gres_ctx->gres_name, buffer);
	safe_unpackstr(&gres_ctx->gres_name_colon, buffer);
	safe_unpack32(&uint32_tmp, buffer);
	gres_ctx->gres_name_colon_len = (int)uint32_tmp;
	safe_unpackstr(&gres_ctx->gres_type, buffer);
	gres_recv_stepd(buffer, &gres_ctx->np_gres_devices);
	/* gres_ctx->ops: filled in later with _load_plugin() */
	safe_unpack32(&gres_ctx->plugin_id, buffer);
	/* gres_ctx->plugin_list: filled in later with _load_plugin() */
	safe_unpack64(&gres_ctx->total_cnt, buffer);
	return SLURM_SUCCESS;

unpack_error:
	error("%s: unpack_error", __func__);
	return SLURM_ERROR;
}

static void _pack_gres_slurmd_conf(void *in, uint16_t protocol_version,
				   buf_t *buffer)
{
	gres_slurmd_conf_t *gres_slurmd_conf = (gres_slurmd_conf_t *)in;

	/*
	 * Ignore protocol_version at the time of writing this only deals with
	 * communication from the slurmd to a new stepd which should always be
	 * the same version.  This function is called from slurm_pack_list which
	 * requires protocol_version.
	 */

	/* Pack gres_slurmd_conf_t */
	pack32(gres_slurmd_conf->config_flags, buffer);
	pack64(gres_slurmd_conf->count, buffer);
	pack32(gres_slurmd_conf->cpu_cnt, buffer);
	packstr(gres_slurmd_conf->cpus, buffer);
	pack_bit_str_hex(gres_slurmd_conf->cpus_bitmap, buffer);
	packstr(gres_slurmd_conf->file, buffer);
	packstr(gres_slurmd_conf->links, buffer);
	packstr(gres_slurmd_conf->name, buffer);
	packstr(gres_slurmd_conf->type_name, buffer);
	packstr(gres_slurmd_conf->unique_id, buffer);
	pack32(gres_slurmd_conf->plugin_id, buffer);
}

static int _unpack_gres_slurmd_conf(void **object, uint16_t protocol_version,
				    buf_t *buffer)
{
	gres_slurmd_conf_t *gres_slurmd_conf =
		xmalloc(sizeof(*gres_slurmd_conf));

	/*
	 * Ignore protocol_version at the time of writing this only deals with
	 * communication from the slurmd to a new stepd which should always be
	 * the same version.  This function is called from slurm_unpack_list
	 * which requires protocol_version.
	 */

	/* Unpack gres_slurmd_conf_t */
	safe_unpack32(&gres_slurmd_conf->config_flags, buffer);
	safe_unpack64(&gres_slurmd_conf->count, buffer);
	safe_unpack32(&gres_slurmd_conf->cpu_cnt, buffer);
	safe_unpackstr(&gres_slurmd_conf->cpus, buffer);
	unpack_bit_str_hex(&gres_slurmd_conf->cpus_bitmap, buffer);
	safe_unpackstr(&gres_slurmd_conf->file, buffer);
	safe_unpackstr(&gres_slurmd_conf->links, buffer);
	safe_unpackstr(&gres_slurmd_conf->name, buffer);
	safe_unpackstr(&gres_slurmd_conf->type_name, buffer);
	safe_unpackstr(&gres_slurmd_conf->unique_id, buffer);
	safe_unpack32(&gres_slurmd_conf->plugin_id, buffer);

	*object = gres_slurmd_conf;
	return SLURM_SUCCESS;

unpack_error:
	destroy_gres_slurmd_conf(gres_slurmd_conf);
	*object = NULL;
	return SLURM_ERROR;
}

/* gres_context_lock should be locked before this */
static void _pack_context_buf(void)
{
	FREE_NULL_BUFFER(gres_context_buf);

	gres_context_buf = init_buf(0);
	pack32(gres_context_cnt, gres_context_buf);
	if (gres_context_cnt <= 0) {
		debug3("%s: No GRES context count sent to stepd", __func__);
		return;
	}

	for (int i = 0; i < gres_context_cnt; i++) {
		slurm_gres_context_t *gres_ctx = &gres_context[i];
		_pack_gres_context(gres_ctx, gres_context_buf);
		if (gres_ctx->ops.send_stepd)
			(*(gres_ctx->ops.send_stepd))(gres_context_buf);
	}
}

static int _unpack_context_buf(buf_t *buffer)
{
	uint32_t cnt;
	safe_unpack32(&cnt, buffer);

	gres_context_cnt = cnt;

	if (!gres_context_cnt)
		return SLURM_SUCCESS;

	xrecalloc(gres_context, gres_context_cnt, sizeof(slurm_gres_context_t));
	for (int i = 0; i < gres_context_cnt; i++) {
		slurm_gres_context_t *gres_ctx = &gres_context[i];
		if (_unpack_gres_context(gres_ctx, buffer) != SLURM_SUCCESS)
			goto unpack_error;
		(void)_load_plugin(gres_ctx);
		if (gres_ctx->ops.recv_stepd)
			(*(gres_ctx->ops.recv_stepd))(buffer);
	}
	return SLURM_SUCCESS;

unpack_error:
	error("%s: failed", __func__);
	return SLURM_ERROR;
}

/* gres_context_lock should be locked before this */
static void _pack_gres_conf(void)
{
	int len = 0;
	FREE_NULL_BUFFER(gres_conf_buf);

	gres_conf_buf = init_buf(0);
	pack32(autodetect_flags, gres_conf_buf);

	/* If there is no list to send, let the stepd know */
	if (!gres_conf_list || !(len = list_count(gres_conf_list))) {
		pack32(len, gres_conf_buf);
		return;
	}
	pack32(len, gres_conf_buf);

	if (slurm_pack_list(gres_conf_list, _pack_gres_slurmd_conf,
			    gres_conf_buf, SLURM_PROTOCOL_VERSION)
	    != SLURM_SUCCESS) {
		error("%s: Failed to pack gres_conf_list", __func__);
		return;
	}
}

static int _unpack_gres_conf(buf_t *buffer)
{
	uint32_t cnt;
	safe_unpack32(&cnt, buffer);
	autodetect_flags = cnt;
	safe_unpack32(&cnt, buffer);

	if (!cnt)
		return SLURM_SUCCESS;

	if (slurm_unpack_list(&gres_conf_list, _unpack_gres_slurmd_conf,
			      destroy_gres_slurmd_conf, buffer,
			      SLURM_PROTOCOL_VERSION) != SLURM_SUCCESS)
		goto unpack_error;

	return SLURM_SUCCESS;

unpack_error:
	error("%s: failed", __func__);
	return SLURM_ERROR;
}

/* List helper function for gres_node_config_load */
static void _free_name_list(void *x)
{
	free(x);
}

/* Fills major and minor information for a gres_device_t dev */
static int _set_gres_device_desc(gres_device_t *dev)
{
	struct stat fs;

	dev->dev_desc.type = DEV_TYPE_NONE;
	dev->dev_desc.major = NO_VAL;
	dev->dev_desc.minor = NO_VAL;

	if (stat(dev->path, &fs) < 0) {
		error("%s: stat(%s): %m", __func__, dev->path);
		return SLURM_ERROR;
	}

	dev->dev_desc.major = major(fs.st_rdev);
	dev->dev_desc.minor = minor(fs.st_rdev);
	log_flag(GRES, "%s : %s major %d, minor %d", __func__, dev->path,
		 dev->dev_desc.major, dev->dev_desc.minor);

	if (S_ISBLK(fs.st_mode))
		dev->dev_desc.type = DEV_TYPE_BLOCK;
	else if (S_ISCHR(fs.st_mode))
		dev->dev_desc.type = DEV_TYPE_CHAR;
	else {
		error("%s is not a valid character or block device, fix your gres.conf",
		      dev->path);
		return SLURM_ERROR;
	}

	return SLURM_SUCCESS;
}


/*
 * Creates and initializes a gres_device_t from a path, an bitmap index and a
 * unique_id. At failure return NULL.
 */
static gres_device_t *_init_gres_device(int index, char *one_name,
					char *unique_id)
{
	int tmp, digit = -1;
	gres_device_t *gres_device = xmalloc(sizeof(gres_device_t));

	gres_device->dev_num = -1;
	gres_device->index = index;
	gres_device->path = xstrdup(one_name);
	gres_device->unique_id = xstrdup(unique_id);

	if (_set_gres_device_desc(gres_device) != SLURM_SUCCESS) {
		xfree(gres_device);
		return NULL;
	}

	tmp = strlen(one_name);
	for (int i = 1;  i <= tmp; i++) {
		if (isdigit(one_name[tmp - i])) {
			digit = tmp - i;
			continue;
		}
		break;
	}
	if (digit >= 0)
		gres_device->dev_num = atoi(one_name + digit);
	else
		gres_device->dev_num = -1;

	return gres_device;
}

/* Load the specific GRES plugins here */
static int _load_specific_gres_plugins(void)
{
	int rc;

	if ((rc = gpu_plugin_init()) != SLURM_SUCCESS)
		return rc;

	return rc;
}

static int _foreach_fill_in_gres_devices(void *x, void *arg)
{
	gres_slurmd_conf_t *gres_slurmd_conf = x;
	foreach_fill_in_gres_devices_t *fill_in_gres_devices = arg;
	node_config_load_t *config = fill_in_gres_devices->config;
	hostlist_t *hl;
	char *one_name;

	if (!(gres_slurmd_conf->config_flags & GRES_CONF_HAS_FILE) ||
	    !gres_slurmd_conf->file ||
	    xstrcmp(gres_slurmd_conf->name, config->gres_name))
		return 0;

	if (!(hl = hostlist_create(gres_slurmd_conf->file))) {
		error("can't parse gres.conf file record (%s)",
		      gres_slurmd_conf->file);
		return 0;
	}

	while ((one_name = hostlist_shift(hl))) {
		/* We don't care about gres_devices in slurmctld */
		if (config->in_slurmd) {
			gres_device_t *gres_device;
			if (!*fill_in_gres_devices->gres_devices)
				*fill_in_gres_devices->gres_devices =
					list_create(destroy_gres_device);

			if (!(gres_device = _init_gres_device(
				      fill_in_gres_devices->index, one_name,
				      gres_slurmd_conf->unique_id))) {
				free(one_name);
				continue;
			}

			if (gres_device->dev_num >
			    fill_in_gres_devices->max_dev_num)
				fill_in_gres_devices->max_dev_num =
					gres_device->dev_num;

			list_append(*fill_in_gres_devices->gres_devices,
				    gres_device);
		}

		/*
		 * Don't check for file duplicates or increment the
		 * device bitmap index if this is a MultipleFiles GRES
		 */
		if (gres_slurmd_conf->config_flags & GRES_CONF_HAS_MULT) {
			free(one_name);
			continue;
		}

		if ((fill_in_gres_devices->rc == SLURM_SUCCESS) &&
		    list_find_first(fill_in_gres_devices->names_list,
				    slurm_find_char_exact_in_list,
				    one_name)) {
			error("%s duplicate device file name (%s)",
			      config->gres_name, one_name);
			fill_in_gres_devices->rc = SLURM_ERROR;
		}

		list_append(fill_in_gres_devices->names_list, one_name);

		/* Increment device bitmap index */
		fill_in_gres_devices->index++;
	}
	hostlist_destroy(hl);

	if (gres_slurmd_conf->config_flags & GRES_CONF_HAS_MULT)
		fill_in_gres_devices->index++;

	return 0;
}

static int _foreach_fill_in_gres_devices_dev_id(void *x, void *arg)
{
	gres_device_t *gres_device = x;
	foreach_fill_in_gres_devices_t *fill_in_gres_devices = arg;

	if (gres_device->dev_num == -1)
		gres_device->dev_num = ++fill_in_gres_devices->max_dev_num;

	if (slurm_conf.debug_flags & DEBUG_FLAG_GRES) {
		char *dev_id_str = gres_device_id2str(&gres_device->dev_desc);
		log_flag(GRES, "%s device number %d(%s):%s",
			 fill_in_gres_devices->config->gres_name,
			 gres_device->dev_num,
			 gres_device->path,
			 dev_id_str);
		xfree(dev_id_str);
	}

	return 0;
}

extern int gres_node_config_load(list_t *gres_conf_list,
				 node_config_load_t *config,
				 list_t **gres_devices)
{
	foreach_fill_in_gres_devices_t fill_in_gres_devices = {
		.config = config,
		.gres_devices = gres_devices,
		.index = 0,
		.max_dev_num = -1,
		.names_list = list_create(_free_name_list),
		.rc = SLURM_SUCCESS,
	};
	xassert(gres_conf_list);
	xassert(gres_devices);

	(void) list_for_each(gres_conf_list, _foreach_fill_in_gres_devices,
			     &fill_in_gres_devices);
	FREE_NULL_LIST(fill_in_gres_devices.names_list);

	if (*gres_devices)
		(void) list_for_each(*gres_devices,
				     _foreach_fill_in_gres_devices_dev_id,
				     &fill_in_gres_devices);

	return fill_in_gres_devices.rc;
}

/*
 * Load this node's configuration (how many resources it has, topology, etc.)
 * IN cpu_cnt - Number of CPUs configured for node node_name.
 * IN node_name - Name of the node to load the GRES config for.
 * IN gres_list - Node's GRES information as loaded from slurm.conf by slurmd
 * IN xcpuinfo_abs_to_mac - Pointer to xcpuinfo_abs_to_mac() funct. If
 *	specified, Slurm will convert gres_slurmd_conf->cpus_bitmap (a bitmap
 *	derived from gres.conf's "Cores" range string) into machine format
 *	(normal slrumd/stepd operation). If not, it will remain unconverted (for
 *	testing purposes or when unused).
 * IN xcpuinfo_mac_to_abs - Pointer to xcpuinfo_mac_to_abs() funct. Used to
 *	convert CPU affinities from machine format (as collected from NVML and
 *	others) into abstract format, for sanity checking purposes.
 * NOTE: Called from slurmd (and from slurmctld for each cloud node)
 */
extern int gres_g_node_config_load(uint32_t cpu_cnt, char *node_name,
				   list_t *gres_list,
				   void *xcpuinfo_abs_to_mac,
				   void *xcpuinfo_mac_to_abs)
{
	static s_p_options_t _gres_conf_options[] = {
		{"AutoDetect", S_P_STRING},
		{"Name",     S_P_ARRAY, _parse_gres_config,  NULL},
		{"NodeName", S_P_ARRAY, _parse_gres_config_node, NULL},
		{NULL}
	};
	list_t *tmp_gres_conf_list = NULL;

	int count = 0, i, rc, rc2;
	struct stat config_stat;
	s_p_hashtbl_t *tbl;
	gres_slurmd_conf_t **gres_array;
	char *gres_conf_file = NULL;
	char *autodetect_string = NULL;
	bool in_slurmd = running_in_slurmd();

	node_config_load_t node_conf = {
		.cpu_cnt = cpu_cnt,
		.in_slurmd = in_slurmd,
		.xcpuinfo_mac_to_abs = xcpuinfo_mac_to_abs
	};

	if (cpu_cnt == 0) {
		error("%s: Invalid cpu_cnt of 0 for node %s",
		      __func__, node_name);
		return ESLURM_INVALID_CPU_COUNT;
	}

	if (xcpuinfo_abs_to_mac)
		xcpuinfo_ops.xcpuinfo_abs_to_mac = xcpuinfo_abs_to_mac;

	xassert(gres_context_cnt >= 0);

	slurm_mutex_lock(&gres_context_lock);

	if (gres_context_cnt == 0) {
		rc = SLURM_SUCCESS;
		goto fini;
	}

	tmp_gres_conf_list = list_create(destroy_gres_slurmd_conf);
	gres_conf_file = get_extra_conf_path("gres.conf");
	if (stat(gres_conf_file, &config_stat) < 0) {
		info("Can not stat gres.conf file (%s), using slurm.conf data",
		     gres_conf_file);
	} else {
		if (xstrcmp(gres_node_name, node_name)) {
			xfree(gres_node_name);
			gres_node_name = xstrdup(node_name);
		}

		gres_cpu_cnt = cpu_cnt;
		tbl = s_p_hashtbl_create(_gres_conf_options);
		if (s_p_parse_file(tbl, NULL, gres_conf_file, 0, NULL) ==
		    SLURM_ERROR)
			fatal("error opening/reading %s", gres_conf_file);

		/* Overwrite unspecified local AutoDetect with global default */
		if (s_p_get_string(&autodetect_string, "Autodetect", tbl)) {
			_handle_global_autodetect(autodetect_string);
			xfree(autodetect_string);
		}

		/* AutoDetect cannot run on the slurmctld node */
		if (running_in_slurmctld() &&
		    autodetect_flags &&
		    !((autodetect_flags & GRES_AUTODETECT_GPU_FLAGS) &
		      GRES_AUTODETECT_GPU_OFF)) {
			rc = ESLURM_UNSUPPORTED_GRES;
			error("Cannot use AutoDetect on cloud/dynamic node \"%s\"",
			      gres_node_name);
			s_p_hashtbl_destroy(tbl);
			goto fini;
		}

		if (s_p_get_array((void ***) &gres_array,
				  &count, "Name", tbl)) {
			for (i = 0; i < count; i++) {
				list_append(tmp_gres_conf_list, gres_array[i]);
				gres_array[i] = NULL;
			}
		}
		if (s_p_get_array((void ***) &gres_array,
				  &count, "NodeName", tbl)) {
			for (i = 0; i < count; i++) {
				list_append(tmp_gres_conf_list, gres_array[i]);
				gres_array[i] = NULL;
			}
		}
		s_p_hashtbl_destroy(tbl);
	}
	FREE_NULL_LIST(gres_conf_list);
	gres_conf_list = tmp_gres_conf_list;
	tmp_gres_conf_list = NULL;

	/* Validate gres.conf and slurm.conf somewhat before merging */
	for (i = 0; i < gres_context_cnt; i++) {
		_validate_slurm_conf(gres_list, &gres_context[i]);
		_validate_gres_conf(gres_conf_list, &gres_context[i]);
		_check_conf_mismatch(gres_list, gres_conf_list,
				     &gres_context[i]);
	}

	/* Merge slurm.conf and gres.conf together into gres_conf_list */
	_merge_config(&node_conf, gres_conf_list, gres_list);

	if ((rc = _load_specific_gres_plugins()) != SLURM_SUCCESS) {
		goto fini;
	}

	for (i = 0; i < gres_context_cnt; i++) {
		node_conf.gres_name = gres_context[i].gres_name;
		if (gres_context[i].ops.node_config_load)
			rc2 = (*(gres_context[i].ops.node_config_load))(
				gres_conf_list, &node_conf);
		else if (gres_context[i].config_flags & GRES_CONF_HAS_FILE) {
			rc2 = gres_node_config_load(
				gres_conf_list, &node_conf,
				&gres_context[i].np_gres_devices);
		} else
			continue;

		if (rc == SLURM_SUCCESS)
			rc = rc2;
	}

	/* Postprocess gres_conf_list after all plugins' node_config_load */

	/* Remove every GPU with an empty File */
	(void) list_delete_all(gres_conf_list, _find_fileless_gres,
			       &gpu_plugin_id);

	list_for_each(gres_conf_list, _log_gres_slurmd_conf, NULL);

	for (i = 0; i < gres_context_cnt; i++) {
		list_for_each(gres_conf_list, _post_plugin_gres_conf,
			      &gres_context[i]);
	}

fini:
	/*
	 * We no longer need the gpu plugin unless this option is set:
	 * AcctGatherEnergyType=acct_gather_energy/gpu
	 * Note: slurmstepds may still load gpu plugin for gpu_g_usage_read()
	 * unless JobAcctGatherParams=DisableGPUAcct is set
	 */
	if (!in_slurmd || !xstrstr(slurm_conf.acct_gather_energy_type, "gpu"))
		gpu_plugin_fini();
	xfree(gres_conf_file);
	FREE_NULL_LIST(tmp_gres_conf_list);
	_pack_context_buf();
	_pack_gres_conf();
	slurm_mutex_unlock(&gres_context_lock);

	return rc;
}

/*
 * Pack this node's gres configuration into a buffer
 * IN/OUT buffer - message buffer to pack
 */
extern int gres_node_config_pack(buf_t *buffer)
{
	int rc = SLURM_SUCCESS;
	uint32_t magic = GRES_MAGIC;
	uint16_t rec_cnt = 0, version = SLURM_PROTOCOL_VERSION;
	list_itr_t *iter;
	gres_slurmd_conf_t *gres_slurmd_conf;

	pack16(version, buffer);
	if (gres_conf_list)
		rec_cnt = list_count(gres_conf_list);
	pack16(rec_cnt, buffer);
	if (rec_cnt) {
		/*
		 * It might be tempting to convert this to slurm_pack_list,
		 * The problem with that is how we unpack things in the function
		 * below this. It uses 'node_name' all throughout which can not
		 * be passed to slurm_unpack_list. This function is not called
		 * very often (only when the slurmd registers). The efforts to
		 * make this work are just not worth it.
		 */
		iter = list_iterator_create(gres_conf_list);
		while ((gres_slurmd_conf =
			(gres_slurmd_conf_t *) list_next(iter))) {
			pack32(magic, buffer);
			pack64(gres_slurmd_conf->count, buffer);
			pack32(gres_slurmd_conf->cpu_cnt, buffer);
			pack32(gres_slurmd_conf->config_flags, buffer);
			pack32(gres_slurmd_conf->plugin_id, buffer);
			packstr(gres_slurmd_conf->cpus, buffer);
			packstr(gres_slurmd_conf->links, buffer);
			packstr(gres_slurmd_conf->name, buffer);
			packstr(gres_slurmd_conf->type_name, buffer);
			packstr(gres_slurmd_conf->unique_id, buffer);
		}
		list_iterator_destroy(iter);
	}

	return rc;
}

/*
 * Unpack this node's configuration from a buffer (built/packed by slurmd)
 * IN/OUT buffer - message buffer to unpack
 * IN node_name - name of node whose data is being unpacked
 */
extern int gres_node_config_unpack(buf_t *buffer, char *node_name)
{
	int i, rc = SLURM_SUCCESS;
	uint32_t cpu_cnt = 0, magic = 0, plugin_id = 0;
	uint64_t count64 = 0;
	uint16_t rec_cnt = 0, protocol_version = 0;
	uint32_t config_flags = 0;
	char *tmp_cpus = NULL, *tmp_links = NULL, *tmp_name = NULL;
	char *tmp_type = NULL;
	char *tmp_unique_id = NULL;
	gres_slurmd_conf_t *p;
	bool locked = false;
	slurm_gres_context_t *gres_ctx;

	xassert(gres_context_cnt >= 0);

	FREE_NULL_LIST(gres_conf_list);
	gres_conf_list = list_create(destroy_gres_slurmd_conf);

	safe_unpack16(&protocol_version, buffer);

	safe_unpack16(&rec_cnt, buffer);
	if (rec_cnt == 0)
		return SLURM_SUCCESS;
	if (rec_cnt > NO_VAL16)
		goto unpack_error;

	slurm_mutex_lock(&gres_context_lock);
	locked = true;
	if (protocol_version < SLURM_MIN_PROTOCOL_VERSION) {
		error("%s: protocol_version %hu not supported",
		      __func__, protocol_version);
		goto unpack_error;
	}
	for (i = 0; i < rec_cnt; i++) {
		bool new_has_file;
		bool orig_has_file;
		if (protocol_version >= SLURM_MIN_PROTOCOL_VERSION) {
			safe_unpack32(&magic, buffer);
			if (magic != GRES_MAGIC)
				goto unpack_error;

			safe_unpack64(&count64, buffer);
			safe_unpack32(&cpu_cnt, buffer);
			safe_unpack32(&config_flags, buffer);
			safe_unpack32(&plugin_id, buffer);
			safe_unpackstr(&tmp_cpus, buffer);
			safe_unpackstr(&tmp_links, buffer);
			safe_unpackstr(&tmp_name, buffer);
			safe_unpackstr(&tmp_type, buffer);
			safe_unpackstr(&tmp_unique_id, buffer);
		}

		if (!count64)
			goto empty;

		log_flag(GRES, "Node:%s Gres:%s Type:%s UniqueId:%s Flags:%s CPU_IDs:%s CPU#:%u Count:%"PRIu64" Links:%s",
			 node_name, tmp_name, tmp_type, tmp_unique_id,
			 gres_flags2str(config_flags), tmp_cpus, cpu_cnt,
			 count64, tmp_links);

		if (!(gres_ctx = _find_context_by_id(plugin_id))) {
			/*
			 * GresPlugins is inconsistently configured.
			 * Not a fatal error, but skip this data.
			 */
			error("%s: No plugin configured to process GRES data from node %s (Name:%s Type:%s PluginID:%u Count:%"PRIu64")",
			      __func__, node_name, tmp_name, tmp_type,
			      plugin_id, count64);
			xfree(tmp_cpus);
			xfree(tmp_links);
			xfree(tmp_name);
			xfree(tmp_type);
			xfree(tmp_unique_id);
			continue;
		}

		if (xstrcmp(gres_ctx->gres_name, tmp_name)) {
			/*
			 * Should have been caught in
			 * gres_init()
			 */
			error("%s: gres/%s duplicate plugin ID with %s, unable to process",
			      __func__, tmp_name,
			      gres_ctx->gres_name);
			continue;
		}
		new_has_file = config_flags & GRES_CONF_HAS_FILE;
		orig_has_file = gres_ctx->config_flags &
			GRES_CONF_HAS_FILE;
		if (orig_has_file && !new_has_file && count64) {
			error("%s: gres/%s lacks \"File=\" parameter for node %s",
			      __func__, tmp_name, node_name);
			config_flags |= GRES_CONF_HAS_FILE;
		}
		if (new_has_file && (count64 > MAX_GRES_BITMAP) &&
		    !gres_id_shared(config_flags)) {
			/*
			 * Avoid over-subscribing memory with
			 * huge bitmaps
			 */
			error("%s: gres/%s has \"File=\" plus very large "
			      "\"Count\" (%"PRIu64") for node %s, "
			      "resetting value to %d",
			      __func__, tmp_name, count64,
			      node_name, MAX_GRES_BITMAP);
			count64 = MAX_GRES_BITMAP;
		}

		/*
		 * If one node in the bunch said a gres has removed
		 * GRES_CONF_ONE_SHARING then remove it from the
		 * context.
		 */
		if ((gres_ctx->config_flags & GRES_CONF_LOADED) &&
		    gres_id_shared(config_flags))  {
			bool gc_one_sharing =
				gres_ctx->config_flags &
				GRES_CONF_ONE_SHARING;
			bool got_one_sharing =
				config_flags & GRES_CONF_ONE_SHARING;
			if (gc_one_sharing == got_one_sharing) {
			} else if (!gc_one_sharing && got_one_sharing) {
				log_flag(GRES, "gres/%s was already set up to share all ignoring one_sharing from %s",
					 tmp_name, node_name);
				config_flags &= ~GRES_CONF_ONE_SHARING;
			} else if (!got_one_sharing) {
				log_flag(GRES, "gres/%s was already set up to only share one, but we just found the opposite from %s. Removing flag.",
					 tmp_name, node_name);
				gres_ctx->config_flags &=
					~GRES_CONF_ONE_SHARING;
			}
		}

		/*
		 * If we read in from state we want to take the slurmd's view
		 * over our state.
		 */
		if (gres_ctx->config_flags & GRES_CONF_FROM_STATE)
			gres_ctx->config_flags = config_flags;
		else
			gres_ctx->config_flags |= config_flags;

		/*
		 * On the slurmctld we need to load the plugins to
		 * correctly set env vars.  We want to call this only
		 * after we have the config_flags so we can tell if we
		 * are CountOnly or not.
		 */
		if (!(gres_ctx->config_flags &
		      GRES_CONF_LOADED)) {
			(void)_load_plugin(gres_ctx);
			gres_ctx->config_flags |=
				GRES_CONF_LOADED;
		}
	empty:
		p = xmalloc(sizeof(gres_slurmd_conf_t));
		p->config_flags = config_flags;
		p->count = count64;
		p->cpu_cnt = cpu_cnt;
		p->cpus = tmp_cpus;
		tmp_cpus = NULL;	/* Nothing left to xfree */
		p->links = tmp_links;
		tmp_links = NULL;	/* Nothing left to xfree */
		p->name = tmp_name;     /* Preserve for accounting! */
		p->type_name = tmp_type;
		tmp_type = NULL;	/* Nothing left to xfree */
		p->plugin_id = plugin_id;
		p->unique_id = tmp_unique_id;
		tmp_unique_id = NULL;
		if (gres_links_validate(p->links) < -1) {
			error("%s: Ignoring invalid Links=%s for Name=%s",
			      __func__, p->links, p->name);
			xfree(p->links);
		}
		list_append(gres_conf_list, p);
	}

	slurm_mutex_unlock(&gres_context_lock);
	return rc;

unpack_error:
	error("%s: unpack error from node %s", __func__, node_name);
	xfree(tmp_cpus);
	xfree(tmp_links);
	xfree(tmp_name);
	xfree(tmp_type);
	if (locked)
		slurm_mutex_unlock(&gres_context_lock);
	return SLURM_ERROR;
}

static void _gres_state_delete_members(void *x)
{
	gres_state_t *gres_ptr = (gres_state_t *) x;

	if (!gres_ptr)
		return;

	xfree(gres_ptr->gres_name);
	xassert(!gres_ptr->gres_data); /* This must be freed beforehand */
	xfree(gres_ptr);
}

static void _gres_node_state_delete_topo(gres_node_state_t *gres_ns)
{
	int i;

	for (i = 0; i < gres_ns->topo_cnt; i++) {
		if (gres_ns->topo_gres_bitmap)
			FREE_NULL_BITMAP(gres_ns->topo_gres_bitmap[i]);
		if (gres_ns->topo_core_bitmap)
			FREE_NULL_BITMAP(gres_ns->topo_core_bitmap[i]);
		if (gres_ns->topo_res_core_bitmap)
			FREE_NULL_BITMAP(gres_ns->topo_res_core_bitmap[i]);
		xfree(gres_ns->topo_type_name[i]);
	}
	xfree(gres_ns->topo_gres_bitmap);
	xfree(gres_ns->topo_core_bitmap);
	xfree(gres_ns->topo_gres_cnt_alloc);
	xfree(gres_ns->topo_gres_cnt_avail);
	xfree(gres_ns->topo_res_core_bitmap);
	xfree(gres_ns->topo_type_id);
	xfree(gres_ns->topo_type_name);
}

static void _gres_node_state_delete(gres_node_state_t *gres_ns)
{
	int i;

	FREE_NULL_BITMAP(gres_ns->gres_bit_alloc);
	xfree(gres_ns->gres_used);
	if (gres_ns->links_cnt) {
		for (i = 0; i < gres_ns->link_len; i++)
			xfree(gres_ns->links_cnt[i]);
		xfree(gres_ns->links_cnt);
	}

	_gres_node_state_delete_topo(gres_ns);

	for (i = 0; i < gres_ns->type_cnt; i++) {
		xfree(gres_ns->type_name[i]);
	}
	xfree(gres_ns->type_cnt_alloc);
	xfree(gres_ns->type_cnt_avail);
	xfree(gres_ns->type_id);
	xfree(gres_ns->type_name);
	xfree(gres_ns);
}

/*
 * Delete an element placed on gres_list by _node_config_validate()
 * free associated memory
 */
static void _gres_node_list_delete(void *list_element)
{
	gres_state_t *gres_state_node;
	gres_node_state_t *gres_ns;

	gres_state_node = (gres_state_t *) list_element;
	gres_ns = (gres_node_state_t *) gres_state_node->gres_data;
	_gres_node_state_delete(gres_ns);
	gres_state_node->gres_data = NULL;
	_gres_state_delete_members(gres_state_node);
}

extern void gres_add_type(char *type, gres_node_state_t *gres_ns,
			  uint64_t tmp_gres_cnt)
{
	int i;
	uint32_t type_id;

	if (!xstrcasecmp(type, "no_consume")) {
		gres_ns->no_consume = true;
		return;
	}

	type_id = gres_build_id(type);
	for (i = 0; i < gres_ns->type_cnt; i++) {
		if (gres_ns->type_id[i] != type_id)
			continue;
		gres_ns->type_cnt_avail[i] += tmp_gres_cnt;
		break;
	}

	if (i >= gres_ns->type_cnt) {
		gres_ns->type_cnt++;
		gres_ns->type_cnt_alloc =
			xrealloc(gres_ns->type_cnt_alloc,
				 sizeof(uint64_t) * gres_ns->type_cnt);
		gres_ns->type_cnt_avail =
			xrealloc(gres_ns->type_cnt_avail,
				 sizeof(uint64_t) * gres_ns->type_cnt);
		gres_ns->type_id =
			xrealloc(gres_ns->type_id,
				 sizeof(uint32_t) * gres_ns->type_cnt);
		gres_ns->type_name =
			xrealloc(gres_ns->type_name,
				 sizeof(char *) * gres_ns->type_cnt);
		gres_ns->type_cnt_avail[i] += tmp_gres_cnt;
		gres_ns->type_id[i] = type_id;
		gres_ns->type_name[i] = xstrdup(type);
	}
}

/*
 * Compute the total GRES count for a particular gres_name.
 * Note that a given gres_name can appear multiple times in the orig_config
 * string for multiple types (e.g. "gres=gpu:kepler:1,gpu:tesla:2").
 * IN/OUT gres_ns - set gres_cnt_config field in this structure
 * IN orig_config - gres configuration from slurm.conf
 * IN gres_name - name of the gres type (e.g. "gpu")
 * IN gres_name_colon - gres name with appended colon
 * IN gres_name_colon_len - size of gres_name_colon
 * RET - Total configured count for this GRES type
 */
static void _get_gres_cnt(gres_node_state_t *gres_ns, char *orig_config,
			  char *gres_name, char *gres_name_colon,
			  int gres_name_colon_len)
{
	char *node_gres_config, *tok, *last_tok = NULL;
	char *sub_tok, *last_sub_tok = NULL;
	char *num, *paren, *last_num = NULL;
	uint64_t gres_config_cnt = 0, tmp_gres_cnt = 0, mult;
	int i;

	xassert(gres_ns);
	if (orig_config == NULL) {
		gres_ns->gres_cnt_config = 0;
		return;
	}

	for (i = 0; i < gres_ns->type_cnt; i++) {
		gres_ns->type_cnt_avail[i] = 0;
	}

	node_gres_config = xstrdup(orig_config);
	tok = strtok_r(node_gres_config, ",", &last_tok);
	while (tok) {
		if (!xstrcmp(tok, gres_name)) {
			gres_config_cnt = 1;
			break;
		}
		if (!xstrncmp(tok, gres_name_colon, gres_name_colon_len)) {
			paren = strrchr(tok, '(');
			if (paren)	/* Ignore socket binding info */
				paren[0] = '\0';
			num = strrchr(tok, ':');
			if (!num) {
				error("Bad GRES configuration: %s", tok);
				break;
			}
			tmp_gres_cnt = strtoll(num + 1, &last_num, 10);
			if ((num[1] < '0') || (num[1] > '9')) {
				/*
				 * Type name, no count (e.g. "gpu:tesla").
				 * assume count of 1.
				 */
				tmp_gres_cnt = 1;
			} else if ((mult = suffix_mult(last_num)) != NO_VAL64) {
				tmp_gres_cnt *= mult;
				num[0] = '\0';
			} else {
				error("Bad GRES configuration: %s", tok);
				break;
			}

			gres_config_cnt += tmp_gres_cnt;

			sub_tok = strtok_r(tok, ":", &last_sub_tok);
			if (sub_tok)	/* Skip GRES name */
				sub_tok = strtok_r(NULL, ":", &last_sub_tok);
			while (sub_tok) {
				gres_add_type(sub_tok, gres_ns,
					      tmp_gres_cnt);
				sub_tok = strtok_r(NULL, ":", &last_sub_tok);
			}
		}
		tok = strtok_r(NULL, ",", &last_tok);
	}
	xfree(node_gres_config);

	gres_ns->gres_cnt_config = gres_config_cnt;
}

static int _find_gres_type(gres_node_state_t *gres_ns, uint32_t type_id)
{
	int type_index = -1;
	for (int i = 0; i < gres_ns->type_cnt; i++) {
		if(type_id == gres_ns->type_id[i]) {
			type_index = i;
			break;
		}
	}
	return type_index;
}

static int _valid_gres_type(char *gres_name, gres_node_state_t *gres_ns,
			    bool config_overrides, char **reason_down)
{
	int i, j;
	uint64_t model_cnt;
	int num_type_rem = 0;

	if (gres_ns->type_cnt == 0)
		return SLURM_SUCCESS;

	for (i = 0; i < gres_ns->type_cnt; i++) {
		model_cnt = 0;
		if (gres_ns->type_cnt) {
			for (j = 0; j < gres_ns->type_cnt; j++) {
				if (gres_ns->type_id[i] ==
				    gres_ns->type_id[j])
					model_cnt +=
						gres_ns->type_cnt_avail[j];
			}
		} else {
			for (j = 0; j < gres_ns->topo_cnt; j++) {
				if (gres_ns->topo_type_id[i] ==
				    gres_ns->topo_type_id[j])
					model_cnt += gres_ns->
						topo_gres_cnt_avail[j];
			}
		}
		if (config_overrides) {
			gres_ns->type_cnt_avail[i] = model_cnt;
		} else if (model_cnt < gres_ns->type_cnt_avail[i]) {
			if (reason_down) {
				xstrfmtcat(*reason_down,
					   "%s:%s count too low "
					   "(%"PRIu64" < %"PRIu64")",
					   gres_name, gres_ns->type_name[i],
					   model_cnt,
					   gres_ns->type_cnt_avail[i]);
			}
			return SLURM_ERROR;
		}
	}

	/*
	 * Remove types with 0 available. This happens when updating the type
	 * of a gres in slurm.conf during a reconfig
	 */
	for (int i = 0; i < gres_ns->type_cnt; i++) {
		if (gres_ns->type_cnt_avail[i])
			continue;
		num_type_rem++;
	}

	if (num_type_rem) {
		int tmp_cnt;
		uint64_t *tmp_type_cnt_alloc, *tmp_type_cnt_avail;
		uint32_t *tmp_type_id;
		char **tmp_type_name;

		tmp_cnt = gres_ns->type_cnt - num_type_rem;
		tmp_type_id = xcalloc(tmp_cnt, sizeof(*tmp_type_id));
		tmp_type_cnt_alloc =
			xcalloc(tmp_cnt, sizeof(*tmp_type_cnt_alloc));
		tmp_type_cnt_avail =
			xcalloc(tmp_cnt, sizeof(*tmp_type_cnt_avail));
		tmp_type_name =
			xcalloc(tmp_cnt, sizeof(*tmp_type_name));

		for (int j = 0, i = 0; i < gres_ns->type_cnt; i++) {
			if (!gres_ns->type_cnt_avail[i]) {
				xfree(gres_ns->type_name[i]);
				continue;
			}
			tmp_type_cnt_alloc[j] =
				gres_ns->type_cnt_alloc[i];
			tmp_type_cnt_avail[j] =
				gres_ns->type_cnt_avail[i];
			tmp_type_id[j] = gres_ns->type_id[i];
			tmp_type_name[j] = gres_ns->type_name[i];
			j++;
		}

		xfree(gres_ns->type_cnt_alloc);
		xfree(gres_ns->type_cnt_avail);
		xfree(gres_ns->type_id);
		xfree(gres_ns->type_name);

		gres_ns->type_cnt_alloc = tmp_type_cnt_alloc;
		gres_ns->type_cnt_avail = tmp_type_cnt_avail;
		gres_ns->type_id = tmp_type_id;
		gres_ns->type_name = tmp_type_name;
		gres_ns->type_cnt -= num_type_rem;
	}

	for (int i = 0; i < gres_ns->topo_cnt; i++) {
		if (_find_gres_type(gres_ns, gres_ns->topo_type_id[i]) < 0) {
			if (reason_down && (*reason_down == NULL)) {
				xstrfmtcat(*reason_down,
					   "%s type (%s) reported but not configured",
					   gres_name,
					   gres_ns->topo_type_name[i]);
			}
			return SLURM_ERROR;
		}
	}

	return SLURM_SUCCESS;
}

static gres_node_state_t *_build_gres_node_state(void)
{
	gres_node_state_t *gres_ns;

	gres_ns = xmalloc(sizeof(gres_node_state_t));
	gres_ns->gres_cnt_config = NO_VAL64;
	gres_ns->gres_cnt_found  = NO_VAL64;

	return gres_ns;
}

/*
 * Build a node's gres record based only upon the slurm.conf contents
 */
static void _node_config_init(char *orig_config, slurm_gres_context_t *gres_ctx,
			      gres_state_t *gres_state_node)
{
	gres_node_state_t *gres_ns;

	if (!gres_state_node->gres_data)
		gres_state_node->gres_data = _build_gres_node_state();
	gres_ns = (gres_node_state_t *) gres_state_node->gres_data;

	/* If the resource isn't configured for use with this node */
	if ((orig_config == NULL) || (orig_config[0] == '\0')) {
		gres_ns->gres_cnt_config = 0;
		return;
	}

	_get_gres_cnt(gres_ns, orig_config,
		      gres_ctx->gres_name,
		      gres_ctx->gres_name_colon,
		      gres_ctx->gres_name_colon_len);

	gres_ctx->total_cnt += gres_ns->gres_cnt_config;

	/* Use count from recovered state, if higher */
	gres_ns->gres_cnt_avail = MAX(gres_ns->gres_cnt_avail,
				      gres_ns->gres_cnt_config);
	if ((gres_ns->gres_bit_alloc != NULL) &&
	    (gres_ns->gres_cnt_avail >
	     bit_size(gres_ns->gres_bit_alloc)) &&
	    !gres_id_shared(gres_ctx->config_flags)) {
		bit_realloc(gres_ns->gres_bit_alloc,
			    gres_ns->gres_cnt_avail);
	}
}

/* Set up the shared/sharing pointers for easy look up later */
static void _set_alt_gres(gres_state_t *gres_state_node_shared,
			  gres_state_t *gres_state_node_sharing)
{
	if (gres_state_node_shared) {
		if (!gres_state_node_sharing) {
			error("we have a shared gres of '%s' but no gres that is sharing",
			      gres_state_node_shared->gres_name);
		} else {
			gres_node_state_t *gres_ns_shared =
				gres_state_node_shared->gres_data;
			gres_node_state_t *gres_ns_sharing =
				gres_state_node_sharing->gres_data;
			gres_ns_shared->alt_gres = gres_state_node_sharing;
			gres_ns_sharing->alt_gres = gres_state_node_shared;
		}
	}
}

/*
 * Build a node's gres record based only upon the slurm.conf contents
 * IN orig_config - Gres information supplied from slurm.conf
 * IN/OUT gres_list - List of Gres records for this node to track usage
 */
extern void gres_init_node_config(char *orig_config, list_t **gres_list)
{
	gres_state_t *gres_state_node, *gres_state_node_sharing = NULL,
		*gres_state_node_shared = NULL;

	xassert(gres_context_cnt >= 0);

	slurm_mutex_lock(&gres_context_lock);
	if ((gres_context_cnt > 0) && (*gres_list == NULL)) {
		*gres_list = list_create(_gres_node_list_delete);
	}
	for (int i = 0; i < gres_context_cnt; i++) {
		gres_node_state_t *gres_ns;
		/* Find or create gres_state entry on the list */
		gres_state_node = list_find_first(*gres_list, gres_find_id,
						  &gres_context[i].plugin_id);
		if (gres_state_node == NULL) {
			gres_state_node = gres_create_state(
				&gres_context[i], GRES_STATE_SRC_CONTEXT_PTR,
				GRES_STATE_TYPE_NODE, _build_gres_node_state());
			list_append(*gres_list, gres_state_node);
		}

		_node_config_init(orig_config, &gres_context[i],
				  gres_state_node);

		gres_ns = gres_state_node->gres_data;
		if (gres_ns && gres_ns->gres_cnt_config) {
			if (gres_id_sharing(gres_state_node->plugin_id))
				gres_state_node_sharing = gres_state_node;
			else if (gres_id_shared(gres_state_node->config_flags))
				gres_state_node_shared = gres_state_node;
		}
	}
	slurm_mutex_unlock(&gres_context_lock);

	_set_alt_gres(gres_state_node_shared, gres_state_node_sharing);
}

static int _foreach_get_tot_from_slurmd_conf(void *x, void *arg)
{
	gres_slurmd_conf_t *gres_slurmd_conf = x;
	tot_from_slurmd_conf_t *slurmd_conf_tot = arg;

	if (gres_slurmd_conf->plugin_id != slurmd_conf_tot->plugin_id)
		return 0;

	slurmd_conf_tot->config_flags |= gres_slurmd_conf->config_flags;

	slurmd_conf_tot->gres_cnt += gres_slurmd_conf->count;
	slurmd_conf_tot->rec_cnt++;

	if (gres_slurmd_conf->cpus || gres_slurmd_conf->type_name)
		slurmd_conf_tot->cpu_set_cnt++;

	return 0;
}

/*
 * Determine GRES availability on some node
 *
 * tot_from_slurmd_conf_t:
 * plugin_id IN - plugin number to search for
 * config_flags OUT - config flags from slurmd
 * topo_cnt OUT - count of gres.conf records of this ID found by slurmd
 *		  (each can have different topology)
 * config_type_cnt OUT - Count of records for this GRES found in configuration,
 *		  each of this represents a different Type of of GRES with
 *		  this name (e.g. GPU model)
 * gres_cnt OUT - total number of GRES available of this ID on this node in (sum
 * 		  across all records of this ID)
 */
static void _get_tot_from_slurmd_conf(tot_from_slurmd_conf_t *slurmd_conf_tot)
{
	xassert(slurmd_conf_tot);

	slurmd_conf_tot->config_flags = 0;
	slurmd_conf_tot->cpu_set_cnt = 0;
	slurmd_conf_tot->config_type_cnt = 0;
	slurmd_conf_tot->topo_cnt = 0;
	slurmd_conf_tot->gres_cnt = 0;
	slurmd_conf_tot->rec_cnt = 0;

	if (gres_conf_list == NULL)
		return;

	(void) list_for_each(gres_conf_list, _foreach_get_tot_from_slurmd_conf,
			     slurmd_conf_tot);

	slurmd_conf_tot->config_type_cnt = slurmd_conf_tot->rec_cnt;
}

/* Convert comma-delimited array of link counts to an integer array */
static int _links_str2array(char *links, char *node_name,
			    gres_node_state_t *gres_ns,
			    int gres_inx, int gres_cnt,
			    char **reason_down)
{
	char *start_ptr, *end_ptr = NULL, *tmp = NULL;
	int i = 0, rc = SLURM_SUCCESS;

	if (!links)	/* No "Links=" data */
		return SLURM_SUCCESS;
	if (gres_inx >= gres_ns->link_len) {
		tmp = xstrdup_printf("Invalid GRES index (%d >= %d)",
				     gres_inx, gres_cnt);
		rc = SLURM_ERROR;
		goto end_it;
	}

	start_ptr = links;
	while (1) {
		gres_ns->links_cnt[gres_inx][i] =
			strtol(start_ptr, &end_ptr, 10);
		if (gres_ns->links_cnt[gres_inx][i] < -2) {
			tmp = xstrdup_printf("Invalid GRES Links value (%s) on node %s: Link value '%d' < -2",
					     links, node_name,
					     gres_ns->links_cnt[gres_inx][i]);

			gres_ns->links_cnt[gres_inx][i] = 0;
			rc = SLURM_ERROR;
			goto end_it;
		}
		if (end_ptr[0] == '\0')
			return SLURM_SUCCESS;
		if (end_ptr[0] != ',') {
			tmp = xstrdup_printf("Invalid GRES Links value (%s) on node %s: end_ptr[0]='%c' != ','",
					     links, node_name, end_ptr[0]);
			rc = SLURM_ERROR;
			goto end_it;
		}
		if (++i >= gres_ns->link_len) {
			tmp = xstrdup_printf("Invalid GRES Links value (%s) on node %s: i=%d >= link_len=%d.",
					     links, node_name,
					     i, gres_ns->link_len);
			rc = SLURM_ERROR;
			goto end_it;
		}
		start_ptr = end_ptr + 1;
	}

end_it:
	if (rc) {
		error("%s: %s If using AutoDetect the amount of GPUs configured in slurm.conf does not match what was detected. If this is intentional, please turn off AutoDetect and manually specify them in gres.conf.",
		      __func__, tmp);
		if (reason_down && !(*reason_down)) {
			*reason_down = tmp;
			tmp = NULL;
		} else
			xfree(tmp);

		/* create zeroed-out links array (NVLINK_NONE == 0) */
		memset(gres_ns->links_cnt[gres_inx], 0, gres_cnt * sizeof(int));
	}

	return rc;
}

static bool _valid_gres_types(char *gres_name, gres_node_state_t *gres_ns,
			      char **reason_down)
{
	bool rc = true;
	uint64_t gres_cnt_found = 0, gres_sum;
	int topo_inx, type_inx;

	if ((gres_ns->type_cnt == 0) || (gres_ns->topo_cnt == 0))
		return rc;

	for (type_inx = 0; type_inx < gres_ns->type_cnt; type_inx++) {
		gres_cnt_found = 0;
		for (topo_inx = 0; topo_inx < gres_ns->topo_cnt; topo_inx++) {
			if (gres_ns->topo_type_id[topo_inx] !=
			    gres_ns->type_id[type_inx])
				continue;
			gres_sum = gres_cnt_found +
				gres_ns->topo_gres_cnt_avail[topo_inx];
			if (gres_sum > gres_ns->type_cnt_avail[type_inx]) {
				gres_ns->topo_gres_cnt_avail[topo_inx] -=
					(gres_sum -
					 gres_ns->type_cnt_avail[type_inx]);
			}
			gres_cnt_found +=
				gres_ns->topo_gres_cnt_avail[topo_inx];
		}
		if (gres_cnt_found < gres_ns->type_cnt_avail[type_inx]) {
			rc = false;
			break;
		}
	}
	if (!rc && reason_down && (*reason_down == NULL)) {
		xstrfmtcat(*reason_down,
			   "%s:%s count too low (%"PRIu64" < %"PRIu64")",
			   gres_name, gres_ns->type_name[type_inx],
			   gres_cnt_found, gres_ns->type_cnt_avail[type_inx]);
	}

	return rc;
}

static void _gres_bit_alloc_resize(gres_node_state_t *gres_ns,
				   uint64_t gres_bits)
{
	if (!gres_bits) {
		FREE_NULL_BITMAP(gres_ns->gres_bit_alloc);
		return;
	}

	if (!gres_ns->gres_bit_alloc)
		gres_ns->gres_bit_alloc = bit_alloc(gres_bits);
	else if (gres_bits != bit_size(gres_ns->gres_bit_alloc))
		bit_realloc(gres_ns->gres_bit_alloc, gres_bits);
}

/*
 * Job scheduling handles gres affinity on a socket basis internally.
 * However, the interface for setting affinity is to specify cores. This can
 * lead to the faulty expectation that the core affinity will be respected by
 * the Slurm scheduler.
 *
 * Therefore this check was added to avoid users setting the cores limit and
 * expecting Slurm to respect it (which it doesn't and never has).
 *
 * In addition to misleading users, a bug can arise where steps and jobs don't
 * line up because steps do look at the cores rather than the sockets like the
 * jobs. (i.e. job allocates a core the the step rejects), if we just wanted to
 * solve this bug we would just expand the cpu list to fill the socket here
 * instead of throwing an error.
 */
static int _check_core_range_matches_sock(bitstr_t *tmp_bitmap,
					  rebuild_topo_t *rebuild_topo,
					  gres_slurmd_conf_t *gres_slurmd_conf)
{
	for (int i = 0; (i < rebuild_topo->sock_cnt); i++) {
		int first = i * rebuild_topo->cores_per_sock;
		int last = (i + 1) * rebuild_topo->cores_per_sock;
		int core_cnt = bit_set_count_range(tmp_bitmap, first, last);

		if (core_cnt && (core_cnt != rebuild_topo->cores_per_sock)) {
			slurm_gres_context_t *gres_ctx = rebuild_topo->gres_ctx;
			gres_node_state_t *gres_ns = rebuild_topo->gres_ns;
			char *gres_cores_str = bit_fmt_full(tmp_bitmap);
			char *tmp;

			if (gres_slurmd_conf->config_flags &
			    GRES_CONF_AUTODETECT) {
				tmp = xstrdup_printf(
					"%s GRES autodetected core affinity %s on node %s doesn't match socket boundaries. (Socket %d is cores %d-%d). "
					"Consider setting SlurmdParameters=l3cache_as_socket (recommended) or override this by manually specifying core affinity in gres.conf.",
					gres_ctx->gres_type, gres_cores_str,
					rebuild_topo->node_name, i, first,
					(last - 1));
			} else {
				tmp = xstrdup_printf(
					"%s GRES core specification %s for node %s doesn't match socket boundaries. (Socket %d is cores %d-%d)",
					gres_ctx->gres_type, gres_cores_str,
					rebuild_topo->node_name, i, first,
					(last - 1));
			}
			xfree(gres_cores_str);
			FREE_NULL_BITMAP(gres_ns->topo_core_bitmap[
						 rebuild_topo->topo_cnt]);
			rebuild_topo->rc = EINVAL;
			error("%s: %s", __func__, tmp);
			if (rebuild_topo->reason_down &&
			    !(*rebuild_topo->reason_down))
				xstrfmtcat(*rebuild_topo->reason_down, "%s",
					   tmp);
			xfree(tmp);
			return SLURM_ERROR;
		}
	}
	return SLURM_SUCCESS;
}

static int _foreach_rebuild_topo(void *x, void *arg)
{
	gres_slurmd_conf_t *gres_slurmd_conf = x;
	rebuild_topo_t *rebuild_topo = arg;
	slurm_gres_context_t *gres_ctx = rebuild_topo->gres_ctx;
	gres_node_state_t *gres_ns = rebuild_topo->gres_ns;
	int topo_cnt = rebuild_topo->topo_cnt;

	if (gres_slurmd_conf->plugin_id != gres_ctx->plugin_id)
		return 0;

	if (gres_ns->gres_bit_alloc && !gres_id_shared(gres_ctx->config_flags))
		gres_ns->topo_gres_cnt_alloc[topo_cnt] = 0;
	gres_ns->topo_gres_cnt_avail[topo_cnt] = gres_slurmd_conf->count;
	if (gres_slurmd_conf->cpus) {
		/* NOTE: gres_slurmd_conf->cpus is cores */
		bitstr_t *tmp_bitmap = bit_alloc(rebuild_topo->core_cnt);
		int ret = bit_unfmt(tmp_bitmap, gres_slurmd_conf->cpus);
		if (ret != SLURM_SUCCESS) {
			error("%s: %s: invalid GRES core specification (%s) on node %s",
			      __func__, gres_ctx->gres_type,
			      gres_slurmd_conf->cpus,
			      rebuild_topo->node_name);
			FREE_NULL_BITMAP(tmp_bitmap);
			rebuild_topo->rc = ESLURM_INVALID_GRES;
			return -1;
		} else {
			FREE_NULL_BITMAP(
				gres_ns->topo_core_bitmap[topo_cnt]);
			gres_ns->topo_core_bitmap[topo_cnt] = tmp_bitmap;
		}
		if (_check_core_range_matches_sock(tmp_bitmap, rebuild_topo,
						   gres_slurmd_conf))
			return -1;

		rebuild_topo->cpus_config = rebuild_topo->core_cnt;
	} else if (rebuild_topo->cpus_config && !rebuild_topo->cpu_config_err) {
		rebuild_topo->cpu_config_err = true;
		error("%s: %s: has CPUs configured for only some of the records on node %s",
		      __func__, gres_ctx->gres_type, rebuild_topo->node_name);
	}

	if (gres_slurmd_conf->links) {
		if (gres_ns->links_cnt &&
		    (gres_ns->link_len != rebuild_topo->tot_gres_cnt)) {
			/* Size changed, need to rebuild */
			for (int j = 0; j < gres_ns->link_len; j++)
				xfree(gres_ns->links_cnt[j]);
			xfree(gres_ns->links_cnt);
		}
		if (!gres_ns->links_cnt) {
			gres_ns->link_len = rebuild_topo->tot_gres_cnt;
			gres_ns->links_cnt = xcalloc(rebuild_topo->tot_gres_cnt,
						     sizeof(int *));
			for (int j = 0; j < rebuild_topo->tot_gres_cnt; j++) {
				gres_ns->links_cnt[j] =
					xcalloc(rebuild_topo->tot_gres_cnt,
						sizeof(int));
			}
		}
	}
	if (gres_id_shared(gres_slurmd_conf->config_flags)) {
		/* If running jobs recovered then already set */
		if (!gres_ns->topo_gres_bitmap[topo_cnt]) {
			gres_ns->topo_gres_bitmap[topo_cnt] =
				bit_alloc(rebuild_topo->dev_cnt);
			bit_set(gres_ns->topo_gres_bitmap[topo_cnt],
				rebuild_topo->gres_inx);
		}
		rebuild_topo->gres_inx++;
	} else if (!rebuild_topo->dev_cnt) {
		/*
		 * Slurmd found GRES, but slurmctld can't use
		 * them. Avoid creating zero-size bitmaps.
		 */
		rebuild_topo->has_file = false;
	} else {
		FREE_NULL_BITMAP(gres_ns->topo_gres_bitmap[topo_cnt]);
		gres_ns->topo_gres_bitmap[topo_cnt] =
			bit_alloc(rebuild_topo->dev_cnt);
		for (int j = 0; j < gres_slurmd_conf->count; j++) {
			if (rebuild_topo->gres_inx >= rebuild_topo->dev_cnt) {
				/* Ignore excess GRES on node */
				break;
			}
			bit_set(gres_ns->topo_gres_bitmap[topo_cnt],
				rebuild_topo->gres_inx);
			if (gres_ns->gres_bit_alloc &&
			    bit_test(gres_ns->gres_bit_alloc,
				     rebuild_topo->gres_inx)) {
				/* Set by recovered job */
				gres_ns->topo_gres_cnt_alloc[topo_cnt]++;
			}
			if (_links_str2array(
				    gres_slurmd_conf->links,
				    rebuild_topo->node_name, gres_ns,
				    rebuild_topo->gres_inx,
				    rebuild_topo->tot_gres_cnt,
				    rebuild_topo->reason_down) != SLURM_SUCCESS)
				rebuild_topo->rc = EINVAL;

			rebuild_topo->gres_inx++;
		}
	}
	gres_ns->topo_type_id[topo_cnt] =
		gres_build_id(gres_slurmd_conf->type_name);
	xfree(gres_ns->topo_type_name[topo_cnt]);
	gres_ns->topo_type_name[topo_cnt] =
		xstrdup(gres_slurmd_conf->type_name);
	rebuild_topo->topo_cnt++;
	if (rebuild_topo->topo_cnt >= gres_ns->topo_cnt)
		return -1;

	return 0;
}

static int _foreach_rebuild_topo_no_cpus(void *x, void *arg)
{
	gres_slurmd_conf_t *gres_slurmd_conf = x;
	rebuild_topo_t *rebuild_topo = arg;
	slurm_gres_context_t *gres_ctx = rebuild_topo->gres_ctx;
	gres_node_state_t *gres_ns = rebuild_topo->gres_ns;

	if (gres_slurmd_conf->plugin_id != gres_ctx->plugin_id)
		return 0;

	for (int j = 0; j < rebuild_topo->topo_cnt; j++) {
		if (gres_ns->topo_core_bitmap[j])
			continue;
		gres_ns->topo_core_bitmap[j] =
			bit_alloc(rebuild_topo->core_cnt);
		bit_set_all(gres_ns->topo_core_bitmap[j]);
	}

	return 0;
}

static int _foreach_add_gres_info(void *x, void *arg)
{
	gres_slurmd_conf_t *gres_slurmd_conf = x;
	add_gres_info_t *add_gres_info = arg;
	slurm_gres_context_t *gres_ctx = add_gres_info->gres_ctx;
	gres_node_state_t *gres_ns = add_gres_info->gres_ns;
	uint32_t type_id;
	int i;

	if (gres_slurmd_conf->plugin_id != gres_ctx->plugin_id)
		return 0;

	type_id = gres_build_id(gres_slurmd_conf->type_name);

	for (i = 0; i < gres_ns->type_cnt; i++) {
		if (type_id == gres_ns->type_id[i])
			break;
	}
	if (i < gres_ns->type_cnt) {
		/* Update count as needed */
		gres_ns->type_cnt_avail[i] = gres_slurmd_conf->count;
	} else {
		gres_add_type(gres_slurmd_conf->type_name,
			      gres_ns,
			      gres_slurmd_conf->count);
	}

	return 0;
}

static int _node_config_validate(node_record_t *node_ptr,
				 gres_state_t *gres_state_node, int cpu_cnt,
				 int core_cnt, int sock_cnt, int cores_per_sock,
				 bool config_overrides, char **reason_down,
				 slurm_gres_context_t *gres_ctx)
{
	int i, rc = SLURM_SUCCESS;
	uint64_t dev_cnt;
	bool updated_config = false;
	gres_node_state_t *gres_ns;
	bool has_file, has_type, first_time = false, rebuild_topo = false;
	tot_from_slurmd_conf_t slurmd_conf_tot = {
		.plugin_id = gres_ctx->plugin_id,
	};
	char *orig_config = node_ptr->config_ptr->gres;
	char *node_name = node_ptr->name;
	xassert(core_cnt);
	if (gres_state_node->gres_data == NULL)
		gres_state_node->gres_data = _build_gres_node_state();
	gres_ns = (gres_node_state_t *) gres_state_node->gres_data;
	if (gres_ns->node_feature)
		return rc;

	_get_tot_from_slurmd_conf(&slurmd_conf_tot);

	/* If the gres is sharing we need to have topo configured. */
	if (slurmd_conf_tot.cpu_set_cnt ||
	    (gres_id_sharing(slurmd_conf_tot.plugin_id) && gres_ns->alt_gres))
		slurmd_conf_tot.topo_cnt = slurmd_conf_tot.rec_cnt;

	/*
	 * Check existing config_flags before overriding from
	 * slurmd_conf_tot.config_flags.
	 */
	if (gres_state_node->config_flags & GRES_CONF_UPDATE_CONFIG)
		updated_config = true;

	/* Make sure these are insync after we get it from the slurmd */
	gres_state_node->config_flags = slurmd_conf_tot.config_flags;

	if (gres_ns->gres_cnt_config > slurmd_conf_tot.gres_cnt) {
		if (reason_down && (*reason_down == NULL)) {
			xstrfmtcat(*reason_down,
				   "%s count reported lower than configured "
				   "(%"PRIu64" < %"PRIu64")",
				   gres_ctx->gres_type,
				   slurmd_conf_tot.gres_cnt,
				   gres_ns->gres_cnt_config);
		}
		rc = EINVAL;
	}
	if ((slurmd_conf_tot.gres_cnt > gres_ns->gres_cnt_config)) {
		debug("%s: %s: Ignoring excess count on node %s (%"
		      PRIu64" > %"PRIu64")",
		      __func__, gres_ctx->gres_type, node_name,
		      slurmd_conf_tot.gres_cnt,
		      gres_ns->gres_cnt_config);
		slurmd_conf_tot.gres_cnt = gres_ns->gres_cnt_config;
	}
	if (gres_ns->gres_cnt_found != slurmd_conf_tot.gres_cnt) {
		if (gres_ns->gres_cnt_found != NO_VAL64) {
			info("%s: %s: Count changed on node %s (%"PRIu64" != %"PRIu64")",
			     __func__, gres_ctx->gres_type, node_name,
			     gres_ns->gres_cnt_found,
			     slurmd_conf_tot.gres_cnt);
		}
		if ((gres_ns->gres_cnt_found != NO_VAL64) &&
		    (gres_ns->gres_cnt_alloc != 0)) {
			if (reason_down && (*reason_down == NULL)) {
				xstrfmtcat(*reason_down,
					   "%s count changed and jobs are using them "
					   "(%"PRIu64" != %"PRIu64")",
					   gres_ctx->gres_type,
					   gres_ns->gres_cnt_found,
					   slurmd_conf_tot.gres_cnt);
			}
			rc = EINVAL;
		} else {
			gres_ns->gres_cnt_found = slurmd_conf_tot.gres_cnt;
			updated_config = true;
			first_time = true;
		}
	}
	if (!updated_config && gres_ns->type_cnt) {
		/*
		 * This is needed to address the GRES specification in
		 * gres.conf having a Type option, while the GRES specification
		 * in slurm.conf does not.
		 */
		for (i = 0; i < gres_ns->type_cnt; i++) {
			if (gres_ns->type_cnt_avail[i])
				continue;
			updated_config = true;
			break;
		}
	}

	if (!first_time && gres_ns->type_cnt && gres_ns->topo_cnt) {
		for (i = 0; i < gres_ns->topo_cnt; i++) {
			int type_index = _find_gres_type(gres_ns,
						gres_ns->topo_type_id[i]);
			/*
			 * On a reconfig if a type was removed from slurm.conf
			 * its type_cnt_avail will be set to 0. If the type is
			 * not found then the topo is from a previous invalid
			 * registration.
			 */
			if ((type_index < 0) ||
			    (gres_ns->type_cnt_avail[type_index] == 0 &&
			     gres_ns->topo_gres_cnt_avail[i])) {
				if (gres_ns->gres_cnt_alloc != 0) {
					if (reason_down &&
					    (*reason_down == NULL)) {
						xstrfmtcat(*reason_down,
							   "%s type changed and jobs are using them",
							   gres_ctx->gres_type);
					}
					rc = EINVAL;
					updated_config = false;
				} else {
					updated_config = true;
				}
			}

		}
	}

	if (!updated_config && !(IS_NODE_INVALID_REG(node_ptr)))
		return rc;

	if (gres_id_sharing(slurmd_conf_tot.plugin_id) && gres_ns->alt_gres) {
		/*
		 * Tell the shared gres to update itself if the sharing gres is
		 * updated -- which will happen in a subsequent call to
		 * _node_config_validate() since gres_node_config_validate() is
		 * looping on all gres_contexts.
		 */
		gres_ns->alt_gres->config_flags |= GRES_CONF_UPDATE_CONFIG;
	}

	if ((slurmd_conf_tot.gres_cnt > gres_ns->gres_cnt_config) &&
	    config_overrides) {
		info("%s: %s: count on node %s inconsistent with slurmctld count (%"PRIu64" != %"PRIu64")",
		     __func__, gres_ctx->gres_type, node_name,
		     slurmd_conf_tot.gres_cnt, gres_ns->gres_cnt_config);
		slurmd_conf_tot.gres_cnt = gres_ns->gres_cnt_config;
		/* Ignore excess GRES */
	}
	if ((slurmd_conf_tot.topo_cnt == 0) &&
	    (slurmd_conf_tot.topo_cnt != gres_ns->topo_cnt)) {
		/* Need to clear topology info */
		_gres_node_state_delete_topo(gres_ns);

		gres_ns->topo_cnt = slurmd_conf_tot.topo_cnt;
	}

	has_file = gres_ctx->config_flags & GRES_CONF_HAS_FILE;
	has_type = gres_ctx->config_flags & GRES_CONF_HAS_TYPE;
	if (gres_id_shared(gres_ctx->config_flags))
		dev_cnt = slurmd_conf_tot.topo_cnt;
	else
		dev_cnt = slurmd_conf_tot.gres_cnt;
	if (has_file && (slurmd_conf_tot.topo_cnt != gres_ns->topo_cnt) &&
	    (dev_cnt == 0)) {
		/*
		 * Clear any vestigial GRES node state info.
		 */
		_gres_node_state_delete_topo(gres_ns);

		xfree(gres_ns->gres_bit_alloc);

		gres_ns->topo_cnt = 0;
	} else if (has_file &&
		   (slurmd_conf_tot.topo_cnt != gres_ns->topo_cnt)) {
		/*
		 * Need to rebuild topology info.
		 * Resize the data structures here.
		 */
		rebuild_topo = true;
		/*
		 * Clear any vestigial GRES node state info.
		 */
		_gres_node_state_delete_topo(gres_ns);

		gres_ns->topo_gres_cnt_alloc =
			xrealloc(gres_ns->topo_gres_cnt_alloc,
				 slurmd_conf_tot.topo_cnt * sizeof(uint64_t));
		gres_ns->topo_gres_cnt_avail =
			xrealloc(gres_ns->topo_gres_cnt_avail,
				 slurmd_conf_tot.topo_cnt * sizeof(uint64_t));
		gres_ns->topo_gres_bitmap =
			xrealloc(gres_ns->topo_gres_bitmap,
				 slurmd_conf_tot.topo_cnt *
				 sizeof(bitstr_t *));
		gres_ns->topo_core_bitmap =
			xrealloc(gres_ns->topo_core_bitmap,
				 slurmd_conf_tot.topo_cnt *
				 sizeof(bitstr_t *));
		gres_ns->topo_res_core_bitmap =
			xrealloc(gres_ns->topo_res_core_bitmap,
				 slurmd_conf_tot.topo_cnt *
				 sizeof(bitstr_t *));
		gres_ns->topo_type_id = xrealloc(gres_ns->topo_type_id,
						 slurmd_conf_tot.topo_cnt *
						 sizeof(uint32_t));
		gres_ns->topo_type_name = xrealloc(gres_ns->topo_type_name,
						   slurmd_conf_tot.topo_cnt *
						   sizeof(char *));
		if (gres_ns->gres_bit_alloc)
			bit_realloc(gres_ns->gres_bit_alloc, dev_cnt);
		gres_ns->topo_cnt = slurmd_conf_tot.topo_cnt;
	} else if (gres_ns->topo_cnt) {
		/*
		 * Need to rebuild topology info to recover state after
		 * slurmctld restart with running jobs. The number of gpus,
		 * cores, and type might have changed in slurm.conf
		 */
		rebuild_topo = true;
	}

	if (rebuild_topo) {
		rebuild_topo_t rebuild_topo = {
			.core_cnt = core_cnt,
			.cores_per_sock = cores_per_sock,
			.dev_cnt = dev_cnt,
			.gres_ctx = gres_ctx,
			.gres_ns = gres_ns,
			.has_file = has_file,
			.node_name = node_name,
			.rc = rc,
			.reason_down = reason_down,
			.sock_cnt = sock_cnt,
			.tot_gres_cnt = slurmd_conf_tot.gres_cnt,
		};
		(void) list_for_each(gres_conf_list, _foreach_rebuild_topo,
				     &rebuild_topo);
		rc = rebuild_topo.rc;
		has_file = rebuild_topo.has_file;

		if (rebuild_topo.cpu_config_err) {
			/*
			 * Some GRES of this type have "CPUs" configured. Set
			 * topo_core_bitmap for all others with all bits set.
			 */
			(void) list_for_each(gres_conf_list,
					     _foreach_rebuild_topo_no_cpus,
					     &rebuild_topo);
		}
	} else if (!has_file && has_type) {
		add_gres_info_t add_gres_info = {
			.gres_ctx = gres_ctx,
			.gres_ns = gres_ns,
		};
		/* Add GRES Type information as needed */
		(void) list_for_each(gres_conf_list,
				     _foreach_add_gres_info,
				     &add_gres_info);
	}

	if ((orig_config == NULL) || (orig_config[0] == '\0'))
		gres_ns->gres_cnt_config = 0;
	else if (gres_ns->gres_cnt_config == NO_VAL64) {
		/* This should have been filled in by _node_config_init() */
		_get_gres_cnt(gres_ns, orig_config,
			      gres_ctx->gres_name,
			      gres_ctx->gres_name_colon,
			      gres_ctx->gres_name_colon_len);
	}

	gres_ns->gres_cnt_avail = gres_ns->gres_cnt_config;

	if (has_file) {
		uint64_t gres_bits;
		if (gres_id_shared(gres_ctx->config_flags)) {
			gres_bits = slurmd_conf_tot.topo_cnt;
		} else {
			if (gres_ns->gres_cnt_avail > MAX_GRES_BITMAP) {
				error("%s: %s has \"File\" plus very large \"Count\" "
				      "(%"PRIu64") for node %s, resetting value to %u",
				      __func__, gres_ctx->gres_type,
				      gres_ns->gres_cnt_avail, node_name,
				      MAX_GRES_BITMAP);
				gres_ns->gres_cnt_avail = MAX_GRES_BITMAP;
				gres_ns->gres_cnt_found = MAX_GRES_BITMAP;
			}
			gres_bits = gres_ns->gres_cnt_avail;
		}

		_gres_bit_alloc_resize(gres_ns, gres_bits);
	}

	gres_validate_node_cores(gres_ns, core_cnt, node_name);

	if ((slurmd_conf_tot.config_type_cnt > 1) &&
	    !_valid_gres_types(gres_ctx->gres_type, gres_ns, reason_down)){
		rc = EINVAL;
	} else if (!config_overrides &&
		   (gres_ns->gres_cnt_found < gres_ns->gres_cnt_config)) {
		if (reason_down && (*reason_down == NULL)) {
			xstrfmtcat(*reason_down,
				   "%s count too low (%"PRIu64" < %"PRIu64")",
				   gres_ctx->gres_type,
				   gres_ns->gres_cnt_found,
				   gres_ns->gres_cnt_config);
		}
		rc = EINVAL;
	} else if (_valid_gres_type(gres_ctx->gres_type, gres_ns,
				    config_overrides, reason_down)) {
		rc = EINVAL;
	} else if (config_overrides && gres_ns->topo_cnt &&
		   (gres_ns->gres_cnt_found != gres_ns->gres_cnt_config)) {
		error("%s on node %s configured for %"PRIu64" resources but "
		      "%"PRIu64" found, ignoring topology support",
		      gres_ctx->gres_type, node_name,
		      gres_ns->gres_cnt_config, gres_ns->gres_cnt_found);
		if (gres_ns->topo_core_bitmap) {
			for (i = 0; i < gres_ns->topo_cnt; i++) {
				if (gres_ns->topo_core_bitmap) {
					FREE_NULL_BITMAP(gres_ns->
							 topo_core_bitmap[i]);
				}
				if (gres_ns->topo_gres_bitmap) {
					FREE_NULL_BITMAP(gres_ns->
							 topo_gres_bitmap[i]);
				}
				xfree(gres_ns->topo_type_name[i]);
			}
			xfree(gres_ns->topo_core_bitmap);
			xfree(gres_ns->topo_gres_bitmap);
			xfree(gres_ns->topo_gres_cnt_alloc);
			xfree(gres_ns->topo_gres_cnt_avail);
			xfree(gres_ns->topo_type_id);
			xfree(gres_ns->topo_type_name);
		}
		gres_ns->topo_cnt = 0;
	}

	return rc;
}

/* The GPU count on a node changed. Update SHARED data structures to match */
static void _sync_node_shared_to_sharing(gres_state_t *sharing_gres_state_node)
{
	gres_node_state_t *sharing_gres_ns, *shared_gres_ns;
	uint64_t sharing_cnt, shared_alloc = 0, shared_rem;
	int i;

	if (!sharing_gres_state_node)
		return;

	sharing_gres_ns = sharing_gres_state_node->gres_data;

	if (!sharing_gres_ns->alt_gres)
		return;

	shared_gres_ns = sharing_gres_ns->alt_gres->gres_data;

	sharing_cnt = sharing_gres_ns->gres_cnt_avail;
	if (shared_gres_ns->gres_bit_alloc) {
		if ((sharing_cnt == bit_size(shared_gres_ns->gres_bit_alloc)) &&
		    (sharing_cnt == shared_gres_ns->topo_cnt)) {
			debug3("No change for gres/'shared'");
			return;
		}
	}

	if (sharing_cnt == 0)
		return;			/* Still no SHARINGs */

	/* Free any excess gres/'shared' topo records */
	for (i = sharing_cnt; i < shared_gres_ns->topo_cnt; i++) {
		if (shared_gres_ns->topo_core_bitmap)
			FREE_NULL_BITMAP(shared_gres_ns->topo_core_bitmap[i]);
		if (shared_gres_ns->topo_gres_bitmap)
			FREE_NULL_BITMAP(shared_gres_ns->topo_gres_bitmap[i]);
		xfree(shared_gres_ns->topo_type_name[i]);
	}

	if (shared_gres_ns->gres_cnt_avail == 0) {
		/* No gres/'shared' on this node */
		shared_gres_ns->topo_cnt = 0;
		return;
	}

	if (!shared_gres_ns->gres_bit_alloc) {
		shared_gres_ns->gres_bit_alloc = bit_alloc(sharing_cnt);
	} else {
		bit_realloc(shared_gres_ns->gres_bit_alloc, sharing_cnt);
	}

	/* Add any additional required gres/'shared' topo records */
	if (shared_gres_ns->topo_cnt) {
		shared_gres_ns->topo_core_bitmap =
			xrealloc(shared_gres_ns->topo_core_bitmap,
				 sizeof(bitstr_t *) * sharing_cnt);
		shared_gres_ns->topo_res_core_bitmap =
			xrealloc(shared_gres_ns->topo_res_core_bitmap,
				 sizeof(bitstr_t *) * sharing_cnt);
		shared_gres_ns->topo_gres_bitmap =
			xrealloc(shared_gres_ns->topo_gres_bitmap,
				 sizeof(bitstr_t *) * sharing_cnt);
		shared_gres_ns->topo_gres_cnt_alloc =
			xrealloc(shared_gres_ns->topo_gres_cnt_alloc,
				 sizeof(uint64_t) * sharing_cnt);
		shared_gres_ns->topo_gres_cnt_avail =
			xrealloc(shared_gres_ns->topo_gres_cnt_avail,
				 sizeof(uint64_t) * sharing_cnt);
		shared_gres_ns->topo_type_id =
			xrealloc(shared_gres_ns->topo_type_id,
				 sizeof(uint32_t) * sharing_cnt);
		shared_gres_ns->topo_type_name =
			xrealloc(shared_gres_ns->topo_type_name,
				 sizeof(char *) * sharing_cnt);
	} else {
		shared_gres_ns->topo_core_bitmap =
			xcalloc(sharing_cnt, sizeof(bitstr_t *));
		shared_gres_ns->topo_res_core_bitmap =
			xcalloc(sharing_cnt, sizeof(bitstr_t *));
		shared_gres_ns->topo_gres_bitmap =
			xcalloc(sharing_cnt, sizeof(bitstr_t *));
		shared_gres_ns->topo_gres_cnt_alloc =
			xcalloc(sharing_cnt, sizeof(uint64_t));
		shared_gres_ns->topo_gres_cnt_avail =
			xcalloc(sharing_cnt, sizeof(uint64_t));
		shared_gres_ns->topo_type_id =
			xcalloc(sharing_cnt, sizeof(uint32_t));
		shared_gres_ns->topo_type_name =
			xcalloc(sharing_cnt, sizeof(char *));
	}

	/*
	 * Evenly distribute any remaining SHARED counts.
	 * Counts get reset as needed when the node registers.
	 */
	for (i = 0; i < shared_gres_ns->topo_cnt; i++)
		shared_alloc += shared_gres_ns->topo_gres_cnt_avail[i];
	if (shared_alloc >= shared_gres_ns->gres_cnt_avail)
		shared_rem = 0;
	else
		shared_rem = shared_gres_ns->gres_cnt_avail - shared_alloc;
	for (i = shared_gres_ns->topo_cnt; i < sharing_cnt; i++) {
		shared_gres_ns->topo_gres_bitmap[i] = bit_alloc(sharing_cnt);
		bit_set(shared_gres_ns->topo_gres_bitmap[i], i);
		shared_alloc = shared_rem / (sharing_cnt - i);
		shared_gres_ns->topo_gres_cnt_avail[i] = shared_alloc;
		shared_rem -= shared_alloc;
	}
	shared_gres_ns->topo_cnt = sharing_cnt;

	for (i = 0; i < shared_gres_ns->topo_cnt; i++) {
		if (shared_gres_ns->topo_gres_bitmap &&
		    shared_gres_ns->topo_gres_bitmap[i] &&
		    (sharing_cnt !=
		     bit_size(shared_gres_ns->topo_gres_bitmap[i]))) {
			bit_realloc(shared_gres_ns->topo_gres_bitmap[i],
				    sharing_cnt);
		}
	}
}

/*
 * Validate a node's configuration and put a gres record onto a list
 * Called immediately after gres_node_config_unpack().
 * IN node_ptr - With the relevant attributes for this function being:
 *	->name - name of the node for which the gres information applies
 *	->config_ptr->gres - Gres information supplied from merged
 *			     slurm.conf/gres.conf
 *	->gres - Updated gres info from slurm.conf
 *	->gres_list - List of Gres records for this node to track usage
 * IN threads_per_core - Count of CPUs (threads) per core on this node
 * IN cores_per_sock - Count of cores per socket on this node
 * IN sock_cnt - Count of sockets on this node
 * IN config_overrides - true: Don't validate hardware, use slurm.conf
 *                             configuration
 *			 false: Validate hardware config, but use slurm.conf
 *                              config
 * OUT reason_down - set to an explanation of failure, if any, don't set if NULL
 */
extern int gres_node_config_validate(node_record_t *node_ptr,
				     int threads_per_core, int cores_per_sock,
				     int sock_cnt, bool config_overrides,
				     char **reason_down)
{
	int i, rc = SLURM_SUCCESS, rc2;
	gres_state_t *gres_state_node, *gres_gpu_ptr = NULL;
	int core_cnt = sock_cnt * cores_per_sock;
	int cpu_cnt  = core_cnt * threads_per_core;

	xassert(gres_context_cnt >= 0);

	slurm_mutex_lock(&gres_context_lock);
	if ((gres_context_cnt > 0) && (node_ptr->gres_list == NULL))
		node_ptr->gres_list = list_create(_gres_node_list_delete);
	for (i = 0; i < gres_context_cnt; i++) {
		/* Find or create gres_state entry on the list */
		gres_state_node =
			list_find_first(node_ptr->gres_list, gres_find_id,
					&gres_context[i].plugin_id);
		if (gres_state_node == NULL) {
			gres_state_node = gres_create_state(
				&gres_context[i], GRES_STATE_SRC_CONTEXT_PTR,
				GRES_STATE_TYPE_NODE, _build_gres_node_state());
			list_append(node_ptr->gres_list, gres_state_node);
		}
		rc2 = _node_config_validate(node_ptr, gres_state_node, cpu_cnt,
					    core_cnt, sock_cnt, cores_per_sock,
					    config_overrides, reason_down,
					    &gres_context[i]);
		rc = MAX(rc, rc2);
		if (gres_id_sharing(gres_state_node->plugin_id))
			gres_gpu_ptr = gres_state_node;
	}
	_sync_node_shared_to_sharing(gres_gpu_ptr);
	_build_node_gres_str(&node_ptr->gres_list, &node_ptr->gres,
			     cores_per_sock, sock_cnt);
	slurm_mutex_unlock(&gres_context_lock);

	return rc;
}

/* Convert number to new value with suffix (e.g. 2096 -> 2K) */
static void _gres_scale_value(uint64_t gres_size, uint64_t *gres_scaled,
			      char **suffix)
{
	uint64_t tmp_gres_size = gres_size;
	int i;

	tmp_gres_size = gres_size;
	for (i = 0; i < 4; i++) {
		if ((tmp_gres_size != 0) && ((tmp_gres_size % 1024) == 0))
			tmp_gres_size /= 1024;
		else
			break;
	}

	*gres_scaled = tmp_gres_size;
	if (i == 0)
		*suffix = "";
	else if (i == 1)
		*suffix = "K";
	else if (i == 2)
		*suffix = "M";
	else if (i == 3)
		*suffix = "G";
	else
		*suffix = "T";
}

/*
 * Add a GRES from node_feature plugin
 * IN node_name - name of the node for which the gres information applies
 * IN gres_name - name of the GRES being added or updated from the plugin
 * IN gres_size - count of this GRES on this node
 * IN/OUT new_config - Updated GRES info from slurm.conf
 * IN/OUT gres_list - List of GRES records for this node to track usage
 */
extern void gres_node_feature(char *node_name,
			      char *gres_name, uint64_t gres_size,
			      char **new_config, list_t **gres_list)
{
	char *new_gres = NULL, *tok, *save_ptr = NULL, *sep = "", *suffix = "";
	gres_state_t *gres_state_node;
	gres_node_state_t *gres_ns;
	uint32_t plugin_id;
	uint64_t gres_scaled = 0;
	int gres_name_len;

	xassert(gres_name);
	gres_name_len = strlen(gres_name);
	plugin_id = gres_build_id(gres_name);
	if (*new_config) {
		tok = strtok_r(*new_config, ",", &save_ptr);
		while (tok) {
			if (!strncmp(tok, gres_name, gres_name_len) &&
			    ((tok[gres_name_len] == ':') ||
			     (tok[gres_name_len] == '\0'))) {
				/* Skip this record */
			} else {
				xstrfmtcat(new_gres, "%s%s", sep, tok);
				sep = ",";
			}
			tok = strtok_r(NULL, ",", &save_ptr);
		}
	}
	_gres_scale_value(gres_size, &gres_scaled, &suffix);
	xstrfmtcat(new_gres, "%s%s:%"PRIu64"%s",
		   sep, gres_name, gres_scaled, suffix);
	xfree(*new_config);
	*new_config = new_gres;

	slurm_mutex_lock(&gres_context_lock);
	if (gres_context_cnt > 0) {
		if (*gres_list == NULL)
			*gres_list = list_create(_gres_node_list_delete);
		gres_state_node = list_find_first(*gres_list, gres_find_id,
						  &plugin_id);
		if (gres_state_node == NULL) {
			gres_state_node = xmalloc(sizeof(gres_state_t));
			/* FIXME: no config_flags known at this moment */
			/* gres_state_node->config_flags = ; */
			gres_state_node->plugin_id = plugin_id;
			gres_state_node->gres_data = _build_gres_node_state();
			gres_state_node->gres_name = xstrdup(gres_name);
			gres_state_node->state_type = GRES_STATE_TYPE_NODE;
			list_append(*gres_list, gres_state_node);
		}
		gres_ns = gres_state_node->gres_data;
		if (gres_size >= gres_ns->gres_cnt_alloc) {
			gres_ns->gres_cnt_avail = gres_size -
				gres_ns->gres_cnt_alloc;
		} else {
			error("%s: Changed size count of GRES %s from %"PRIu64
			      " to %"PRIu64", resource over allocated",
			      __func__, gres_name,
			      gres_ns->gres_cnt_avail, gres_size);
			gres_ns->gres_cnt_avail = 0;
		}
		gres_ns->gres_cnt_config = gres_size;
		gres_ns->gres_cnt_found = gres_size;
		gres_ns->node_feature = true;
	}
	slurm_mutex_unlock(&gres_context_lock);
}

/*
 * Check validity of a GRES change. Specifically if a GRES type has "Files"
 * configured then the only valid new counts are the current count or zero
 *
 * RET true of the requested change is valid
 */
static int _node_reconfig_test(char *node_name, char *new_gres,
			       gres_state_t *gres_state_node,
			       slurm_gres_context_t *gres_ctx)
{
	gres_node_state_t *orig_gres_ns, *new_gres_ns;
	int rc = SLURM_SUCCESS;

	xassert(gres_state_node);
	if (!(gres_ctx->config_flags & GRES_CONF_HAS_FILE))
		return SLURM_SUCCESS;

	orig_gres_ns = gres_state_node->gres_data;
	new_gres_ns = _build_gres_node_state();
	_get_gres_cnt(new_gres_ns, new_gres,
		      gres_ctx->gres_name,
		      gres_ctx->gres_name_colon,
		      gres_ctx->gres_name_colon_len);
	if ((new_gres_ns->gres_cnt_config != 0) &&
	    (new_gres_ns->gres_cnt_config !=
	     orig_gres_ns->gres_cnt_config)) {
		error("Attempt to change gres/%s Count on node %s from %"
		      PRIu64" to %"PRIu64" invalid with File configuration",
		      gres_ctx->gres_name, node_name,
		      orig_gres_ns->gres_cnt_config,
		      new_gres_ns->gres_cnt_config);
		rc = ESLURM_INVALID_GRES;
	}
	_gres_node_state_delete(new_gres_ns);

	return rc;
}

static int _node_reconfig(char *node_name, char *new_gres, char **gres_str,
			  gres_state_t *gres_state_node, bool config_overrides,
			  slurm_gres_context_t *gres_ctx,
			  bool *updated_gpu_cnt)
{
	int i;
	gres_node_state_t *gres_ns;
	uint64_t gres_bits, orig_cnt;

	xassert(gres_state_node);
	xassert(updated_gpu_cnt);
	*updated_gpu_cnt = false;
	if (gres_state_node->gres_data == NULL)
		gres_state_node->gres_data = _build_gres_node_state();
	gres_ns = gres_state_node->gres_data;
	orig_cnt = gres_ns->gres_cnt_config;

	_get_gres_cnt(gres_ns, new_gres,
		      gres_ctx->gres_name,
		      gres_ctx->gres_name_colon,
		      gres_ctx->gres_name_colon_len);

	if (gres_ns->gres_cnt_config == orig_cnt)
		return SLURM_SUCCESS;	/* No change in count */

	/* Update count */
	gres_ctx->total_cnt -= orig_cnt;
	gres_ctx->total_cnt += gres_ns->gres_cnt_config;

	gres_ns->gres_cnt_avail = gres_ns->gres_cnt_config;

	if (gres_ctx->config_flags & GRES_CONF_HAS_FILE) {
		if (gres_id_shared(gres_ctx->config_flags))
			gres_bits = gres_ns->topo_cnt;
		else
			gres_bits = gres_ns->gres_cnt_avail;

		_gres_bit_alloc_resize(gres_ns, gres_bits);
	} else if (gres_ns->gres_bit_alloc &&
		   !gres_id_shared(gres_ctx->config_flags)) {
		/*
		 * If GRES count changed in configuration between reboots,
		 * update bitmap sizes as needed.
		 */
		gres_bits = gres_ns->gres_cnt_avail;
		if (gres_bits != bit_size(gres_ns->gres_bit_alloc)) {
			info("gres/%s count changed on node %s to %"PRIu64,
			     gres_ctx->gres_name, node_name, gres_bits);
			if (gres_id_sharing(gres_ctx->plugin_id))
				*updated_gpu_cnt = true;
			bit_realloc(gres_ns->gres_bit_alloc, gres_bits);
			for (i = 0; i < gres_ns->topo_cnt; i++) {
				if (gres_ns->topo_gres_bitmap &&
				    gres_ns->topo_gres_bitmap[i] &&
				    (gres_bits !=
				     bit_size(gres_ns->topo_gres_bitmap[i]))){
					bit_realloc(gres_ns->topo_gres_bitmap[i],
						    gres_bits);
				}
			}
		}
	}

	return SLURM_SUCCESS;
}

/* Convert core bitmap into socket string, xfree return value */
static char *_core_bitmap2str(bitstr_t *core_map, int cores_per_sock,
			      int sock_per_node)
{
	char *sock_info = NULL, tmp[256];
	bitstr_t *sock_map;
	int c, s, core_offset, max_core;
	bool any_set = false;

	xassert(core_map);
	max_core = bit_size(core_map) - 1;
	sock_map = bit_alloc(sock_per_node);
	for (s = 0; s < sock_per_node; s++) {
		core_offset = s * cores_per_sock;
		for (c = 0; c < cores_per_sock; c++) {
			if (core_offset > max_core) {
				error("%s: bad core offset (%d >= %d)",
				      __func__, core_offset, max_core);
				break;
			}
			if (bit_test(core_map, core_offset++)) {
				bit_set(sock_map, s);
				any_set = true;
				break;
			}
		}
	}
	if (any_set) {
		bit_fmt(tmp, sizeof(tmp), sock_map);
		xstrfmtcat(sock_info, "(S:%s)", tmp);
	} else {
		/* We have a core bitmap with no bits set */
		sock_info = xstrdup("");
	}
	FREE_NULL_BITMAP(sock_map);

	return sock_info;
}

/* Given a count, modify it as needed and return suffix (e.g. "M" for mega ) */
static char *_get_suffix(uint64_t *count)
{
	if (*count == 0)
		return "";
	if ((*count % ((uint64_t)1024 * 1024 * 1024 * 1024 * 1024)) == 0) {
		*count /= ((uint64_t)1024 * 1024 * 1024 * 1024 * 1024);
		return "P";
	} else if ((*count % ((uint64_t)1024 * 1024 * 1024 * 1024)) == 0) {
		*count /= ((uint64_t)1024 * 1024 * 1024 * 1024);
		return "T";
	} else if ((*count % ((uint64_t)1024 * 1024 * 1024)) == 0) {
		*count /= ((uint64_t)1024 * 1024 * 1024);
		return "G";
	} else if ((*count % (1024 * 1024)) == 0) {
		*count /= (1024 * 1024);
		return "M";
	} else if ((*count % 1024) == 0) {
		*count /= 1024;
		return "K";
	} else {
		return "";
	}
}

/* Build node's GRES string based upon data in that node's GRES list */
static void _build_node_gres_str(list_t **gres_list, char **gres_str,
				 int cores_per_sock, int sock_per_node)
{
	gres_state_t *gres_state_node;
	gres_node_state_t *gres_ns;
	bitstr_t *done_topo, *core_map;
	uint64_t gres_sum;
	char *sep = "", *suffix, *sock_info = NULL, *sock_str, *no_consume_str;
	int c, i, j;

	xassert(gres_str);
	xfree(*gres_str);
	for (c = 0; c < gres_context_cnt; c++) {
		/* Find gres_state entry on the list */
		gres_state_node = list_find_first(*gres_list, gres_find_id,
						  &gres_context[c].plugin_id);
		if (gres_state_node == NULL)
			continue;	/* Node has none of this GRES */

		gres_ns = (gres_node_state_t *) gres_state_node->gres_data;
		no_consume_str = gres_ns->no_consume ? ":no_consume" : "";
		if (gres_ns->topo_cnt &&
		    gres_ns->gres_cnt_avail) {
			done_topo = bit_alloc(gres_ns->topo_cnt);
			for (i = 0; i < gres_ns->topo_cnt; i++) {
				if (bit_test(done_topo, i))
					continue;
				bit_set(done_topo, i);
				gres_sum = gres_ns->
					topo_gres_cnt_avail[i];
				if (gres_ns->topo_core_bitmap[i]) {
					core_map = bit_copy(
						gres_ns->
						topo_core_bitmap[i]);
				} else
					core_map = NULL;
				for (j = 0; j < gres_ns->topo_cnt; j++){
					if (gres_ns->topo_type_id[i] !=
					    gres_ns->topo_type_id[j])
						continue;
					if (bit_test(done_topo, j))
						continue;
					bit_set(done_topo, j);
					gres_sum += gres_ns->
						topo_gres_cnt_avail[j];
					if (core_map &&
					    gres_ns->
					    topo_core_bitmap[j]) {
						bit_or(core_map,
						       gres_ns->
						       topo_core_bitmap[j]);
					} else if (gres_ns->
						   topo_core_bitmap[j]) {
						core_map = bit_copy(
							gres_ns->
							topo_core_bitmap[j]);
					}
				}
				if (core_map) {
					sock_info = _core_bitmap2str(
						core_map,
						cores_per_sock,
						sock_per_node);
					FREE_NULL_BITMAP(core_map);
					sock_str = sock_info;
				} else
					sock_str = "";
				suffix = _get_suffix(&gres_sum);
				if (gres_ns->topo_type_name[i]) {
					xstrfmtcat(*gres_str,
						   "%s%s:%s%s:%"PRIu64"%s%s", sep,
						   gres_context[c].gres_name,
						   gres_ns->
						   topo_type_name[i],
						   no_consume_str, gres_sum,
						   suffix, sock_str);
				} else {
					xstrfmtcat(*gres_str,
						   "%s%s%s:%"PRIu64"%s%s", sep,
						   gres_context[c].gres_name,
						   no_consume_str, gres_sum,
						   suffix, sock_str);
				}
				xfree(sock_info);
				sep = ",";
			}
			FREE_NULL_BITMAP(done_topo);
		} else if (gres_ns->type_cnt &&
			   gres_ns->gres_cnt_avail) {
			for (i = 0; i < gres_ns->type_cnt; i++) {
				gres_sum = gres_ns->type_cnt_avail[i];
				suffix = _get_suffix(&gres_sum);
				xstrfmtcat(*gres_str, "%s%s:%s%s:%"PRIu64"%s",
					   sep, gres_context[c].gres_name,
					   gres_ns->type_name[i],
					   no_consume_str, gres_sum, suffix);
				sep = ",";
			}
		} else if (gres_ns->gres_cnt_avail) {
			gres_sum = gres_ns->gres_cnt_avail;
			suffix = _get_suffix(&gres_sum);
			xstrfmtcat(*gres_str, "%s%s%s:%"PRIu64"%s",
				   sep, gres_context[c].gres_name,
				   no_consume_str, gres_sum, suffix);
			sep = ",";
		}
	}
}

static int _foreach_node_state_pack(void *x, void *arg)
{
	gres_state_t *gres_state_node = x;
	pack_state_t *pack_state = arg;
	gres_node_state_t *gres_ns = gres_state_node->gres_data;
	uint16_t gres_bitmap_size;

	if (pack_state->protocol_version >= SLURM_MIN_PROTOCOL_VERSION) {
		pack32(pack_state->magic, pack_state->buffer);
		pack32(gres_state_node->plugin_id, pack_state->buffer);
		pack32(gres_state_node->config_flags, pack_state->buffer);
		pack64(gres_ns->gres_cnt_avail, pack_state->buffer);
		/*
		 * Just note if gres_bit_alloc exists.
		 * Rebuild it based upon the state of recovered jobs
		 */
		if (gres_ns->gres_bit_alloc)
			gres_bitmap_size = bit_size(gres_ns->gres_bit_alloc);
		else
			gres_bitmap_size = 0;
		pack16(gres_bitmap_size, pack_state->buffer);

		pack16(gres_ns->topo_cnt, pack_state->buffer);
		for (int i = 0; i < gres_ns->topo_cnt; i++) {
			pack_bit_str_hex(gres_ns->topo_core_bitmap[i],
					 pack_state->buffer);
			pack_bit_str_hex(gres_ns->topo_gres_bitmap[i],
					 pack_state->buffer);
			pack_bit_str_hex(gres_ns->topo_res_core_bitmap[i],
					 pack_state->buffer);
		}
		pack64_array(gres_ns->topo_gres_cnt_alloc, gres_ns->topo_cnt,
			     pack_state->buffer);
		pack64_array(gres_ns->topo_gres_cnt_avail, gres_ns->topo_cnt,
			     pack_state->buffer);
		pack32_array(gres_ns->topo_type_id, gres_ns->topo_cnt,
			     pack_state->buffer);
		packstr_array(gres_ns->topo_type_name, gres_ns->topo_cnt,
			      pack_state->buffer);
	} else {
		error("%s: protocol_version %hu not supported",
		      __func__, pack_state->protocol_version);
		return -1;
	}

	return 0;
}

static int _foreach_job_state_pack(void *x, void *arg)
{
	gres_state_t *gres_state_job = x;
	pack_state_t *pack_state = arg;
	gres_job_state_t *gres_js = gres_state_job->gres_data;
	int i;

	if (pack_state->protocol_version >= SLURM_MIN_PROTOCOL_VERSION) {
		pack32(pack_state->magic, pack_state->buffer);
		pack32(gres_state_job->plugin_id, pack_state->buffer);
		pack16(gres_js->cpus_per_gres, pack_state->buffer);
		pack16(gres_js->flags, pack_state->buffer);
		pack64(gres_js->gres_per_job, pack_state->buffer);
		pack64(gres_js->gres_per_node, pack_state->buffer);
		pack64(gres_js->gres_per_socket, pack_state->buffer);
		pack64(gres_js->gres_per_task, pack_state->buffer);
		pack64(gres_js->mem_per_gres, pack_state->buffer);
		pack16(gres_js->ntasks_per_gres, pack_state->buffer);
		pack64(gres_js->total_gres, pack_state->buffer);
		packstr(gres_js->type_name, pack_state->buffer);
		pack32(gres_js->node_cnt, pack_state->buffer);

		if (gres_js->gres_cnt_node_alloc) {
			pack8((uint8_t) 1, pack_state->buffer);
			pack64_array(gres_js->gres_cnt_node_alloc,
				     gres_js->node_cnt, pack_state->buffer);
		} else {
			pack8((uint8_t) 0, pack_state->buffer);
		}

		if (gres_js->gres_bit_alloc) {
			pack8((uint8_t) 1, pack_state->buffer);
			for (i = 0; i < gres_js->node_cnt; i++) {
				pack_bit_str_hex(gres_js->
						 gres_bit_alloc[i],
						 pack_state->buffer);
			}
		} else {
			pack8((uint8_t) 0, pack_state->buffer);
		}
		for (i = 0; i < gres_js->node_cnt; i++) {
			if (!gres_js->gres_per_bit_alloc ||
			    !gres_js->gres_per_bit_alloc[i] ||
			    !gres_js->gres_bit_alloc ||
			    !gres_js->gres_bit_alloc[i]) {
				pack8((uint8_t)0, pack_state->buffer);
				continue;
			}
			pack8((uint8_t)1, pack_state->buffer);
			pack64_array(
				gres_js->gres_per_bit_alloc[i],
				bit_size(gres_js->gres_bit_alloc[i]),
				pack_state->buffer);
		}
		if (pack_state->details && gres_js->gres_bit_step_alloc) {
			pack8((uint8_t) 1, pack_state->buffer);
			for (i = 0; i < gres_js->node_cnt; i++) {
				pack_bit_str_hex(gres_js->
						 gres_bit_step_alloc[i],
						 pack_state->buffer);
			}
		} else {
			pack8((uint8_t) 0, pack_state->buffer);
		}
		if (pack_state->details && gres_js->gres_cnt_step_alloc) {
			pack8((uint8_t) 1, pack_state->buffer);
			for (i = 0; i < gres_js->node_cnt; i++) {
				pack64(gres_js->
				       gres_cnt_step_alloc[i],
				       pack_state->buffer);
			}
		} else {
			pack8((uint8_t) 0, pack_state->buffer);
		}
		for (i = 0; i < gres_js->node_cnt; i++) {
			if (!pack_state->details ||
			    !gres_js->gres_per_bit_step_alloc ||
			    !gres_js->gres_per_bit_step_alloc[i] ||
			    !gres_js->gres_bit_step_alloc ||
			    !gres_js->gres_bit_step_alloc[i]) {
				pack8((uint8_t)0, pack_state->buffer);
				continue;
			}
			pack8((uint8_t)1, pack_state->buffer);
			pack64_array(
				gres_js->gres_per_bit_step_alloc[i],
				bit_size(gres_js->gres_bit_step_alloc[i]),
				pack_state->buffer);
		}
	} else {
		error("%s: protocol_version %hu not supported",
		      __func__, pack_state->protocol_version);
		return -1;
	}

	return 0;
}

static int _foreach_step_state_pack(void *x, void *arg)
{
	gres_state_t *gres_state_step = x;
	pack_state_t *pack_state = arg;
	gres_step_state_t *gres_ss = gres_state_step->gres_data;
	int i;

	if (pack_state->protocol_version >= SLURM_MIN_PROTOCOL_VERSION) {
		pack32(pack_state->magic, pack_state->buffer);
		pack32(gres_state_step->plugin_id, pack_state->buffer);
		pack16(gres_ss->cpus_per_gres, pack_state->buffer);
		pack16(gres_ss->flags, pack_state->buffer);
		pack64(gres_ss->gres_per_step, pack_state->buffer);
		pack64(gres_ss->gres_per_node, pack_state->buffer);
		pack64(gres_ss->gres_per_socket, pack_state->buffer);
		pack64(gres_ss->gres_per_task, pack_state->buffer);
		pack64(gres_ss->mem_per_gres, pack_state->buffer);
		pack64(gres_ss->total_gres, pack_state->buffer);
		packstr(gres_ss->type_name, pack_state->buffer);
		pack32(gres_ss->node_cnt, pack_state->buffer);
		pack_bit_str_hex(gres_ss->node_in_use, pack_state->buffer);
		if (gres_ss->gres_cnt_node_alloc) {
			pack8((uint8_t) 1, pack_state->buffer);
			pack64_array(gres_ss->gres_cnt_node_alloc,
				     gres_ss->node_cnt, pack_state->buffer);
		} else {
			pack8((uint8_t) 0, pack_state->buffer);
		}
		if (gres_ss->gres_bit_alloc) {
			pack8((uint8_t) 1, pack_state->buffer);
			for (i = 0; i < gres_ss->node_cnt; i++)
				pack_bit_str_hex(gres_ss->gres_bit_alloc[i],
						 pack_state->buffer);
		} else {
			pack8((uint8_t) 0, pack_state->buffer);
		}
		for (i = 0; i < gres_ss->node_cnt; i++) {
			if (!gres_ss->gres_per_bit_alloc ||
			    !gres_ss->gres_per_bit_alloc[i] ||
			    !gres_ss->gres_bit_alloc ||
			    !gres_ss->gres_bit_alloc[i]) {
				pack8((uint8_t)0, pack_state->buffer);
				continue;
			}
			pack8((uint8_t)1, pack_state->buffer);
			pack64_array(gres_ss->gres_per_bit_alloc[i],
				     bit_size(gres_ss->gres_bit_alloc[i]),
				     pack_state->buffer);
		}
	} else {
		error("%s: protocol_version %hu not supported",
		      __func__, pack_state->protocol_version);
		return -1;
	}
	return 0;
}

static int _pack_state(list_t *gres_list, pack_state_t *pack_state,
		       int (*pack_function) (void *x, void *key))
{
	int rc = SLURM_SUCCESS;
	uint32_t top_offset, tail_offset;
	uint16_t rec_cnt = 0;

	top_offset = get_buf_offset(pack_state->buffer);
	pack16(rec_cnt, pack_state->buffer);	/* placeholder if data */

	if (!gres_list)
		return rc;

	rec_cnt = list_for_each(gres_list, pack_function, pack_state);

	if (rec_cnt > 0) {
		tail_offset = get_buf_offset(pack_state->buffer);
		set_buf_offset(pack_state->buffer, top_offset);
		pack16(rec_cnt, pack_state->buffer);
		set_buf_offset(pack_state->buffer, tail_offset);
	}

	return rc;
}

/*
 * Note that a node's configuration has been modified (e.g. "scontol update ..")
 * IN node_name - name of the node for which the gres information applies
 * IN new_gres - Updated GRES information supplied from slurm.conf or scontrol
 * IN/OUT gres_str - Node's current GRES string, updated as needed
 * IN/OUT gres_list - List of Gres records for this node to track usage
 * IN config_overrides - true: Don't validate hardware, use slurm.conf
 *                             configuration
 *			 false: Validate hardware config, but use slurm.conf
 *                              config
 * IN cores_per_sock - Number of cores per socket on this node
 * IN sock_per_node - Total count of sockets on this node (on any board)
 */
extern int gres_node_reconfig(char *node_name,
			      char *new_gres,
			      char **gres_str,
			      list_t **gres_list,
			      bool config_overrides,
			      int cores_per_sock,
			      int sock_per_node)
{
	int i, rc = SLURM_SUCCESS;
	gres_state_t *gres_state_node = NULL, **gres_state_node_array;
	gres_state_t *gpu_gres_state_node = NULL;

	xassert(gres_context_cnt >= 0);
	slurm_mutex_lock(&gres_context_lock);
	gres_state_node_array = xcalloc(gres_context_cnt,
					sizeof(gres_state_t *));
	if ((gres_context_cnt > 0) && (*gres_list == NULL))
		*gres_list = list_create(_gres_node_list_delete);

	/* First validate all of the requested GRES changes */
	for (i = 0; (rc == SLURM_SUCCESS) && (i < gres_context_cnt); i++) {
		/* Find gres_state entry on the list */
		gres_state_node = list_find_first(*gres_list, gres_find_id,
						  &gres_context[i].plugin_id);
		if (gres_state_node == NULL)
			continue;
		gres_state_node_array[i] = gres_state_node;
		rc = _node_reconfig_test(node_name, new_gres, gres_state_node,
					 &gres_context[i]);
	}

	/* Now update the GRES counts */
	for (i = 0; (rc == SLURM_SUCCESS) && (i < gres_context_cnt); i++) {
		bool updated_gpu_cnt = false;
		if (gres_state_node_array[i] == NULL)
			continue;
		rc = _node_reconfig(node_name, new_gres, gres_str,
				    gres_state_node_array[i], config_overrides,
				    &gres_context[i], &updated_gpu_cnt);
		if (updated_gpu_cnt)
			gpu_gres_state_node = gres_state_node;
	}

	/* Now synchronize gres/gpu and gres/'shared' state */
	if (gpu_gres_state_node) {
		/* Update gres/'shared' counts and bitmaps to match gres/gpu */
		_sync_node_shared_to_sharing(gpu_gres_state_node);
	}

	/* Build new per-node gres_str */
	_build_node_gres_str(gres_list, gres_str, cores_per_sock,sock_per_node);
	slurm_mutex_unlock(&gres_context_lock);
	xfree(gres_state_node_array);

	return rc;
}

extern void gres_node_remove(node_record_t *node_ptr)
{
	if (!node_ptr->gres_list)
		return;

	slurm_mutex_lock(&gres_context_lock);
	for (int i = 0; i < gres_context_cnt; i++) {
		gres_state_t *gres_state_node;

		if (!(gres_state_node =
		      list_find_first(node_ptr->gres_list, gres_find_id,
				      &gres_context[i].plugin_id)))
			continue;

		if (gres_state_node->gres_data) {
			gres_node_state_t *gres_ns = gres_state_node->gres_data;
			gres_context[i].total_cnt -= gres_ns->gres_cnt_config;
		}
	}
	slurm_mutex_unlock(&gres_context_lock);
}

/*
 * Pack a node's current gres status, called from slurmctld for save/restore
 * IN gres_list - generated by gres_node_config_validate()
 * IN/OUT buffer - location to write state to
 */
extern int gres_node_state_pack(list_t *gres_list, buf_t *buffer,
				uint16_t protocol_version)
{
	pack_state_t pack_state = {
		.buffer = buffer,
		.magic = GRES_MAGIC,
		.protocol_version = protocol_version,
	};

	return _pack_state(gres_list, &pack_state, _foreach_node_state_pack);
}

/*
 * Unpack a node's current gres status, called from slurmctld for save/restore
 * OUT gres_list - restored state stored by gres_node_state_pack()
 * IN/OUT buffer - location to read state from
 * IN node_name - name of the node for which the gres information applies
 */
extern int gres_node_state_unpack(list_t **gres_list, buf_t *buffer,
				  char *node_name,
				  uint16_t protocol_version)
{
	int rc = SLURM_SUCCESS;
	uint32_t magic = 0, plugin_id = 0, config_flags = 0;
	uint16_t gres_bitmap_size = 0, rec_cnt = 0;
	gres_state_t *gres_state_node;
	gres_node_state_t *gres_ns = NULL;
	bool locked = false;

	safe_unpack16(&rec_cnt, buffer);
	if (rec_cnt == 0)
		return SLURM_SUCCESS;

	xassert(gres_context_cnt >= 0);

	slurm_mutex_lock(&gres_context_lock);
	locked = true;
	if ((gres_context_cnt > 0) && (*gres_list == NULL))
		*gres_list = list_create(_gres_node_list_delete);

	while ((rc == SLURM_SUCCESS) && (rec_cnt)) {
		uint32_t tmp_uint32;
		uint32_t full_config_flags = 0;
		slurm_gres_context_t *gres_ctx;
		if ((buffer == NULL) || (remaining_buf(buffer) == 0))
			break;
		rec_cnt--;

		gres_ns = _build_gres_node_state();

		if (protocol_version >= SLURM_MIN_PROTOCOL_VERSION) {
			safe_unpack32(&magic, buffer);
			if (magic != GRES_MAGIC)
				goto unpack_error;
			safe_unpack32(&plugin_id, buffer);
			safe_unpack32(&config_flags, buffer);
			safe_unpack64(&gres_ns->gres_cnt_avail, buffer);
			safe_unpack16(&gres_bitmap_size, buffer);

			safe_unpack16(&gres_ns->topo_cnt, buffer);
			if (gres_ns->topo_cnt) {
				gres_ns->topo_core_bitmap =
					xcalloc(gres_ns->topo_cnt,
						sizeof(bitstr_t *));
				gres_ns->topo_gres_bitmap =
					xcalloc(gres_ns->topo_cnt,
						sizeof(bitstr_t *));
				gres_ns->topo_res_core_bitmap =
					xcalloc(gres_ns->topo_cnt,
						sizeof(bitstr_t *));
				for (int i = 0; i < gres_ns->topo_cnt; i++) {
					unpack_bit_str_hex(
						&gres_ns->topo_core_bitmap[i],
						buffer);
					unpack_bit_str_hex(
						&gres_ns->topo_gres_bitmap[i],
						buffer);
					unpack_bit_str_hex(
						&gres_ns->
						topo_res_core_bitmap[i],
						buffer);
				}
			}
			safe_unpack64_array(&gres_ns->topo_gres_cnt_alloc,
					    &tmp_uint32, buffer);
			safe_unpack64_array(&gres_ns->topo_gres_cnt_avail,
					    &tmp_uint32, buffer);
			safe_unpack32_array(&gres_ns->topo_type_id, &tmp_uint32,
					    buffer);
			safe_unpackstr_array(&gres_ns->topo_type_name,
					     &tmp_uint32, buffer);
		} else {
			error("%s: protocol_version %hu not supported",
			      __func__, protocol_version);
			goto unpack_error;
		}

		if (!(gres_ctx = _find_context_by_id(plugin_id))) {
			error("%s: no plugin configured to unpack data type %u from node %s",
			      __func__, plugin_id, node_name);
			/*
			 * A likely sign that GresPlugins has changed.
			 * Not a fatal error, skip over the data.
			 */
			_gres_node_state_delete(gres_ns);
			continue;
		}

		if (gres_bitmap_size) {
			gres_ns->gres_bit_alloc =
				bit_alloc(gres_bitmap_size);
		}

		/* We don't want to lose flags from gres_ctx */
		full_config_flags = gres_ctx->config_flags;

		/*
		 * Flag this as flags read from state so we only use them until
		 * the node checks in.
		 */
		gres_ctx->config_flags = config_flags | GRES_CONF_FROM_STATE;

		gres_state_node = gres_create_state(
			gres_ctx, GRES_STATE_SRC_CONTEXT_PTR,
			GRES_STATE_TYPE_NODE, gres_ns);
		list_append(*gres_list, gres_state_node);
		gres_ctx->config_flags |= full_config_flags;
	}
	slurm_mutex_unlock(&gres_context_lock);
	return rc;

unpack_error:
	error("%s: unpack error from node %s", __func__, node_name);
	_gres_node_state_delete(gres_ns);
	if (locked)
		slurm_mutex_unlock(&gres_context_lock);
	return SLURM_ERROR;
}

static void *_node_state_dup(gres_node_state_t *gres_ns)
{
	int i, j;
	gres_node_state_t *new_gres_ns;

	if (gres_ns == NULL)
		return NULL;

	new_gres_ns = xmalloc(sizeof(gres_node_state_t));
	new_gres_ns->gres_cnt_found  = gres_ns->gres_cnt_found;
	new_gres_ns->gres_cnt_config = gres_ns->gres_cnt_config;
	new_gres_ns->gres_cnt_avail  = gres_ns->gres_cnt_avail;
	new_gres_ns->gres_cnt_alloc  = gres_ns->gres_cnt_alloc;
	new_gres_ns->no_consume      = gres_ns->no_consume;
	if (gres_ns->gres_bit_alloc)
		new_gres_ns->gres_bit_alloc = bit_copy(gres_ns->gres_bit_alloc);

	if (gres_ns->links_cnt && gres_ns->link_len) {
		new_gres_ns->links_cnt = xcalloc(gres_ns->link_len,
						 sizeof(int *));
		j = sizeof(int) * gres_ns->link_len;
		for (i = 0; i < gres_ns->link_len; i++) {
			new_gres_ns->links_cnt[i] = xmalloc(j);
			memcpy(new_gres_ns->links_cnt[i],
			       gres_ns->links_cnt[i], j);
		}
		new_gres_ns->link_len = gres_ns->link_len;
	}

	if (gres_ns->topo_cnt) {
		new_gres_ns->topo_cnt         = gres_ns->topo_cnt;
		new_gres_ns->topo_core_bitmap = xcalloc(gres_ns->topo_cnt,
							sizeof(bitstr_t *));
		new_gres_ns->topo_gres_bitmap = xcalloc(gres_ns->topo_cnt,
							sizeof(bitstr_t *));
		new_gres_ns->topo_res_core_bitmap = xcalloc(gres_ns->topo_cnt,
							    sizeof(bitstr_t *));
		new_gres_ns->topo_gres_cnt_alloc = xcalloc(gres_ns->topo_cnt,
							   sizeof(uint64_t));
		new_gres_ns->topo_gres_cnt_avail = xcalloc(gres_ns->topo_cnt,
							   sizeof(uint64_t));
		new_gres_ns->topo_type_id = xcalloc(gres_ns->topo_cnt,
						    sizeof(uint32_t));
		new_gres_ns->topo_type_name = xcalloc(gres_ns->topo_cnt,
						      sizeof(char *));
		for (i = 0; i < gres_ns->topo_cnt; i++) {
			if (gres_ns->topo_core_bitmap[i]) {
				new_gres_ns->topo_core_bitmap[i] =
					bit_copy(gres_ns->topo_core_bitmap[i]);
			}
			if (gres_ns->topo_res_core_bitmap[i]) {
				new_gres_ns->topo_res_core_bitmap[i] =
					bit_copy(gres_ns->
						 topo_res_core_bitmap[i]);
			}
			if (gres_ns->topo_gres_bitmap[i]) {
				new_gres_ns->topo_gres_bitmap[i] =
					bit_copy(gres_ns->topo_gres_bitmap[i]);
			}
			new_gres_ns->topo_gres_cnt_alloc[i] =
				gres_ns->topo_gres_cnt_alloc[i];
			new_gres_ns->topo_gres_cnt_avail[i] =
				gres_ns->topo_gres_cnt_avail[i];
			new_gres_ns->topo_type_id[i] = gres_ns->topo_type_id[i];
			new_gres_ns->topo_type_name[i] =
				xstrdup(gres_ns->topo_type_name[i]);
		}
	}

	if (gres_ns->type_cnt) {
		new_gres_ns->type_cnt       = gres_ns->type_cnt;
		new_gres_ns->type_cnt_alloc = xcalloc(gres_ns->type_cnt,
						      sizeof(uint64_t));
		new_gres_ns->type_cnt_avail = xcalloc(gres_ns->type_cnt,
						      sizeof(uint64_t));
		new_gres_ns->type_id = xcalloc(gres_ns->type_cnt,
					       sizeof(uint32_t));
		new_gres_ns->type_name = xcalloc(gres_ns->type_cnt,
						 sizeof(char *));
		for (i = 0; i < gres_ns->type_cnt; i++) {
			new_gres_ns->type_cnt_alloc[i] =
				gres_ns->type_cnt_alloc[i];
			new_gres_ns->type_cnt_avail[i] =
				gres_ns->type_cnt_avail[i];
			new_gres_ns->type_id[i] = gres_ns->type_id[i];
			new_gres_ns->type_name[i] =
				xstrdup(gres_ns->type_name[i]);
		}
	}

	return new_gres_ns;
}

static int _foreach_node_state_dup(void *x, void *arg)
{
	gres_state_t *gres_state_node = x, *new_gres;
	list_t *new_list = arg;
	void *gres_ns;

	if (!_find_context_by_id(gres_state_node->plugin_id)) {
		error("Could not find plugin id %u to dup node record",
		      gres_state_node->plugin_id);
		return 0;
	}

	gres_ns = _node_state_dup(gres_state_node->gres_data);
	if (gres_ns) {
		new_gres = gres_create_state(
			gres_state_node, GRES_STATE_SRC_STATE_PTR,
			GRES_STATE_TYPE_NODE, gres_ns);
		/*
		 * Because "gres/'shared'" follows "gres/gpu" (see gres_init)
		 * the sharing gres will be in new list already.
		 */
		if (gres_id_shared(new_gres->config_flags)) {
			/*
			 * gres_id_sharing currently only includes gpus so we
			 * can just search for that.
			 */
			_set_alt_gres(new_gres,
				      list_find_first(new_list, gres_find_id,
						      &gpu_plugin_id));
		}
		list_append(new_list, new_gres);
	}
	return 0;
}

/*
 * Duplicate a node gres status (used for will-run logic)
 * IN gres_list - node gres state information
 * RET a copy of gres_list or NULL on failure
 */
extern list_t *gres_node_state_list_dup(list_t *gres_list)
{
	list_t *new_list = NULL;

	if (gres_list == NULL)
		return new_list;

	xassert(gres_context_cnt >= 0);

	slurm_mutex_lock(&gres_context_lock);
	if ((gres_context_cnt > 0)) {
		new_list = list_create(_gres_node_list_delete);
		(void) list_for_each(gres_list,
				     _foreach_node_state_dup,
				     new_list);
	}
	slurm_mutex_unlock(&gres_context_lock);

	return new_list;
}

static int _node_state_dealloc(void *x, void *arg)
{
	gres_state_t *gres_state_node = x;
	int i;
	gres_node_state_t *gres_ns;

	gres_ns = (gres_node_state_t *) gres_state_node->gres_data;
	gres_ns->gres_cnt_alloc = 0;
	if (gres_ns->gres_bit_alloc)
		bit_clear_all(gres_ns->gres_bit_alloc);

	if (gres_ns->topo_cnt && !gres_ns->topo_gres_cnt_alloc) {
		error("gres_node_state_dealloc_all: gres/%s topo_cnt!=0 "
		      "and topo_gres_cnt_alloc is NULL",
		      gres_state_node->gres_name);
	} else if (gres_ns->topo_cnt) {
		for (i = 0; i < gres_ns->topo_cnt; i++) {
			gres_ns->topo_gres_cnt_alloc[i] = 0;
		}
	} else {
		/*
		 * This array can be set at startup if a job has been allocated
		 * specific GRES and the node has not registered with the
		 * details needed to track individual GRES (rather than only
		 * a GRES count).
		 */
		xfree(gres_ns->topo_gres_cnt_alloc);
	}

	for (i = 0; i < gres_ns->type_cnt; i++) {
		gres_ns->type_cnt_alloc[i] = 0;
	}

	return 0;
}

/*
 * Deallocate all resources on this node previous allocated to any jobs.
 *	This function isused to synchronize state after slurmctld restarts or
 *	is reconfigured.
 * IN gres_list - node gres state information
 */
extern void gres_node_state_dealloc_all(list_t *gres_list)
{
	if (gres_list == NULL)
		return;

	xassert(gres_context_cnt >= 0);

	(void) list_for_each(gres_list, _node_state_dealloc, NULL);
}

static char *_node_gres_used(gres_node_state_t *gres_ns, char *gres_name)
{
	char *sep = "";
	int i, j;

	xassert(gres_ns);

	if (!gres_ns->gres_cnt_avail) {
		return NULL;
	} else if ((gres_ns->topo_cnt != 0) && (gres_ns->no_consume == false)) {
		bitstr_t *topo_printed = bit_alloc(gres_ns->topo_cnt);
		xfree(gres_ns->gres_used);    /* Free any cached value */
		for (i = 0; i < gres_ns->topo_cnt; i++) {
			/*
			 * For non-shared gres, we record which indices have
			 * gres allocated. For shared gres, we record the count
			 * of allocated gres at each index (may be >1, as
			 * opposed to non-shared gres which is never >1)
			 *
			 * topo_gres_bitmap is used for non-shared gres, while
			 * topo_gres_cnt_alloc_str is used for shared gres
			 * (shard, mps).
			 */
			bitstr_t *topo_gres_bitmap = NULL;
			char *topo_gres_cnt_alloc_str = NULL;

			uint64_t gres_alloc_cnt = 0;
			char *gres_alloc_idx, tmp_str[64];
			bool is_shared;

			if (bit_test(topo_printed, i))
				continue;
			bit_set(topo_printed, i);

			is_shared = gres_is_shared_name(gres_name);
			if (is_shared) {
				uint64_t alloc, avail;
				alloc = gres_ns->topo_gres_cnt_alloc[i];
				avail = gres_ns->topo_gres_cnt_avail[i];
				xstrfmtcat(topo_gres_cnt_alloc_str,
					   "%"PRIu64"/%"PRIu64,
					   alloc, avail);
				gres_alloc_cnt += alloc;
			} else if (gres_ns->topo_gres_bitmap[i]) {
				topo_gres_bitmap =
					bit_copy(gres_ns->
						 topo_gres_bitmap[i]);
			}
			for (j = i + 1; j < gres_ns->topo_cnt; j++) {
				if (bit_test(topo_printed, j))
					continue;
				if (gres_ns->topo_type_id[i] !=
				    gres_ns->topo_type_id[j])
					continue;
				bit_set(topo_printed, j);
				if (is_shared) {
					uint64_t alloc, avail;
					alloc = gres_ns->topo_gres_cnt_alloc[j];
					avail = gres_ns->topo_gres_cnt_avail[j];
					xstrfmtcat(topo_gres_cnt_alloc_str,
						   ",%"PRIu64"/%"PRIu64,
						   alloc, avail);
					gres_alloc_cnt += alloc;
				} else if (gres_ns->topo_gres_bitmap[j]) {
					if (!topo_gres_bitmap) {
						topo_gres_bitmap =
							bit_copy(gres_ns->
								 topo_gres_bitmap[j]);
					} else if (bit_size(topo_gres_bitmap) ==
						   bit_size(gres_ns->
							    topo_gres_bitmap[j])){
						bit_or(topo_gres_bitmap,
						       gres_ns->
						       topo_gres_bitmap[j]);
					}
				}
			}
			if (!is_shared && gres_ns->gres_bit_alloc &&
			    topo_gres_bitmap &&
			    (bit_size(topo_gres_bitmap) ==
			     bit_size(gres_ns->gres_bit_alloc))) {
				bit_and(topo_gres_bitmap,
					gres_ns->gres_bit_alloc);
				gres_alloc_cnt = bit_set_count(topo_gres_bitmap);
			}
			if (is_shared) {
				gres_alloc_idx = topo_gres_cnt_alloc_str;
			} else if (gres_alloc_cnt > 0) {
				bit_fmt(tmp_str, sizeof(tmp_str),
					topo_gres_bitmap);
				gres_alloc_idx = tmp_str;
			} else {
				gres_alloc_idx = "N/A";
			}
			xstrfmtcat(gres_ns->gres_used,
				   "%s%s:%s:%"PRIu64"(%s%s)", sep, gres_name,
				   gres_ns->topo_type_name[i], gres_alloc_cnt,
				   is_shared ? "" : "IDX:", gres_alloc_idx);
			sep = ",";
			FREE_NULL_BITMAP(topo_gres_bitmap);
			xfree(topo_gres_cnt_alloc_str);
		}
		FREE_NULL_BITMAP(topo_printed);
	} else if (gres_ns->gres_used) {
		;	/* Used cached value */
	} else if (gres_ns->type_cnt == 0) {
		if (gres_ns->no_consume) {
			xstrfmtcat(gres_ns->gres_used, "%s:0", gres_name);
		} else {
			xstrfmtcat(gres_ns->gres_used, "%s:%"PRIu64,
				   gres_name, gres_ns->gres_cnt_alloc);
		}
	} else {
		for (i = 0; i < gres_ns->type_cnt; i++) {
			if (gres_ns->no_consume) {
				xstrfmtcat(gres_ns->gres_used,
					   "%s%s:%s:0", sep, gres_name,
					   gres_ns->type_name[i]);
			} else {
				xstrfmtcat(gres_ns->gres_used,
					   "%s%s:%s:%"PRIu64, sep, gres_name,
					   gres_ns->type_name[i],
					   gres_ns->type_cnt_alloc[i]);
			}
			sep = ",";
		}
	}

	return gres_ns->gres_used;
}

static int _foreach_node_state_log(void *x, void *arg)
{
	gres_state_t *gres_state_node = x;
	gres_node_state_t *gres_ns = gres_state_node->gres_data;
	char *gres_name = gres_state_node->gres_name;
	char *node_name = arg;

	int i, j;
	char *buf = NULL, *sep, tmp_str[128];

	xassert(gres_ns);

	info("gres/%s: state for %s", gres_name, node_name);
	if (gres_ns->gres_cnt_found == NO_VAL64) {
		snprintf(tmp_str, sizeof(tmp_str), "TBD");
	} else {
		snprintf(tmp_str, sizeof(tmp_str), "%"PRIu64,
			 gres_ns->gres_cnt_found);
	}

	if (gres_ns->no_consume) {
		info("  gres_cnt found:%s configured:%"PRIu64" "
		     "avail:%"PRIu64" no_consume",
		     tmp_str, gres_ns->gres_cnt_config,
		     gres_ns->gres_cnt_avail);
	} else {
		info("  gres_cnt found:%s configured:%"PRIu64" "
		     "avail:%"PRIu64" alloc:%"PRIu64"",
		     tmp_str, gres_ns->gres_cnt_config,
		     gres_ns->gres_cnt_avail,
		     gres_ns->gres_cnt_alloc);
	}

	if (gres_ns->gres_bit_alloc) {
		bit_fmt(tmp_str, sizeof(tmp_str),gres_ns->gres_bit_alloc);
		info("  gres_bit_alloc:%s of %d",
		     tmp_str, (int) bit_size(gres_ns->gres_bit_alloc));
	} else {
		info("  gres_bit_alloc:NULL");
	}

	info("  gres_used:%s", gres_ns->gres_used);

	if (gres_ns->links_cnt && gres_ns->link_len) {
		for (i = 0; i < gres_ns->link_len; i++) {
			sep = "";
			for (j = 0; j < gres_ns->link_len; j++) {
				xstrfmtcat(buf, "%s%d", sep,
					   gres_ns->links_cnt[i][j]);
				sep = ", ";
			}
			info("  links[%d]:%s", i, buf);
			xfree(buf);
		}
	}

	for (i = 0; i < gres_ns->topo_cnt; i++) {
		info("  topo[%d]:%s(%u)", i, gres_ns->topo_type_name[i],
		     gres_ns->topo_type_id[i]);
		if (gres_ns->topo_core_bitmap[i]) {
			bit_fmt(tmp_str, sizeof(tmp_str),
				gres_ns->topo_core_bitmap[i]);
			info("   topo_core_bitmap[%d]:%s of %d", i, tmp_str,
			     (int)bit_size(gres_ns->topo_core_bitmap[i]));
		} else
			info("   topo_core_bitmap[%d]:NULL", i);
		if (gres_ns->topo_gres_bitmap[i]) {
			bit_fmt(tmp_str, sizeof(tmp_str),
				gres_ns->topo_gres_bitmap[i]);
			info("   topo_gres_bitmap[%d]:%s of %d", i, tmp_str,
			     (int)bit_size(gres_ns->topo_gres_bitmap[i]));
		} else
			info("   topo_gres_bitmap[%d]:NULL", i);
		info("   topo_gres_cnt_alloc[%d]:%"PRIu64"", i,
		     gres_ns->topo_gres_cnt_alloc[i]);
		info("   topo_gres_cnt_avail[%d]:%"PRIu64"", i,
		     gres_ns->topo_gres_cnt_avail[i]);
	}

	for (i = 0; i < gres_ns->type_cnt; i++) {
		info("  type[%d]:%s(%u)", i, gres_ns->type_name[i],
		     gres_ns->type_id[i]);
		info("   type_cnt_alloc[%d]:%"PRIu64, i,
		     gres_ns->type_cnt_alloc[i]);
		info("   type_cnt_avail[%d]:%"PRIu64, i,
		     gres_ns->type_cnt_avail[i]);
	}

	return 0;
}

/*
 * Log a node's current gres state
 * IN gres_list - generated by gres_node_config_validate()
 * IN node_name - name of the node for which the gres information applies
 */
extern void gres_node_state_log(list_t *gres_list, char *node_name)
{
	if (!(slurm_conf.debug_flags & DEBUG_FLAG_GRES) || !gres_list)
		return;

	xassert(gres_context_cnt >= 0);

	(void) list_for_each(gres_list, _foreach_node_state_log, node_name);
}

/* Find node_state_t gres record with any allocated gres (key is unused) */
static int _find_node_state_with_alloc_gres(void *x, void *key)
{
	gres_state_t *gres_state_node = (gres_state_t *) x;

	if (((gres_node_state_t *) gres_state_node->gres_data)->gres_cnt_alloc)
		return 1;
	else
		return 0;
}

extern bool gres_node_state_list_has_alloc_gres(list_t *gres_list)
{
	if (!gres_list)
		return false;

	return list_find_first(gres_list,
			       _find_node_state_with_alloc_gres, NULL);
}

/*
 * Build a string indicating a node's drained GRES
 * IN gres_list - generated by gres_node_config_validate()
 * RET - string, must be xfreed by caller
 */
extern char *gres_get_node_drain(list_t *gres_list)
{
	char *node_drain = xstrdup("N/A");

	return node_drain;
}

static int _foreach_get_node_used(void *x, void *arg)
{
	gres_state_t *gres_state_node = x;
	char **gres_usedp = arg;
	char *gres_used = *gres_usedp;
	char *tmp = NULL;

	if (!(tmp = _node_gres_used(gres_state_node->gres_data,
				    gres_state_node->gres_name)))
		return 0;

	if (gres_used)
		xstrcat(gres_used, ",");
	xstrcat(gres_used, tmp);

	*gres_usedp = gres_used;

	return 0;
}

/*
 * Build a string indicating a node's used GRES
 * IN gres_list - generated by gres_node_config_validate()
 * RET - string, must be xfreed by caller
 */
extern char *gres_get_node_used(list_t *gres_list)
{
	char *gres_used = NULL;

	if (gres_list)
		(void) list_for_each(gres_list,
				     _foreach_get_node_used,
				     &gres_used);

	return gres_used;
}

/*
 * Give the total system count of a given GRES
 * Returns NO_VAL64 if name not found
 */
extern uint64_t gres_get_system_cnt(char *name, bool case_insensitive)
{
	uint64_t count = NO_VAL64;
	int i;

	if (!name)
		return NO_VAL64;

	xassert(gres_context_cnt >= 0);

	slurm_mutex_lock(&gres_context_lock);
	for (i = 0; i < gres_context_cnt; i++) {
		if (case_insensitive ?
		    !xstrcasecmp(gres_context[i].gres_name, name) :
		    !xstrcmp(gres_context[i].gres_name, name)) {
			count = gres_context[i].total_cnt;
			break;
		}
	}
	slurm_mutex_unlock(&gres_context_lock);
	return count;
}


/*
 * Get the count of a node's GRES
 * IN gres_list - List of Gres records for this node to track usage
 * IN name - name of gres
 */
extern uint64_t gres_node_config_cnt(list_t *gres_list, char *name)
{
	int i;
	gres_state_t *gres_state_node;
	gres_node_state_t *gres_ns;
	uint64_t count = 0;

	if (!gres_list || !name || !list_count(gres_list))
		return count;

	xassert(gres_context_cnt >= 0);

	slurm_mutex_lock(&gres_context_lock);
	for (i = 0; i < gres_context_cnt; i++) {
		if (!xstrcasecmp(gres_context[i].gres_name, name)) {
			/* Find or create gres_state entry on the list */
			gres_state_node = list_find_first(
				gres_list, gres_find_id,
				&gres_context[i].plugin_id);

			if (!gres_state_node || !gres_state_node->gres_data)
				break;
			gres_ns = gres_state_node->gres_data;
			count = gres_ns->gres_cnt_config;
			break;
		} else if (!xstrncasecmp(name, gres_context[i].gres_name_colon,
					 gres_context[i].gres_name_colon_len)) {
			int type;
			uint32_t type_id;
			char *type_str = NULL;

			if (!(type_str = strchr(name, ':'))) {
				error("Invalid gres name '%s'", name);
				break;
			}
			type_str++;

			gres_state_node = list_find_first(
				gres_list, gres_find_id,
				&gres_context[i].plugin_id);

			if (!gres_state_node || !gres_state_node->gres_data)
				break;
			gres_ns = gres_state_node->gres_data;
			type_id = gres_build_id(type_str);
			for (type = 0; type < gres_ns->type_cnt; type++) {
				if (gres_ns->type_id[type] == type_id) {
					count = gres_ns->type_cnt_avail[type];
					break;
				}
			}
			break;
		}
	}
	slurm_mutex_unlock(&gres_context_lock);

	return count;
}

extern void gres_job_state_delete(gres_job_state_t *gres_js)
{
	int i;

	if (gres_js == NULL)
		return;

	gres_job_clear_alloc(gres_js);

	if (gres_js->gres_bit_select) {
		for (i = 0; i < gres_js->total_node_cnt; i++)
			FREE_NULL_BITMAP(gres_js->gres_bit_select[i]);
		xfree(gres_js->gres_bit_select);
	}
	if (gres_js->gres_per_bit_select) {
		for (i = 0; i < gres_js->total_node_cnt; i++){
			xfree(gres_js->gres_per_bit_select[i]);
		}
		xfree(gres_js->gres_per_bit_select);
	}

	if (gres_js->res_gpu_cores) {
		for (i = 0; i < gres_js->res_array_size; i++) {
			FREE_NULL_BITMAP(gres_js->res_gpu_cores[i]);
		}
		xfree(gres_js->res_gpu_cores);
	}

	xfree(gres_js->gres_cnt_node_alloc);
	xfree(gres_js->gres_cnt_node_select);
	xfree(gres_js->type_name);
	xfree(gres_js);
}

extern void gres_job_clear_alloc(gres_job_state_t *gres_js)
{
	for (int i = 0; i < gres_js->node_cnt; i++) {
		if (gres_js->gres_bit_alloc)
			FREE_NULL_BITMAP(gres_js->gres_bit_alloc[i]);
		if (gres_js->gres_bit_step_alloc)
			FREE_NULL_BITMAP(gres_js->gres_bit_step_alloc[i]);
		if (gres_js->gres_per_bit_alloc)
			xfree(gres_js->gres_per_bit_alloc[i]);
		if (gres_js->gres_per_bit_step_alloc)
			xfree(gres_js->gres_per_bit_step_alloc[i]);
	}

	xfree(gres_js->gres_bit_alloc);
	xfree(gres_js->gres_bit_step_alloc);
	xfree(gres_js->gres_per_bit_alloc);
	xfree(gres_js->gres_per_bit_step_alloc);
	xfree(gres_js->gres_cnt_step_alloc);
	xfree(gres_js->gres_cnt_node_alloc);
	gres_js->node_cnt = 0;
}

extern void gres_job_list_delete(void *list_element)
{
	gres_state_t *gres_state_job;

	gres_state_job = (gres_state_t *) list_element;
	gres_job_state_delete(gres_state_job->gres_data);
	gres_state_job->gres_data = NULL;
	_gres_state_delete_members(gres_state_job);
}

/*
 * Ensure consistency of gres_per_* options
 * Modify task and node count as needed for consistentcy with GRES options
 * RET -1 on failure, 0 on success
 */
static int _test_gres_cnt(gres_state_t *gres_state_job,
			  gres_job_state_validate_t *gres_js_val)
{
	gres_job_state_t *gres_js = gres_state_job->gres_data;
	int req_nodes, req_tasks, req_tasks_per_node, req_tasks_per_socket;
	int req_sockets, req_cpus_per_task;
	uint16_t cpus_per_gres;

	/* Ensure gres_per_job >= gres_per_node >= gres_per_socket */
	if (gres_js->gres_per_job &&
	    ((gres_js->gres_per_node &&
	      (gres_js->gres_per_node > gres_js->gres_per_job)) ||
	     (gres_js->gres_per_task &&
	      (gres_js->gres_per_task > gres_js->gres_per_job)) ||
	     (gres_js->gres_per_socket &&
	      (gres_js->gres_per_socket >
	       gres_js->gres_per_job)))) {
		error("Failed to ensure --%ss >= --gres=%s/--%ss-per-node >= --%ss-per-socket",
		      gres_state_job->gres_name,
		      gres_state_job->gres_name,
		      gres_state_job->gres_name,
		      gres_state_job->gres_name);
		return -1;
	}

	/* Ensure gres_per_job >= gres_per_task */
	if (gres_js->gres_per_node &&
	    ((gres_js->gres_per_task &&
	      (gres_js->gres_per_task > gres_js->gres_per_node)) ||
	     (gres_js->gres_per_socket &&
	      (gres_js->gres_per_socket >
	       gres_js->gres_per_node)))) {
		error("Failed to ensure --%ss >= --%ss-per-task",
		      gres_state_job->gres_name,
		      gres_state_job->gres_name);
		return -1;
	}

	/* gres_per_socket requires sockets-per-node count specification */
	if (gres_js->gres_per_socket) {
		if (*gres_js_val->sockets_per_node == NO_VAL16) {
			error("--%ss-per-socket option requires --sockets-per-node specification",
			      gres_state_job->gres_name);
			return -1;
		}
	}

	/* make sure --cpu-per-gres is not combined with --cpus-per-task */
	if (!running_in_slurmctld() && gres_js->cpus_per_gres &&
	    (*gres_js_val->cpus_per_task != NO_VAL16)) {
		error("--cpus-per-%s is mutually exclusive with --cpus-per-task",
		      gres_state_job->gres_name);
		return -1;
	}


	/*
	 * Ensure gres_per_job is multiple of gres_per_node
	 * Ensure node count is consistent with GRES parameters
	 */
	if (gres_js->gres_per_job && gres_js->gres_per_node) {
		if (gres_js->gres_per_job % gres_js->gres_per_node){
			/* gres_per_job not multiple of gres_per_node */
			error("Failed to validate job spec, --%ss is not multiple of --gres=%s/--%ss-per-node",
			      gres_state_job->gres_name,
			      gres_state_job->gres_name,
			      gres_state_job->gres_name);
			return -1;
		}
		req_nodes = gres_js->gres_per_job /
			gres_js->gres_per_node;
		if (((*gres_js_val->min_nodes != NO_VAL) &&
		     (req_nodes < *gres_js_val->min_nodes)) ||
		    (req_nodes > *gres_js_val->max_nodes)) {
			error("Failed to validate job spec. Based on --%s and --gres=%s/--%ss-per-node required nodes (%u) doesn't fall between min_nodes (%u) and max_nodes (%u) boundaries.",
			      gres_state_job->gres_name,
			      gres_state_job->gres_name,
			      gres_state_job->gres_name,
			      req_nodes,
			      *gres_js_val->min_nodes,
			      *gres_js_val->max_nodes);
			return -1;
		}
		*gres_js_val->min_nodes = *gres_js_val->max_nodes = req_nodes;
	}

	/*
	 * Ensure gres_per_node is multiple of gres_per_socket
	 * Ensure task count is consistent with GRES parameters
	 */
	if (gres_js->gres_per_node && gres_js->gres_per_socket) {
		if (gres_js->gres_per_node %
		    gres_js->gres_per_socket) {
			/* gres_per_node not multiple of gres_per_socket */
			error("Failed to validate job spec, --gres=%s/--%ss-per-node not multiple of --%ss-per-socket.",
			      gres_state_job->gres_name,
			      gres_state_job->gres_name,
			      gres_state_job->gres_name);
			return -1;
		}
		req_sockets = gres_js->gres_per_node /
			gres_js->gres_per_socket;
		if (*gres_js_val->sockets_per_node == NO_VAL16)
			*gres_js_val->sockets_per_node = req_sockets;
		else if (*gres_js_val->sockets_per_node != req_sockets) {
			error("Failed to validate job spec. Based on --gres=%s/--%ss-per-node and --%ss-per-socket required number of sockets differ from --sockets-per-node.",
			      gres_state_job->gres_name,
			      gres_state_job->gres_name,
			      gres_state_job->gres_name);
			return -1;
		}
	}

	/*
	 * Ensure ntasks_per_tres is multiple of num_tasks
	 */
	if (gres_js->ntasks_per_gres &&
	    (gres_js->ntasks_per_gres != NO_VAL16) &&
	    (*gres_js_val->num_tasks != NO_VAL)) {
		int tmp = *gres_js_val->num_tasks / gres_js->ntasks_per_gres;
		if ((tmp * gres_js->ntasks_per_gres) !=
		    *gres_js_val->num_tasks) {
			error("Failed to validate job spec, -n/--ntasks has to be a multiple of --ntasks-per-%s.",
			      gres_state_job->gres_name);
			return -1;
		}
	}

	/*
	 * Ensure gres_per_job is multiple of gres_per_task
	 * Ensure task count is consistent with GRES parameters
	 */
	if (gres_js->gres_per_task) {
		if(gres_js->gres_per_job) {
			if (gres_js->gres_per_job %
			    gres_js->gres_per_task) {
				/* gres_per_job not multiple of gres_per_task */
				error("Failed to validate job spec, --%ss not multiple of --%ss-per-task",
				      gres_state_job->gres_name,
				      gres_state_job->gres_name);
				return -1;
			}
			req_tasks = gres_js->gres_per_job /
				gres_js->gres_per_task;
			if (*gres_js_val->num_tasks == NO_VAL)
				*gres_js_val->num_tasks = req_tasks;
			else if (*gres_js_val->num_tasks != req_tasks) {
				if (running_in_slurmctld()) {
					/* requesting new task count */
					gres_js->total_gres =
						gres_js->gres_per_job =
						*gres_js_val->num_tasks *
						gres_js->gres_per_task;
				} else {
					/*
					 * Anywhere outside of the slurmctld we
					 * are asking for something incorrect.
					 */
					error("Failed to validate job spec. Based on --%ss and --%ss-per-task number of requested tasks differ from -n/--ntasks.",
					      gres_state_job->gres_name,
					      gres_state_job->gres_name);
					return -1;
				}
			}
		} else if (*gres_js_val->num_tasks != NO_VAL) {
			gres_js->gres_per_job = *gres_js_val->num_tasks *
				gres_js->gres_per_task;
		} else if (!xstrcmp(gres_state_job->gres_name, "gpu")) {
			error("Failed to validate job spec. --%ss-per-task or --tres-per-task used without either --%ss or -n/--ntasks is not allowed.",
			      gres_state_job->gres_name,
			      gres_state_job->gres_name);
			return -1;
		} else {
			error("Failed to validate job spec. --tres-per-task used without -n/--ntasks is not allowed.");
			return -1;
		}
	}

	/*
	 * Ensure gres_per_node is multiple of gres_per_task
	 * Ensure tasks_per_node is consistent with GRES parameters
	 */
	if (gres_js->gres_per_node && gres_js->gres_per_task) {
		if (gres_js->gres_per_node %
		    gres_js->gres_per_task) {
			/* gres_per_node not multiple of gres_per_task */
			error("Failed to validate job spec, --gres=%s/--%ss-per-node not multiple of --%ss-per-task.",
			      gres_state_job->gres_name,
			      gres_state_job->gres_name,
			      gres_state_job->gres_name);
			return -1;
		}
		req_tasks_per_node = gres_js->gres_per_node /
			gres_js->gres_per_task;
		if ((*gres_js_val->ntasks_per_node == NO_VAL16) ||
		    (*gres_js_val->ntasks_per_node == 0))
			*gres_js_val->ntasks_per_node = req_tasks_per_node;
		else if (*gres_js_val->ntasks_per_node != req_tasks_per_node) {
			error("Failed to validate job spec. Based on --gres=%s/--%ss-per-node and --%ss-per-task requested number of tasks per node differ from --ntasks-per-node.",
			      gres_state_job->gres_name,
			      gres_state_job->gres_name,
			      gres_state_job->gres_name);
			return -1;
		}
	}

	/*
	 * Ensure gres_per_socket is multiple of gres_per_task
	 * Ensure ntasks_per_socket is consistent with GRES parameters
	 */
	if (gres_js->gres_per_socket && gres_js->gres_per_task) {
		if (gres_js->gres_per_socket %
		    gres_js->gres_per_task) {
			/* gres_per_socket not multiple of gres_per_task */
			error("Failed to validate job spec, --%ss-per-socket not multiple of --%ss-per-task.",
			      gres_state_job->gres_name,
			      gres_state_job->gres_name);
			return -1;
		}
		req_tasks_per_socket = gres_js->gres_per_socket /
			gres_js->gres_per_task;
		if ((*gres_js_val->ntasks_per_socket == NO_VAL16) ||
		    (*gres_js_val->ntasks_per_socket == 0))
			*gres_js_val->ntasks_per_socket = req_tasks_per_socket;
		else if (*gres_js_val->ntasks_per_socket !=
			 req_tasks_per_socket) {
			error("Failed to validate job spec. Based on --%ss-per-socket and --%ss-per-task requested number of tasks per sockets differ from --ntasks-per-socket.",
			      gres_state_job->gres_name,
			      gres_state_job->gres_name);
			return -1;
		}
	}

	/* Ensure that cpus_per_gres * gres_per_task == cpus_per_task */
	if (gres_js->cpus_per_gres)
		cpus_per_gres = gres_js->cpus_per_gres;
	else
		cpus_per_gres = gres_js->def_cpus_per_gres;
	if (cpus_per_gres && gres_js->gres_per_task) {
		req_cpus_per_task = cpus_per_gres * gres_js->gres_per_task;
		if ((*gres_js_val->cpus_per_task == NO_VAL16) ||
		    (*gres_js_val->cpus_per_task == 0))
			*gres_js_val->cpus_per_task = req_cpus_per_task;
		else if (*gres_js_val->cpus_per_task != req_cpus_per_task) {
			error("Failed to validate job spec. Based on --cpus-per-%s and --%ss-per-task requested number of cpus differ from -c/--cpus-per-task.",
			      gres_state_job->gres_name,
			      gres_state_job->gres_name);
			return -1;
		}
	}

	/* Ensure tres_per_job >= node count */
	if (gres_js->gres_per_job) {
		if ((*gres_js_val->min_nodes != NO_VAL) &&
		    (gres_js->gres_per_job < *gres_js_val->min_nodes)) {
			error("Failed to validate job spec, --%ss < -N",
			      gres_state_job->gres_name);
			return -1;
		}
		if ((*gres_js_val->max_nodes != NO_VAL) &&
		    (gres_js->gres_per_job < *gres_js_val->max_nodes)) {
			*gres_js_val->max_nodes = gres_js->gres_per_job;
		}
	}

	return 0;
}

/*
 * Reentrant TRES specification parse logic
 * in_val IN - initial input string
 * type OUT -  must be xfreed by caller
 * cnt OUT - count of values
 * save_ptr IN/OUT - NULL on initial call, otherwise value from previous call
 * RET rc - error code
 */
static int _get_next_gres(char *in_val, char **type_ptr, int *context_inx_ptr,
			  uint64_t *cnt, char **save_ptr)
{
	char *name = NULL, *type = NULL, *tres_type = "gres";
	int i, rc = SLURM_SUCCESS;
	uint64_t value = 0;

	xassert(cnt);
	xassert(save_ptr);

	rc = slurm_get_next_tres(&tres_type, in_val, &name, &type,
				 &value, save_ptr);
	if (name) {
		for (i = 0; i < gres_context_cnt; i++) {
			if (!xstrcmp(name, gres_context[i].gres_name) ||
			    !xstrncmp(name, gres_context[i].gres_name_colon,
				      gres_context[i].gres_name_colon_len))
				break;	/* GRES name match found */
		}
		if (i >= gres_context_cnt) {
			debug("%s: Failed to locate GRES %s", __func__, name);
			rc = ESLURM_INVALID_GRES;
		} else
			*context_inx_ptr = i;
		xfree(name);
	}

	if (rc != SLURM_SUCCESS) {
		*save_ptr = NULL;
		if ((rc == ESLURM_INVALID_TRES) && running_in_slurmctld()) {
			info("%s: Invalid GRES job specification %s", __func__,
			     in_val);
		}
		xfree(type);
		*type_ptr = NULL;
	} else {
		*cnt = value;
		*type_ptr = type;
	}
	xfree(name);

	return rc;
}

/*
 * TRES specification parse logic
 * in_val IN - initial input string
 * cnt OUT - count of values
 * gres_list IN/OUT - where to search for (or add) new job TRES record
 * save_ptr IN/OUT - NULL on initial call, otherwise value from previous call
 * rc OUT - unchanged or an error code
 * RET gres - job record to set value in, found or created by this function
 */
static gres_state_t *_get_next_job_gres(char *in_val, uint64_t *cnt,
					list_t *gres_list, char **save_ptr,
					int *rc)
{
	static char *prev_save_ptr = NULL;
	int context_inx = NO_VAL, my_rc = SLURM_SUCCESS;
	gres_job_state_t *gres_js = NULL;
	gres_state_t *gres_state_job = NULL;
	gres_key_t job_search_key;
	char *type = NULL, *name = NULL;

	xassert(save_ptr);
	if (!in_val && (*save_ptr == NULL)) {
		return NULL;
	}

	if (*save_ptr == NULL) {
		prev_save_ptr = in_val;
	} else if (*save_ptr != prev_save_ptr) {
		error("%s: parsing error", __func__);
		my_rc = SLURM_ERROR;
		goto fini;
	}

	if (prev_save_ptr[0] == '\0') {	/* Empty input token */
		*save_ptr = NULL;
		return NULL;
	}

	if ((my_rc = _get_next_gres(in_val, &type, &context_inx,
				    cnt, &prev_save_ptr)) ||
	    (context_inx == NO_VAL)) {
		prev_save_ptr = NULL;
		goto fini;
	}

	/* Find the job GRES record */
	job_search_key.config_flags = gres_context[context_inx].config_flags;
	job_search_key.plugin_id = gres_context[context_inx].plugin_id;
	job_search_key.type_id = gres_build_id(type);
	gres_state_job = list_find_first(gres_list, gres_find_job_by_key,
					 &job_search_key);

	if (gres_state_job) {
		gres_js = gres_state_job->gres_data;
	} else {
		gres_js = xmalloc(sizeof(gres_job_state_t));
		gres_js->type_id = job_search_key.type_id;
		gres_js->type_name = type;
		type = NULL;	/* String moved above */

		gres_state_job = gres_create_state(
			&gres_context[context_inx], GRES_STATE_SRC_CONTEXT_PTR,
			GRES_STATE_TYPE_JOB, gres_js);
		list_append(gres_list, gres_state_job);
	}

fini:	xfree(name);
	xfree(type);
	if (my_rc != SLURM_SUCCESS) {
		prev_save_ptr = NULL;
		if ((my_rc == ESLURM_INVALID_GRES) && running_in_slurmctld()) {
			info("%s: Invalid GRES job specification %s", __func__,
			     in_val);
		}
		*rc = my_rc;
	}
	*save_ptr = prev_save_ptr;
	return gres_state_job;
}

/* Return true if job specification only includes cpus_per_gres or mem_per_gres
 * Return false if any other field set
 */
static bool _generic_state(void *gres_data, bool is_job)
{
	if (is_job) {
		gres_job_state_t *gres_js = gres_data;
		if (gres_js->gres_per_job ||
		    gres_js->gres_per_node ||
		    gres_js->gres_per_socket ||
		    gres_js->gres_per_task)
			return false;
	} else {
		gres_step_state_t *gres_ss = gres_data;
		if (gres_ss->gres_per_step ||
		    gres_ss->gres_per_node ||
		    gres_ss->gres_per_socket ||
		    gres_ss->gres_per_task)
			return false;
	}

	return true;
}

/*
 * Setup over_array to mark if we have gres of the same type.
 */
static void _set_over_array(gres_state_t *gres_state,
			   job_validate_t *job_validate)
{
	char *type_name = job_validate->is_job ?
		((gres_job_state_t *) gres_state->gres_data)->type_name:
		((gres_step_state_t *) gres_state->gres_data)->type_name;
	int i;
	overlap_check_t *overlap_check = NULL;

	xassert(job_validate->over_array);

	for (i = 0; i < job_validate->over_count; i++) {
		if (job_validate->over_array[i].plugin_id ==
		    gres_state->plugin_id)
			break;
	}

	/*
	 * Set overlap_check after the loop since when over_count is 0 the loop
	 * won't happen.
	 */
	overlap_check = &job_validate->over_array[i];
	xassert(overlap_check);

	if (i >= job_validate->over_count) {
		job_validate->over_count++;
		overlap_check->plugin_id = gres_state->plugin_id;
		if (type_name) {
			overlap_check->with_type = true;
		} else {
			overlap_check->without_type = true;
			overlap_check->without_type_state =
				gres_state->gres_data;
		}
	} else if (type_name) {
		overlap_check->with_type = true;
		if (overlap_check->without_type)
			job_validate->overlap_merge = true;
	} else {
		overlap_check->without_type = true;
		overlap_check->without_type_state = gres_state->gres_data;
		if (overlap_check->with_type)
			job_validate->overlap_merge = true;
	}

	return;
}

static int _foreach_merge_generic_data(void *x, void *arg)
{
	gres_state_t *gres_state = x;
	merge_generic_t *merge_generic = arg;

	if (merge_generic->plugin_id != gres_state->plugin_id)
		return 0;

	if (merge_generic->generic_gres_data == gres_state->gres_data)
		return 1;

	if (merge_generic->is_job) {
		gres_job_state_t *gres_js_in = merge_generic->generic_gres_data;
		gres_job_state_t *gres_js = gres_state->gres_data;

		if (!gres_js->cpus_per_gres)
			gres_js->cpus_per_gres = gres_js_in->cpus_per_gres;
		if (!gres_js->mem_per_gres)
			gres_js->mem_per_gres = gres_js_in->mem_per_gres;
	} else {
		gres_step_state_t *gres_ss_in =
			merge_generic->generic_gres_data;
		gres_step_state_t *gres_ss = gres_state->gres_data;

		if (!gres_ss->cpus_per_gres)
			gres_ss->cpus_per_gres = gres_ss_in->cpus_per_gres;
		if (!gres_ss->mem_per_gres)
			gres_ss->mem_per_gres = gres_ss_in->mem_per_gres;
	}

	return 0;
}

/*
 * Put generic data (*_per_gres) on other gres of the same kind.
 */
static int _merge_generic_data(
	list_t *gres_list, job_validate_t *job_validate)
{
	int rc = SLURM_SUCCESS;
	merge_generic_t merge_generic = {
		.is_job = job_validate->is_job,
	};

	for (int i = 0; i < job_validate->over_count; i++) {
		overlap_check_t *overlap_check = &job_validate->over_array[i];
		if (!overlap_check->with_type ||
		    !overlap_check->without_type_state)
			continue;
		if (!_generic_state(overlap_check->without_type_state,
				    job_validate->is_job)) {
			rc = ESLURM_INVALID_GRES_TYPE;
			break;
		}

		/* Propagate generic parameters */
		merge_generic.generic_gres_data =
			overlap_check->without_type_state;
		merge_generic.plugin_id = overlap_check->plugin_id;

		(void) list_delete_all(gres_list,
				       _foreach_merge_generic_data,
				       &merge_generic);
	}

	return rc;
}

static int _foreach_set_over_array(void *x, void *arg)
{
	_set_over_array(x, arg);

	return 0;
}

static int _foreach_job_state_validate(void *x, void *arg)
{
	gres_state_t *gres_state_job = x;
	gres_job_state_t *gres_js = gres_state_job->gres_data;
	job_validate_t *job_validate = arg;

	if (_test_gres_cnt(gres_state_job, job_validate->gres_js_val) != 0) {
		job_validate->rc = ESLURM_INVALID_GRES;
		return -1;
	}
	if (!job_validate->have_gres_sharing &&
	    gres_id_sharing(gres_state_job->plugin_id))
		job_validate->have_gres_sharing = true;
	if (gres_id_shared(gres_state_job->config_flags)) {
		job_validate->have_gres_shared = true;
	}
	if (job_validate->have_gres_sharing && job_validate->have_gres_shared) {
		job_validate->rc = ESLURM_INVALID_GRES;
		return -1;
	}

	if (job_validate->cpus_per_gres &&
	    (gres_state_job->plugin_id == gres_get_gpu_plugin_id()))
		job_validate->tmp_min_cpus +=
			job_validate->cpus_per_gres * gres_js->total_gres;

	(void) _foreach_set_over_array(gres_state_job, job_validate);

	return 0;
}

extern int gres_job_state_validate(gres_job_state_validate_t *gres_js_val)
{
	int rc = SLURM_SUCCESS, size;
	bool requested_gpu = false;
	gres_state_t *gres_state_job;
	gres_job_state_t *gres_js;
	uint64_t cnt = 0;
	char *cpus_per_tres;
	char *mem_per_tres;
	char *tres_freq;
	char *tres_per_job;
	char *tres_per_node;
	char *tres_per_socket;
	char *tres_per_task;
	job_validate_t job_validate = {
		.gres_js_val = gres_js_val,
		.is_job = true,
		.rc = SLURM_SUCCESS,
	};

	xassert(gres_js_val);
	xassert(gres_js_val->gres_list);
	xassert(!*gres_js_val->gres_list);

	cpus_per_tres = gres_js_val->cpus_per_tres;
	mem_per_tres = gres_js_val->mem_per_tres;
	tres_freq = gres_js_val->tres_freq;
	tres_per_job = gres_js_val->tres_per_job;
	tres_per_node = gres_js_val->tres_per_node;
	tres_per_socket = gres_js_val->tres_per_socket;
	tres_per_task = gres_js_val->tres_per_task;

	if (tres_per_task && running_in_slurmctld() && !running_cons_tres()) {
		char *tmp = xstrdup(tres_per_task);
		/*
		 * Check if cpus_per_task is the only part of tres_per_task. If
		 * so, continue with validation. If not, then the request is
		 * invalid: reject the request.
		 */
		slurm_option_update_tres_per_task(0, "cpu", &tmp);
		if (tmp) {
			xfree(tmp);
			return ESLURM_UNSUPPORTED_GRES;
		}
	}

	if (running_in_slurmctld() && !running_cons_tres() &&
	    (cpus_per_tres || tres_per_job || tres_per_socket || mem_per_tres))
		return ESLURM_UNSUPPORTED_GRES;

	if (!cpus_per_tres && !tres_per_job && !tres_per_node &&
	    !tres_per_socket && !tres_per_task && !mem_per_tres &&
	    !gres_js_val->ntasks_per_tres)
		return SLURM_SUCCESS;

	if ((tres_per_task || (*gres_js_val->ntasks_per_tres != NO_VAL16)) &&
	    (*gres_js_val->num_tasks == NO_VAL) &&
	    (*gres_js_val->min_nodes != NO_VAL) &&
	    (*gres_js_val->min_nodes == *gres_js_val->max_nodes)) {
		/* Implicitly set task count */
		if (*gres_js_val->ntasks_per_tres != NO_VAL16)
			*gres_js_val->num_tasks = *gres_js_val->min_nodes *
				*gres_js_val->ntasks_per_tres;
		else if (*gres_js_val->ntasks_per_node != NO_VAL16)
			*gres_js_val->num_tasks = *gres_js_val->min_nodes *
				*gres_js_val->ntasks_per_node;
		else if (*gres_js_val->cpus_per_task == NO_VAL16)
			*gres_js_val->num_tasks = *gres_js_val->min_nodes;
	}

	xassert(gres_context_cnt >= 0);

	/*
	 * Set new values as requested
	 */
	*gres_js_val->gres_list = list_create(gres_job_list_delete);

	slurm_mutex_lock(&gres_context_lock);
	if (cpus_per_tres) {
		char *in_val = cpus_per_tres, *save_ptr = NULL;
		while ((gres_state_job = _get_next_job_gres(in_val, &cnt,
							    *gres_js_val->
							    gres_list,
							    &save_ptr, &rc))) {
			gres_js = gres_state_job->gres_data;
			gres_js->cpus_per_gres = cnt;
			in_val = NULL;
			gres_js->ntasks_per_gres =
				*gres_js_val->ntasks_per_tres;

			/*
			 * In theory MAX(cpus_per_gres) shouldn't matter because
			 * we should only allow one gres name to have
			 * cpus_per_gres and it should be the same for all types
			 * (e.g., gpu:k80 vs gpu:tesla) of that same gres (gpu)
			 */
			job_validate.cpus_per_gres =
				MAX(job_validate.cpus_per_gres, cnt);
		}
	}
	if (tres_per_job) {
		char *in_val = tres_per_job, *save_ptr = NULL;
		while ((gres_state_job = _get_next_job_gres(in_val, &cnt,
							    *gres_js_val->
							    gres_list,
							    &save_ptr, &rc))) {
			if (!requested_gpu &&
			    (!xstrcmp(gres_state_job->gres_name, "gpu")))
				requested_gpu = true;
			gres_js = gres_state_job->gres_data;
			gres_js->gres_per_job = cnt;
			in_val = NULL;
			gres_js->total_gres =
				MAX(gres_js->total_gres, cnt);
			gres_js->ntasks_per_gres =
				*gres_js_val->ntasks_per_tres;
		}
	}
	if (tres_per_node) {
		char *in_val = tres_per_node, *save_ptr = NULL;
		while ((gres_state_job = _get_next_job_gres(in_val, &cnt,
							    *gres_js_val->
							    gres_list,
							    &save_ptr, &rc))) {
			if (!requested_gpu &&
			    (!xstrcmp(gres_state_job->gres_name, "gpu")))
				requested_gpu = true;
			gres_js = gres_state_job->gres_data;
			gres_js->gres_per_node = cnt;
			in_val = NULL;
			if (*gres_js_val->min_nodes != NO_VAL)
				cnt *= *gres_js_val->min_nodes;
			gres_js->total_gres =
				MAX(gres_js->total_gres, cnt);
			gres_js->ntasks_per_gres =
				*gres_js_val->ntasks_per_tres;
		}
	}
	if (tres_per_socket) {
		char *in_val = tres_per_socket, *save_ptr = NULL;
		while ((gres_state_job = _get_next_job_gres(in_val, &cnt,
							    *gres_js_val->
							    gres_list,
							    &save_ptr, &rc))) {
			if (!requested_gpu &&
			    (!xstrcmp(gres_state_job->gres_name, "gpu")))
				requested_gpu = true;
			gres_js = gres_state_job->gres_data;
			gres_js->gres_per_socket = cnt;
			in_val = NULL;
			if ((*gres_js_val->min_nodes != NO_VAL) &&
			    (*gres_js_val->sockets_per_node != NO_VAL16)) {
				cnt *= (*gres_js_val->min_nodes *
					*gres_js_val->sockets_per_node);
			} else if ((*gres_js_val->num_tasks != NO_VAL) &&
				   (*gres_js_val->ntasks_per_socket !=
				    NO_VAL16)) {
				cnt *= ROUNDUP(*gres_js_val->num_tasks,
					       *gres_js_val->ntasks_per_socket);
			} else if (*gres_js_val->sockets_per_node != NO_VAL16) {
				/* default 1 node */
				cnt *= *gres_js_val->sockets_per_node;
			}
			gres_js->total_gres =
				MAX(gres_js->total_gres, cnt);
			gres_js->ntasks_per_gres =
				*gres_js_val->ntasks_per_tres;
		}
	}
	if (tres_per_task) {
		char *in_val = tres_per_task, *save_ptr = NULL;
		while ((gres_state_job = _get_next_job_gres(in_val, &cnt,
							    *gres_js_val->
							    gres_list,
							    &save_ptr, &rc))) {
			if (!requested_gpu &&
			    (!xstrcmp(gres_state_job->gres_name, "gpu")))
				requested_gpu = true;
			gres_js = gres_state_job->gres_data;
			gres_js->gres_per_task = cnt;
			in_val = NULL;
			if (*gres_js_val->num_tasks != NO_VAL)
				cnt *= *gres_js_val->num_tasks;
			gres_js->total_gres =
				MAX(gres_js->total_gres, cnt);
			gres_js->ntasks_per_gres =
				*gres_js_val->ntasks_per_tres;
		}
	}
	if (mem_per_tres) {
		char *in_val = mem_per_tres, *save_ptr = NULL;
		while ((gres_state_job = _get_next_job_gres(in_val, &cnt,
							    *gres_js_val->
							    gres_list,
							    &save_ptr, &rc))) {
			gres_js = gres_state_job->gres_data;
			gres_js->mem_per_gres = cnt;
			in_val = NULL;
			gres_js->ntasks_per_gres =
				*gres_js_val->ntasks_per_tres;
		}
	}

	/*
	 * *gres_js_val->num_tasks and *gres_js_val->ntasks_per_tres could be 0
	 * on requeue
	 */
	if (!gres_js_val->ntasks_per_tres ||
	    !*gres_js_val->ntasks_per_tres ||
	    (*gres_js_val->ntasks_per_tres == NO_VAL16)) {
		/* do nothing */
	} else if (requested_gpu && list_count(*gres_js_val->gres_list)) {
		/* Set num_tasks = gpus * ntasks/gpu */
		uint64_t gpus = _get_job_gres_list_cnt(
			*gres_js_val->gres_list, "gpu", NULL);
		if (gpus != NO_VAL64)
			*gres_js_val->num_tasks =
				gpus * *gres_js_val->ntasks_per_tres;
		else {
			error("%s: Can't set num_tasks = gpus * *ntasks_per_tres because there are no allocated GPUs",
			      __func__);
			rc = ESLURM_INVALID_GRES;
		}
	} else if (*gres_js_val->num_tasks &&
		   (*gres_js_val->num_tasks != NO_VAL)) {
		/*
		 * If job_gres_list empty, and ntasks_per_tres is specified,
		 * then derive GPUs according to how many tasks there are.
		 * GPU GRES = [ntasks / (ntasks_per_tres)]
		 * For now, only generate type-less GPUs.
		 */
		uint32_t gpus = *gres_js_val->num_tasks /
			*gres_js_val->ntasks_per_tres;
		char *save_ptr = NULL, *gres = NULL, *in_val;
		xstrfmtcat(gres, "gres/gpu:%u", gpus);
		in_val = gres;
		while ((gres_state_job = _get_next_job_gres(in_val, &cnt,
							    *gres_js_val->
							    gres_list,
							    &save_ptr, &rc))) {
			gres_js = gres_state_job->gres_data;
			gres_js->ntasks_per_gres =
				*gres_js_val->ntasks_per_tres;
			/* Simulate a tres_per_job specification */
			gres_js->gres_per_job = cnt;
			gres_js->total_gres =
				MAX(gres_js->total_gres, cnt);
			in_val = NULL;
		}
		if (list_count(*gres_js_val->gres_list) == 0)
			error("%s: Failed to add generated GRES %s (via ntasks_per_tres) to gres_list",
			      __func__, gres);
		else
			requested_gpu = true;
		xfree(gres);
	} else {
		error("%s: --ntasks-per-tres needs either a GRES GPU specification or a node/ntask specification",
		      __func__);
		rc = ESLURM_INVALID_GRES;
	}

	slurm_mutex_unlock(&gres_context_lock);

	if (rc != SLURM_SUCCESS)
		return rc;
	size = list_count(*gres_js_val->gres_list);
	if (size == 0) {
		FREE_NULL_LIST(*gres_js_val->gres_list);
		return rc;
	}

	/*
	 * If someone requested [mem|cpus]_per_tres but didn't request any GPUs
	 * (even if --exclusive was used), then error. For now we only test for
	 * GPUs since --[mem|cpus]-per-gpu are the only allowed
	 * [mem|cpus]_per_gres options. Even though --exclusive means that you
	 * will be allocated all of the GRES on the node, we still require that
	 * GPUs are explicitly requested when --[mem|cpus]-per-gpu is used.
	 */
	if (mem_per_tres && (!requested_gpu)) {
		error("Requested mem_per_tres=%s but did not request any GPU.",
		      mem_per_tres);
		return ESLURM_INVALID_GRES;
	}
	if (cpus_per_tres && (!requested_gpu)) {
		error("Requested cpus_per_tres=%s but did not request any GPU.",
		      cpus_per_tres);
		return ESLURM_INVALID_GRES;
	}

	/*
	 * Check for record overlap (e.g. "gpu:2,gpu:tesla:1")
	 * Ensure tres_per_job >= tres_per_node >= tres_per_socket
	 */
	job_validate.over_array = xcalloc(size, sizeof(overlap_check_t));

	(void) list_for_each(*gres_js_val->gres_list,
			     _foreach_job_state_validate,
			     &job_validate);

	if (job_validate.tmp_min_cpus > *gres_js_val->min_cpus)
		*gres_js_val->min_cpus = job_validate.tmp_min_cpus;

	if (((*gres_js_val->cpus_per_task) != NO_VAL16) &&
	    ((*gres_js_val->num_tasks) != NO_VAL)) {
		cnt = (*gres_js_val->cpus_per_task) * (*gres_js_val->num_tasks);
		if (*gres_js_val->min_cpus < cnt)
			*gres_js_val->min_cpus = cnt;
	}

	if (job_validate.have_gres_shared &&
	    (job_validate.rc == SLURM_SUCCESS) &&
	    tres_freq &&
	    strstr(tres_freq, "gpu")) {
		job_validate.rc = ESLURM_INVALID_GRES;
	}

	if (job_validate.overlap_merge) /* Merge generic data if possible */
		job_validate.rc = _merge_generic_data(*gres_js_val->gres_list,
						      &job_validate);

	xfree(job_validate.over_array);

	return job_validate.rc;
}

static int _find_gres_per_jst(void *x, void *arg)
{
	gres_state_t *gres_state_job = x;
	gres_job_state_t *gres_js = gres_state_job->gres_data;

	if (gres_js->gres_per_job ||
	    gres_js->gres_per_socket ||
	    gres_js->gres_per_task)
		return 1;

	return 0;
}

/*
 * Determine if a job's specified GRES can be supported. This is designed to
 * prevent the running of a job using the GRES options only supported by the
 * select/cons_tres plugin when switching (on slurmctld restart) from the
 * cons_tres plugin to any other select plugin.
 *
 * IN gres_list - List of GRES records for this job to track usage
 * RET SLURM_SUCCESS or ESLURM_INVALID_GRES
 */
extern int gres_job_revalidate(list_t *gres_list)
{
	if (!gres_list || running_cons_tres())
		return SLURM_SUCCESS;

	if (list_find_first(gres_list, _find_gres_per_jst, NULL))
		return ESLURM_UNSUPPORTED_GRES;

	return SLURM_SUCCESS;
}

/*
 * Return TRUE if any of this job's GRES has a populated gres_bit_alloc element.
 * This indicates the allocated GRES has a File configuration parameter and is
 * tracking individual file assignments.
 */
static int _find_job_has_gres_bits(void *x, void *arg)
{
	gres_state_t *gres_state_job = x;
	gres_job_state_t *gres_js = gres_state_job->gres_data;;

	for (int i = 0; i < gres_js->node_cnt; i++) {
		if (gres_js->gres_bit_alloc && gres_js->gres_bit_alloc[i])
			return 1;
	}

	return 0;
}


static int _find_invalid_job_gres_on_node(void *x, void *arg)
{
	gres_state_t *gres_state_job = x;
	gres_job_state_t *gres_js = gres_state_job->gres_data;
	validate_job_gres_cnt_t *validate_job_gres_cnt = arg;
	gres_state_t *gres_state_node;
	uint32_t plugin_id;
	int job_gres_cnt, node_gres_cnt = 0;

	if (!gres_js ||
	    !gres_js->gres_bit_alloc ||
	    (gres_js->node_cnt <= validate_job_gres_cnt->node_inx) ||
	    !gres_js->gres_bit_alloc[validate_job_gres_cnt->node_inx])
		return 0;

	job_gres_cnt = bit_size(
		gres_js->gres_bit_alloc[validate_job_gres_cnt->node_inx]);

	if (gres_id_shared(gres_state_job->config_flags))
		plugin_id = gpu_plugin_id;
	else
		plugin_id = gres_state_job->plugin_id;

	if ((gres_state_node = list_find_first(validate_job_gres_cnt->
					       node_gres_list,
					       gres_find_id,
					       &plugin_id))) {
		gres_node_state_t *gres_ns = gres_state_node->gres_data;
		node_gres_cnt = (int) gres_ns->gres_cnt_config;
		if (gres_js->type_id) {
			bool found_type = false;
			gres_node_state_t *gres_ns = gres_state_node->gres_data;

			for (int i = 0; i < gres_ns->type_cnt; i++) {
				if (gres_ns->type_id[i] == gres_js->type_id) {
					found_type = true;
					break;
				}
			}
			if (!found_type) {
				error("%s: Killing job %u: gres/%s type %s not found on node %s",
				      __func__,
				      validate_job_gres_cnt->job_id,
				      gres_state_job->gres_name,
				      gres_js->type_name,
				      validate_job_gres_cnt->node_name);
				return 1;
			}
		}
	}

	if (job_gres_cnt != node_gres_cnt) {
		error("%s: Killing job %u: gres/%s count mismatch on node "
		      "%s (%d != %d)",
		      __func__, validate_job_gres_cnt->job_id,
		      gres_state_job->gres_name,
		      validate_job_gres_cnt-> node_name,
		      job_gres_cnt, node_gres_cnt);
		return 1;
	}

	return 0;
}

/*
 * Determine if a job's specified GRES are currently valid. This is designed to
 * manage jobs allocated GRES which are either no longer supported or a GRES
 * configured with the "File" option in gres.conf where the count has changed,
 * in which case we don't know how to map the job's old GRES bitmap onto the
 * current GRES bitmaps.
 *
 * IN job_id - ID of job being validated (used for logging)
 * IN job_gres_list - List of GRES records for this job to track usage
 * RET SLURM_SUCCESS or ESLURM_INVALID_GRES
 */
extern int gres_job_revalidate2(uint32_t job_id, list_t *job_gres_list,
				bitstr_t *node_bitmap)
{
	node_record_t *node_ptr;
	int rc = SLURM_SUCCESS;
	validate_job_gres_cnt_t validate_job_gres_cnt = {
		.job_id = job_id,
		.node_inx = -1,
	};

	if (!job_gres_list || !node_bitmap ||
	    !list_find_first(job_gres_list, _find_job_has_gres_bits, NULL))
		return SLURM_SUCCESS;

	for (int i = 0; (node_ptr = next_node_bitmap(node_bitmap, &i)); i++) {
		/* If no node_ptr->gres_list we are invalid */
		if (!node_ptr->gres_list)
			return ESLURM_INVALID_GRES;

		validate_job_gres_cnt.node_inx++;
		validate_job_gres_cnt.node_gres_list = node_ptr->gres_list;
		validate_job_gres_cnt.node_name = node_ptr->name;

		if (list_find_first(job_gres_list,
				    _find_invalid_job_gres_on_node,
				    &validate_job_gres_cnt))
			return ESLURM_INVALID_GRES;
	}

	return rc;
}

/*
 * Find a sock_gres_t record in a list by matching the plugin_id and type_id
 *	from a gres_state_t job record
 * IN x - a sock_gres_t record to test
 * IN key - the gres_state_t record (from a job) we want to match
 * RET 1 on match, otherwise 0
 */
extern int gres_find_sock_by_job_state(void *x, void *key)
{
	sock_gres_t *sock_data = (sock_gres_t *) x;
	gres_state_t *job_gres_state = (gres_state_t *) key;
	gres_job_state_t *sock_gres_js, *gres_js;

	gres_js = (gres_job_state_t *) job_gres_state->gres_data;
	sock_gres_js = sock_data->gres_state_job->gres_data;

	if ((sock_data->gres_state_job->plugin_id ==
	     job_gres_state->plugin_id) &&
	    (sock_gres_js->type_id == gres_js->type_id))
		return 1;
	return 0;
}

/*
 * Create a (partial) copy of a job's gres state for job binding
 * IN gres_list - List of Gres records for this job to track usage
 * RET The copy or NULL on failure
 * NOTE: Only job details are copied, NOT the job step details
 */
extern list_t *gres_job_state_list_dup(list_t *gres_list)
{
	return gres_job_state_extract(gres_list, -1);
}

static gres_job_state_t *_job_state_dup_common(gres_job_state_t *gres_js)
{
	gres_job_state_t *new_gres_js = xmalloc(sizeof(gres_job_state_t));

	new_gres_js->cpus_per_gres = gres_js->cpus_per_gres;
	new_gres_js->def_cpus_per_gres = gres_js->def_cpus_per_gres;
	new_gres_js->def_mem_per_gres = gres_js->def_mem_per_gres;
	new_gres_js->flags = gres_js->flags;
	new_gres_js->gres_per_job = gres_js->gres_per_job;
	new_gres_js->gres_per_node = gres_js->gres_per_node;
	new_gres_js->gres_per_socket = gres_js->gres_per_socket;
	new_gres_js->gres_per_task = gres_js->gres_per_task;
	new_gres_js->mem_per_gres = gres_js->mem_per_gres;
	new_gres_js->ntasks_per_gres = gres_js->ntasks_per_gres;
	new_gres_js->node_cnt = gres_js->node_cnt;
	new_gres_js->res_array_size = gres_js->res_array_size;
	new_gres_js->total_gres	= gres_js->total_gres;
	new_gres_js->total_node_cnt = gres_js->total_node_cnt;
	new_gres_js->type_id = gres_js->type_id;
	new_gres_js->type_name = xstrdup(gres_js->type_name);

	return new_gres_js;
}

/* Copy gres_job_state_t record for ALL nodes */
extern void *gres_job_state_dup(gres_job_state_t *gres_js)
{

	int i;
	gres_job_state_t *new_gres_js;

	if (gres_js == NULL)
		return NULL;

	new_gres_js = _job_state_dup_common(gres_js);

	if (gres_js->gres_cnt_node_alloc) {
		i = sizeof(uint64_t) * gres_js->node_cnt;
		new_gres_js->gres_cnt_node_alloc = xmalloc(i);
		memcpy(new_gres_js->gres_cnt_node_alloc,
		       gres_js->gres_cnt_node_alloc, i);
	}
	if (gres_js->gres_cnt_step_alloc) {
		new_gres_js->gres_cnt_step_alloc = xcalloc(
			gres_js->node_cnt,
			sizeof(*new_gres_js->gres_cnt_step_alloc));
		memcpy(new_gres_js->gres_cnt_step_alloc,
		       gres_js->gres_cnt_step_alloc,
		       (sizeof(*new_gres_js->gres_cnt_step_alloc) *
			gres_js->node_cnt));
	}
	if (gres_js->gres_bit_alloc) {
		new_gres_js->gres_bit_alloc = xcalloc(gres_js->node_cnt,
						      sizeof(bitstr_t *));
		for (i = 0; i < gres_js->node_cnt; i++) {
			if (gres_js->gres_bit_alloc[i] == NULL)
				continue;
			new_gres_js->gres_bit_alloc[i] =
				bit_copy(gres_js->gres_bit_alloc[i]);
		}
	}
	if (gres_js->gres_per_bit_alloc && gres_js->gres_bit_alloc) {
		new_gres_js->gres_per_bit_alloc = xcalloc(gres_js->node_cnt,
							  sizeof(uint64_t *));
		for (i = 0; i < gres_js->node_cnt; i++) {
			int bit_cnt = bit_size(gres_js->gres_bit_alloc[i]);
			new_gres_js->gres_per_bit_alloc[i] = xcalloc(
				bit_cnt, sizeof(uint64_t));
			memcpy(new_gres_js->gres_per_bit_alloc[i],
			       gres_js->gres_per_bit_alloc[i], bit_cnt);
		}
	}
	if (gres_js->gres_bit_step_alloc) {
		new_gres_js->gres_bit_step_alloc = xcalloc(gres_js->node_cnt,
							   sizeof(bitstr_t *));
		for (i = 0; i < gres_js->node_cnt; i++) {
			if (!gres_js->gres_bit_step_alloc[i])
				continue;
			new_gres_js->gres_bit_step_alloc[i] =
				bit_copy(gres_js->gres_bit_step_alloc[i]);
		}
	}
	if (gres_js->gres_per_bit_step_alloc && gres_js->gres_bit_alloc) {
		new_gres_js->gres_per_bit_step_alloc = xcalloc(
			gres_js->node_cnt, sizeof(uint64_t *));
		for (i = 0; i < gres_js->node_cnt; i++) {
			int bit_cnt = bit_size(gres_js->gres_bit_alloc[i]);
			new_gres_js->gres_per_bit_step_alloc[i] = xcalloc(
				bit_cnt, sizeof(uint64_t));
			memcpy(new_gres_js->gres_per_bit_step_alloc[i],
			       gres_js->gres_per_bit_step_alloc[i],
			       bit_cnt * sizeof(uint64_t));
		}
	}
	if (gres_js->gres_cnt_node_select) {
		i = sizeof(uint64_t) * gres_js->total_node_cnt;
		new_gres_js->gres_cnt_node_select = xmalloc(i);
		memcpy(new_gres_js->gres_cnt_node_select,
		       gres_js->gres_cnt_node_select, i);
	}
	if (gres_js->gres_bit_select) {
		new_gres_js->gres_bit_select = xcalloc(gres_js->total_node_cnt,
						       sizeof(bitstr_t *));
		for (i = 0; i < gres_js->total_node_cnt; i++) {
			if (gres_js->gres_bit_select[i] == NULL)
				continue;
			new_gres_js->gres_bit_select[i] =
				bit_copy(gres_js->gres_bit_select[i]);
		}
	}
	if (gres_js->gres_per_bit_select && gres_js->gres_bit_select) {
		new_gres_js->gres_per_bit_select =
			xcalloc(gres_js->total_node_cnt, sizeof(uint64_t *));
		for (i = 0; i < gres_js->total_node_cnt; i++) {
			int bit_cnt;

			if (!gres_js->gres_bit_select[i])
				continue;

			bit_cnt = bit_size(gres_js->gres_bit_select[i]);
			new_gres_js->gres_per_bit_select[i] = xcalloc(
				bit_cnt, sizeof(uint64_t));
			memcpy(new_gres_js->gres_per_bit_select[i],
			       gres_js->gres_per_bit_select[i], bit_cnt);
		}
	}

	if (gres_js->res_gpu_cores) {
		new_gres_js->res_gpu_cores = xcalloc(gres_js->res_array_size,
						     sizeof(bitstr_t *));
		for (i = 0; i < gres_js->res_array_size; i++) {
			if (gres_js->res_gpu_cores[i] == NULL)
				continue;
			new_gres_js->res_gpu_cores[i] =
				bit_copy(gres_js->res_gpu_cores[i]);
		}
	}

	return new_gres_js;
}

/* Copy gres_job_state_t record for one specific node (stepd) */
static void *_job_state_dup2(gres_job_state_t *gres_js, int job_node_index)
{
	gres_job_state_t *new_gres_js;

	if (gres_js == NULL)
		return NULL;

	new_gres_js = _job_state_dup_common(gres_js);
	new_gres_js->total_node_cnt = 1;
	new_gres_js->node_cnt		= 1;

	if (gres_js->gres_cnt_node_alloc) {
		new_gres_js->gres_cnt_node_alloc = xmalloc(sizeof(uint64_t));
		new_gres_js->gres_cnt_node_alloc[0] =
			gres_js->gres_cnt_node_alloc[job_node_index];
	}
	if (gres_js->gres_bit_alloc &&
	    gres_js->gres_bit_alloc[job_node_index]) {
		new_gres_js->gres_bit_alloc	= xmalloc(sizeof(bitstr_t *));
		new_gres_js->gres_bit_alloc[0] =
			bit_copy(gres_js->gres_bit_alloc[job_node_index]);
	}
	if (gres_js->gres_per_bit_alloc &&
	    gres_js->gres_bit_alloc &&
	    gres_js->gres_bit_alloc[job_node_index]) {
		new_gres_js->gres_per_bit_alloc = xmalloc(sizeof(uint64_t *));
		new_gres_js->gres_per_bit_alloc[0] = xcalloc(
			bit_size(gres_js->gres_bit_alloc[job_node_index]),
			sizeof(uint64_t));
		memcpy(new_gres_js->gres_per_bit_alloc[0],
		       gres_js->gres_per_bit_alloc[job_node_index],
		       bit_size(gres_js->gres_bit_alloc[job_node_index]) *
		       sizeof(uint64_t));
	}

	/*
	 * No reason to do
	 *
	 * gres_js->gres_cnt_node_select
	 * gres_js->gres_bit_select
	 *
	 * they are based off the entire cluster this is not needed for the
	 * stepd.
	 */

	return new_gres_js;
}

static int _foreach_job_state_extract(void *x, void *arg)
{
	gres_state_t *gres_state_job = x;
	job_state_extract_t *job_state_extract = arg;
	gres_state_t *new_gres_state;
	void *new_gres_data;

	if (job_state_extract->job_node_index == -1)
		new_gres_data = gres_job_state_dup(
			gres_state_job->gres_data);
	else
		new_gres_data = _job_state_dup2(
			gres_state_job->gres_data,
			job_state_extract->job_node_index);

	if (!new_gres_data)
		return -1;

	if (!job_state_extract->new_list)
		job_state_extract->new_list = list_create(gres_job_list_delete);

	new_gres_state = gres_create_state(
		gres_state_job, GRES_STATE_SRC_STATE_PTR,
		GRES_STATE_TYPE_JOB, new_gres_data);
	list_append(job_state_extract->new_list, new_gres_state);

	return 0;
}

/*
 * Create a (partial) copy of a job's gres state for a particular node index
 * IN gres_list - List of Gres records for this job to track usage
 * IN job_node_index - zero-origin index to the node
 * RET The copy or NULL on failure
 */
extern list_t *gres_job_state_extract(list_t *gres_list, int job_node_index)
{
	job_state_extract_t job_state_extract = {
		.job_node_index = job_node_index,
	};

	if (gres_list)
		(void) list_for_each(gres_list,
				     _foreach_job_state_extract,
				     &job_state_extract);

	return job_state_extract.new_list;
}

/*
 * Pack a job's current gres status, called from slurmctld for save/restore
 * IN gres_list - generated by gres_job_config_validate()
 * IN/OUT buffer - location to write state to
 * IN job_id - job's ID
 * IN details - if set then pack job step allocation details (only needed to
 *		save/restore job state, not needed in job credential for
 *		slurmd task binding)
 *
 * NOTE: A job's allocation to steps is not recorded here, but recovered with
 *	 the job step state information upon slurmctld restart.
 */
extern int gres_job_state_pack(list_t *gres_list, buf_t *buffer,
			       uint32_t job_id, bool details,
			       uint16_t protocol_version)
{
	pack_state_t pack_state = {
		.buffer = buffer,
		.details = details,
		.magic = GRES_MAGIC,
		.protocol_version = protocol_version,
	};

	return _pack_state(gres_list, &pack_state, _foreach_job_state_pack);
}

/*
 * Unpack a job's current gres status, called from slurmctld for save/restore
 * OUT gres_list - restored state stored by gres_job_state_pack()
 * IN/OUT buffer - location to read state from
 * IN job_id - job's ID
 */
extern int gres_job_state_unpack(list_t **gres_list, buf_t *buffer,
				 uint32_t job_id,
				 uint16_t protocol_version)
{
	int i = 0, rc = SLURM_SUCCESS;
	uint32_t magic = 0, plugin_id = 0, utmp32 = 0;
	uint16_t rec_cnt = 0;
	uint8_t  has_more = 0;
	gres_state_t *gres_state_job;
	gres_job_state_t *gres_js = NULL;
	bool locked = false;

	safe_unpack16(&rec_cnt, buffer);
	if (rec_cnt == 0)
		return SLURM_SUCCESS;

	xassert(gres_context_cnt >= 0);

	slurm_mutex_lock(&gres_context_lock);
	locked = true;
	if ((gres_context_cnt > 0) && (*gres_list == NULL)) {
		*gres_list = list_create(gres_job_list_delete);
	}

	while ((rc == SLURM_SUCCESS) && (rec_cnt)) {
		slurm_gres_context_t *gres_ctx;
		if ((buffer == NULL) || (remaining_buf(buffer) == 0))
			break;
		rec_cnt--;

		if (protocol_version >= SLURM_MIN_PROTOCOL_VERSION) {
			safe_unpack32(&magic, buffer);
			if (magic != GRES_MAGIC)
				goto unpack_error;
			safe_unpack32(&plugin_id, buffer);
			gres_js = xmalloc(sizeof(gres_job_state_t));
			safe_unpack16(&gres_js->cpus_per_gres, buffer);
			safe_unpack16(&gres_js->flags, buffer);
			safe_unpack64(&gres_js->gres_per_job, buffer);
			safe_unpack64(&gres_js->gres_per_node, buffer);
			safe_unpack64(&gres_js->gres_per_socket, buffer);
			safe_unpack64(&gres_js->gres_per_task, buffer);
			safe_unpack64(&gres_js->mem_per_gres, buffer);
			safe_unpack16(&gres_js->ntasks_per_gres, buffer);
			safe_unpack64(&gres_js->total_gres, buffer);
			safe_unpackstr(&gres_js->type_name, buffer);
			gres_js->type_id =
				gres_build_id(gres_js->type_name);
			safe_unpack32(&gres_js->node_cnt, buffer);
			if (gres_js->node_cnt > NO_VAL)
				goto unpack_error;

			safe_unpack8(&has_more, buffer);
			if (has_more) {
				safe_unpack64_array(
					&gres_js->gres_cnt_node_alloc,
					&utmp32, buffer);
			}

			safe_unpack8(&has_more, buffer);
			if (has_more) {
				safe_xcalloc(gres_js->gres_bit_alloc,
					     gres_js->node_cnt,
					     sizeof(bitstr_t *));
				for (i = 0; i < gres_js->node_cnt; i++) {
					unpack_bit_str_hex(&gres_js->
							   gres_bit_alloc[i],
							   buffer);
				}
			}
			for (i = 0; i < gres_js->node_cnt; i++) {
				safe_unpack8(&has_more, buffer);
				if (!has_more)
					continue;
				if (!gres_js->gres_per_bit_alloc)
					safe_xcalloc(
						gres_js->gres_per_bit_alloc,
						gres_js->node_cnt,
						sizeof(uint64_t *));
				safe_unpack64_array(
					&gres_js->gres_per_bit_alloc[i],
					&utmp32, buffer);
			}
			safe_unpack8(&has_more, buffer);
			if (has_more) {
				safe_xcalloc(gres_js->gres_bit_step_alloc,
					     gres_js->node_cnt,
					     sizeof(bitstr_t *));
				for (i = 0; i < gres_js->node_cnt; i++) {
					unpack_bit_str_hex(&gres_js->
							   gres_bit_step_alloc[i],
							   buffer);
				}
			}
			safe_unpack8(&has_more, buffer);
			if (has_more) {
				safe_xcalloc(gres_js->gres_cnt_step_alloc,
					     gres_js->node_cnt,
					     sizeof(uint64_t));
				for (i = 0; i < gres_js->node_cnt; i++) {
					safe_unpack64(&gres_js->
						      gres_cnt_step_alloc[i],
						      buffer);
				}
			}
			for (i = 0; i < gres_js->node_cnt; i++) {
				safe_unpack8(&has_more, buffer);
				if (!has_more)
					continue;
				if (!gres_js->gres_per_bit_step_alloc)
					safe_xcalloc(
						gres_js->gres_per_bit_step_alloc,
						gres_js->node_cnt,
						sizeof(uint64_t *));
				safe_unpack64_array(
					&gres_js->gres_per_bit_step_alloc[i],
					&utmp32, buffer);
			}
		} else {
			error("%s: protocol_version %hu not supported",
			      __func__, protocol_version);
			goto unpack_error;
		}

		if (!(gres_ctx = _find_context_by_id(plugin_id))) {
			/*
			 * A likely sign that GresPlugins has changed.
			 * Not a fatal error, skip over the data.
			 */
			error("%s: no plugin configured to unpack data type %u from job %u. This is likely due to a difference in the GresTypes configured in slurm.conf on different cluster nodes.",
			      __func__, plugin_id, job_id);
			gres_job_state_delete(gres_js);
			continue;
		}

		gres_state_job = gres_create_state(
			gres_ctx, GRES_STATE_SRC_CONTEXT_PTR,
			GRES_STATE_TYPE_JOB, gres_js);
		gres_js = NULL;	/* nothing left to free on error */
		list_append(*gres_list, gres_state_job);
	}
	slurm_mutex_unlock(&gres_context_lock);
	return rc;

unpack_error:
	error("%s: unpack error from job %u", __func__, job_id);
	if (gres_js)
		gres_job_state_delete(gres_js);
	if (locked)
		slurm_mutex_unlock(&gres_context_lock);
	return SLURM_ERROR;
}

extern void gres_prep_pack(void *in, uint16_t protocol_version, buf_t *buffer)
{
	uint32_t magic = GRES_MAGIC;
	gres_prep_t *gres_prep = in;

	if (protocol_version >= SLURM_MIN_PROTOCOL_VERSION) {
		pack32(magic, buffer);
		pack32(gres_prep->plugin_id, buffer);
		pack32(gres_prep->node_cnt, buffer);
		if (gres_prep->gres_cnt_node_alloc) {
			pack8((uint8_t) 1, buffer);
			pack64_array(gres_prep->gres_cnt_node_alloc,
				     gres_prep->node_cnt, buffer);
		} else {
			pack8((uint8_t) 0, buffer);
		}
		if (gres_prep->gres_bit_alloc) {
			pack8((uint8_t) 1, buffer);
			for (int i = 0; i < gres_prep->node_cnt; i++) {
				pack_bit_str_hex(gres_prep->
						 gres_bit_alloc[i],
						 buffer);
			}
		} else {
			pack8((uint8_t) 0, buffer);
		}
	} else {
		error("%s: protocol_version %hu not supported",
		      __func__, protocol_version);
	}
}

/*
 * Pack a job's allocated gres information for use by prolog/epilog
 * IN gres_list - generated by gres_job_config_validate()
 * IN/OUT buffer - location to write state to
 *
 * When 24.11 is no longer supported this can be removed.
 */
extern int gres_prep_pack_legacy(list_t *gres_list, buf_t *buffer,
				 uint16_t protocol_version)
{
	int rc = SLURM_SUCCESS;
	uint32_t top_offset, tail_offset;
	uint16_t rec_cnt = 0;
	list_itr_t *gres_iter;
	gres_prep_t *gres_prep;

	top_offset = get_buf_offset(buffer);
	pack16(rec_cnt, buffer);	/* placeholder if data */

	if (gres_list == NULL)
		return rc;

	if (protocol_version < SLURM_MIN_PROTOCOL_VERSION) {
		error("%s: protocol_version %hu not supported",
		      __func__, protocol_version);
		return rc;
	}

	gres_iter = list_iterator_create(gres_list);
	while ((gres_prep = list_next(gres_iter))) {
		gres_prep_pack(gres_prep, protocol_version, buffer);
		rec_cnt++;
	}
	list_iterator_destroy(gres_iter);

	tail_offset = get_buf_offset(buffer);
	set_buf_offset(buffer, top_offset);
	pack16(rec_cnt, buffer);
	set_buf_offset(buffer, tail_offset);

	return rc;
}

static void _prep_list_del(void *x)
{
	gres_prep_t *gres_prep = (gres_prep_t *) x;
	int i;

	if (!gres_prep)
		return;

	if (gres_prep->gres_bit_alloc) {
		for (i = 0; i < gres_prep->node_cnt; i++)
			FREE_NULL_BITMAP(gres_prep->gres_bit_alloc[i]);
		xfree(gres_prep->gres_bit_alloc);
	}
	xfree(gres_prep->gres_cnt_node_alloc);
	xfree(gres_prep->node_list);
	xfree(gres_prep);
}

static int _gres_prep_unpack(void **object, uint16_t protocol_version,
			     buf_t *buffer)
{
	uint32_t magic = 0, utmp32 = 0;
	uint8_t filled = 0;
	gres_prep_t *gres_prep = NULL;

	gres_prep = xmalloc(sizeof(gres_prep_t));

	if (protocol_version >= SLURM_MIN_PROTOCOL_VERSION) {
		safe_unpack32(&magic, buffer);
		if (magic != GRES_MAGIC)
			goto unpack_error;
		safe_unpack32(&gres_prep->plugin_id, buffer);
		safe_unpack32(&gres_prep->node_cnt, buffer);
		if (gres_prep->node_cnt > NO_VAL)
			goto unpack_error;
		safe_unpack8(&filled, buffer);
		if (filled) {
			safe_unpack64_array(
				&gres_prep->gres_cnt_node_alloc,
				&utmp32, buffer);
		}
		safe_unpack8(&filled, buffer);
		if (filled) {
			safe_xcalloc(gres_prep->gres_bit_alloc,
				     gres_prep->node_cnt,
				     sizeof(bitstr_t *));
			for (int i = 0; i < gres_prep->node_cnt; i++) {
				unpack_bit_str_hex(&gres_prep->
						   gres_bit_alloc[i],
						   buffer);
			}
		}
	} else {
		error("%s: protocol_version %hu not supported",
		      __func__, protocol_version);
		goto unpack_error;
	}

	if (!_find_context_by_id(gres_prep->plugin_id)) {
		/*
		 * A likely sign that GresPlugins has changed.
		 * Not a fatal error, skip over the data.
		 */
		error("%s: no plugin configured to unpack data type %u",
		      __func__, gres_prep->plugin_id);
		_prep_list_del(gres_prep);
		gres_prep = NULL;
		/* Don't return SLURM_ERROR */
	}

	*object = gres_prep;

	return SLURM_SUCCESS;

unpack_error:
	error("%s: unpack error", __func__);
	_prep_list_del(gres_prep);

	return SLURM_ERROR;
}

extern int gres_prep_unpack_list(list_t **out, buf_t *buffer,
				 uint16_t protocol_version)
{
	int rc = SLURM_SUCCESS;

	/* We have to have gres_context_lock locked to call the unpack */
	slurm_mutex_lock(&gres_context_lock);
	if ((rc = slurm_unpack_list(out, _gres_prep_unpack, _prep_list_del,
				    buffer, protocol_version)) != SLURM_SUCCESS)
		FREE_NULL_LIST(*out);
	slurm_mutex_unlock(&gres_context_lock);

	return rc;
}

/*
 * Unpack a job's allocated gres information for use by prolog/epilog
 * OUT gres_list - restored state stored by gres_prep_pack()
 * IN/OUT buffer - location to read state from
 *
 * When 24.11 is no longer supported this can be removed.
 */
extern int gres_prep_unpack_legacy(list_t **gres_list, buf_t *buffer,
				   uint16_t protocol_version)
{
	int rc = SLURM_SUCCESS;
	uint16_t rec_cnt = 0;
	gres_prep_t *gres_prep = NULL;
	bool locked = false;

	safe_unpack16(&rec_cnt, buffer);
	if (rec_cnt == 0)
		return SLURM_SUCCESS;

	xassert(gres_context_cnt >= 0);

	slurm_mutex_lock(&gres_context_lock);
	locked = true;
	if ((gres_context_cnt > 0) && (*gres_list == NULL)) {
		*gres_list = list_create(_prep_list_del);
	}

	while ((rc == SLURM_SUCCESS) && (rec_cnt)) {
		if ((buffer == NULL) || (remaining_buf(buffer) == 0))
			break;
		rec_cnt--;

		if (_gres_prep_unpack((void **)&gres_prep, protocol_version,
				      buffer) != SLURM_SUCCESS)
			goto unpack_error;

		if (gres_prep) {
			list_append(*gres_list, gres_prep);
			gres_prep = NULL;
		}
	}
	slurm_mutex_unlock(&gres_context_lock);
	return rc;

unpack_error:
	error("%s: unpack error", __func__);
	if (gres_prep)
		_prep_list_del(gres_prep);
	if (locked)
		slurm_mutex_unlock(&gres_context_lock);
	return SLURM_ERROR;
}


static int _foreach_prep_build_env(void *x, void *arg)
{
	gres_state_t *gres_ptr = x;
	foreach_prep_build_env_t *foreach_prep_build_env = arg;
	slurm_gres_context_t *gres_ctx;
	gres_prep_t *gres_prep;

	if (!(gres_ctx = _find_context_by_id(gres_ptr->plugin_id))) {
		error("%s: gres not found in context. This should never happen",
		      __func__);
		return 0;
	}

	if (!gres_ctx->ops.prep_build_env) /* No plugin to call */
		return 0;

	gres_prep = (*(gres_ctx->ops.prep_build_env))(gres_ptr->gres_data);
	if (!gres_prep) /* No info to add for this plugin */
		return 0;

	if (!foreach_prep_build_env->prep_gres_list)
		foreach_prep_build_env->prep_gres_list =
			list_create(_prep_list_del);

	gres_prep->plugin_id = gres_ctx->plugin_id;
	gres_prep->node_list = xstrdup(foreach_prep_build_env->node_list);
	list_append(foreach_prep_build_env->prep_gres_list, gres_prep);

	return 0;
}

/*
 * Build List of information needed to set job's Prolog or Epilog environment
 * variables
 *
 * IN job_gres_list - job's GRES allocation info
 * IN hostlist - list of nodes associated with the job
 * RET information about the job's GRES allocation needed by Prolog or Epilog
 */
extern list_t *gres_g_prep_build_env(list_t *job_gres_list, char *node_list)
{
	foreach_prep_build_env_t foreach_prep_build_env = {
		.node_list = node_list,
	};

	if (!job_gres_list)
		return NULL;

	xassert(gres_context_cnt >= 0);

	slurm_mutex_lock(&gres_context_lock);
	(void) list_for_each(job_gres_list, _foreach_prep_build_env,
			     &foreach_prep_build_env);
	slurm_mutex_unlock(&gres_context_lock);

	return foreach_prep_build_env.prep_gres_list;
}

static int _foreach_prep_set_env(void *x, void *arg)
{
	gres_prep_t *gres_prep = x;
	foreach_prep_set_env_t *foreach_prep_set_env = arg;
	slurm_gres_context_t *gres_ctx;

	if (!(gres_ctx = _find_context_by_id(gres_prep->plugin_id))) {
		error("%s: GRES ID %u not found in context",
		      __func__, gres_prep->plugin_id);
		return 0;
	}

	if (!gres_ctx->ops.prep_set_env) /* No plugin to call */
		return 0;

	(*(gres_ctx->ops.prep_set_env))
		(foreach_prep_set_env->prep_env_ptr, gres_prep,
		 foreach_prep_set_env->node_inx);

	return 0;
}

/*
 * Set environment variables as appropriate for a job's prolog or epilog based
 * GRES allocated to the job.
 *
 * IN/OUT prep_env_ptr - environment variable array
 * IN prep_gres_list - generated by TBD
 * IN node_inx - zero origin node index
 */
extern void gres_g_prep_set_env(char ***prep_env_ptr,
				list_t *prep_gres_list, int node_inx)
{
	foreach_prep_set_env_t foreach_prep_set_env = {
		.node_inx = node_inx,
		.prep_env_ptr = prep_env_ptr,
	};

	*prep_env_ptr = NULL;
	if (!prep_gres_list)
		return;

	xassert(gres_context_cnt >= 0);

	slurm_mutex_lock(&gres_context_lock);
	(void) list_for_each(prep_gres_list, _foreach_prep_set_env,
			     &foreach_prep_set_env);
	slurm_mutex_unlock(&gres_context_lock);
}

/*
 * If core bitmap from slurmd differs in size from that in slurmctld,
 * then modify bitmap from slurmd so we can use bit_and, bit_or, etc.
 */
static bitstr_t *_core_bitmap_rebuild(bitstr_t *old_core_bitmap, int new_size)
{
	int i, j, old_size, ratio;
	bitstr_t *new_core_bitmap;

	new_core_bitmap = bit_alloc(new_size);
	old_size = bit_size(old_core_bitmap);
	if (old_size > new_size) {
		ratio = old_size / new_size;
		for (i = 0; i < new_size; i++) {
			for (j = 0; j < ratio; j++) {
				if (bit_test(old_core_bitmap, i*ratio+j)) {
					bit_set(new_core_bitmap, i);
					break;
				}
			}
		}
	} else {
		ratio = new_size / old_size;
		for (i = 0; i < old_size; i++) {
			if (!bit_test(old_core_bitmap, i))
				continue;
			for (j = 0; j < ratio; j++) {
				bit_set(new_core_bitmap, i*ratio+j);
			}
		}
	}

	return new_core_bitmap;
}

extern void gres_validate_node_cores(gres_node_state_t *gres_ns,
				     int cores_ctld, char *node_name)
{
	int i, cores_slurmd;
	bitstr_t *new_core_bitmap;
	int log_mismatch = true;

	if (gres_ns->topo_cnt == 0)
		return;

	if (gres_ns->topo_core_bitmap == NULL) {
		error("Gres topo_core_bitmap is NULL on node %s", node_name);
		return;
	}


	for (i = 0; i < gres_ns->topo_cnt; i++) {
		if (!gres_ns->topo_core_bitmap[i])
			continue;
		cores_slurmd = bit_size(gres_ns->topo_core_bitmap[i]);
		if (cores_slurmd == cores_ctld)
			continue;
		if (log_mismatch) {
			debug("Rebuilding node %s gres core bitmap (%d != %d)",
			      node_name, cores_slurmd, cores_ctld);
			log_mismatch = false;
		}
		new_core_bitmap = _core_bitmap_rebuild(
			gres_ns->topo_core_bitmap[i],
			cores_ctld);
		FREE_NULL_BITMAP(gres_ns->topo_core_bitmap[i]);
		gres_ns->topo_core_bitmap[i] = new_core_bitmap;
	}
}

static uint32_t _job_test(gres_state_t *gres_state_job,
			  gres_state_t *gres_state_node,
			  bool use_total_gres,
			  int core_start_bit, int core_end_bit,
			  uint32_t job_id, char *node_name)
{
	gres_job_state_t *gres_js = gres_state_job->gres_data;
	gres_node_state_t *gres_ns = gres_state_node->gres_data;
	char *gres_name = gres_state_job->gres_name;
	int i, j, core_size, core_ctld, top_inx = -1;
	uint64_t gres_avail = 0, gres_total, gres_tmp;
	uint64_t min_gres_node = 0;
	uint32_t *cores_addnt = NULL; /* Additional cores avail from this GRES */
	uint32_t *cores_avail = NULL; /* cores initially avail from this GRES */
	uint32_t core_cnt = 0;
	bitstr_t *alloc_core_bitmap = NULL;
	bitstr_t *avail_core_bitmap = NULL;
	bool use_single_dev = (gres_id_shared(gres_state_job->config_flags) &&
			       !(slurm_conf.select_type_param &
				 SELECT_MULTIPLE_SHARING_GRES_PJ));
	bool use_busy_dev;

	if (gres_ns->no_consume)
		use_total_gres = true;

	use_busy_dev = gres_use_busy_dev(gres_state_node, use_total_gres);

	/* Determine minimum GRES count needed on this node */
	if (gres_js->gres_per_job)
		min_gres_node = 1;
	min_gres_node = MAX(min_gres_node, gres_js->gres_per_node);
	min_gres_node = MAX(min_gres_node, gres_js->gres_per_socket);
	min_gres_node = MAX(min_gres_node, gres_js->gres_per_task);

	if (min_gres_node && gres_ns->topo_cnt) {
		/* Need to determine which specific cores can be used */
		gres_avail = gres_ns->gres_cnt_avail;
		if (!use_total_gres)
			gres_avail -= gres_ns->gres_cnt_alloc;
		if (min_gres_node > gres_avail)
			return (uint32_t) 0;	/* insufficient GRES avail */

		core_ctld = core_end_bit - core_start_bit + 1;
		for (i = 0; i < gres_ns->topo_cnt; i++) {
			if (!gres_ns->topo_core_bitmap[i])
				continue;
			core_ctld = bit_size(gres_ns->
						topo_core_bitmap[i]);
			break;
		}

		alloc_core_bitmap = bit_alloc(core_ctld);
		bit_set_all(alloc_core_bitmap);


		avail_core_bitmap = bit_copy(alloc_core_bitmap);
		cores_addnt = xcalloc(gres_ns->topo_cnt,
				      sizeof(uint32_t));
		cores_avail = xcalloc(gres_ns->topo_cnt,
				      sizeof(uint32_t));
		for (i = 0; i < gres_ns->topo_cnt; i++) {
			if (gres_ns->topo_gres_cnt_avail[i] == 0)
				continue;
			if (use_busy_dev &&
			    (gres_ns->topo_gres_cnt_alloc[i] == 0))
				continue;
			if (!use_total_gres &&
			    (gres_ns->topo_gres_cnt_alloc[i] >=
			     gres_ns->topo_gres_cnt_avail[i]))
				continue;
			if (gres_js->type_name &&
			    (!gres_ns->topo_type_name[i] ||
			     (gres_ns->topo_type_id[i] !=
			      gres_js->type_id)))
				continue;
			if (!gres_ns->topo_core_bitmap[i]) {
				cores_avail[i] = core_end_bit -
					core_start_bit + 1;
				continue;
			}
			core_size = bit_size(gres_ns->topo_core_bitmap[i]);
			for (j = 0; j < core_size; j++) {
				if (bit_test(gres_ns->
					     topo_core_bitmap[i], j)) {
					cores_avail[i]++;
				}
			}
		}

		/* Pick the topology entries with the most cores available */
		gres_avail = 0;
		gres_total = 0;
		while (gres_avail < min_gres_node) {
			top_inx = -1;
			for (j = 0; j < gres_ns->topo_cnt; j++) {
				if ((gres_avail == 0) ||
				    (cores_avail[j] == 0) ||
				    !gres_ns->topo_core_bitmap[j]) {
					cores_addnt[j] = cores_avail[j];
				} else {
					cores_addnt[j] = cores_avail[j] -
						bit_overlap(alloc_core_bitmap,
							    gres_ns->
							    topo_core_bitmap[j]);
				}

				if (top_inx == -1) {
					if (cores_avail[j])
						top_inx = j;
				} else if (cores_addnt[j] > cores_addnt[top_inx])
					top_inx = j;
			}
			if ((top_inx < 0) || (cores_avail[top_inx] == 0)) {
				if (gres_total < min_gres_node)
					core_cnt = 0;
				break;
			}
			cores_avail[top_inx] = 0;	/* Flag as used */
			gres_tmp = gres_ns->topo_gres_cnt_avail[top_inx];
			if (!use_total_gres &&
			    (gres_tmp >=
			     gres_ns->topo_gres_cnt_alloc[top_inx])) {
				gres_tmp -= gres_ns->
					topo_gres_cnt_alloc[top_inx];
			} else if (!use_total_gres) {
				gres_tmp = 0;
			}
			if (gres_id_shared(gres_state_job->config_flags) &&
			    gres_js->gres_per_task) {
				/*
				 * Remove remaining shared gres_per_task
				 * Because we don't allocate shared
				 * gres_per_task across multiple sharing gres.
				 * See _set_shared_task_bits() in
				 * gres_select_filter.c
				 */
				gres_tmp -= (gres_tmp % gres_js->gres_per_task);
			}
			if (gres_tmp == 0) {
				error("gres/%s: topology allocation error on node %s",
				      gres_name, node_name);
				break;
			}
			/* update counts of allocated cores and GRES */
			if (use_single_dev) {
				/*
				 * Process outside of loop after specific
				 * device selected
				 */
			} else if (!gres_ns->topo_core_bitmap[top_inx]) {
				bit_set_all(alloc_core_bitmap);
			} else if (gres_avail) {
				bit_or(alloc_core_bitmap,
				       gres_ns->
				       topo_core_bitmap[top_inx]);
			} else {
				bit_and(alloc_core_bitmap,
					gres_ns->
					topo_core_bitmap[top_inx]);
			}
			if (use_single_dev) {
				gres_total = MAX(gres_total, gres_tmp);
				gres_avail = gres_total;
			} else {
				/*
				 * Available GRES count is up to gres_tmp,
				 * but take 1 per loop to maximize available
				 * core count
				 */
				gres_avail += 1;
				gres_total += gres_tmp;
				core_cnt = bit_set_count(alloc_core_bitmap);
			}
		}
		if (use_single_dev && (top_inx >= 0) &&
		    (gres_avail >= min_gres_node)) {
			if (!gres_ns->topo_core_bitmap[top_inx]) {
				bit_set_all(alloc_core_bitmap);
			} else {
				bit_or(alloc_core_bitmap,
				       gres_ns->
				       topo_core_bitmap[top_inx]);
			}
			core_cnt = bit_set_count(alloc_core_bitmap);
		}
		FREE_NULL_BITMAP(alloc_core_bitmap);
		FREE_NULL_BITMAP(avail_core_bitmap);
		xfree(cores_addnt);
		xfree(cores_avail);
		return core_cnt;
	} else if (gres_js->type_name) {
		for (i = 0; i < gres_ns->type_cnt; i++) {
			if (gres_ns->type_name[i] &&
			    (gres_ns->type_id[i] ==
			     gres_js->type_id))
				break;
		}
		if (i >= gres_ns->type_cnt)
			return (uint32_t) 0;	/* no such type */
		gres_avail = gres_ns->type_cnt_avail[i];
		if (!use_total_gres)
			gres_avail -= gres_ns->type_cnt_alloc[i];
		gres_tmp = gres_ns->gres_cnt_avail;
		if (!use_total_gres)
			gres_tmp -= gres_ns->gres_cnt_alloc;
		gres_avail = MIN(gres_avail, gres_tmp);
		if (min_gres_node > gres_avail)
			return (uint32_t) 0;	/* insufficient GRES avail */
		return NO_VAL;
	} else {
		gres_avail = gres_ns->gres_cnt_avail;
		if (!use_total_gres)
			gres_avail -= gres_ns->gres_cnt_alloc;
		if (min_gres_node > gres_avail)
			return (uint32_t) 0;	/* insufficient GRES avail */
		return NO_VAL;
	}
}

static int _foreach_job_test(void *x, void *arg)
{
	gres_state_t *gres_state_job = x;
	foreach_job_test_t *foreach_job_test = arg;
	uint32_t tmp_cnt;
	gres_state_t *gres_state_node =
		list_find_first(foreach_job_test->node_gres_list,
				gres_find_id,
				&gres_state_job->plugin_id);
	if (!gres_state_node) {
		/* node lack resources required by the job */
		foreach_job_test->core_cnt = 0;
		return -1;
	}

	tmp_cnt = _job_test(gres_state_job, gres_state_node,
			    foreach_job_test->use_total_gres,
			    foreach_job_test->core_start_bit,
			    foreach_job_test->core_end_bit,
			    foreach_job_test->job_id,
			    foreach_job_test->node_name);
	if (tmp_cnt != NO_VAL) {
		if (foreach_job_test->core_cnt == NO_VAL)
			foreach_job_test->core_cnt = tmp_cnt;
		else
			foreach_job_test->core_cnt =
				MIN(tmp_cnt, foreach_job_test->core_cnt);
	}

	if (foreach_job_test->core_cnt == 0)
		return -1;

	return 0;
}

/*
 * Determine how many cores on the node can be used by this job
 * IN job_gres_list  - job's gres_list built by gres_job_state_validate()
 * IN node_gres_list - node's gres_list built by gres_node_config_validate()
 * IN use_total_gres - if set then consider all gres resources as available,
 *		       and none are committed to running jobs
 * IN core_start_bit - index into core_bitmap for this node's first core
 * IN core_end_bit   - index into core_bitmap for this node's last core
 * IN job_id         - job's ID (for logging)
 * IN node_name      - name of the node (for logging)
 * IN disable binding- --gres-flags=disable-binding
 * RET: NO_VAL    - All cores on node are available
 *      otherwise - Count of available cores
 */
extern uint32_t gres_job_test(list_t *job_gres_list, list_t *node_gres_list,
			      bool use_total_gres,
			      int core_start_bit, int core_end_bit,
			      uint32_t job_id, char *node_name)
{
	foreach_job_test_t foreach_job_test = {
		.core_cnt = NO_VAL,
		.core_end_bit = core_end_bit,
		.core_start_bit = core_start_bit,
		.job_id = job_id,
		.node_gres_list = node_gres_list,
		.node_name = node_name,
		.use_total_gres = use_total_gres,
	};

	if (job_gres_list == NULL)
		return NO_VAL;
	if (node_gres_list == NULL)
		return 0;

	(void) list_for_each(job_gres_list, _foreach_job_test,
			     &foreach_job_test);

	return foreach_job_test.core_cnt;
}

extern void gres_sock_delete(void *x)
{
	sock_gres_t *sock_gres = (sock_gres_t *) x;
	int s;

	if (sock_gres) {
		FREE_NULL_BITMAP(sock_gres->bits_any_sock);
		if (sock_gres->bits_by_sock) {
			for (s = 0; s < sock_gres->sock_cnt; s++)
				FREE_NULL_BITMAP(sock_gres->bits_by_sock[s]);
			xfree(sock_gres->bits_by_sock);
		}
		xfree(sock_gres->cnt_by_sock);
		xfree(sock_gres);
	}
}

static int _foreach_sock_str(void *x, void *arg)
{
	sock_gres_t *sock_gres = x;
	foreach_sock_str_t *foreach_sock_str = arg;
	char *gres_name = sock_gres->gres_state_job->gres_name;
	gres_job_state_t *gres_js = sock_gres->gres_state_job->gres_data;
	char *type_name = gres_js->type_name;

	if (foreach_sock_str->sock_inx < 0) {
		if (sock_gres->cnt_any_sock) {
			if (type_name) {
				xstrfmtcat(foreach_sock_str->gres_str,
					   "%s%s:%s:%"PRIu64,
					   foreach_sock_str->sep,
					   gres_name,
					   type_name,
					   sock_gres->cnt_any_sock);
			} else {
				xstrfmtcat(foreach_sock_str->gres_str,
					   "%s%s:%"PRIu64,
					   foreach_sock_str->sep, gres_name,
					   sock_gres->cnt_any_sock);
			}
			foreach_sock_str->sep = " ";
		}
		return 0;
	}
	if (!sock_gres->cnt_by_sock ||
	    (sock_gres->cnt_by_sock[foreach_sock_str->sock_inx] == 0))
		return 0;
	if (type_name) {
		xstrfmtcat(foreach_sock_str->gres_str, "%s%s:%s:%"PRIu64,
			   foreach_sock_str->sep,
			   gres_name, type_name,
			   sock_gres->cnt_by_sock[foreach_sock_str->sock_inx]);
	} else {
		xstrfmtcat(foreach_sock_str->gres_str, "%s%s:%"PRIu64,
			   foreach_sock_str->sep,
			   gres_name,
			   sock_gres->cnt_by_sock[foreach_sock_str->sock_inx]);
	}
	foreach_sock_str->sep = " ";
	return 0;
}

/*
 * Build a string containing the GRES details for a given node and socket
 * sock_gres_list IN - List of sock_gres_t entries
 * sock_inx IN - zero-origin socket for which information is to be returned
 *		 if value < 0, then report GRES unconstrained by core
 * RET string, must call xfree() to release memory
 */
extern char *gres_sock_str(list_t *sock_gres_list, int sock_inx)
{
	foreach_sock_str_t foreach_sock_str = {
		.gres_str = NULL,
		.sep = "",
		.sock_inx = sock_inx,
	};

	if (!sock_gres_list)
		return NULL;

	(void) list_for_each(sock_gres_list, _foreach_sock_str,
			     &foreach_sock_str);

	return foreach_sock_str.gres_str;
}

static void _accumulate_job_gres_alloc(gres_job_state_t *gres_js,
				       int node_inx,
				       bitstr_t **gres_bit_alloc,
				       uint64_t *gres_cnt)
{
	if (gres_js->node_cnt <= node_inx) {
		error("gres_job_state_t node count less than node_inx. This should never happen");
		return;
	}

	if ((node_inx >= 0) && (node_inx < gres_js->node_cnt) &&
	    gres_js->gres_bit_alloc &&
	    gres_js->gres_bit_alloc[node_inx]) {
		if (!*gres_bit_alloc) {
			*gres_bit_alloc = bit_alloc(
				bit_size(gres_js->
					 gres_bit_alloc[node_inx]));
		}
		bit_or(*gres_bit_alloc, gres_js->gres_bit_alloc[node_inx]);
	}
	if (gres_cnt && gres_js->gres_cnt_node_alloc)
		*gres_cnt += gres_js->gres_cnt_node_alloc[node_inx];
}

static int _accumulate_gres_device(void *x, void *arg)
{
	gres_state_t *gres_ptr = x;
	foreach_gres_accumulate_device_t *foreach_gres_accumulate_device = arg;

	if (gres_ptr->plugin_id != foreach_gres_accumulate_device->plugin_id)
		return 0;

	if (foreach_gres_accumulate_device->is_job) {
		_accumulate_job_gres_alloc(
			gres_ptr->gres_data,
			foreach_gres_accumulate_device->node_inx,
			foreach_gres_accumulate_device->gres_bit_alloc,
			&foreach_gres_accumulate_device->gres_cnt);
	} else {
		_accumulate_step_gres_alloc(
			gres_ptr,
			foreach_gres_accumulate_device->gres_bit_alloc,
			&foreach_gres_accumulate_device->gres_cnt,
			foreach_gres_accumulate_device->gres_per_bit);
	}

	/* Does job have a sharing GRES (GPU)? */
	if (gres_id_sharing(foreach_gres_accumulate_device->plugin_id))
		foreach_gres_accumulate_device->sharing_gres_allocated = true;

	return 0;
}

/*
 * Set environment variables as required for a batch or interactive step
 */
extern void gres_g_job_set_env(stepd_step_rec_t *step, int node_inx)
{
	int i;
	gres_internal_flags_t flags = GRES_INTERNAL_FLAG_NONE;
	bitstr_t *gres_bit_alloc = NULL;
	foreach_gres_accumulate_device_t foreach_gres_accumulate_device = {
		.gres_bit_alloc = &gres_bit_alloc,
		.is_job = true,
		.node_inx = node_inx,
	};

	xassert(gres_context_cnt >= 0);

	slurm_mutex_lock(&gres_context_lock);
	for (i = 0; i < gres_context_cnt; i++) {
		slurm_gres_context_t *gres_ctx = &gres_context[i];
		if (!gres_ctx->ops.job_set_env)
			continue;	/* No plugin to call */
		if (step->job_gres_list) {
			foreach_gres_accumulate_device.plugin_id =
				gres_ctx->plugin_id;
			(void) list_for_each(step->job_gres_list,
					     _accumulate_gres_device,
					     &foreach_gres_accumulate_device);
		}

		/*
		 * Do not let MPS or Shard (shared GRES) clear any envs set for
		 * a GPU (sharing GRES) when a GPU is allocated but an
		 * MPS/Shard is not. Sharing GRES plugins always run before
		 * shared GRES, so we don't need to protect MPS/Shard from GPU.
		 */
		if (gres_id_shared(gres_ctx->config_flags) &&
		    foreach_gres_accumulate_device.sharing_gres_allocated)
			flags |= GRES_INTERNAL_FLAG_PROTECT_ENV;

		if ((step->flags & LAUNCH_EXT_LAUNCHER)) {
			/*
			 * We need the step environment variables, but still
			 * use all the job's gres.
			 */
			(*(gres_ctx->ops.step_set_env))(
				&step->env,
				gres_bit_alloc,
				foreach_gres_accumulate_device.gres_cnt,
				flags);
		} else
			(*(gres_ctx->ops.job_set_env))(
				&step->env,
				gres_bit_alloc,
				foreach_gres_accumulate_device.gres_cnt,
				flags);
		foreach_gres_accumulate_device.gres_cnt = 0;
		FREE_NULL_BITMAP(gres_bit_alloc);
	}
	slurm_mutex_unlock(&gres_context_lock);
}

static int _job_state_log(void *x, void *arg)
{
	gres_state_t *gres_state_job = x;
	uint32_t job_id = *(uint32_t *)arg;
	gres_job_state_t *gres_js = gres_state_job->gres_data;
	char *sparse_msg = "", tmp_str[128];
	int i;

	xassert(gres_js);
	info("gres_job_state gres:%s(%u) type:%s(%u) job:%u flags:%s",
	     gres_state_job->gres_name, gres_state_job->plugin_id,
	     gres_js->type_name,
	     gres_js->type_id, job_id, gres_flags2str(gres_js->flags));
	if (gres_js->cpus_per_gres)
		info("  cpus_per_gres:%u", gres_js->cpus_per_gres);
	else if (gres_js->def_cpus_per_gres)
		info("  def_cpus_per_gres:%u", gres_js->def_cpus_per_gres);
	if (gres_js->gres_per_job)
		info("  gres_per_job:%"PRIu64, gres_js->gres_per_job);
	if (gres_js->gres_per_node) {
		info("  gres_per_node:%"PRIu64" node_cnt:%u",
		     gres_js->gres_per_node, gres_js->node_cnt);
	}
	if (gres_js->gres_per_socket)
		info("  gres_per_socket:%"PRIu64, gres_js->gres_per_socket);
	if (gres_js->gres_per_task)
		info("  gres_per_task:%"PRIu64, gres_js->gres_per_task);
	if (gres_js->mem_per_gres)
		info("  mem_per_gres:%"PRIu64, gres_js->mem_per_gres);
	else if (gres_js->def_mem_per_gres)
		info("  def_mem_per_gres:%"PRIu64, gres_js->def_mem_per_gres);
	if (gres_js->ntasks_per_gres)
		info("  ntasks_per_gres:%u", gres_js->ntasks_per_gres);

	/*
	 * These arrays are only used for resource selection and may include
	 * data for many nodes not used in the resources eventually allocated
	 * to this job.
	 */
	if (gres_js->total_node_cnt) {
		sparse_msg = " (sparsely populated for resource selection)";
		info("  total_node_cnt:%u%s", gres_js->total_node_cnt,
		     sparse_msg);
	}
	for (i = 0; i < gres_js->total_node_cnt; i++) {
		if (gres_js->gres_cnt_node_select &&
		    gres_js->gres_cnt_node_select[i]) {
			info("  gres_cnt_node_select[%d]:%"PRIu64,
			     i, gres_js->gres_cnt_node_select[i]);
		}
		if (gres_js->gres_bit_select &&
		    gres_js->gres_bit_select[i]) {
			bit_fmt(tmp_str, sizeof(tmp_str),
				gres_js->gres_bit_select[i]);
			info("  gres_bit_select[%d]:%s of %d", i, tmp_str,
			     (int) bit_size(gres_js->gres_bit_select[i]));
		}
		if (gres_js->gres_bit_select &&
		    gres_js->gres_bit_select[i] &&
		    gres_js->gres_per_bit_select &&
		    gres_js->gres_per_bit_select[i]) {
			for (int j = 0;
			     (j = bit_ffs_from_bit(gres_js->gres_bit_select[i],
						   j)) >= 0;
			     j++) {
				info("  gres_per_bit_select[%d][%d]:%"PRIu64,
				     i, j, gres_js->gres_per_bit_select[i][j]);
			}
		}
	}

	if (gres_js->total_gres)
		info("  total_gres:%"PRIu64, gres_js->total_gres);
	if (gres_js->node_cnt)
		info("  node_cnt:%u", gres_js->node_cnt);
	for (i = 0; i < gres_js->node_cnt; i++) {
		if (gres_js->gres_cnt_node_alloc &&
		    gres_js->gres_cnt_node_alloc[i]) {
			info("  gres_cnt_node_alloc[%d]:%"PRIu64,
			     i, gres_js->gres_cnt_node_alloc[i]);
		} else if (gres_js->gres_cnt_node_alloc)
			info("  gres_cnt_node_alloc[%d]:NULL", i);

		if (gres_js->gres_bit_alloc && gres_js->gres_bit_alloc[i]) {
			bit_fmt(tmp_str, sizeof(tmp_str),
				gres_js->gres_bit_alloc[i]);
			info("  gres_bit_alloc[%d]:%s of %d", i, tmp_str,
			     (int) bit_size(gres_js->gres_bit_alloc[i]));
		} else if (gres_js->gres_bit_alloc)
			info("  gres_bit_alloc[%d]:NULL", i);

		if (gres_js->gres_bit_alloc &&
		    gres_js->gres_bit_alloc[i] &&
		    gres_js->gres_per_bit_alloc &&
		    gres_js->gres_per_bit_alloc[i]) {
			for (int j = 0;
			     (j = bit_ffs_from_bit(gres_js->gres_bit_alloc[i],
						   j)) >= 0;
			     j++) {
				info("  gres_per_bit_alloc[%d][%d]:%"PRIu64,
				     i, j, gres_js->gres_per_bit_alloc[i][j]);
			}
		}

		if (gres_js->gres_bit_step_alloc &&
		    gres_js->gres_bit_step_alloc[i]) {
			bit_fmt(tmp_str, sizeof(tmp_str),
				gres_js->gres_bit_step_alloc[i]);
			info("  gres_bit_step_alloc[%d]:%s of %d", i, tmp_str,
			     (int) bit_size(gres_js->gres_bit_step_alloc[i]));
		} else if (gres_js->gres_bit_step_alloc)
			info("  gres_bit_step_alloc[%d]:NULL", i);

		if (gres_js->gres_bit_step_alloc &&
		    gres_js->gres_bit_step_alloc[i] &&
		    gres_js->gres_per_bit_step_alloc &&
		    gres_js->gres_per_bit_step_alloc[i]) {
			for (int j = 0;
			     (j = bit_ffs_from_bit(
				      gres_js->gres_bit_step_alloc[i], j)) >= 0;
			     j++) {
				info("  gres_per_bit_step_alloc[%d][%d]:%"PRIu64,
				     i, j,
				     gres_js->gres_per_bit_step_alloc[i][j]);
			}
		}

		if (gres_js->gres_cnt_step_alloc) {
			info("  gres_cnt_step_alloc[%d]:%"PRIu64"", i,
			     gres_js->gres_cnt_step_alloc[i]);
		}
	}

	return 0;
}

static int _foreach_gres_list_cnt(void *x, void *arg)
{
	gres_state_t *gres_state_ptr = x;
	foreach_gres_list_cnt_t *foreach_gres_list_cnt = arg;
	uint64_t total_gres;
	void *type_name;

	if (gres_state_ptr->plugin_id != foreach_gres_list_cnt->plugin_id)
		return 0;

	if (foreach_gres_list_cnt->is_job) {
		gres_job_state_t *gres_js = gres_state_ptr->gres_data;
		type_name = gres_js->type_name;
		total_gres = gres_js->total_gres;
	} else {
		gres_step_state_t *gres_ss = gres_state_ptr->gres_data;
		type_name = gres_ss->type_name;
		total_gres = gres_ss->total_gres;
	}

	/* If we are filtering on GRES type, ignore other types */
	if (foreach_gres_list_cnt->filter_type &&
	    xstrcasecmp(foreach_gres_list_cnt->gres_type, type_name))
		return 0;

	if ((total_gres == NO_VAL64) || (total_gres == 0))
		return 0;

	if (foreach_gres_list_cnt->gres_cnt == NO_VAL64)
		foreach_gres_list_cnt->gres_cnt = total_gres;
	else
		foreach_gres_list_cnt->gres_cnt += total_gres;

	return 0;
}

/*
 * Extract from the job/step gres_list the count of GRES of the specified name
 * and (optionally) type. If no type is specified, then the count will include
 * all GRES of that name, regardless of type.
 *
 * IN gres_list  - job/step record's gres_list.
 * IN gres_name - the name of the GRES to query.
 * IN gres_type - (optional) the type of the GRES to query.
 * IN is_job - True if the GRES list is for the job, false if for the step.
 * RET The number of GRES in the job/step gres_list or NO_VAL64 if not found.
 */
static uint64_t _get_gres_list_cnt(list_t *gres_list, char *gres_name,
				   char *gres_type, bool is_job)
{
	foreach_gres_list_cnt_t foreach_gres_list_cnt = {
		.gres_cnt = NO_VAL64,
		.gres_type = gres_type,
		.is_job = is_job,
	};

	if ((gres_list == NULL) || (list_count(gres_list) == 0))
		return foreach_gres_list_cnt.gres_cnt;

	foreach_gres_list_cnt.plugin_id = gres_build_id(gres_name);

	if (gres_type && (gres_type[0] != '\0'))
		foreach_gres_list_cnt.filter_type = true;

	(void) list_for_each(gres_list, _foreach_gres_list_cnt,
			     &foreach_gres_list_cnt);

	return foreach_gres_list_cnt.gres_cnt;
}

static uint64_t _get_job_gres_list_cnt(list_t *gres_list, char *gres_name,
				       char *gres_type)
{
	return _get_gres_list_cnt(gres_list, gres_name, gres_type, true);
}

static uint64_t _get_step_gres_list_cnt(list_t *gres_list, char *gres_name,
					char *gres_type)
{
	return _get_gres_list_cnt(gres_list, gres_name, gres_type, false);
}

/*
 * Log a job's current gres state
 * IN gres_list - generated by gres_job_state_validate()
 * IN job_id - job's ID
 */
extern void gres_job_state_log(list_t *gres_list, uint32_t job_id)
{
	if (!(slurm_conf.debug_flags & DEBUG_FLAG_GRES) || !gres_list)
		return;

	(void) list_for_each(gres_list, _job_state_log, &job_id);
}

static int _find_device(void *x, void *key)
{
	gres_device_t *device_x = (gres_device_t *)x;
	gres_device_t *device_key = (gres_device_t *)key;

	if (!xstrcmp(device_x->path, device_key->path))
		return 1;

	return 0;
}

static int _foreach_init_device_list(void *x, void *arg)
{
	gres_device_t *gres_device = x;
	list_t **device_list = arg;

	if (!*device_list)
		*device_list = list_create(NULL);
	gres_device->alloc = 0;
	/*
	 * Keep the list unique by not adding duplicates (in the
	 * case of MPS and GPU)
	 */
	if (!list_find_first(*device_list, _find_device, gres_device))
		list_append(*device_list, gres_device);

	return 0;
}

static int _foreach_alloc_gres_device(void *x, void *arg)
{
	gres_device_t *gres_device = x;
	foreach_alloc_gres_device_t *foreach_alloc_gres_device = arg;
	if (!bit_test(foreach_alloc_gres_device->gres_bit_alloc,
		      gres_device->index))
		return 0;

	if (!foreach_alloc_gres_device->usable_gres ||
	    bit_test(foreach_alloc_gres_device->usable_gres,
		     gres_device->index)) {
		/*
		 * Search for the device among the unique
		 * devices list (since two plugins could have
		 * device records that point to the same file,
		 * like with GPU and MPS)
		 */
		gres_device_t *gres_device2 = list_find_first(
			foreach_alloc_gres_device->device_list,
			_find_device,
			gres_device);
		/*
		 * Set both, in case they point to different records
		 */
		gres_device->alloc = 1;
		if (gres_device2)
			gres_device2->alloc = 1;
	}

	return 0;
}

extern list_t *gres_g_get_devices(list_t *gres_list, bool is_job,
				  uint16_t accel_bind_type, char *tres_bind_str,
				  int local_proc_id, stepd_step_rec_t *step)
{
	int j;
	bitstr_t *gres_bit_alloc = NULL;
	uint64_t *gres_per_bit = NULL;
	list_t *gres_devices;
	list_t *device_list = NULL;
	bitstr_t *usable_gres = NULL;

	xassert(gres_context_cnt >= 0);

	/*
	 * Create a unique device list of all possible GRES device files.
	 * Initialize each device to deny.
	 */
	slurm_mutex_lock(&gres_context_lock);
	for (j = 0; j < gres_context_cnt; j++) {
		if (!gres_context[j].ops.get_devices){
			gres_devices = gres_context[j].np_gres_devices;
		} else {
			gres_devices = (*(gres_context[j].ops.get_devices))();
		}
		if (!gres_devices || !list_count(gres_devices))
			continue;

		(void) list_for_each(gres_devices, _foreach_init_device_list,
				     &device_list);
	}

	if (!gres_list) {
		slurm_mutex_unlock(&gres_context_lock);
		return device_list;
	}

	if (accel_bind_type)
		_parse_accel_bind_type(accel_bind_type, tres_bind_str);

	for (j = 0; j < gres_context_cnt; j++) {
		/* We need to get a gres_bit_alloc with all the gres types
		 * merged (accumulated) together */
		foreach_gres_accumulate_device_t arg = {
			.gres_bit_alloc = &gres_bit_alloc,
			.gres_per_bit = &gres_per_bit,
			.is_job = is_job,
			.plugin_id = gres_context[j].plugin_id,
		};
		foreach_alloc_gres_device_t foreach_alloc_gres_device = {
			.device_list = device_list,
		};

		(void) list_for_each(gres_list, _accumulate_gres_device, &arg);

		if (!gres_bit_alloc)
			continue;
		if (!gres_context[j].ops.get_devices){
			gres_devices = gres_context[j].np_gres_devices;
		} else {
			gres_devices = (*(gres_context[j].ops.get_devices))();
		}
		if (!gres_devices) {
			error("We should had got gres_devices, but for some reason none were set in the plugin.");
			continue;
		}

		if (_get_usable_gres(j, local_proc_id, tres_bind_str,
				     &usable_gres, gres_bit_alloc, true, step,
				     gres_per_bit, NULL) == SLURM_ERROR)
			continue;

		foreach_alloc_gres_device.gres_bit_alloc = gres_bit_alloc;
		foreach_alloc_gres_device.usable_gres = usable_gres;

		(void) list_for_each(gres_devices, _foreach_alloc_gres_device,
				     &foreach_alloc_gres_device);

		FREE_NULL_BITMAP(gres_bit_alloc);
		FREE_NULL_BITMAP(usable_gres);
	}
	slurm_mutex_unlock(&gres_context_lock);

	return device_list;
}

static void _step_state_delete(void *gres_data)
{
	int i;
	gres_step_state_t *gres_ss = (gres_step_state_t *) gres_data;

	if (gres_ss == NULL)
		return;

	FREE_NULL_BITMAP(gres_ss->node_in_use);
	if (gres_ss->gres_bit_alloc) {
		for (i = 0; i < gres_ss->node_cnt; i++)
			FREE_NULL_BITMAP(gres_ss->gres_bit_alloc[i]);
		xfree(gres_ss->gres_bit_alloc);
	}
	if (gres_ss->gres_per_bit_alloc) {
		for (i = 0; i < gres_ss->node_cnt; i++){
			xfree(gres_ss->gres_per_bit_alloc[i]);
		}
		xfree(gres_ss->gres_per_bit_alloc);
	}
	xfree(gres_ss->gres_cnt_node_alloc);
	xfree(gres_ss->type_name);
	xfree(gres_ss);
}

extern void gres_step_list_delete(void *list_element)
{
	gres_state_t *gres_state_step = (gres_state_t *) list_element;

	_step_state_delete(gres_state_step->gres_data);
	gres_state_step->gres_data = NULL;
	_gres_state_delete_members(gres_state_step);
}

/*
 * TRES specification parse logic
 * in_val IN - initial input string
 * cnt OUT - count of values
 * gres_list IN/OUT - where to search for (or add) new step TRES record
 * save_ptr IN/OUT - NULL on initial call, otherwise value from previous call
 * rc OUT - unchanged or an error code
 * RET gres - step record to set value in, found or created by this function
 */
static gres_state_t *_get_next_step_gres(char *in_val, uint64_t *cnt,
					 list_t *gres_list, char **save_ptr,
					 int *rc)
{
	static char *prev_save_ptr = NULL;
	int context_inx = NO_VAL, my_rc = SLURM_SUCCESS;
	gres_step_state_t *gres_ss = NULL;
	gres_state_t *gres_state_step = NULL;
	gres_key_t step_search_key;
	char *type = NULL, *name = NULL;

	xassert(save_ptr);
	if (!in_val && (*save_ptr == NULL)) {
		return NULL;
	}

	if (*save_ptr == NULL) {
		prev_save_ptr = in_val;
	} else if (*save_ptr != prev_save_ptr) {
		error("%s: parsing error", __func__);
		my_rc = SLURM_ERROR;
		goto fini;
	}

	if (prev_save_ptr[0] == '\0') {	/* Empty input token */
		*save_ptr = NULL;
		return NULL;
	}

	if ((my_rc = _get_next_gres(in_val, &type, &context_inx,
				    cnt, &prev_save_ptr)) ||
	    (context_inx == NO_VAL)) {
		prev_save_ptr = NULL;
		goto fini;
	}

	/* Find the step GRES record */
	step_search_key.config_flags = gres_context[context_inx].config_flags;
	step_search_key.plugin_id = gres_context[context_inx].plugin_id;
	step_search_key.type_id = gres_build_id(type);
	gres_state_step = list_find_first(gres_list, gres_find_step_by_key,
					  &step_search_key);

	if (gres_state_step) {
		gres_ss = gres_state_step->gres_data;
	} else {
		gres_ss = xmalloc(sizeof(gres_step_state_t));
		gres_ss->type_id = step_search_key.type_id;
		gres_ss->type_name = type;
		type = NULL;	/* String moved above */
		gres_state_step = gres_create_state(
			&gres_context[context_inx], GRES_STATE_SRC_CONTEXT_PTR,
			GRES_STATE_TYPE_STEP, gres_ss);
		list_append(gres_list, gres_state_step);
	}

fini:	xfree(name);
	xfree(type);
	if (my_rc != SLURM_SUCCESS) {
		prev_save_ptr = NULL;
		if (my_rc == ESLURM_INVALID_GRES && running_in_slurmctld())
			info("Invalid GRES step specification %s", in_val);
		*rc = my_rc;
	}
	*save_ptr = prev_save_ptr;
	return gres_state_step;
}

static int _handle_ntasks_per_tres_step(list_t *new_step_list,
					uint16_t ntasks_per_tres,
					uint32_t *num_tasks,
					uint32_t *cpu_count)
{
	gres_state_t *gres_state_step;
	gres_step_state_t *gres_ss;
	uint64_t cnt = 0;
	int rc = SLURM_SUCCESS;

	uint64_t tmp = _get_step_gres_list_cnt(new_step_list, "gpu", NULL);
	if ((tmp == NO_VAL64) && (*num_tasks != NO_VAL)) {
		/*
		 * Generate GPUs from ntasks_per_tres when not specified
		 * and ntasks is specified
		 */
		uint32_t gpus = *num_tasks / ntasks_per_tres;
		/* For now, do type-less GPUs */
		char *save_ptr = NULL, *gres = NULL, *in_val;
		xstrfmtcat(gres, "gres/gpu:%u", gpus);
		in_val = gres;
		if (*num_tasks != ntasks_per_tres * gpus) {
			log_flag(GRES, "%s: -n/--ntasks %u is not a multiple of --ntasks-per-gpu=%u",
				 __func__, *num_tasks, ntasks_per_tres);
			return ESLURM_INVALID_GRES;
		}
		while ((gres_state_step =
			_get_next_step_gres(in_val, &cnt,
					    new_step_list,
					    &save_ptr, &rc))) {
			gres_ss = gres_state_step->gres_data;
			/* Simulate a tres_per_job specification */
			gres_ss->gres_per_step = cnt;
			gres_ss->ntasks_per_gres = ntasks_per_tres;
			gres_ss->total_gres =
				MAX(gres_ss->total_gres, cnt);
			in_val = NULL;
		}
		xfree(gres);
		xassert(list_count(new_step_list) != 0);
	} else if (tmp != NO_VAL64) {
		tmp = tmp * ntasks_per_tres;
		if (*num_tasks < tmp) {
			uint32_t cpus_per_task = *cpu_count / *num_tasks;
			*num_tasks = tmp;
			tmp = tmp * cpus_per_task;
			if (*cpu_count && (*cpu_count < tmp)) {
				/* step_spec->cpu_count == 0 means SSF_OVERSUBSCRIBE */
				*cpu_count = tmp;
			}
		}
	} else {
		error("%s: ntasks_per_tres was specified, but there was either no task count or no GPU specification to go along with it, or both were already specified.",
		      __func__);
		rc = SLURM_ERROR;
	}

	return rc;
}

extern int gres_step_state_validate(char *cpus_per_tres,
				    char *tres_per_step,
				    char *tres_per_node,
				    char *tres_per_socket,
				    char *tres_per_task,
				    char *mem_per_tres,
				    uint16_t ntasks_per_tres,
				    uint32_t step_min_nodes,
				    list_t **step_gres_list,
				    uint32_t job_id,
				    uint32_t step_id,
				    uint32_t *num_tasks,
				    uint32_t *cpu_count, char **err_msg)
{
	int rc = SLURM_SUCCESS;
	gres_step_state_t *gres_ss;
	gres_state_t *gres_state_step;
	list_t *new_step_list;
	uint64_t cnt = 0;
	uint16_t cpus_per_gres = 0;
	char *cpus_per_gres_name = NULL;
	char *cpus_per_gres_type = NULL;

	*step_gres_list = NULL;
	xassert(gres_context_cnt >= 0);
	xassert(num_tasks);
	xassert(cpu_count);

	slurm_mutex_lock(&gres_context_lock);
	new_step_list = list_create(gres_step_list_delete);
	if (cpus_per_tres) {
		char *in_val = cpus_per_tres, *save_ptr = NULL;
		while ((gres_state_step = _get_next_step_gres(
						in_val, &cnt,
						new_step_list,
						&save_ptr, &rc))) {
			gres_ss = gres_state_step->gres_data;
			gres_ss->cpus_per_gres = cnt;
			in_val = NULL;
			/* Only a single cpus_per_tres value is allowed. */
			if (cpus_per_gres) {
				if (err_msg)
					*err_msg = xstrdup("You may only request cpus_per_tres for one tres");
				else
					error("You may only request cpus_per_tres for one tres");
				rc = ESLURM_INVALID_GRES;
				FREE_NULL_LIST(new_step_list);
				goto fini;
			} else {
				cpus_per_gres = cnt;
				cpus_per_gres_name = gres_state_step->gres_name;
				cpus_per_gres_type = gres_ss->type_name;
			}
		}
	}
	if (tres_per_step) {
		char *in_val = tres_per_step, *save_ptr = NULL;
		while ((gres_state_step = _get_next_step_gres(
						in_val, &cnt,
						new_step_list,
						&save_ptr, &rc))) {
			gres_ss = gres_state_step->gres_data;
			gres_ss->gres_per_step = cnt;
			in_val = NULL;
			gres_ss->total_gres =
				MAX(gres_ss->total_gres, cnt);
		}
	}
	if (tres_per_node) {
		char *in_val = tres_per_node, *save_ptr = NULL;
		while ((gres_state_step = _get_next_step_gres(
						in_val, &cnt,
						new_step_list,
						&save_ptr, &rc))) {
			gres_ss = gres_state_step->gres_data;
			gres_ss->gres_per_node = cnt;
			in_val = NULL;
			gres_ss->total_gres =
				MAX(gres_ss->total_gres, step_min_nodes * cnt);
		}
	}
	if (tres_per_socket) {
		char *in_val = tres_per_socket, *save_ptr = NULL;
		while ((gres_state_step = _get_next_step_gres(
						in_val, &cnt,
						new_step_list,
						&save_ptr, &rc))) {
			gres_ss = gres_state_step->gres_data;
			gres_ss->gres_per_socket = cnt;
			in_val = NULL;
			// TODO: What is sockets_per_node and ntasks_per_socket?
			// if (*sockets_per_node != NO_VAL16) {
			//	cnt *= *sockets_per_node;
			// } else if ((*num_tasks != NO_VAL) &&
			//	   (*ntasks_per_socket != NO_VAL16)) {
			//	cnt *= ROUNDUP(*num_tasks, *ntasks_per_socket);
			// }
			// gres_ss->total_gres =
			//	MAX(gres_ss->total_gres, cnt);
		}
	}
	if (tres_per_task) {
		char *in_val = tres_per_task, *save_ptr = NULL;
		while ((gres_state_step = _get_next_step_gres(
						in_val, &cnt,
						new_step_list,
						&save_ptr, &rc))) {
			gres_ss = gres_state_step->gres_data;
			gres_ss->gres_per_task = cnt;
			in_val = NULL;
			if (*num_tasks != NO_VAL)
				cnt *= *num_tasks;
			gres_ss->total_gres =
				MAX(gres_ss->total_gres, cnt);
		}
	}
	if (mem_per_tres) {
		char *in_val = mem_per_tres, *save_ptr = NULL;
		while ((gres_state_step = _get_next_step_gres(
						in_val, &cnt,
						new_step_list,
						&save_ptr, &rc))) {
			gres_ss = gres_state_step->gres_data;
			gres_ss->mem_per_gres = cnt;
			in_val = NULL;
		}
	}

	if ((ntasks_per_tres != NO_VAL16)) {
		rc = _handle_ntasks_per_tres_step(new_step_list,
						  ntasks_per_tres,
						  num_tasks,
						  cpu_count);
	}

	if ((rc == SLURM_SUCCESS) && cpus_per_gres && *cpu_count &&
	    running_in_slurmctld()) {
		/*
		 * Update cpu_count = the total requested gres * cpus_per_gres
		 *
		 * If SSF_OVERCOMMIT (step_spec->cpu_count == 0), don't update.
		 * Only update if in slurmctld because the step can inherit
		 * gres from the job_gres_list_req, which only exists in
		 * slurmctld.
		 */
		uint64_t gpu_cnt = _get_step_gres_list_cnt(new_step_list,
							   cpus_per_gres_name,
							   cpus_per_gres_type);

		if (gpu_cnt == NO_VAL64) {
			if (err_msg)
				*err_msg = xstrdup("cpus_per_gres also requires specifying the same gres");
			else
				error("cpus_per_gres also requires specifying the same gres");
			rc = ESLURM_INVALID_GRES;
			FREE_NULL_LIST(new_step_list);
		} else
			*cpu_count = gpu_cnt * cpus_per_gres;
	}

	if (list_count(new_step_list) == 0) {
		FREE_NULL_LIST(new_step_list);
	} else {
		if (rc == SLURM_SUCCESS) {
			job_validate_t job_validate = {
				.over_array = xcalloc(list_count(new_step_list),
						      sizeof(overlap_check_t)),
			};

			(void) list_for_each(new_step_list,
					     _foreach_set_over_array,
					     &job_validate);

			if (job_validate.overlap_merge)
				rc = _merge_generic_data(new_step_list,
							 &job_validate);
			xfree(job_validate.over_array);
		}
		if (rc == SLURM_SUCCESS)
			*step_gres_list = new_step_list;
		else
			FREE_NULL_LIST(new_step_list);
	}
fini:
	slurm_mutex_unlock(&gres_context_lock);
	return rc;
}

static void *_step_state_dup(gres_step_state_t *gres_ss)
{

	int i;
	gres_step_state_t *new_gres_ss;

	xassert(gres_ss);
	new_gres_ss = xmalloc(sizeof(gres_step_state_t));
	new_gres_ss->cpus_per_gres	= gres_ss->cpus_per_gres;
	new_gres_ss->gres_per_step	= gres_ss->gres_per_step;
	new_gres_ss->gres_per_node	= gres_ss->gres_per_node;
	new_gres_ss->gres_per_socket	= gres_ss->gres_per_socket;
	new_gres_ss->gres_per_task	= gres_ss->gres_per_task;
	new_gres_ss->mem_per_gres	= gres_ss->mem_per_gres;
	new_gres_ss->node_cnt		= gres_ss->node_cnt;
	new_gres_ss->total_gres	= gres_ss->total_gres;

	if (gres_ss->node_in_use)
		new_gres_ss->node_in_use = bit_copy(gres_ss->node_in_use);

	if (gres_ss->gres_cnt_node_alloc) {
		i = sizeof(uint64_t) * gres_ss->node_cnt;
		new_gres_ss->gres_cnt_node_alloc = xmalloc(i);
		memcpy(new_gres_ss->gres_cnt_node_alloc,
		       gres_ss->gres_cnt_node_alloc, i);
	}
	if (gres_ss->gres_bit_alloc) {
		new_gres_ss->gres_bit_alloc = xcalloc(gres_ss->node_cnt,
						      sizeof(bitstr_t *));
		for (i = 0; i < gres_ss->node_cnt; i++) {
			if (gres_ss->gres_bit_alloc[i] == NULL)
				continue;
			new_gres_ss->gres_bit_alloc[i] =
				bit_copy(gres_ss->gres_bit_alloc[i]);
		}
	}
	if (new_gres_ss->gres_per_bit_alloc && gres_ss->gres_bit_alloc) {
		new_gres_ss->gres_per_bit_alloc = xcalloc(gres_ss->node_cnt,
							  sizeof(uint64_t *));
		for (i = 0; i < gres_ss->node_cnt; i++) {
			int bit_cnt = bit_size(gres_ss->gres_bit_alloc[i]);
			new_gres_ss->gres_per_bit_alloc[i] = xcalloc(
				bit_cnt, sizeof(uint64_t));
			memcpy(new_gres_ss->gres_per_bit_alloc[i],
			       gres_ss->gres_per_bit_alloc[i],
			       bit_cnt * sizeof(uint64_t));
		}
	}
	return new_gres_ss;
}

static void *_step_state_dup2(gres_step_state_t *gres_ss, int job_node_index)
{
	gres_step_state_t *new_gres_ss;

	xassert(gres_ss);
	new_gres_ss = xmalloc(sizeof(gres_step_state_t));
	new_gres_ss->cpus_per_gres	= gres_ss->cpus_per_gres;
	new_gres_ss->gres_per_step	= gres_ss->gres_per_step;
	new_gres_ss->gres_per_node	= gres_ss->gres_per_node;
	new_gres_ss->gres_per_socket	= gres_ss->gres_per_socket;
	new_gres_ss->gres_per_task	= gres_ss->gres_per_task;
	new_gres_ss->mem_per_gres	= gres_ss->mem_per_gres;
	new_gres_ss->node_cnt		= 1;
	new_gres_ss->total_gres	= gres_ss->total_gres;

	if (gres_ss->node_in_use)
		new_gres_ss->node_in_use = bit_copy(gres_ss->node_in_use);

	if (gres_ss->gres_cnt_node_alloc) {
		new_gres_ss->gres_cnt_node_alloc = xmalloc(sizeof(uint64_t));
		new_gres_ss->gres_cnt_node_alloc[0] =
			gres_ss->gres_cnt_node_alloc[job_node_index];
	}

	if ((job_node_index < gres_ss->node_cnt) && gres_ss->gres_bit_alloc &&
	    gres_ss->gres_bit_alloc[job_node_index]) {
		new_gres_ss->gres_bit_alloc = xmalloc(sizeof(bitstr_t *));
		new_gres_ss->gres_bit_alloc[0] =
			bit_copy(gres_ss->gres_bit_alloc[job_node_index]);
	}
	if (gres_ss->gres_per_bit_alloc &&
	    (job_node_index < gres_ss->node_cnt) && gres_ss->gres_bit_alloc &&
	    gres_ss->gres_bit_alloc[job_node_index]) {
		int bit_cnt = bit_size(gres_ss->gres_bit_alloc[job_node_index]);
		new_gres_ss->gres_per_bit_alloc = xmalloc(sizeof(uint64_t *));
		new_gres_ss->gres_per_bit_alloc[0] = xcalloc(bit_cnt,
							     sizeof(uint64_t));
		memcpy(new_gres_ss->gres_per_bit_alloc[0],
		       gres_ss->gres_per_bit_alloc[job_node_index],
		       bit_cnt * sizeof(uint64_t));
	}
	return new_gres_ss;
}

/*
 * Create a copy of a step's gres state
 * IN gres_list - List of Gres records for this step to track usage
 * RET The copy or NULL on failure
 */
list_t *gres_step_state_list_dup(list_t *gres_list)
{
	return gres_step_state_extract(gres_list, -1);
}

static int _foreach_step_state_extract(void *x, void *arg)
{
	gres_state_t *gres_state_step = x;
	foreach_state_list_dup_t *foreach_state_list_dup = arg;
	gres_state_t *new_gres_state_step;
	void *new_gres_data;

	if (foreach_state_list_dup->job_node_index == -1)
		new_gres_data = _step_state_dup(gres_state_step->gres_data);
	else
		new_gres_data = _step_state_dup2(
			gres_state_step->gres_data,
			foreach_state_list_dup->job_node_index);

	if (!foreach_state_list_dup->new_gres_list)
		foreach_state_list_dup->new_gres_list =
			list_create(gres_step_list_delete);

	new_gres_state_step = gres_create_state(
		gres_state_step, GRES_STATE_SRC_STATE_PTR,
		GRES_STATE_TYPE_STEP, new_gres_data);
	list_append(foreach_state_list_dup->new_gres_list, new_gres_state_step);

	return 0;
}

/*
 * Create a copy of a step's gres state for a particular node index
 * IN gres_list - List of Gres records for this step to track usage
 * IN job_node_index - zero-origin index to the node
 * RET The copy or NULL on failure
 */
list_t *gres_step_state_extract(list_t *gres_list, int job_node_index)
{
	foreach_state_list_dup_t foreach_state_list_dup = {
		.job_node_index = job_node_index,
	};

	if (gres_list)
		(void) list_for_each(gres_list, _foreach_step_state_extract,
				     &foreach_state_list_dup);

	return foreach_state_list_dup.new_gres_list;
}

/*
 * Pack a step's current gres status, called from slurmctld for save/restore
 * IN gres_list - generated by gres_stepmgr_step_alloc()
 * IN/OUT buffer - location to write state to
 * IN step_id - job and step ID for logging
 */
extern int gres_step_state_pack(list_t *gres_list, buf_t *buffer,
				slurm_step_id_t *step_id,
				uint16_t protocol_version)
{
	pack_state_t pack_state = {
		.buffer = buffer,
		.magic = GRES_MAGIC,
		.protocol_version = protocol_version,
	};

	return _pack_state(gres_list, &pack_state, _foreach_step_state_pack);
}

/*
 * Unpack a step's current gres status, called from slurmctld for save/restore
 * OUT gres_list - restored state stored by gres_step_state_pack()
 * IN/OUT buffer - location to read state from
 * IN step_id - job and step ID for logging
 */
extern int gres_step_state_unpack(list_t **gres_list, buf_t *buffer,
				  slurm_step_id_t *step_id,
				  uint16_t protocol_version)
{
	int i, rc = SLURM_SUCCESS;
	uint32_t magic = 0, plugin_id = 0, uint32_tmp = 0;
	uint16_t rec_cnt = 0;
	uint8_t data_flag = 0;
	gres_state_t *gres_state_step;
	gres_step_state_t *gres_ss = NULL;
	bool locked = false;

	safe_unpack16(&rec_cnt, buffer);
	if (rec_cnt == 0)
		return SLURM_SUCCESS;

	xassert(gres_context_cnt >= 0);

	slurm_mutex_lock(&gres_context_lock);
	locked = true;
	if ((gres_context_cnt > 0) && (*gres_list == NULL)) {
		*gres_list = list_create(gres_step_list_delete);
	}

	while ((rc == SLURM_SUCCESS) && (rec_cnt)) {
		slurm_gres_context_t *gres_ctx;
		if ((buffer == NULL) || (remaining_buf(buffer) == 0))
			break;
		rec_cnt--;
		if (protocol_version >= SLURM_MIN_PROTOCOL_VERSION) {
			safe_unpack32(&magic, buffer);
			if (magic != GRES_MAGIC)
				goto unpack_error;
			safe_unpack32(&plugin_id, buffer);
			gres_ss = xmalloc(sizeof(gres_step_state_t));
			safe_unpack16(&gres_ss->cpus_per_gres, buffer);
			safe_unpack16(&gres_ss->flags, buffer);
			safe_unpack64(&gres_ss->gres_per_step, buffer);
			safe_unpack64(&gres_ss->gres_per_node, buffer);
			safe_unpack64(&gres_ss->gres_per_socket, buffer);
			safe_unpack64(&gres_ss->gres_per_task, buffer);
			safe_unpack64(&gres_ss->mem_per_gres, buffer);
			safe_unpack64(&gres_ss->total_gres, buffer);
			safe_unpackstr(&gres_ss->type_name, buffer);
			gres_ss->type_id = gres_build_id(gres_ss->type_name);
			safe_unpack32(&gres_ss->node_cnt, buffer);
			if (gres_ss->node_cnt > NO_VAL)
				goto unpack_error;
			unpack_bit_str_hex(&gres_ss->node_in_use, buffer);
			safe_unpack8(&data_flag, buffer);
			if (data_flag) {
				safe_unpack64_array(
					&gres_ss->gres_cnt_node_alloc,
					&uint32_tmp, buffer);
			}
			safe_unpack8(&data_flag, buffer);
			if (data_flag) {
				gres_ss->gres_bit_alloc =
					xcalloc(gres_ss->node_cnt,
						sizeof(bitstr_t *));
				for (i = 0; i < gres_ss->node_cnt; i++) {
					unpack_bit_str_hex(&gres_ss->
							   gres_bit_alloc[i],
							   buffer);
				}
			}
			for (i = 0; i < gres_ss->node_cnt; i++) {
				safe_unpack8(&data_flag, buffer);
				if (!data_flag)
					continue;
				if (!gres_ss->gres_per_bit_alloc)
					safe_xcalloc(
						gres_ss->gres_per_bit_alloc,
						gres_ss->node_cnt,
						sizeof(uint64_t *));
				safe_unpack64_array(
					&gres_ss->gres_per_bit_alloc[i],
					&uint32_tmp, buffer);
			}
		} else {
			error("%s: protocol_version %hu not supported",
			      __func__, protocol_version);
			goto unpack_error;
		}

		if (!(gres_ctx = _find_context_by_id(plugin_id))) {
			/*
			 * A likely sign that GresPlugins has changed.
			 * Not a fatal error, skip over the data.
			 */
			info("%s: no plugin configured to unpack data type %u from %ps",
			     __func__, plugin_id, step_id);
			_step_state_delete(gres_ss);
			gres_ss = NULL;
			continue;
		}
		gres_state_step = gres_create_state(
			gres_ctx, GRES_STATE_SRC_CONTEXT_PTR,
			GRES_STATE_TYPE_STEP, gres_ss);
		gres_ss = NULL;
		list_append(*gres_list, gres_state_step);
	}
	slurm_mutex_unlock(&gres_context_lock);
	return rc;

unpack_error:
	error("%s: unpack error from %ps", __func__, step_id);
	if (gres_ss)
		_step_state_delete(gres_ss);
	if (locked)
		slurm_mutex_unlock(&gres_context_lock);
	return SLURM_ERROR;
}

static int _foreach_step_count(void *x, void *arg)
{
	gres_state_t *gres_state_step = x;
	foreach_gres_list_cnt_t *foreach_gres_list_cnt = arg;
	gres_step_state_t *gres_ss = gres_state_step->gres_data;

	if (gres_state_step->plugin_id != foreach_gres_list_cnt->plugin_id)
		return 0;

	/* gres_cnt_node_alloc has one element in slurmstepd */
	if (foreach_gres_list_cnt->gres_cnt == NO_VAL64)
		foreach_gres_list_cnt->gres_cnt =
			gres_ss->gres_cnt_node_alloc[0];
	else
		foreach_gres_list_cnt->gres_cnt +=
			gres_ss->gres_cnt_node_alloc[0];
	return 0;
}

/* Return the count of GRES of a specific name on this machine
 * IN step_gres_list - generated by gres_stepmgr_step_alloc()
 * IN gres_name - name of the GRES to match
 * RET count of GRES of this specific name available to the job or NO_VAL64
 */
extern uint64_t gres_step_count(list_t *step_gres_list, char *gres_name)
{
	foreach_gres_list_cnt_t foreach_gres_list_cnt = {
		.gres_cnt = NO_VAL64,
	};

	if (step_gres_list) {
		slurm_mutex_lock(&gres_context_lock);
		for (int i = 0; i < gres_context_cnt; i++) {
			if (!xstrcmp(gres_context[i].gres_name, gres_name)) {
				foreach_gres_list_cnt.plugin_id =
					gres_context[i].plugin_id;
				(void) list_for_each(step_gres_list,
						     _foreach_step_count,
						     &foreach_gres_list_cnt);
				break;
			}
		}
		slurm_mutex_unlock(&gres_context_lock);
	}

	return foreach_gres_list_cnt.gres_cnt;
}

/*
 * Here we convert usable_gres from a mask just for the gres in the allocation
 * to one for the gres on the node. Essentially putting in a '0' for gres not
 * in the allocation
 *
 * IN/OUT - usable_gres
 * IN - gres_bit_alloc
 */
static void _translate_step_to_global_device_index(bitstr_t **usable_gres,
						   bitstr_t *gres_bit_alloc)
{
	bitstr_t *tmp = bit_alloc(bit_size(gres_bit_alloc));
	int i_last, bit, bit2 = 0;

	i_last = bit_fls(gres_bit_alloc);
	for (bit = 0; bit <= i_last; bit++) {
		if (bit_test(gres_bit_alloc, bit)) {
			if (bit_test(*usable_gres, bit2)) {
				bit_set(tmp, bit);
			}
			bit2++;
		}
	}
	FREE_NULL_BITMAP(*usable_gres);
	*usable_gres = tmp;
}

bitstr_t *cpu_set_to_bit_str(cpu_set_t *cpu_set, int cpu_count)
{
	bitstr_t *cpu_bitstr = bit_alloc(cpu_count);

	if (cpu_set) {
		for (int i = 0; i < cpu_count; i++)
			if (CPU_ISSET(i, cpu_set))
				bit_set(cpu_bitstr, i);
	} else {
		bit_set_all(cpu_bitstr);
	}

	return cpu_bitstr;

}

static int _foreach_closest_usable_gres(void *x, void *arg)
{
	gres_slurmd_conf_t *gres_slurmd_conf = x;
	foreach_closest_usable_gres_t *foreach_closest_usable_gres = arg;

	if (gres_slurmd_conf->plugin_id !=
	    foreach_closest_usable_gres->plugin_id)
		return 0;
	if ((foreach_closest_usable_gres->gres_inx + gres_slurmd_conf->count) >
	    foreach_closest_usable_gres->bitmap_size) {
		error("GRES %s bitmap overflow ((%d + %"PRIu64") > %d)",
		      gres_slurmd_conf->name,
		      foreach_closest_usable_gres->gres_inx,
		      gres_slurmd_conf->count,
		      foreach_closest_usable_gres->bitmap_size);
		return 0;
	}
	if (!gres_slurmd_conf->cpus_bitmap ||
	    bit_overlap_any(gres_slurmd_conf->cpus_bitmap,
			    foreach_closest_usable_gres->task_cpus_bitmap)) {
		bit_nset(foreach_closest_usable_gres->usable_gres,
			 foreach_closest_usable_gres->gres_inx,
			 foreach_closest_usable_gres->gres_inx +
			 gres_slurmd_conf->count - 1);
	}
	foreach_closest_usable_gres->gres_inx += gres_slurmd_conf->count;

	return 0;
}

/*
 * Given a GRES context index, return a bitmap representing those GRES
 * which are available from the CPUs current allocated to this process.
 * This function only works with task/cgroup and constrained devices or
 * if the job step has access to the entire node's resources.
 */
static bitstr_t *_get_closest_usable_gres(uint32_t plugin_id,
					  bitstr_t *gres_bit_alloc,
					  cpu_set_t *task_cpu_set)
{
	foreach_closest_usable_gres_t foreach_closest_usable_gres = {
		.gres_inx = 0,
		.plugin_id = plugin_id,
	};

	if (!gres_conf_list) {
		error("gres_conf_list is null!");
		return NULL;
	}

	foreach_closest_usable_gres.task_cpus_bitmap = cpu_set_to_bit_str(
		task_cpu_set,
		((gres_slurmd_conf_t *)list_peek(gres_conf_list))->cpu_cnt);
	foreach_closest_usable_gres.bitmap_size = bit_size(gres_bit_alloc);
	foreach_closest_usable_gres.usable_gres =
		bit_alloc(foreach_closest_usable_gres.bitmap_size);

	(void) list_for_each(gres_conf_list, _foreach_closest_usable_gres,
			     &foreach_closest_usable_gres);

	FREE_NULL_BITMAP(foreach_closest_usable_gres.task_cpus_bitmap);

	bit_and(foreach_closest_usable_gres.usable_gres, gres_bit_alloc);

	return foreach_closest_usable_gres.usable_gres;
}

static int _foreach_gres_to_task(void *x, void *arg)
{
	gres_slurmd_conf_t *gres_slurmd_conf = x;
	foreach_gres_to_task_t *foreach_gres_to_task = arg;
	int start, end;

	if (gres_slurmd_conf->plugin_id != foreach_gres_to_task->plugin_id)
		return 0;

	start = foreach_gres_to_task->gres_inx *
		foreach_gres_to_task->ntasks_per_gres;
	foreach_gres_to_task->gres_inx += gres_slurmd_conf->count;
	end = foreach_gres_to_task->gres_inx *
		foreach_gres_to_task->ntasks_per_gres;

	if (!bit_set_count_range(foreach_gres_to_task->gres_slots, start, end))
		return 0;

	if (gres_slurmd_conf->cpus_bitmap) {
		if (bit_super_set(foreach_gres_to_task->task_cpus_bitmap,
				  gres_slurmd_conf->cpus_bitmap)) {
			foreach_gres_to_task->best_slot = bit_ffs_from_bit(
				foreach_gres_to_task->gres_slots, start);
			return -1;
		}

		if (foreach_gres_to_task->overlap)
			return 0;

		if (bit_overlap_any(foreach_gres_to_task->task_cpus_bitmap,
				    gres_slurmd_conf->cpus_bitmap)) {
			foreach_gres_to_task->best_slot = bit_ffs_from_bit(
				foreach_gres_to_task->gres_slots, start);
			foreach_gres_to_task->overlap = true;
			return 0;
		}
	}

	if (foreach_gres_to_task->best_slot == -1)
		foreach_gres_to_task->best_slot = bit_ffs_from_bit(
			foreach_gres_to_task->gres_slots, start);

	return 0;
}

/* Select the best available gres from gres_slots */
static int _assign_gres_to_task(cpu_set_t *task_cpu_set, int ntasks_per_gres,
				bitstr_t *gres_slots, uint32_t plugin_id)
{
	foreach_gres_to_task_t foreach_gres_to_task = {
		.best_slot = -1,
		.gres_inx = 0,
		.gres_slots = gres_slots,
		.ntasks_per_gres = ntasks_per_gres,
		.overlap = false,
		.plugin_id = plugin_id,
		.task_cpus_bitmap = cpu_set_to_bit_str(
			task_cpu_set,
			((gres_slurmd_conf_t *)list_peek(gres_conf_list))->
			cpu_cnt),
	};

	(void) list_for_each(gres_conf_list, _foreach_gres_to_task,
			     &foreach_gres_to_task);
	FREE_NULL_BITMAP(foreach_gres_to_task.task_cpus_bitmap);

	if (foreach_gres_to_task.best_slot != -1) {
		bit_clear(foreach_gres_to_task.gres_slots,
			  foreach_gres_to_task.best_slot);
		return (foreach_gres_to_task.best_slot /
			foreach_gres_to_task.ntasks_per_gres);
	} else {
		log_flag(GRES, "%s Can't find free slot", __func__);
		return -1;
	}
}

/*
 * Given the cpu affinity of all tasks, return a bitmap binding a single gres to
 * this task.
 */
static bitstr_t *_get_single_usable_gres(int context_inx,
					 int ntasks_per_gres,
					 int local_proc_id,
					 stepd_step_rec_t *step,
					 bitstr_t *gres_bit_alloc)
{
	int idx = 0;
	bitstr_t *usable_gres = NULL;
	bitstr_t *gres_slots = NULL;
	int32_t gres_count = bit_set_count(gres_bit_alloc);


	/* No need to select gres if there is only 1 to use */
	if (gres_count <= 1) {
		log_flag(GRES, "%s: (task %d) No need to select single gres since count is 0 or 1",
			 __func__, local_proc_id);
		return bit_copy(gres_bit_alloc);
	}

	/*
	 * Create bitmap called gres_slots. This represents the available slots
	 * for tasks on that gres based off of ntasks_per_gres and if that gres
	 * is allocated to the step.
	 */
	if (ntasks_per_gres == 1)
		gres_slots = bit_copy(gres_bit_alloc);
	else {
		gres_slots = bit_alloc(bit_size(gres_bit_alloc) *
				       ntasks_per_gres);
		for (int i = -1;
		     (i = bit_ffs_from_bit(gres_bit_alloc, i + 1)) >= 0;) {
			bit_nset(gres_slots, i * ntasks_per_gres,
				 (((i + 1) * ntasks_per_gres) - 1));
		}
	}

	/*
	 * To ensure no task gets more than ntasks_per_gres, here we one by one,
	 * select an available gres_slot for each task and clear a gres_slot.
	 * Once we reach the current task we can take the gres assignment and
	 * quit the loop
	 */
	for (int i = 0; i <= local_proc_id; i++) {
		idx = _assign_gres_to_task(step->task[i]->cpu_set,
					   ntasks_per_gres, gres_slots,
					   gres_context[context_inx].plugin_id);
	}
	FREE_NULL_BITMAP(gres_slots);

	/* Return a bitmap with this as the only usable GRES */
	usable_gres = bit_alloc(bit_size(gres_bit_alloc));
	if (idx < 0) {
		int n;
		error("%s Can't find free slot for local_proc_id = %d, continue using block distribution",
		     __func__, local_proc_id);
		n = local_proc_id % gres_count;
		idx = bit_get_bit_num(gres_bit_alloc, n);
	}

	bit_set(usable_gres, idx);

	if (slurm_conf.debug_flags & DEBUG_FLAG_GRES){
		char *usable_gres_str = bit_fmt_hexmask_trim(usable_gres);
		log_flag(GRES, "%s: local_proc_id = %d; usable_gres: %s",
			 __func__, local_proc_id, usable_gres_str);
		xfree(usable_gres_str);
	}

	return usable_gres;
}

/*
 * Configure the GRES hardware allocated to the current step while privileged
 *
 * IN step_gres_list - Step's GRES specification
 * IN node_id        - relative position of this node in step
 * IN settings       - string containing configuration settings for the hardware
 */
extern void gres_g_step_hardware_init(list_t *step_gres_list,
				      uint32_t node_id, char *settings)
{
	int i;
	gres_state_t *gres_state_step;
	gres_step_state_t *gres_ss;
	bitstr_t *devices;

	if (!step_gres_list)
		return;

	xassert(gres_context_cnt >= 0);
	slurm_mutex_lock(&gres_context_lock);
	for (i = 0; i < gres_context_cnt; i++) {
		if (gres_context[i].ops.step_hardware_init == NULL)
			continue;

		gres_state_step = list_find_first(step_gres_list, gres_find_id,
						  &gres_context[i].plugin_id);
		if (!gres_state_step || !gres_state_step->gres_data)
			continue;
		gres_ss = (gres_step_state_t *) gres_state_step->gres_data;
		if ((gres_ss->node_cnt != 1) ||
		    !gres_ss->gres_bit_alloc ||
		    !gres_ss->gres_bit_alloc[0])
			continue;

		devices = gres_ss->gres_bit_alloc[0];
		if (settings)
			debug2("settings: %s", settings);
		(*(gres_context[i].ops.step_hardware_init))(devices, settings);
	}
	slurm_mutex_unlock(&gres_context_lock);
}

/*
 * Optionally undo GRES hardware configuration while privileged
 */
extern void gres_g_step_hardware_fini(void)
{
	int i;
	xassert(gres_context_cnt >= 0);
	slurm_mutex_lock(&gres_context_lock);
	for (i = 0; i < gres_context_cnt; i++) {
		if (gres_context[i].ops.step_hardware_fini == NULL) {
			continue;
		}
		(*(gres_context[i].ops.step_hardware_fini)) ();
	}
	slurm_mutex_unlock(&gres_context_lock);
}

/*
 * Given a set of GRES masks or maps and the local process ID, return the bitmap
 * of GRES that should be available to this task.
 *
 * IN map_or_mask
 * IN local_proc_id
 * IN gres_bit_alloc
 * IN is_map
 * IN get_devices
 *
 * RET usable_gres
 */
static bitstr_t *_get_usable_gres_map_or_mask(char *map_or_mask,
					      int local_proc_id,
					      bitstr_t *gres_bit_alloc,
					      bool is_map,
					      bool get_devices)
{
	bitstr_t *usable_gres = NULL;
	char *tmp, *tok, *save_ptr = NULL, *mult;
	int i, task_offset = 0, task_mult, bitmap_size;
	int value, min, max;

	if (!map_or_mask || !map_or_mask[0])
		return NULL;

	bitmap_size = bit_size(gres_bit_alloc);
	min = (is_map ?  0 : 1);
	max = (is_map ? bitmap_size - 1 : ~(-1 << bitmap_size));
	while (usable_gres == NULL) {
		tmp = xstrdup(map_or_mask);
		strtok(tmp,"+");
		tok = strtok_r(tmp, ",", &save_ptr);
		while (tok) {
			if ((mult = strchr(tok, '*')))
				task_mult = atoi(mult + 1);
			else
				task_mult = 1;
			if (task_mult == 0) {
				error("Repetition count of 0 not allowed in gres binding mask, using 1 instead");
				task_mult = 1;
			}
			if ((local_proc_id >= task_offset) &&
			    (local_proc_id <= (task_offset + task_mult - 1))) {
				value = strtol(tok, NULL, 0);
				usable_gres = bit_alloc(bitmap_size);
				if ((value < min) || (value > max)) {
					error("Invalid map or mask value specified.");
					xfree(tmp);
					goto end;	/* Bad value */
				}
				if (is_map)
					bit_set(usable_gres, value);
				else
					for (i = 0; i < bitmap_size; i++) {
						if ((value >> i) & 0x1)
							bit_set(usable_gres, i);
					}
				break;	/* All done */
			} else {
				task_offset += task_mult;
			}
			tok = strtok_r(NULL, ",", &save_ptr);
		}
		xfree(tmp);
	}

end:
	if (gres_use_local_device_index()) {
		if (get_devices)
			_translate_step_to_global_device_index(
				&usable_gres, gres_bit_alloc);
		else{
			bit_consolidate(usable_gres);
		}
	} else {
		bit_and(usable_gres, gres_bit_alloc);
	}

	return usable_gres;
}

static void _accumulate_step_gres_alloc(gres_state_t *gres_state_step,
					bitstr_t **gres_bit_alloc,
					uint64_t *gres_cnt,
					uint64_t **gres_per_bit)
{
	gres_step_state_t *gres_ss =
		(gres_step_state_t *)gres_state_step->gres_data;

	/* Since this should only run on the node node_cnt should always be 1 */
	if (gres_ss->node_cnt != 1) {
		error("gres_step_state_t node count not 1 while on node. This should never happen");
		return;
	}

	if (gres_ss->gres_bit_alloc &&
	    gres_ss->gres_bit_alloc[0]) {
		if (!*gres_bit_alloc) {
			*gres_bit_alloc = bit_alloc(
				bit_size(gres_ss->gres_bit_alloc[0]));
		}
		bit_or(*gres_bit_alloc, gres_ss->gres_bit_alloc[0]);
	}
	if (gres_cnt && gres_ss->gres_cnt_node_alloc)
		*gres_cnt += gres_ss->gres_cnt_node_alloc[0];
	if (gres_per_bit &&
	    gres_ss->gres_per_bit_alloc &&
	    gres_ss->gres_per_bit_alloc[0] &&
	    gres_ss->gres_bit_alloc &&
	    gres_ss->gres_bit_alloc[0]) {
		if (!*gres_per_bit)
			*gres_per_bit = xcalloc(
				bit_size(gres_ss->gres_bit_alloc[0]),
				sizeof(uint64_t));
		for (int i = 0; i < bit_size(gres_ss->gres_bit_alloc[0]); i++) {
			(*gres_per_bit)[i] += gres_ss->gres_per_bit_alloc[0][i];
		}
	}
}

static void _filter_gres_per_task(bitstr_t *test_gres,
				  bitstr_t *gres_bit_avail,
				  bitstr_t *usable_gres,
				  uint64_t *gres_needed,
				  bool set_usable_gres)
{
	for (int bit = 0;
	     *gres_needed && (bit = bit_ffs_from_bit(test_gres, bit)) >= 0;
	     bit++) {
		(*gres_needed)--;
		bit_clear(gres_bit_avail, bit);
		if (set_usable_gres)
			bit_set(usable_gres, bit);
	}
}

/*
 * Given a required gres_per_task count, determine which gres should be assigned
 * to this task. Prefer gres with cpu affinity that match the task.
 *
 * RET usable_gres
 */
static bitstr_t *_get_gres_per_task(bitstr_t *gres_bit_alloc,
				    uint64_t gres_per_task,
				    stepd_step_rec_t *step,
				    uint32_t plugin_id,
				    int local_proc_id)
{
	uint64_t gres_needed;
	bitstr_t *usable_gres, *gres_bit_avail;

	usable_gres = bit_alloc(bit_size(gres_bit_alloc));
	gres_bit_avail = bit_copy(gres_bit_alloc);

	/*
	 * We must determine what the previous tasks are taking first to know
	 * which gres are available to be assigned to this task.
	 */
	for (int i = 0; i <= local_proc_id; i++) {
		gres_needed = gres_per_task;

		/* First: Try to select device with with cpu affinity */
		if (gres_needed) {
			bitstr_t *closest_gres = _get_closest_usable_gres(
				plugin_id, gres_bit_avail,
				step->task[i]->cpu_set);
			_filter_gres_per_task(closest_gres, gres_bit_avail,
					      usable_gres, &gres_needed,
					      (i == local_proc_id));
			FREE_NULL_BITMAP(closest_gres);
		}

		/* Second: Select any available device */
		if (gres_needed)
			_filter_gres_per_task(gres_bit_avail, gres_bit_avail,
					      usable_gres, &gres_needed,
					      (i == local_proc_id));

		if (gres_needed) {
			error("Not enough gres to bind %"PRIu64" per task",
			      gres_per_task);
			break;
		}
	}
	FREE_NULL_BITMAP(gres_bit_avail);
	return usable_gres;
}

static void _filter_shared_gres_per_task(bitstr_t *test_gres,
					 bitstr_t *usable_gres,
					 uint64_t *gres_per_bit_avail,
					 uint64_t *gres_needed,
					 bool use_single_dev,
					 bool set_usable_gres)
{
	for (int bit = 0;
	     *gres_needed && (bit = bit_ffs_from_bit(test_gres, bit)) >= 0;
	     bit++) {
		uint64_t dec = MIN(gres_per_bit_avail[bit], *gres_needed);

		if (dec < (use_single_dev ? *gres_needed : 1))
			continue;

		gres_per_bit_avail[bit] -= dec;
		*gres_needed -= dec;

		if (set_usable_gres)
			bit_set(usable_gres, bit);
	}
}

/*
 * Given a required gres_per_task count, determine which shared gres should be
 * assigned to this task. Prefer gres with core affinity that match the task
 * and prefer allocating shared gres belonging to a single device if possible.
 */
static bitstr_t *_get_shared_gres_per_task(bitstr_t *gres_bit_alloc,
					   uint64_t *gres_per_bit,
					   uint64_t gres_per_task,
					   stepd_step_rec_t *step,
					   uint32_t sharing_plugin_id,
					   int local_proc_id)
{
	uint64_t gres_needed;
	bitstr_t *usable_gres, *closest_gres;
	uint64_t *gres_per_bit_avail;

	usable_gres = bit_alloc(bit_size(gres_bit_alloc));
	gres_per_bit_avail = xcalloc(bit_size(gres_bit_alloc),
				     sizeof(uint64_t));
	memcpy(gres_per_bit_avail, gres_per_bit,
	       bit_size(gres_bit_alloc) * sizeof(uint64_t));

	/*
	 * We must determine what the previous tasks are taking first to know
	 * which gres are available to be assigned to this task.
	 */
	for (int i = 0; i <= local_proc_id; i++) {
		closest_gres = _get_closest_usable_gres(sharing_plugin_id,
							gres_bit_alloc,
							step->task[i]->cpu_set);

		gres_needed = gres_per_task;

		/*
		 * Compare this selection priority with _set_shared_task_bits()
		 * in gres_select_filter.c
		 *
		 * First: Get a single device with core affinity with sufficient
		 *	available shared gres.
		 * Second: Get a single device with sufficient available shared
		 *	gres
		 * Third: Get devices with core affinity with any available
		 *	shared gres
		 * Fourth: Get devices with any available shared gres
		 */
		if (gres_needed)
			_filter_shared_gres_per_task(closest_gres, usable_gres,
						     gres_per_bit_avail,
						     &gres_needed, true,
						     (i == local_proc_id));
		if (gres_needed)
			_filter_shared_gres_per_task(gres_bit_alloc,
						     usable_gres,
						     gres_per_bit_avail,
						     &gres_needed, true,
						     (i == local_proc_id));
		if (gres_needed)
			_filter_shared_gres_per_task(closest_gres, usable_gres,
						     gres_per_bit_avail,
						     &gres_needed, false,
						     (i == local_proc_id));
		if (gres_needed)
			_filter_shared_gres_per_task(gres_bit_alloc,
						     usable_gres,
						     gres_per_bit_avail,
						     &gres_needed, false,
						     (i == local_proc_id));
		FREE_NULL_BITMAP(closest_gres);
		if (gres_needed) {
			error("Not enough shared gres to bind %"PRIu64" per task",
			      gres_per_task);
			break;
		}
	}
	xfree(gres_per_bit_avail);
	return usable_gres;
}

/* Convert old binding options to current gres binding format
 *
 * IN accel_bind_type - GRES binding options (old format, a bitmap)
 * IN/OUT tres_bind_str - TRES binding directives (new format, a string)
 */
static void _parse_accel_bind_type(uint16_t accel_bind_type, char *tres_bind_str)
{
	if (accel_bind_type & ACCEL_BIND_CLOSEST_GPU) {
		xstrfmtcat(tres_bind_str, "%sgres/gpu:closest",
			   tres_bind_str ? "+" : "");
	}
	if (accel_bind_type & ACCEL_BIND_CLOSEST_NIC) {
		xstrfmtcat(tres_bind_str, "%sgres/nic:closest",
			   tres_bind_str ? "+" : "");
	}
}

static int _get_usable_gres(int context_inx, int proc_id,
			    char *tres_bind_str, bitstr_t **usable_gres_ptr,
			    bitstr_t *gres_bit_alloc,  bool get_devices,
			    stepd_step_rec_t *step, uint64_t *gres_per_bit,
			    gres_internal_flags_t *flags)
{
	char *tres_name = NULL, *sep;
	bitstr_t *usable_gres = NULL;
	uint32_t plugin_id = gres_context[context_inx].plugin_id;
	*usable_gres_ptr = NULL;

	if (!gres_bit_alloc || !tres_bind_str)
		return SLURM_SUCCESS;

	tres_name = xstrdup_printf("gres/%s:",
				   gres_context[context_inx].gres_name);
	sep = xstrstr(tres_bind_str, tres_name);
	if (!sep) {
		xfree(tres_name);
		return SLURM_SUCCESS;
	}
	sep += strlen(tres_name);
	xfree(tres_name);

	if (!xstrncasecmp(sep, "verbose,", 8)){
		sep += 8;
		if (flags)
			*flags |= GRES_INTERNAL_FLAG_VERBOSE;
	}

	if (step->flags & LAUNCH_GRES_ALLOW_TASK_SHARING) {
		if (get_devices)
			return SLURM_SUCCESS;
		/*
		* Overwrite device index setting to use the global node/job GRES
		* index, rather than the index local to the task. This ensures
		* that the GRES environment variable is set correctly on the
		* task when multiple devices are constrained to the task, and
		* only the environment variables are bound to specific GRES.
		*/
		use_local_index = false;
		dev_index_mode_set = true;

		/*
		 * Consolidate allocated gres bitstring so that we get the GRES
		 * device index of the GRES within the context of the job, and
		 * not within the context of the whole node, unless specifically
		 * required with the GRES_CONF_GLOBAL_INDEX flag.
		 */
		if (!(gres_context[context_inx].config_flags &
		      GRES_CONF_GLOBAL_INDEX))
			bit_consolidate(gres_bit_alloc);
	}

	if (gres_context[context_inx].config_flags & GRES_CONF_GLOBAL_INDEX) {
		use_local_index = false;
		dev_index_mode_set = true;
	}

	if (!gres_id_shared(gres_context[context_inx].config_flags)) {
		if (!xstrncasecmp(sep, "map_gpu:", 8)) { // Old Syntax
			usable_gres = _get_usable_gres_map_or_mask(
				(sep + 8), proc_id, gres_bit_alloc,
				true, get_devices);
		} else if (!xstrncasecmp(sep, "mask_gpu:", 9)) { // Old Syntax
			usable_gres = _get_usable_gres_map_or_mask(
				(sep + 9), proc_id, gres_bit_alloc,
				false, get_devices);
		} else if (!xstrncasecmp(sep, "map:", 4)) {
			usable_gres = _get_usable_gres_map_or_mask(
				(sep + 4), proc_id, gres_bit_alloc,
				true, get_devices);
		} else if (!xstrncasecmp(sep, "mask:", 5)) {
			usable_gres = _get_usable_gres_map_or_mask(
				(sep + 5), proc_id, gres_bit_alloc,
				false, get_devices);
		} else if (!xstrncasecmp(sep, "single:", 7)) {
			if (!get_devices && gres_use_local_device_index()) {
				usable_gres = bit_alloc(
					bit_size(gres_bit_alloc));
				bit_set(usable_gres, 0);
			} else {
				usable_gres = _get_single_usable_gres(
					context_inx, slurm_atoul(sep + 7),
					proc_id, step, gres_bit_alloc);
			}
		} else if (!xstrncasecmp(sep, "closest", 7)) {
			usable_gres = _get_closest_usable_gres(
				plugin_id, gres_bit_alloc,
				step->task[proc_id]->cpu_set);
			if (!get_devices && gres_use_local_device_index())
				bit_consolidate(usable_gres);
		} else if (!xstrncasecmp(sep, "per_task:", 9)) {
			if (!get_devices && gres_use_local_device_index()) {
				usable_gres = bit_alloc(
					bit_size(gres_bit_alloc));
				bit_nset(usable_gres, 0,
					 slurm_atoul(sep + 9) - 1);
			} else {
				usable_gres = _get_gres_per_task(
					gres_bit_alloc, slurm_atoul(sep + 9),
					step, plugin_id, proc_id);
			}
		} else if (!xstrncasecmp(sep, "none", 4)) {
			usable_gres = bit_copy(gres_bit_alloc);
		} else
			return SLURM_ERROR;
	} else { // Shared gres only support per_task binding for now
		if (!xstrncasecmp(sep, "per_task:", 9)) {
			usable_gres = _get_shared_gres_per_task(
				gres_bit_alloc, gres_per_bit,
				slurm_atoul(sep + 9),
				step, gpu_plugin_id, proc_id);
			if (!get_devices && gres_use_local_device_index())
				bit_consolidate(usable_gres);
		} else if (!xstrncasecmp(sep, "none", 4)) {
			usable_gres = bit_copy(gres_bit_alloc);
		} else
			return SLURM_ERROR;
	}

	if (usable_gres && !bit_set_count(usable_gres)) {
		error("Bind request %s does not specify any devices within the allocation for task %d. Binding to the first device in the allocation instead.",
		      tres_bind_str, proc_id);
		if (!get_devices && gres_use_local_device_index())
			bit_set(usable_gres, 0);
		else
			bit_set(usable_gres, bit_ffs(gres_bit_alloc));
	}

	*usable_gres_ptr = usable_gres;

	return SLURM_SUCCESS;
}

/*
 * Set environment as required for all tasks of a job step
 */
extern void gres_g_step_set_env(stepd_step_rec_t *step)
{
	int i;
	bitstr_t *gres_bit_alloc = NULL;
	gres_internal_flags_t flags = GRES_INTERNAL_FLAG_NONE;
	foreach_gres_accumulate_device_t foreach_gres_accumulate_device = {
		.gres_bit_alloc = &gres_bit_alloc,
		.is_job = false,
	};

	xassert(gres_context_cnt >= 0);
	slurm_mutex_lock(&gres_context_lock);
	for (i = 0; i < gres_context_cnt; i++) {
		slurm_gres_context_t *gres_ctx = &gres_context[i];
		if (!gres_ctx->ops.step_set_env)
			continue;	/* No plugin to call */
		if (!step->step_gres_list) {
			/* Clear GRES environment variables */
			(*(gres_ctx->ops.step_set_env))(
				&step->env, NULL, 0, GRES_INTERNAL_FLAG_NONE);
			continue;
		}
		foreach_gres_accumulate_device.plugin_id = gres_ctx->plugin_id;
		(void) list_for_each(step->step_gres_list,
				     _accumulate_gres_device,
				     &foreach_gres_accumulate_device);

		/*
		 * Do not let MPS or Shard (shared GRES) clear any envs set for
		 * a GPU (sharing GRES) when a GPU is allocated but an
		 * MPS/Shard is not. Sharing GRES plugins always run before
		 * shared GRES, so we don't need to protect MPS/Shard from GPU.
		 */
		if (gres_id_shared(gres_ctx->config_flags) &&
		    foreach_gres_accumulate_device.sharing_gres_allocated)
			flags |= GRES_INTERNAL_FLAG_PROTECT_ENV;

		(*(gres_ctx->ops.step_set_env))(
			&step->env,
			gres_bit_alloc,
			foreach_gres_accumulate_device.gres_cnt,
			flags);
		foreach_gres_accumulate_device.gres_cnt = 0;
		FREE_NULL_BITMAP(gres_bit_alloc);
	}
	slurm_mutex_unlock(&gres_context_lock);
}

/*
 * Change the task's inherited environment (from the step, and set by
 * gres_g_step_set_env()). Use this to implement GPU task binding.
 */
extern void gres_g_task_set_env(stepd_step_rec_t *step, int local_proc_id)
{
	int i;
	bitstr_t *usable_gres = NULL;
	bitstr_t *gres_bit_alloc = NULL;
	uint64_t *gres_per_bit = NULL;
	foreach_gres_accumulate_device_t foreach_gres_accumulate_device = {
		.gres_bit_alloc = &gres_bit_alloc,
		.gres_per_bit = &gres_per_bit,
		.is_job = false,
	};

	if (step->accel_bind_type)
		_parse_accel_bind_type(step->accel_bind_type, step->tres_bind);

	xassert(gres_context_cnt >= 0);
	slurm_mutex_lock(&gres_context_lock);
	for (i = 0; i < gres_context_cnt; i++) {
		gres_internal_flags_t flags = GRES_INTERNAL_FLAG_NONE;
		slurm_gres_context_t *gres_ctx = &gres_context[i];
		if (!gres_ctx->ops.task_set_env)
			continue;	/* No plugin to call */
		if (!step->step_gres_list) {
			/* Clear GRES environment variables */
			(*(gres_ctx->ops.task_set_env))(
				&step->envtp->env, NULL, 0, NULL,
				GRES_INTERNAL_FLAG_NONE);
			continue;
		}
		foreach_gres_accumulate_device.plugin_id = gres_ctx->plugin_id;
		(void) list_for_each(step->step_gres_list,
				     _accumulate_gres_device,
				     &foreach_gres_accumulate_device);

		if (_get_usable_gres(i, local_proc_id, step->tres_bind,
				     &usable_gres, gres_bit_alloc, false, step,
				     gres_per_bit, &flags) == SLURM_ERROR) {
			goto next;
		}

		/*
		 * Do not let MPS or Shard (shared GRES) clear any envs set for
		 * a GPU (sharing GRES) when a GPU is allocated but an
		 * MPS/Shard is not. Sharing GRES plugins always run before
		 * shared GRES, so we don't need to protect MPS/Shard from GPU.
		 */
		if (gres_id_shared(gres_ctx->config_flags) &&
		    foreach_gres_accumulate_device.sharing_gres_allocated)
			flags |= GRES_INTERNAL_FLAG_PROTECT_ENV;

		(*(gres_ctx->ops.task_set_env))(
			&step->envtp->env,
			gres_bit_alloc,
			foreach_gres_accumulate_device.gres_cnt,
			usable_gres, flags);
	next:
		foreach_gres_accumulate_device.gres_cnt = 0;
		xfree(gres_per_bit);
		FREE_NULL_BITMAP(gres_bit_alloc);
		FREE_NULL_BITMAP(usable_gres);
	}
	slurm_mutex_unlock(&gres_context_lock);
}

static void _step_state_log_node(gres_step_state_t *gres_ss, int i)
{
	char tmp_str[128];
	if (gres_ss->gres_bit_alloc[i]) {
		bit_fmt(tmp_str, sizeof(tmp_str), gres_ss->gres_bit_alloc[i]);
		info("  gres_bit_alloc[%d]:%s of %d", i, tmp_str,
		     (int)bit_size(gres_ss->gres_bit_alloc[i]));
	} else
		info("  gres_bit_alloc[%d]:NULL", i);

	if (gres_ss->gres_per_bit_alloc && gres_ss->gres_per_bit_alloc[i]) {
		for (int j = 0;
		     (j = bit_ffs_from_bit(gres_ss->gres_bit_alloc[i], j)) >= 0;
		     j++) {
			info("  gres_per_bit_alloc[%d][%d]:%" PRIu64, i, j,
			     gres_ss->gres_per_bit_alloc[i][j]);
		}
	}
}

static int _step_state_log(void *x, void *arg)
{
	gres_state_t *gres_state_step = x;
	gres_step_state_t *gres_ss = gres_state_step->gres_data;
	char *gres_name = gres_state_step->gres_name;
	slurm_step_id_t *step_id = arg;
	int i;

	xassert(gres_ss);
	info("gres:%s type:%s(%u) %ps flags:%s state", gres_name,
	     gres_ss->type_name, gres_ss->type_id, step_id,
	     gres_flags2str(gres_ss->flags));
	if (gres_ss->cpus_per_gres)
		info("  cpus_per_gres:%u", gres_ss->cpus_per_gres);
	if (gres_ss->gres_per_step)
		info("  gres_per_step:%"PRIu64, gres_ss->gres_per_step);
	if (gres_ss->gres_per_node) {
		info("  gres_per_node:%"PRIu64" node_cnt:%u",
		     gres_ss->gres_per_node, gres_ss->node_cnt);
	}
	if (gres_ss->gres_per_socket)
		info("  gres_per_socket:%"PRIu64, gres_ss->gres_per_socket);
	if (gres_ss->gres_per_task)
		info("  gres_per_task:%"PRIu64, gres_ss->gres_per_task);
	if (gres_ss->mem_per_gres)
		info("  mem_per_gres:%"PRIu64, gres_ss->mem_per_gres);

	if (gres_ss->node_in_use == NULL)
		info("  node_in_use:NULL");
	else if (gres_ss->gres_bit_alloc == NULL)
		info("  gres_bit_alloc:NULL");
	else {
		for (i = 0; i < gres_ss->node_cnt; i++) {
			if (bit_test(gres_ss->node_in_use, i))
				_step_state_log_node(gres_ss, i);
		}
	}

	return 0;
}

/*
 * Log a step's current gres state
 * IN gres_list - generated by gres_stepmgr_step_alloc()
 * IN job_id - job's ID
 * IN step_id - step's ID
 */
extern void gres_step_state_log(list_t *gres_list, uint32_t job_id,
				uint32_t step_id)
{
	slurm_step_id_t tmp_step_id = {
		.job_id = job_id,
		.step_het_comp = NO_VAL,
		.step_id = step_id,
	};

	if (!(slurm_conf.debug_flags & DEBUG_FLAG_GRES) || !gres_list)
		return;

	(void) list_for_each(gres_list, _step_state_log, &tmp_step_id);
}

/*
 * Return TRUE if this plugin ID consumes GRES count > 1 for a single device
 * file (e.g. MPS)
 */
extern bool gres_id_shared(uint32_t config_flags)
{
	if (config_flags & GRES_CONF_SHARED)
		return true;
	return false;
}
/*
 * Return TRUE if this plugin ID shares resources with another GRES that
 * consumes subsets of its resources (e.g. GPU)
 */
extern bool gres_id_sharing(uint32_t plugin_id)
{
	if (plugin_id == gpu_plugin_id)
		return true;
	return false;
}

static int _foreach_node_count(void *x, void *arg)
{
	gres_state_t *gres_state_node = x;
	foreach_node_count_t *foreach_node_count = arg;
	gres_node_state_t *gres_ns = gres_state_node->gres_data;
	uint64_t val = 0;

	xassert(gres_ns);

	switch (foreach_node_count->val_type) {
	case GRES_VAL_TYPE_FOUND:
		val = gres_ns->gres_cnt_found;
		break;
	case GRES_VAL_TYPE_CONFIG:
		val = gres_ns->gres_cnt_config;
		break;
	case GRES_VAL_TYPE_AVAIL:
		val = gres_ns->gres_cnt_avail;
		break;
	case GRES_VAL_TYPE_ALLOC:
		val = gres_ns->gres_cnt_alloc;
		break;
	}

	foreach_node_count->gres_count_ids[foreach_node_count->index] =
		gres_state_node->plugin_id;
	foreach_node_count->gres_count_vals[foreach_node_count->index] = val;

	if (++foreach_node_count->index >= foreach_node_count->array_len)
		return -1;
	return 0;
}

/*
 * Fill in an array of GRES type ids contained within the given node gres_list
 *		and an array of corresponding counts of those GRES types.
 * IN gres_list - a List of GRES types found on a node.
 * IN arrlen - Length of the arrays (the number of elements in the gres_list).
 * IN gres_count_ids, gres_count_vals - the GRES type ID's and values found
 *		in the gres_list.
 * IN val_type - Type of value desired, see GRES_VAL_TYPE_*
 * RET SLURM_SUCCESS or error code
 */
extern int gres_node_count(list_t *gres_list, int arr_len,
			   uint32_t *gres_count_ids,
			   uint64_t *gres_count_vals,
			   int val_type)
{
	foreach_node_count_t foreach_node_count = {
		.array_len = arr_len,
		.gres_count_ids = gres_count_ids,
		.gres_count_vals = gres_count_vals,
		.val_type = val_type,
	};

	if (arr_len <= 0)
		return EINVAL;

	(void) list_for_each(gres_list, _foreach_node_count,
			     &foreach_node_count);

	return SLURM_SUCCESS;
}
static void _gres_device_pack(
	void *in, uint16_t protocol_version, buf_t *buffer)
{
	gres_device_t *gres_device = in;

	/* DON'T PACK gres_device->alloc */
	pack32(gres_device->index, buffer);
	pack32(gres_device->dev_num, buffer);
	pack32(gres_device->dev_desc.type, buffer);
	pack32(gres_device->dev_desc.major, buffer);
	pack32(gres_device->dev_desc.minor, buffer);
	packstr(gres_device->path, buffer);
	packstr(gres_device->unique_id, buffer);
}

extern void gres_send_stepd(buf_t *buffer, list_t *gres_devices)
{
	slurm_pack_list(gres_devices, _gres_device_pack, buffer,
			SLURM_PROTOCOL_VERSION);
}

static int _gres_device_unpack(void **object, uint16_t protocol_version,
			       buf_t *buffer)
{
	uint32_t uint32_tmp = 0;
	gres_device_t *gres_device = xmalloc(sizeof(gres_device_t));

	safe_unpack32(&uint32_tmp, buffer);
	gres_device->index = uint32_tmp;
	safe_unpack32(&uint32_tmp, buffer);
	gres_device->dev_num = uint32_tmp;
	safe_unpack32(&uint32_tmp, buffer);
	gres_device->dev_desc.type = uint32_tmp;
	safe_unpack32(&uint32_tmp, buffer);
	gres_device->dev_desc.major = uint32_tmp;
	safe_unpack32(&uint32_tmp, buffer);
	gres_device->dev_desc.minor = uint32_tmp;
	safe_unpackstr(&gres_device->path, buffer);
	safe_unpackstr(&gres_device->unique_id, buffer);
	/* info("adding %d %s %s", gres_device->dev_num, */
	/*      gres_device->major, gres_device->path); */

	*object = gres_device;

	return SLURM_SUCCESS;

unpack_error:
	error("%s: failed", __func__);
	destroy_gres_device(gres_device);
	return SLURM_ERROR;
}

extern void gres_recv_stepd(buf_t *buffer, list_t **gres_devices)
{
	(void) slurm_unpack_list(gres_devices, _gres_device_unpack,
				 destroy_gres_device,
				 buffer, SLURM_PROTOCOL_VERSION);
}

/* Send GRES information to slurmstepd on the specified file descriptor */
extern void gres_g_send_stepd(int fd, slurm_msg_t *msg)
{
	int len;
	uint32_t step_id;
	cred_data_enum_t check;
	slurm_cred_t *cred = NULL;

	/* Setup the gres_device list and other plugin-specific data */
	xassert(gres_context_cnt >= 0);

	slurm_mutex_lock(&gres_context_lock);
	xassert(gres_context_buf);

	len = get_buf_offset(gres_context_buf);
	safe_write(fd, &len, sizeof(len));
	safe_write(fd, get_buf_data(gres_context_buf), len);

	slurm_mutex_unlock(&gres_context_lock);

	if (msg->msg_type == REQUEST_BATCH_JOB_LAUNCH) {
		batch_job_launch_msg_t *job = msg->data;
		step_id = SLURM_BATCH_SCRIPT;
		cred = job->cred;
	} else {
		launch_tasks_request_msg_t *job = msg->data;
		step_id = job->step_id.step_id;
		cred = job->cred;
	}

	/* If we are a special step we get the JOB_GRES_LIST */
	if (step_id >= SLURM_MAX_NORMAL_STEP_ID)
		check = CRED_DATA_JOB_GRES_LIST;
	else
		check = CRED_DATA_STEP_GRES_LIST;
	/* Send the merged slurm.conf/gres.conf and autodetect data */
	if (slurm_cred_get(cred, check)) {
		len = get_buf_offset(gres_conf_buf);
		safe_write(fd, &len, sizeof(len));
		safe_write(fd, get_buf_data(gres_conf_buf), len);
	}

	return;
rwfail:
	error("%s: failed", __func__);
	slurm_mutex_unlock(&gres_context_lock);

	return;
}

/* Receive GRES information from slurmd on the specified file descriptor */
extern int gres_g_recv_stepd(int fd, slurm_msg_t *msg)
{
	int len, rc = SLURM_ERROR;
	buf_t *buffer = NULL;
	uint32_t step_id;
	cred_data_enum_t check;
	slurm_cred_t *cred = NULL;

	slurm_mutex_lock(&gres_context_lock);

	safe_read(fd, &len, sizeof(int));

	buffer = init_buf(len);
	safe_read(fd, buffer->head, len);

	rc = _unpack_context_buf(buffer);

	if (rc == SLURM_ERROR)
		goto rwfail;

	FREE_NULL_BUFFER(buffer);

	if (msg->msg_type == REQUEST_BATCH_JOB_LAUNCH) {
		batch_job_launch_msg_t *job = msg->data;
		step_id = SLURM_BATCH_SCRIPT;
		cred = job->cred;
	} else {
		launch_tasks_request_msg_t *job = msg->data;
		step_id = job->step_id.step_id;
		cred = job->cred;
	}

	/* If we are a special step we get the JOB_GRES_LIST */
	if (step_id >= SLURM_MAX_NORMAL_STEP_ID)
		check = CRED_DATA_JOB_GRES_LIST;
	else
		check = CRED_DATA_STEP_GRES_LIST;
	/* Recv the merged slurm.conf/gres.conf and autodetect data */
	if (slurm_cred_get(cred, check)) {
		safe_read(fd, &len, sizeof(int));

		buffer = init_buf(len);
		safe_read(fd, buffer->head, len);

		rc = _unpack_gres_conf(buffer);

		if (rc == SLURM_ERROR)
			goto rwfail;

		FREE_NULL_BUFFER(buffer);
	}
	slurm_mutex_unlock(&gres_context_lock);

	/* Set debug flags only */
	(void) gres_init();

	rc = _load_specific_gres_plugins();

	return rc;
rwfail:
	FREE_NULL_BUFFER(buffer);
	error("%s: failed", __func__);
	slurm_mutex_unlock(&gres_context_lock);

	/* Set debug flags only */
	(void) gres_init();

	rc = _load_specific_gres_plugins();

	return rc;
}

/* Get generic GRES data types here. Call the plugin for others */
static int _get_step_info(gres_step_state_t *gres_ss,
			  uint32_t node_inx, enum gres_step_data_type data_type,
			  void *data)
{
	uint64_t *u64_data = (uint64_t *) data;
	bitstr_t **bit_data = (bitstr_t **) data;
	int rc = SLURM_SUCCESS;

	if (!gres_ss || !data)
		return EINVAL;
	if (node_inx >= gres_ss->node_cnt)
		return ESLURM_INVALID_NODE_COUNT;

	switch (data_type) {
	case GRES_STEP_DATA_COUNT:
		*u64_data += gres_ss->gres_cnt_node_alloc[node_inx];
		break;
	case GRES_STEP_DATA_BITMAP:
		if (gres_ss->gres_bit_alloc) {
			if (!*bit_data) {
				*bit_data = bit_copy(
					gres_ss->gres_bit_alloc[node_inx]);
			} else {
				xassert(bit_size(*bit_data) ==
					bit_size(gres_ss->gres_bit_alloc[
							 node_inx]));
				bit_or(*bit_data,
				       gres_ss->gres_bit_alloc[node_inx]);
			}
		}
		break;
	default:
		error("%s: unknown enum given %d", __func__, data_type);
		rc = EINVAL;
		break;
	}

	return rc;
}

static int _foreach_get_step_info(void *x, void *arg)
{
	gres_state_t *gres_state_step = x;
	foreach_step_info_t *foreach_step_info = arg;

	if (gres_state_step->plugin_id != foreach_step_info->plugin_id)
		return 0;

	foreach_step_info->rc = _get_step_info(gres_state_step->gres_data,
					       foreach_step_info->node_inx,
					       foreach_step_info->data_type,
					       foreach_step_info->data);
	if (foreach_step_info->rc != SLURM_SUCCESS)
		return -1;
	return 0;
}

/*
 * get data from a step's GRES data structure
 * IN step_gres_list  - step's GRES data structure
 * IN gres_name - name of a GRES type
 * IN node_inx - zero-origin index of the node within the job's allocation
 *	for which data is desired. Note this can differ from the step's
 *	node allocation index.
 * IN data_type - type of data to get from the step's data
 * OUT data - pointer to the data from step's GRES data structure
 *            DO NOT FREE: This is a pointer into the step's data structure
 * RET - SLURM_SUCCESS or error code
 */
extern int gres_get_step_info(list_t *step_gres_list, char *gres_name,
			      uint32_t node_inx,
			      enum gres_step_data_type data_type, void *data)
{
	foreach_step_info_t foreach_step_info = {
		.data = data,
		.data_type = data_type,
		.node_inx = node_inx,
		.rc = ESLURM_INVALID_GRES,
	};
	if (data == NULL)
		return EINVAL;
	if (step_gres_list == NULL)	/* No GRES allocated */
		return ESLURM_INVALID_GRES;

	xassert(gres_context_cnt >= 0);
	foreach_step_info.plugin_id = gres_build_id(gres_name);

	(void) list_for_each(step_gres_list, _foreach_get_step_info,
			     &foreach_step_info);

	return foreach_step_info.rc;
}

extern uint32_t gres_get_autodetect_flags(void)
{
	return autodetect_flags;
}

extern void gres_clear_tres_cnt(uint64_t *tres_cnt, bool locked)
{
	assoc_mgr_lock_t locks = { .tres = READ_LOCK };

	/*
	 * If gres_context_lock is ever locked/unlocked here, it should happen
	 * in between assoc_mgr_lock() and before assoc_mgr_unlock().
	 */

	if (!locked)
		assoc_mgr_lock(&locks);

	/* Initialize all GRES counters to zero. Increment them later. */
	for (int i = 0; i < g_tres_count; ++i) {
		/* Skip all non-GRES TRES */
		if (xstrcasecmp(assoc_mgr_tres_array[i]->type, "gres"))
			continue;
		tres_cnt[i] = 0;
	}

	if (!locked)
		assoc_mgr_unlock(&locks);
}

extern char *gres_device_id2str(gres_device_id_t *gres_dev)
{
	char *res = NULL;

	xstrfmtcat(res, "%c %u:%u rwm",
		   gres_dev->type == DEV_TYPE_BLOCK ? 'b' : 'c',
		   gres_dev->major, gres_dev->minor);

	return res;
}


/* Free memory for gres_device_t record */
extern void destroy_gres_device(void *gres_device_ptr)
{
	gres_device_t *gres_device = (gres_device_t *) gres_device_ptr;

	if (!gres_device)
		return;
	xfree(gres_device->path);
	xfree(gres_device->unique_id);
	xfree(gres_device);
}

/* Destroy a gres_slurmd_conf_t record, free it's memory */
extern void destroy_gres_slurmd_conf(void *x)
{
	gres_slurmd_conf_t *p = (gres_slurmd_conf_t *) x;

	xassert(p);
	xfree(p->cpus);
	FREE_NULL_BITMAP(p->cpus_bitmap);
	xfree(p->file);		/* Only used by slurmd */
	xfree(p->links);
	xfree(p->name);
	xfree(p->type_name);
	xfree(p->unique_id);
	xfree(p);
}

/*
 * Convert GRES config_flags to a string. The pointer returned references local
 * storage in this function, which is not re-entrant.
 */
extern char *gres_flags2str(uint32_t config_flags)
{
	static char flag_str[128];
	char *sep = "";

	flag_str[0] = '\0';
	if (config_flags & GRES_CONF_COUNT_ONLY) {
		strcat(flag_str, sep);
		strcat(flag_str, "CountOnly");
		sep = ",";
	}

	if (config_flags & GRES_CONF_EXPLICIT) {
		strcat(flag_str, sep);
		strcat(flag_str, "Explicit");
		sep = ",";
	}

	if (config_flags & GRES_CONF_HAS_FILE) {
		strcat(flag_str, sep);
		strcat(flag_str, "HAS_FILE");
		sep = ",";
	}

	if (config_flags & GRES_CONF_LOADED) {
		strcat(flag_str, sep);
		strcat(flag_str, "LOADED");
		sep = ",";
	}

	if (config_flags & GRES_CONF_HAS_TYPE) {
		strcat(flag_str, sep);
		strcat(flag_str, "HAS_TYPE");
		sep = ",";
	}

	if (config_flags & GRES_CONF_ENV_NVML) {
		strcat(flag_str, sep);
		strcat(flag_str, "ENV_NVML");
		sep = ",";
	}

	if (config_flags & GRES_CONF_ENV_RSMI) {
		strcat(flag_str, sep);
		strcat(flag_str, "ENV_RSMI");
		sep = ",";
	}

	if (config_flags & GRES_CONF_ENV_ONEAPI) {
		strcat(flag_str, sep);
		strcat(flag_str, "ENV_ONEAPI");
		sep = ",";
	}

	if (config_flags & GRES_CONF_ENV_OPENCL) {
		strcat(flag_str, sep);
		strcat(flag_str, "ENV_OPENCL");
		sep = ",";
	}

	if (config_flags & GRES_CONF_ENV_DEF) {
		strcat(flag_str, sep);
		strcat(flag_str, "ENV_DEFAULT");
		sep = ",";
	}

	if (config_flags & GRES_CONF_SHARED) {
		strcat(flag_str, sep);
		strcat(flag_str, "SHARED");
		sep = ",";
	}

	if (config_flags & GRES_CONF_ONE_SHARING) {
		strcat(flag_str, sep);
		strcat(flag_str, "ONE_SHARING");
		sep = ",";
	}

	return flag_str;
}

/*
 * Creates a gres_slurmd_conf_t record to add to a list of gres_slurmd_conf_t
 * records
 */
extern void add_gres_to_list(list_t *gres_list,
			     gres_slurmd_conf_t *gres_slurmd_conf_in)
{
	gres_slurmd_conf_t *gres_slurmd_conf;
	bool use_empty_first_record = false;

	/*
	 * If the first record already exists and has a count of 0 then
	 * overwrite it.
	 * This is a placeholder record created in _merge_config()
	 */
	gres_slurmd_conf = list_peek(gres_list);
	if (gres_slurmd_conf && (gres_slurmd_conf->count == 0))
		use_empty_first_record = true;
	else
		gres_slurmd_conf = xmalloc(sizeof(gres_slurmd_conf_t));
	gres_slurmd_conf->cpu_cnt = gres_slurmd_conf_in->cpu_cnt;
	if (gres_slurmd_conf_in->cpus_bitmap) {
		bitstr_t *cpu_aff = bit_copy(gres_slurmd_conf_in->cpus_bitmap);

		/*
		 * Size down (or possibly up) cpus_bitmap, if necessary, so that
		 * the size of cpus_bitmap for system-detected devices matches
		 * the size of cpus_bitmap for configured devices.
		 */
		if (bit_size(cpu_aff) != gres_slurmd_conf_in->cpu_cnt) {
			/* Calculate minimum size to hold CPU affinity */
			int64_t size = bit_fls(cpu_aff) + 1;
			if (size > gres_slurmd_conf_in->cpu_cnt) {
				char *cpu_str = bit_fmt_hexmask_trim(cpu_aff);
				fatal("This CPU affinity bitmask (%s) does not fit within the CPUs configured for this node (%d). Make sure that the node's CPU count is configured correctly.",
				      cpu_str, gres_slurmd_conf_in->cpu_cnt);
				xfree(cpu_str);
			}
			bit_realloc(cpu_aff, gres_slurmd_conf_in->cpu_cnt);
		}
		gres_slurmd_conf->cpus_bitmap = cpu_aff;
	}

	/* Set default env flags, if necessary */
	if ((gres_slurmd_conf_in->config_flags & GRES_CONF_ENV_DEF) &&
	    ((gres_slurmd_conf_in->config_flags & GRES_CONF_ENV_SET) !=
	     GRES_CONF_ENV_SET))
		gres_slurmd_conf_in->config_flags |= GRES_CONF_ENV_SET;

	gres_slurmd_conf->config_flags = gres_slurmd_conf_in->config_flags;

	if (gres_slurmd_conf_in->file) {
		hostlist_t *hl = hostlist_create(gres_slurmd_conf_in->file);
		gres_slurmd_conf->config_flags |= GRES_CONF_HAS_FILE;
		if (hostlist_count(hl) > 1)
			gres_slurmd_conf->config_flags |= GRES_CONF_HAS_MULT;
		hostlist_destroy(hl);
	}
	if (gres_slurmd_conf_in->type_name)
		gres_slurmd_conf->config_flags |= GRES_CONF_HAS_TYPE;
	gres_slurmd_conf->cpus = xstrdup(gres_slurmd_conf_in->cpus);
	gres_slurmd_conf->type_name = xstrdup(gres_slurmd_conf_in->type_name);
	gres_slurmd_conf->name = xstrdup(gres_slurmd_conf_in->name);
	gres_slurmd_conf->file = xstrdup(gres_slurmd_conf_in->file);
	gres_slurmd_conf->links = xstrdup(gres_slurmd_conf_in->links);
	gres_slurmd_conf->unique_id = xstrdup(gres_slurmd_conf_in->unique_id);
	gres_slurmd_conf->count = gres_slurmd_conf_in->count;
	gres_slurmd_conf->plugin_id = gres_build_id(gres_slurmd_conf_in->name);
	if (!use_empty_first_record)
		list_append(gres_list, gres_slurmd_conf);
}

extern char *gres_prepend_tres_type(const char *gres_str)
{
	char *output = NULL;

	if (gres_str) {
		output = xstrdup_printf("gres/%s", gres_str);
		xstrsubstituteall(output, ",", ",gres/");
		xstrsubstituteall(output, "gres/gres/", "gres/");
	}
	return output;
}

extern bool gres_use_busy_dev(gres_state_t *gres_state_node,
			      bool use_total_gres)
{
	gres_node_state_t *gres_ns = gres_state_node->gres_data;

	if (!use_total_gres &&
	    gres_id_shared(gres_state_node->config_flags) &&
	    (gres_state_node->config_flags & GRES_CONF_ONE_SHARING) &&
	    (gres_ns->gres_cnt_alloc != 0)) {
		/* We must use the ONE already active GRES of this type */
		return true;
	}

	return false;
}

/* Return the plugin id made from gres_build_id("gpu") */
extern uint32_t gres_get_gpu_plugin_id(void)
{
	return gpu_plugin_id;
}

extern bool gres_valid_name(char *name)
{
	if (!name || (name[0] == '\0'))
		return false;
	if (gres_get_system_cnt(name, false) != NO_VAL64)
		return true;

	return false;
}
