src/plugins/gpu/nvml/gpu_nvml.c - SchedMD/slurm - Git at Google

 /*****************************************************************************\
  *  gpu_nvml.c - Support nvml interface to an Nvidia GPU.
  *****************************************************************************
  *  Copyright (C) SchedMD LLC.
  *
  *  This file is part of Slurm, a resource management program.
  *  For details, see <https://slurm.schedmd.com/>.
  *  Please also read the included file: DISCLAIMER.
  *
  *  Slurm is free software; you can redistribute it and/or modify it under
  *  the terms of the GNU General Public License as published by the Free
  *  Software Foundation; either version 2 of the License, or (at your option)
  *  any later version.
  *
  *  In addition, as a special exception, the copyright holders give permission
  *  to link the code of portions of this program with the OpenSSL library under
  *  certain conditions as described in each individual source file, and
  *  distribute linked combinations including the two. You must obey the GNU
  *  General Public License in all respects for all of the code used other than
  *  OpenSSL. If you modify file(s) with this exception, you may extend this
  *  exception to your version of the file(s), but you are not obligated to do
  *  so. If you do not wish to do so, delete this exception statement from your
  *  version.  If you delete this exception statement from all source files in
  *  the program, then also delete it here.
  *
  *  Slurm is distributed in the hope that it will be useful, but WITHOUT ANY
  *  WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
  *  FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
  *  details.
  *
  *  You should have received a copy of the GNU General Public License along
  *  with Slurm; if not, write to the Free Software Foundation, Inc.,
  *  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301  USA.
 \*****************************************************************************/

 #define _GNU_SOURCE

 #include <nvml.h>
 #include <math.h>

 #include "../common/gpu_common.h"

 #if defined (__APPLE__)
 extern slurmd_conf_t *conf __attribute__((weak_import));
 #else
 slurmd_conf_t *conf = NULL;
 #endif

 /*
  * #defines needed to test nvml.
  */
 #define MAX_CPUS	0x8000
 #define ULONG_BYTES	(sizeof(unsigned long))
 #define ULONG_BITS	(ULONG_BYTES * 8)
 /*
  * The # of unsigned longs needed to accommodate a bitmask array capable
  * of representing MAX_CPUS cpus (will vary if 32-bit or 64-bit)
  * E.g. for a 130 CPU 64-bit machine: (130 + 63) / 64 = 3.02
  * -> Integer division floor -> 3 ulongs to represent 130 CPUs
  */
 #define CPU_SET_SIZE	((MAX_CPUS + (ULONG_BITS-1)) / ULONG_BITS)
 #define NVLINK_SELF	-1
 #define NVLINK_NONE	0
 #define FREQS_SIZE	512

 #define MIG_LINE_SIZE	128

 typedef struct {
 	char *files; /* Includes MIG cap files and parent GPU device file */
 	char *links; /* MIG doesn't support NVLinks, but use for sorting */
 	char *profile_name; /* <GPU_type>_<slice_cnt>g.<mem>gb */
 	char *unique_id;
 	/* `MIG-<GPU-UUID>/<GPU instance ID>/<compute instance ID>` */
 } nvml_mig_t;

 static bitstr_t	*saved_gpus = NULL;
 /*
  * These variables are required by the generic plugin interface.  If they
  * are not found in the plugin, the plugin loader will ignore it.
  *
  * plugin_name - A string giving a human-readable description of the
  * plugin.  There is no maximum length, but the symbol must refer to
  * a valid string.
  *
  * plugin_type - A string suggesting the type of the plugin or its
  * applicability to a particular form of data or method of data handling.
  * If the low-level plugin API is used, the contents of this string are
  * unimportant and may be anything.  Slurm uses the higher-level plugin
  * interface which requires this string to be of the form
  *
  *	<application>/<method>
  *
  * where <application> is a description of the intended application of
  * the plugin (e.g., "auth" for Slurm authentication) and <method> is a
  * description of how this plugin satisfies that application.  Slurm will
  * only load authentication plugins if the plugin_type string has a prefix
  * of "auth/".
  *
  * plugin_version - an unsigned 32-bit integer containing the Slurm version
  * (major.minor.micro combined into a single number).
  */
 const char plugin_name[] = "GPU NVML plugin";
 const char	plugin_type[]		= "gpu/nvml";
 const uint32_t	plugin_version		= SLURM_VERSION_NUMBER;

 static int gpumem_pos = -1;
 static int gpuutil_pos = -1;
 static pid_t init_pid = 0;

 /*
  * Converts a cpu_set returned from the NVML API into a Slurm bitstr_t
  *
  * This function accounts for the endianness of the machine.
  *
  * cpu_set_bitstr: (IN/OUT) A preallocated bitstr_t via bit_alloc() that is
  * 		   bitstr_size bits wide. This will get filled in.
  * cpu_set:	   (IN) The cpu_set array returned by nvmlDeviceGetCpuAffinity()
  * cpu_set_size:   (IN) The size of the cpu_set array
  */
 static void _set_cpu_set_bitstr(bitstr_t *cpu_set_bitstr,
 				unsigned long *cpu_set,
 				unsigned int cpu_set_size)
 {
 	int j, k, b;
 	int bit_cur;
 	int bitstr_bits = (int) bit_size(cpu_set_bitstr);
 	int cpu_set_bits = (cpu_set_size * ULONG_BITS);

 	// If this fails, then something went horribly wrong
 	if (bitstr_bits != cpu_set_bits)
 		fatal("%s: bitstr_bits != cpu_set_bits", __func__);

 	bit_cur = bitstr_bits - 1;

 	// Iterate through each cpu_set long int
 	for (j = cpu_set_size - 1; j >= 0; --j) {
 		// Iterate through the bytes of the jth ulong bitmask
 		char *bitmask = (char *) &cpu_set[j];
 #ifdef SLURM_BIGENDIAN
 		for (k = 0; k < ULONG_BYTES; ++k) {
 #else
 		for (k = ULONG_BYTES - 1; k >= 0; --k) {
 #endif // SLURM_BIGENDIAN
 			unsigned char byte = bitmask[k];
 			unsigned char mask;
 			// If byte is zero, nothing to set
 			if (byte == 0) {
 				bit_cur -= 8;
 				continue;
 			}

 			// Test each bit of byte, from MSB to LSB. Set if needed
 			mask = 0x80;
 			for (b = 0; b < 8; ++b) {
 				if (byte & mask)
 					bit_set(cpu_set_bitstr, bit_cur);
 				mask >>= 1;
 				bit_cur--;
 			}
 			xassert(mask == 0x00);
 		}
 	}

 	xassert(bit_cur == -1);
 	// If NVML gave us an empty CPU affinity, then something is very wrong
 	if (bit_set_count(cpu_set_bitstr) == 0)
 		fatal("%s: cpu_set_bitstr is empty! No CPU affinity for device",
 		      __func__);
 }


 /*
  * Initialize the NVML library. This takes a few seconds
  */
 static void _nvml_init(void)
 {
 	pid_t my_pid = conf->pid ? conf->pid : getpid();
 	nvmlReturn_t nvml_rc;

 	if (init_pid == my_pid)
 		return;

 	init_pid = my_pid;

 	DEF_TIMERS;
 	START_TIMER;
 	nvml_rc = nvmlInit();
 	END_TIMER;
 	debug3("nvmlInit() took %ld microseconds", DELTA_TIMER);
 	if (nvml_rc != NVML_SUCCESS)
 		error("Failed to initialize NVML: %s",
 		      nvmlErrorString(nvml_rc));
 	else
 		debug2("Successfully initialized NVML");
 }

 /*
  * Undo _nvml_init
  */
 static void _nvml_shutdown(void)
 {
 	nvmlReturn_t nvml_rc;
 	DEF_TIMERS;
 	START_TIMER;
 	nvml_rc = nvmlShutdown();
 	init_pid = 0;
 	END_TIMER;
 	debug3("nvmlShutdown() took %ld microseconds", DELTA_TIMER);
 	if (nvml_rc != NVML_SUCCESS)
 		error("Failed to shut down NVML: %s", nvmlErrorString(nvml_rc));
 	else
 		debug2("Successfully shut down NVML");
 }

 /*
  * Get the handle to the GPU for the passed index
  *
  * index 	(IN) The GPU index (corresponds to PCI Bus ID order)
  * device	(OUT) The device handle
  *
  * Returns true if successful, false if not
  */
 static bool _nvml_get_handle(int index, nvmlDevice_t *device)
 {
 	nvmlReturn_t nvml_rc;
 	nvml_rc = nvmlDeviceGetHandleByIndex(index, device);
 	if (nvml_rc != NVML_SUCCESS) {
 		error("Failed to get device handle for GPU %d: %s", index,
 		      nvmlErrorString(nvml_rc));
 		return false;
 	}
 	return true;
 }

 /*
  * Get all possible memory frequencies for the device
  *
  * device		(IN) The device handle
  * mem_freqs_size	(IN/OUT) The size of the mem_freqs array; this will be
  * 			overwritten with the number of memory freqs found.
  * mem_freqs		(OUT) The possible memory frequencies, sorted in
  * 			descending order
  *
  * Return true if successful, false if not.
  */
 static bool _nvml_get_mem_freqs(nvmlDevice_t *device, uint32_t *mem_freqs_size,
 				uint32_t *mem_freqs)
 {
 	nvmlReturn_t nvml_rc;
 	DEF_TIMERS;
 	START_TIMER;
 	unsigned int *nvml_mem_freqs = mem_freqs;
 	unsigned int *nvml_mem_freqs_size = mem_freqs_size;
 	nvml_rc = nvmlDeviceGetSupportedMemoryClocks(*device,
 						     nvml_mem_freqs_size,
 						     nvml_mem_freqs);
 	END_TIMER;
 	debug3("nvmlDeviceGetSupportedMemoryClocks() took %ld microseconds",
 	       DELTA_TIMER);

 	if (nvml_rc != NVML_SUCCESS) {
 		error("%s: Failed to get supported memory frequencies for the "
 		      "GPU : %s", __func__, nvmlErrorString(nvml_rc));
 		return false;
 	}

 	qsort(mem_freqs, *mem_freqs_size, sizeof(uint32_t),
 	      slurm_sort_uint32_list_desc);

 	if ((*mem_freqs_size > 1) &&
 	    (mem_freqs[0] <= mem_freqs[(*mem_freqs_size)-1])) {
 		error("%s: mem frequencies are not stored in descending order!",
 		      __func__);
 		return false;
 	}
 	return true;
 }

 /*
  * Get all possible graphics frequencies for the device
  *
  * device		(IN) The device handle
  * mem_freq		(IN) The memory frequency to get graphics freqs for.
  * gfx_freqs_size	(IN/OUT) The size of the gfx_freqs array; this will
  * 			be overwritten with the number of graphics freqs found.
  * gfx_freqs		(OUT) The possible graphics frequencies, sorted in
  * 			descending order
  *
  * Return true if successful, false if not.
  */
 static bool _nvml_get_gfx_freqs(nvmlDevice_t *device, uint32_t mem_freq,
 				uint32_t *gfx_freqs_size, uint32_t *gfx_freqs)
 {
 	nvmlReturn_t nvml_rc;
 	DEF_TIMERS;
 	START_TIMER;

 	unsigned int *nvml_gfx_freqs = gfx_freqs;
 	unsigned int *nvml_gfx_freqs_size = gfx_freqs_size;
 	nvml_rc = nvmlDeviceGetSupportedGraphicsClocks(*device, mem_freq,
 						       nvml_gfx_freqs_size,
 						       nvml_gfx_freqs);
 	END_TIMER;
 	debug3("nvmlDeviceGetSupportedGraphicsClocks() took %ld microseconds",
 	       DELTA_TIMER);
 	if (nvml_rc != NVML_SUCCESS) {
 		error("%s: Failed to get supported graphics frequencies for the"
 		      " GPU at mem frequency %u: %s", __func__, mem_freq,
 		      nvmlErrorString(nvml_rc));
 		return false;
 	}

 	qsort(gfx_freqs, *gfx_freqs_size, sizeof(uint32_t),
 	      slurm_sort_uint32_list_desc);

 	if ((*gfx_freqs_size > 1) &&
 	    (gfx_freqs[0] <= gfx_freqs[(*gfx_freqs_size)-1])) {
 		error("%s: gfx frequencies are not stored in descending order!",
 		      __func__);
 		return false;
 	}
 	return true;
 }

 /*
  * Print out all possible graphics frequencies for the given device and mem
  * freq. If there are many frequencies, only prints out a few.
  *
  * device		(IN) The device handle
  * mem_freq		(IN) The memory frequency to get graphics freqs for.
  * gfx_freqs_size	(IN) The size of the gfx_freqs array
  * gfx_freqs		(IN) A preallocated empty array of size gfx_freqs_size
  * 			to fill with possible graphics frequencies
  * l			(IN) The log level at which to print
  *
  * NOTE: The contents of gfx_freqs will be modified during use.
  */
 static void _nvml_print_gfx_freqs(nvmlDevice_t *device, uint32_t mem_freq,
 				  uint32_t gfx_freqs_size, uint32_t *gfx_freqs,
 				  log_level_t l)
 {
 	uint32_t size = gfx_freqs_size;

 	if (!_nvml_get_gfx_freqs(device, mem_freq, &size, gfx_freqs))
 		return;

 	gpu_common_print_freqs(gfx_freqs, size, l, "GPU Graphics", 8);
 }

 /*
  * Print out all possible memory and graphics frequencies for the given device.
  * If there are more than FREQS_SIZE frequencies, prints a summary instead
  *
  * device	(IN) The device handle
  * l		(IN) The log level at which to print
  */
 static void _nvml_print_freqs(nvmlDevice_t *device, log_level_t l)
 {
 	uint32_t mem_size = FREQS_SIZE;
 	uint32_t mem_freqs[FREQS_SIZE] = {0};
 	uint32_t gfx_freqs[FREQS_SIZE] = {0};
 	uint32_t i;
 	bool concise = false;

 	if (!_nvml_get_mem_freqs(device, &mem_size, mem_freqs))
 		return;

 	if (mem_size > FREQS_CONCISE)
 		concise = true;

 	log_var(l, "Possible GPU Memory Frequencies (%u):", mem_size);
 	log_var(l, "-------------------------------");
 	if (concise) {
 		// first, next, ..., middle, ..., penultimate, last
 		unsigned int tmp;
 		log_var(l, "    *%u MHz [0]", mem_freqs[0]);
 		_nvml_print_gfx_freqs(device, mem_freqs[0], FREQS_SIZE,
 				      gfx_freqs, l);
 		log_var(l, "    *%u MHz [1]", mem_freqs[1]);
 		_nvml_print_gfx_freqs(device, mem_freqs[1], FREQS_SIZE,
 				      gfx_freqs, l);
 		log_var(l, "    ...");
 		tmp = (mem_size - 1) / 2;
 		log_var(l, "    *%u MHz [%u]", mem_freqs[tmp], tmp);
 		_nvml_print_gfx_freqs(device, mem_freqs[tmp], FREQS_SIZE,
 				      gfx_freqs, l);
 		log_var(l, "    ...");
 		tmp = mem_size - 2;
 		log_var(l, "    *%u MHz [%u]", mem_freqs[tmp], tmp);
 		_nvml_print_gfx_freqs(device, mem_freqs[tmp], FREQS_SIZE,
 				      gfx_freqs, l);
 		tmp = mem_size - 1;
 		log_var(l, "    *%u MHz [%u]", mem_freqs[tmp], tmp);
 		_nvml_print_gfx_freqs(device, mem_freqs[tmp], FREQS_SIZE,
 				      gfx_freqs, l);
 		return;
 	}

 	for (i = 0; i < mem_size; ++i) {
 		log_var(l,"    *%u MHz [%u]", mem_freqs[i], i);
 		_nvml_print_gfx_freqs(device, mem_freqs[i], FREQS_SIZE,
 				      gfx_freqs, l);
 	}
 }

 /*
  * Get the nearest valid memory and graphics clock frequencies
  *
  * device		(IN) The NVML GPU device handle
  * mem_freq		(IN/OUT) The requested memory frequency, in MHz. This
  * 			will be overwritten with the output value, if different.
  * gfx_freq 		(IN/OUT) The requested graphics frequency, in MHz. This
  * 			will be overwritten with the output value, if different.
  */
 static void _nvml_get_nearest_freqs(nvmlDevice_t *device, uint32_t *mem_freq,
 				    uint32_t *gfx_freq)
 {
 	uint32_t mem_freqs[FREQS_SIZE] = {0};
 	uint32_t mem_freqs_size = FREQS_SIZE;
 	uint32_t gfx_freqs[FREQS_SIZE] = {0};
 	uint32_t gfx_freqs_size = FREQS_SIZE;

 	// Get the memory frequencies
 	if (!_nvml_get_mem_freqs(device, &mem_freqs_size, mem_freqs))
 		return;

 	// Set the nearest valid memory frequency for the requested frequency
 	gpu_common_get_nearest_freq(mem_freq, mem_freqs_size, mem_freqs);

 	// Get the graphics frequencies at this memory frequency
 	if (!_nvml_get_gfx_freqs(device, *mem_freq, &gfx_freqs_size, gfx_freqs))
 		return;
 	// Set the nearest valid graphics frequency for the requested frequency
 	gpu_common_get_nearest_freq(gfx_freq, gfx_freqs_size, gfx_freqs);
 }

 /*
  * Set the memory and graphics clock frequencies for the GPU
  *
  * device	(IN) The NVML GPU device handle
  * mem_freq	(IN) The memory clock frequency, in MHz
  * gfx_freq	(IN) The graphics clock frequency, in MHz
  *
  * Returns true if successful, false if not
  */
 static bool _nvml_set_freqs(nvmlDevice_t *device, uint32_t mem_freq,
 			    uint32_t gfx_freq)
 {
 	nvmlReturn_t nvml_rc;
 	DEF_TIMERS;
 	START_TIMER;
 	nvml_rc = nvmlDeviceSetApplicationsClocks(*device, mem_freq, gfx_freq);
 	END_TIMER;
 	debug3("nvmlDeviceSetApplicationsClocks(%u, %u) took %ld microseconds",
 	       mem_freq, gfx_freq, DELTA_TIMER);
 	if (nvml_rc != NVML_SUCCESS) {
 		error("%s: Failed to set memory and graphics clock frequency "
 		      "pair (%u, %u) for the GPU: %s", __func__, mem_freq,
 		      gfx_freq, nvmlErrorString(nvml_rc));
 		return false;
 	}
 	return true;
 }

 /*
  * Reset the memory and graphics clock frequencies for the GPU to the same
  * default frequencies that are used after system reboot or driver reload. This
  * default cannot be changed.
  *
  * device	(IN) The NVML GPU device handle
  *
  * Returns true if successful, false if not
  */
 static bool _nvml_reset_freqs(nvmlDevice_t *device)
 {
 	nvmlReturn_t nvml_rc;
 	DEF_TIMERS;

 	START_TIMER;
 	nvml_rc = nvmlDeviceResetApplicationsClocks(*device);
 	END_TIMER;
 	debug3("nvmlDeviceResetApplicationsClocks() took %ld microseconds",
 	       DELTA_TIMER);
 	if (nvml_rc != NVML_SUCCESS) {
 		error("%s: Failed to reset GPU frequencies to the hardware default: %s",
 		      __func__, nvmlErrorString(nvml_rc));
 		return false;
 	}
 	return true;
 }

 /*
  * Get the memory or graphics clock frequency that the GPU is currently running
  * at
  *
  * device	(IN) The NVML GPU device handle
  * type		(IN) The clock type to query. Either NVML_CLOCK_GRAPHICS or
  * 		NVML_CLOCK_MEM.
  *
  * Returns the clock frequency in MHz if successful, or 0 if not
  */
 static uint32_t _nvml_get_freq(nvmlDevice_t *device, nvmlClockType_t type)
 {
 	nvmlReturn_t nvml_rc;
 	uint32_t freq = 0;
 	char *type_str = "unknown";
 	DEF_TIMERS;

 	switch (type) {
 	case NVML_CLOCK_GRAPHICS:
 		type_str = "graphics";
 		break;
 	case NVML_CLOCK_MEM:
 		type_str = "memory";
 		break;
 	default:
 		error("%s: Unsupported clock type", __func__);
 		break;
 	}

 	START_TIMER;
 	unsigned int *nvml_freq = &freq;
 	nvml_rc = nvmlDeviceGetApplicationsClock(*device, type, nvml_freq);
 	END_TIMER;
 	debug3("nvmlDeviceGetApplicationsClock(%s) took %ld microseconds",
 	       type_str, DELTA_TIMER);
 	if (nvml_rc != NVML_SUCCESS) {
 		error("%s: Failed to get the GPU %s frequency: %s", __func__,
 		      type_str, nvmlErrorString(nvml_rc));
 		return 0;
 	}
 	return freq;
 }

 static uint32_t _nvml_get_gfx_freq(nvmlDevice_t *device)
 {
 	return _nvml_get_freq(device, NVML_CLOCK_GRAPHICS);
 }

 static uint32_t _nvml_get_mem_freq(nvmlDevice_t *device)
 {
 	return _nvml_get_freq(device, NVML_CLOCK_MEM);
 }

 /*
  * Reset the frequencies of each GPU in the step to the hardware default
  * NOTE: NVML must be initialized beforehand
  *
  * gpus		(IN) A bitmap specifying the GPUs on which to operate.
  */
 static void _reset_freq(bitstr_t *gpus)
 {
 	int gpu_len = bit_size(gpus);
 	int i = -1, count = 0, count_set = 0;
 	bool freq_reset = false;

 	/*
 	 * Reset the frequency of each device allocated to the step
 	 */
 	for (i = 0; i < gpu_len; i++) {
 		nvmlDevice_t device;
 		if (!bit_test(gpus, i))
 			continue;
 		count++;

 		if (!_nvml_get_handle(i, &device))
 			continue;

 		debug2("Memory frequency before reset: %u",
 		       _nvml_get_mem_freq(&device));
 		debug2("Graphics frequency before reset: %u",
 		       _nvml_get_gfx_freq(&device));
 		freq_reset =_nvml_reset_freqs(&device);
 		debug2("Memory frequency after reset: %u",
 		       _nvml_get_mem_freq(&device));
 		debug2("Graphics frequency after reset: %u",
 		       _nvml_get_gfx_freq(&device));

 		if (freq_reset) {
 			log_flag(GRES, "Successfully reset GPU[%d]", i);
 			count_set++;
 		} else {
 			log_flag(GRES, "Failed to reset GPU[%d]", i);
 		}
 	}

 	if (count_set != count) {
 		log_flag(GRES, "%s: Could not reset frequencies for all GPUs. Set %d/%d total GPUs",
 		         __func__, count_set, count);
 		fprintf(stderr, "Could not reset frequencies for all GPUs. "
 			"Set %d/%d total GPUs\n", count_set, count);
 	}
 }

 /*
  * Set the frequencies of each GPU specified for the step
  * NOTE: NVML must be initialized beforehand
  *
  * gpus		(IN) A bitmap specifying the GPUs on which to operate.
  * gpu_freq	(IN) The frequencies to set each of the GPUs to. If a NULL or
  * 		empty memory or graphics frequency is specified, then GpuFreqDef
  * 		will be consulted, which defaults to "high,memory=high" if not
  * 		set.
  */
 static void _set_freq(bitstr_t *gpus, char *gpu_freq)
 {
 	bool verbose_flag = false;
 	int gpu_len = 0;
 	int i = -1, count = 0, count_set = 0;
 	unsigned int gpu_freq_num = 0, mem_freq_num = 0;
 	bool freq_set = false, freq_logged = false;
 	char *tmp = NULL;
 	bool task_cgroup = false;
 	bool constrained_devices = false;
 	bool cgroups_active = false;

 	/*
 	 * Parse frequency information
 	 */
 	debug2("_parse_gpu_freq(%s)", gpu_freq);
 	gpu_common_parse_gpu_freq(gpu_freq, &gpu_freq_num, &mem_freq_num,
 				  &verbose_flag);
 	if (verbose_flag)
 		debug2("verbose_flag ON");

 	tmp = gpu_common_freq_value_to_string(mem_freq_num);
 	debug2("Requested GPU memory frequency: %s", tmp);
 	xfree(tmp);
 	tmp = gpu_common_freq_value_to_string(gpu_freq_num);
 	debug2("Requested GPU graphics frequency: %s", tmp);
 	xfree(tmp);

 	if (!mem_freq_num && !gpu_freq_num) {
 		debug2("%s: No frequencies to set", __func__);
 		return;
 	}

 	// Check if GPUs are constrained by cgroups
 	cgroup_conf_init();
 	if (slurm_cgroup_conf.constrain_devices)
 		constrained_devices = true;

 	// Check if task/cgroup plugin is loaded
 	if (xstrstr(slurm_conf.task_plugin, "cgroup"))
 		task_cgroup = true;

 	// If both of these are true, then GPUs will be constrained
 	if (constrained_devices && task_cgroup) {
 		cgroups_active = true;
 		gpu_len = bit_set_count(gpus);
 		debug2("%s: cgroups are configured. Using LOCAL GPU IDs",
 		       __func__);
 	} else {
 	 	gpu_len = bit_size(gpus);
 		debug2("%s: cgroups are NOT configured. Assuming GLOBAL GPU IDs",
 		       __func__);
 	}

 	/*
 	 * Set the frequency of each device allocated to the step
 	 */
 	for (i = 0; i < gpu_len; i++) {
 		char *sep = "";
 		nvmlDevice_t device;
 		unsigned int gpu_freq = gpu_freq_num, mem_freq = mem_freq_num;

 		// Only check the global GPU bitstring if not using cgroups
 		if (!cgroups_active && !bit_test(gpus, i)) {
 			debug2("Passing over NVML device %u", i);
 			continue;
 		}
 		count++;

 		if (!_nvml_get_handle(i, &device))
 			continue;
 		debug2("Setting frequency of NVML device %u", i);
 		_nvml_get_nearest_freqs(&device, &mem_freq, &gpu_freq);

 		debug2("Memory frequency before set: %u",
 		       _nvml_get_mem_freq(&device));
 		debug2("Graphics frequency before set: %u",
 		       _nvml_get_gfx_freq(&device));
 		freq_set = _nvml_set_freqs(&device, mem_freq, gpu_freq);
 		debug2("Memory frequency after set: %u",
 		       _nvml_get_mem_freq(&device));
 		debug2("Graphics frequency after set: %u",
 		       _nvml_get_gfx_freq(&device));

 		if (mem_freq) {
 			xstrfmtcat(tmp, "%smemory_freq:%u", sep, mem_freq);
 			sep = ",";
 		}
 		if (gpu_freq) {
 			xstrfmtcat(tmp, "%sgraphics_freq:%u", sep, gpu_freq);
 		}

 		if (freq_set) {
 			log_flag(GRES, "Successfully set GPU[%d] %s", i, tmp);
 			count_set++;
 		} else {
 			log_flag(GRES, "Failed to set GPU[%d] %s", i, tmp);
 		}

 		if (verbose_flag && !freq_logged) {
 			fprintf(stderr, "GpuFreq=%s\n", tmp);
 			freq_logged = true;	/* Just log for first GPU */
 		}
 		xfree(tmp);
 	}

 	if (count_set != count) {
 		log_flag(GRES, "%s: Could not set frequencies for all GPUs. Set %d/%d total GPUs",
 		         __func__, count_set, count);
 		fprintf(stderr, "Could not set frequencies for all GPUs. "
 			"Set %d/%d total GPUs\n", count_set, count);
 	}
 }


 /*
  * Get the version of the system's graphics driver
  */
 static void _nvml_get_driver(char *driver, unsigned int len)
 {
 	nvmlReturn_t nvml_rc = nvmlSystemGetDriverVersion(driver, len);
 	if (nvml_rc != NVML_SUCCESS) {
 		error("Failed to get the NVIDIA graphics driver version: %s",
 		      nvmlErrorString(nvml_rc));
 		driver[0] = '\0';
 	}
 }

 /*
  * Get the version of the NVML library
  */
 static void _nvml_get_version(char *version, unsigned int len)
 {
 	nvmlReturn_t nvml_rc = nvmlSystemGetNVMLVersion(version, len);
 	if (nvml_rc != NVML_SUCCESS) {
 		error("Failed to get the NVML library version: %s",
 		      nvmlErrorString(nvml_rc));
 		version[0] = '\0';
 	}
 }

 /*
  * Get the total # of GPUs in the system
  */
 extern void gpu_p_get_device_count(uint32_t *device_count)
 {
 	unsigned int *nvml_device_count = device_count;
 	nvmlReturn_t nvml_rc = nvmlDeviceGetCount(nvml_device_count);
 	if (nvml_rc != NVML_SUCCESS) {
 		error("Failed to get device count: %s",
 		      nvmlErrorString(nvml_rc));
 		*device_count = 0;
 	}
 }

 /*
  * Get the name of the GPU
  */
 static void _nvml_get_device_name(nvmlDevice_t *device, char *device_name,
 				  unsigned int size)
 {
 	nvmlReturn_t nvml_rc = nvmlDeviceGetName(*device, device_name, size);
 	if (nvml_rc != NVML_SUCCESS) {
 		error("Failed to get name of the GPU: %s",
 		      nvmlErrorString(nvml_rc));
 	}
 	gpu_common_underscorify_tolower(device_name);
 }

 /*
  * Get the UUID of the device, since device index can fluctuate
  */
 static void _nvml_get_device_uuid(nvmlDevice_t *device, char *uuid,
 				  unsigned int len)
 {
 	nvmlReturn_t nvml_rc = nvmlDeviceGetUUID(*device, uuid, len);
 	if (nvml_rc != NVML_SUCCESS) {
 		error("Failed to get UUID of GPU: %s",
 		      nvmlErrorString(nvml_rc));
 	}
 }

 /*
  * Get the PCI Bus ID of the device, since device index can fluctuate
  */
 static void _nvml_get_device_pci_info(nvmlDevice_t *device, nvmlPciInfo_t *pci)
 {
 	nvmlReturn_t nvml_rc = nvmlDeviceGetPciInfo(*device, pci);
 	if (nvml_rc != NVML_SUCCESS) {
 		error("Failed to get PCI info of GPU: %s",
 		      nvmlErrorString(nvml_rc));
 	}
 }

 /*
  * Retrieves minor number for the device. The minor number for the device is
  * such that the Nvidia device node file for each GPU will have the form
  * /dev/nvidia[minor_number].
  */
 static void _nvml_get_device_minor_number(nvmlDevice_t *device,
 					  unsigned int *minor)
 {
 	nvmlReturn_t nvml_rc = nvmlDeviceGetMinorNumber(*device, minor);
 	if (nvml_rc != NVML_SUCCESS) {
 		error("Failed to get minor number of GPU: %s",
 		      nvmlErrorString(nvml_rc));
 		*minor = NO_VAL;
 	}
 }

 /*
  * Retrieves an array of unsigned ints (sized to cpuSetSize) of bitmasks with
  * the ideal CPU affinity for the GPU.
  *
  * cpu_set: an array reference in which to return a bitmask of CPUs. 64 CPUs per
  * unsigned long on 64-bit machines, 32 on 32-bit machines.
  *
  * For example, on 32-bit machines, if processors 0, 1, 32, and 33 are ideal for
  * the device and cpuSetSize == 2, result[0] = 0x3, result[1] = 0x3.
  */
 static void _nvml_get_device_affinity(nvmlDevice_t *device, unsigned int size,
 				      unsigned long *cpu_set)
 {
 	nvmlReturn_t nvml_rc = nvmlDeviceGetCpuAffinity(*device, size, cpu_set);
 	if (nvml_rc != NVML_SUCCESS) {
 		error("Failed to get cpu affinity of GPU: %s",
 		      nvmlErrorString(nvml_rc));
 	}
 }

 /*
  * Returns the busId string of the connected endpoint device of an nvlink lane.
  * If query fails, an empty string is returned.
  * The returned string must be xfree'd.
  *
  * device - the GPU device
  * lane - the nvlink lane that we are checking
  *
  * device <---lane---> endpoint/remote device
  */
 static char *_nvml_get_nvlink_remote_pcie(nvmlDevice_t *device,
 					  unsigned int lane)
 {
 	nvmlPciInfo_t pci_info;
 	nvmlReturn_t nvml_rc;

 	memset(&pci_info, 0, sizeof(pci_info));
 	nvml_rc = nvmlDeviceGetNvLinkRemotePciInfo(*device, lane, &pci_info);
 	if (nvml_rc != NVML_SUCCESS) {
 		error("Failed to get PCI info of endpoint device for lane %d: %s",
 		      lane, nvmlErrorString(nvml_rc));
 		return xstrdup("");
 	} else {
 		return xstrdup(pci_info.busId);
 	}
 }

 /*
  * Does a linear search for string str in array of strings str_arr, starting
  * from index 0.
  * Returns the index of the first match found, else returns -1 if not found.
  *
  * str - the string to search for
  * str_array - the array of strings to search in
  * size - the size of str_arr
  */
 static int _get_index_from_str_arr(char *str, char **str_arr, unsigned int size)
 {
 	int i;
 	if (str_arr == NULL || str == NULL)
 		return -1;
 	for (i = 0; i < size; ++i) {
 		if (xstrcmp(str, str_arr[i]) == 0) {
 			return i;
 		}
 	}
 	return -1;
 }

 /*
  * Allocates and returns a string that is a comma separated list of nvlinks of
  * the device. If no links are specified, then an empty string will be returned.
  * The string must be xfree'd.
  *
  * device - the current GPU to get the nvlink info for
  * index - the index of the current GPU as returned by NVML. Based on PCI bus id
  * device_lut - an array of PCI busid's for each GPU. The index is the GPU index
  * device_count - the size of device_lut
  */
 static char *_nvml_get_nvlink_info(nvmlDevice_t *device, int index,
 				   char **device_lut, uint32_t device_count)
 {
 	unsigned int i;
 	nvmlReturn_t nvml_rc;
 	nvmlEnableState_t is_active;
 	int *links = xcalloc(device_count, sizeof(int));
 	char *links_str = NULL, *sep = "";

 	// Initialize links, xmalloc() initialized the array to 0 or NVLINK_NONE
 	links[index] = NVLINK_SELF;

 	// Query all nvlink lanes
 	for (i = 0; i < NVML_NVLINK_MAX_LINKS; ++i) {
 		nvml_rc = nvmlDeviceGetNvLinkState(*device, i, &is_active);
 		if (nvml_rc == NVML_ERROR_INVALID_ARGUMENT) {
 			debug3("Device/lane %d is invalid", i);
 			continue;
 		} else if (nvml_rc == NVML_ERROR_NOT_SUPPORTED) {
 			debug3("Device %d does not support "
 			       "nvmlDeviceGetNvLinkState()", i);
 			break;
 		} else if (nvml_rc != NVML_SUCCESS) {
 			error("Failed to get nvlink info from GPU: %s",
 			      nvmlErrorString(nvml_rc));
 		}
 		// See if nvlink lane is active
 		if (is_active == NVML_FEATURE_ENABLED) {
 			char *busid;
 			int k;
 			debug3("nvlink %d is enabled", i);

 			/*
 			 * Count link endpoints to determine single and double
 			 * links. E.g. if already a single link (1), increment
 			 * to a double (2).
 			 */
 			busid = _nvml_get_nvlink_remote_pcie(device, i);
 			k = _get_index_from_str_arr(busid, device_lut,
 						    device_count);
 			// Ignore self and not-founds
 			if ((k != index) && (k != -1)) {
 				links[k]++;
 			}
 			xfree(busid);
 		} else
 			debug3("nvlink %d is disabled", i);
 	}

 	// Convert links to comma separated string
 	for (i = 0; i < device_count; ++i) {
 		xstrfmtcat(links_str, "%s%d", sep, links[i]);
 		sep = ",";
 	}

 	xfree(links);
 	if (!links_str)
 		links_str = xstrdup("");
 	return links_str;
 }

 /* MIG requires CUDA 11.1 and NVIDIA driver 450.80.02 or later */
 #if HAVE_MIG_SUPPORT

 static void _free_nvml_mig_members(nvml_mig_t *nvml_mig)
 {
 	if (!nvml_mig)
 		return;

 	xfree(nvml_mig->files);
 	xfree(nvml_mig->links);
 	xfree(nvml_mig->profile_name);
 	xfree(nvml_mig->unique_id);
 }

 /*
  * Get the handle to the MIG device for the passed GPU device and MIG index
  *
  * device	(IN) The GPU device handle
  * mig_index	(IN) The MIG index
  * mig		(OUT) The MIG device handle
  *
  * Returns true if successful, false if not
  */
 static bool _nvml_get_mig_handle(nvmlDevice_t *device, unsigned int mig_index,
 				 nvmlDevice_t *mig)
 {
 	nvmlReturn_t nvml_rc = nvmlDeviceGetMigDeviceHandleByIndex(*device,
 								   mig_index,
 								   mig);
 	if (nvml_rc == NVML_ERROR_NOT_FOUND)
 		/* Not found is ok */
 		return false;
 	else if (nvml_rc != NVML_SUCCESS) {
 		error("Failed to get MIG device at MIG index %u: %s",
 		      mig_index, nvmlErrorString(nvml_rc));
 		return false;
 	}
 	return true;
 }

 /*
  * Get the maximum count of MIGs possible
  */
 static void _nvml_get_max_mig_device_count(nvmlDevice_t *device,
 					   unsigned int *count)
 {
 	nvmlReturn_t nvml_rc = nvmlDeviceGetMaxMigDeviceCount(*device, count);
 	if (nvml_rc != NVML_SUCCESS) {
 		error("Failed to get MIG device count: %s",
 		      nvmlErrorString(nvml_rc));
 		*count = 0;
 		return;
 	}

 	if (count && (*count == 0))
 		error("MIG device count is 0; MIG is either disabled or not supported");
 }

 /*
  * Get the GPU instance ID of a MIG device handle
  */
 static void _nvml_get_gpu_instance_id(nvmlDevice_t *mig, unsigned int *gi_id)
 {
 	nvmlReturn_t nvml_rc = nvmlDeviceGetGpuInstanceId(*mig, gi_id);
 	if (nvml_rc != NVML_SUCCESS) {
 		error("Failed to get MIG GPU instance ID: %s",
 		      nvmlErrorString(nvml_rc));
 		*gi_id = 0;
 	}
 }

 /*
  * Get the compute instance ID of a MIG device handle
  */
 static void _nvml_get_compute_instance_id(nvmlDevice_t *mig, unsigned int *ci_id)
 {
 	nvmlReturn_t nvml_rc = nvmlDeviceGetComputeInstanceId(*mig, ci_id);
 	if (nvml_rc != NVML_SUCCESS) {
 		error("Failed to get MIG GPU instance ID: %s",
 		      nvmlErrorString(nvml_rc));
 		*ci_id = 0;
 	}
 }

 /*
  * Get the MIG mode of the device
  *
  * If current_mode is 1, that means the device is MIG-capable and enabled.
  * If pending_mode is different than current_mode, then current_mode will be
  * changed to match pending_mode on the next "activation trigger" (device
  * unbind, device reset, or machine reboot)
  */
 static void _nvml_get_device_mig_mode(nvmlDevice_t *device,
 				      unsigned int *current_mode,
 				      unsigned int *pending_mode)
 {
 	nvmlReturn_t nvml_rc = nvmlDeviceGetMigMode(*device, current_mode,
 						    pending_mode);
 	if (nvml_rc == NVML_ERROR_NOT_SUPPORTED)
 		/* This device doesn't support MIG mode */
 		return;
 	else if (nvml_rc != NVML_SUCCESS) {
 		error("Failed to get MIG mode of the GPU: %s",
 		      nvmlErrorString(nvml_rc));
 	}
 }

 /*
  * Get the minor numbers for the GPU instance and compute instance for a MIG
  * device.
  *
  * gpu_minor	(IN) The minor number of the parent GPU of the MIG device.
  * gi_id	(IN) The GPU instance ID of the MIG device.
  * ci_id	(IN) The compute instance ID of the MIG device.
  * gi_minor	(OUT) The minor number of the GPU instance.
  * ci_minor	(OUT) The minor number of the compute instance.
  *
  * Returns SLURM_SUCCESS on success and SLURM_ERROR on failure.
  */
 static int _nvml_get_mig_minor_numbers(unsigned int gpu_minor,
 				       unsigned int gi_id, unsigned int ci_id,
 				       unsigned int *gi_minor,
 				       unsigned int *ci_minor)
 {
 	/* Parse mig-minors file for minor numbers */
 	FILE *fp = NULL;
 	int rc = SLURM_ERROR;
 	char *path = "/proc/driver/nvidia-caps/mig-minors";
 	char gi_fmt[MIG_LINE_SIZE];
 	char ci_fmt[MIG_LINE_SIZE];
 	char tmp_str[MIG_LINE_SIZE];
 	unsigned int tmp_val;
 	int i = 0;

 	/* You can't have more than 7 compute instances per GPU instance */
 	xassert(ci_id <= 7);

 	/* Clear storage for minor numbers */
 	*gi_minor = 0;
 	*ci_minor = 0;

 	fp = fopen(path, "r");
 	if (!fp) {
 		error("Could not open file `%s`", path);
 		return rc;
 	}

 	snprintf(gi_fmt, MIG_LINE_SIZE, "gpu%u/gi%u/access", gpu_minor,
 		 gi_id);
 	snprintf(ci_fmt, MIG_LINE_SIZE, "gpu%u/gi%u/ci%u/access", gpu_minor,
 		 gi_id, ci_id);

 	while (1) {
 		int found = 0;
 		int count = 0;

 		i++;
 		count = fscanf(fp, "%127s%u", tmp_str, &tmp_val);
 		if (count == EOF) {
 			error("mig-minors: %d: Reached end of file. Could not find GPU=%u|GI=%u|CI=%u",
 			      i, gpu_minor, gi_id, ci_id);
 			break;
 		} else if (count != 2) {
 			error("mig-minors: %d: Could not find tmp_str and/or tmp_val",
 			      i);
 			break;
 		}

 		if (!xstrcmp(tmp_str, gi_fmt)) {
 			found = 1;
 			*gi_minor = tmp_val;
 		}

 		if (!xstrcmp(tmp_str, ci_fmt)) {
 			found = 1;
 			*ci_minor = tmp_val;
 		}

 		if (found)
 			debug3("mig-minors: %d: Found `%s %u`", i, tmp_str,
 			       tmp_val);

 		if ((*gi_minor != 0) && (*ci_minor != 0)) {
 			rc = SLURM_SUCCESS;
 			debug3("GPU:%u|GI:%u,GI_minor=%u|CI:%u,CI_minor=%u",
 			      gpu_minor, gi_id, *gi_minor, ci_id, *ci_minor);
 			break;
 		}
 	}

 	fclose(fp);
 	return rc;
 }

 /*
  * Get the MIG mode of the device
  *
  * If current_mode is 1, that means the device is MIG-capable and enabled.
  * If pending_mode is different than current_mode, then current_mode will be
  * changed to match pending_mode on the next "activation trigger" (device
  * unbind, device reset, or machine reboot)
  */
 static bool _nvml_is_device_mig(nvmlDevice_t *device)
 {
 	unsigned int current_mode = NVML_DEVICE_MIG_DISABLE;
 	unsigned int pending_mode = NVML_DEVICE_MIG_DISABLE;

 	_nvml_get_device_mig_mode(device, &current_mode, &pending_mode);

 	if (current_mode == NVML_DEVICE_MIG_DISABLE &&
 	    pending_mode == NVML_DEVICE_MIG_ENABLE)
 		info("MIG is disabled, but set to be enabled on next GPU reset");
 	else if (current_mode == NVML_DEVICE_MIG_ENABLE &&
 		 pending_mode == NVML_DEVICE_MIG_DISABLE)
 		info("MIG is enabled, but set to be disabled on next GPU reset");

 	if (current_mode == NVML_DEVICE_MIG_ENABLE)
 		return true;
 	else
 		return false;
 }

 /*
  * According to NVIDIA documentation:
  * "With drivers >= R470 (470.42.01+), each MIG device is assigned a GPU UUID
  * starting with MIG-<UUID>."
  * https://docs.nvidia.com/datacenter/tesla/mig-user-guide/#:~:text=CUDA_VISIBLE_DEVICES%20has%20been,instance%20ID%3E
  */
 static bool _nvml_use_mig_uuid()
 {
 	static bool nvml_use_mig_uuid;
 	static bool set = false;

 	if (!set) {
 		int m_major = 470, m_minor = 42, m_rev = 1; /* 470.42.01 */
 		int major, minor, rev;
 		char v[NVML_SYSTEM_DRIVER_VERSION_BUFFER_SIZE];

 		_nvml_get_driver(v, NVML_SYSTEM_DRIVER_VERSION_BUFFER_SIZE);
 		sscanf(v, "%d.%d.%d", &major, &minor, &rev);

 		if ((major > m_major) ||
 		    ((major == m_major) && (minor > m_minor)) ||
 		    ((major == m_major) && (minor == m_minor) &&
 		     (rev >= m_rev)))
 			nvml_use_mig_uuid = true;
 		else
 			nvml_use_mig_uuid = false;
 		set = true;
 	}

 	return nvml_use_mig_uuid;
 }

 /*
  * Print out a MIG device and return a populated nvml_mig struct.
  *
  * device	(IN) The MIG device handle
  * gpu_minor	(IN) The GPU minor number
  * mig_index	(IN) The MIG index
  * gpu_uuid	(IN) The UUID string of the parent GPU
  * nvml_mig	(OUT) An nvml_mig_t struct. This function sets profile_name,
  * 		files, links, and unique_id. profile_name should already be
  * 		populated with the parent GPU type string, and files should
  * 		already be populated with the parent GPU device file.
  *
  * Returns SLURM_SUCCESS or SLURM_ERROR. Caller must xfree() struct fields.
  *
  * files includes a comma-separated string of NVIDIA capability device files
  * (/dev/nividia-caps/...) associated with the compute instance behind this MIG
  * device.
  */
 static int _handle_mig(nvmlDevice_t *device, unsigned int gpu_minor,
 		       unsigned int mig_index, char *gpu_uuid,
 		       nvml_mig_t *nvml_mig)
 {
 	nvmlDevice_t mig;
 	/* Use the V2 size so it can fit extra MIG info */
 	char mig_uuid[NVML_DEVICE_UUID_V2_BUFFER_SIZE] = {0};
 	char device_name[NVML_DEVICE_NAME_BUFFER_SIZE] = {0};
 	char *str;
 	unsigned int gi_id;
 	unsigned int ci_id;
 	unsigned int gi_minor;
 	unsigned int ci_minor;

 	xassert(nvml_mig);

 	if (!_nvml_get_mig_handle(device, mig_index, &mig))
 		return SLURM_ERROR;

 	_nvml_get_device_uuid(&mig, mig_uuid,
 			      NVML_DEVICE_UUID_V2_BUFFER_SIZE);
 	_nvml_get_gpu_instance_id(&mig, &gi_id);
 	_nvml_get_compute_instance_id(&mig, &ci_id);

 	if (_nvml_get_mig_minor_numbers(gpu_minor, gi_id, ci_id, &gi_minor,
 					&ci_minor) != SLURM_SUCCESS)
 		return SLURM_ERROR;

 	_nvml_get_device_name(&mig, device_name,
 			      NVML_DEVICE_NAME_BUFFER_SIZE);
 	if (device_name[0] && (str = strstr(device_name, "mig_"))) {
 		/* Adding 3 to skip "mig" but keep "_" */
 		xstrfmtcat(nvml_mig->profile_name, "%s", str + 3);
 	} else { /* Backup: generate name from attributes */
 		nvmlDeviceAttributes_t attributes;
 		nvmlReturn_t nvml_rc;
 		nvml_rc = nvmlDeviceGetAttributes(mig, &attributes);
 		if (nvml_rc != NVML_SUCCESS) {
 			error("Failed to get MIG attributes: %s",
 			      nvmlErrorString(nvml_rc));
 			return SLURM_ERROR;
 		}
 		xstrfmtcat(nvml_mig->profile_name, "_");

 		if (attributes.computeInstanceSliceCount !=
 		    attributes.gpuInstanceSliceCount)
 			xstrfmtcat(nvml_mig->profile_name, "%uc.",
 				   attributes.computeInstanceSliceCount);

 		/* Divide MB by 1024 (2^10) to get GB, and then round */
 		xstrfmtcat(nvml_mig->profile_name, "%ug.%lugb",
 			   attributes.gpuInstanceSliceCount,
 			   (unsigned long)(ROUNDUP(attributes.memorySizeMB,
 						   1024)));
 	}

 	if (_nvml_use_mig_uuid())
 		xstrfmtcat(nvml_mig->unique_id, "%s", mig_uuid);
 	else
 		xstrfmtcat(nvml_mig->unique_id, "MIG-%s/%u/%u", gpu_uuid, gi_id, ci_id);

 	/* Allow access to both the GPU instance and the compute instance */
 	xstrfmtcat(nvml_mig->files, ",/dev/nvidia-caps/nvidia-cap%u,/dev/nvidia-caps/nvidia-cap%u",
 		   gi_minor, ci_minor);

 	debug2("GPU minor %u, MIG index %u:", gpu_minor, mig_index);
 	debug2("    MIG Profile: %s", nvml_mig->profile_name);
 	debug2("    MIG UUID: %s", mig_uuid);
 	debug2("    UniqueID: %s", nvml_mig->unique_id);
 	debug2("    GPU Instance (GI) ID: %u", gi_id);
 	debug2("    Compute Instance (CI) ID: %u", ci_id);
 	debug2("    GI Minor Number: %u", gi_minor);
 	debug2("    CI Minor Number: %u", ci_minor);
 	debug2("    Device Files: %s", nvml_mig->files);

 	return SLURM_SUCCESS;
 }

 #endif

 /*
  * Creates and returns a gres conf list of detected nvidia gpus on the node.
  * If an error occurs, return NULL
  * Caller is responsible for freeing the list.
  *
  * If the NVIDIA NVML API exists (comes with CUDA), then query GPU info,
  * so the user doesn't need to specify manually in gres.conf.
  * Specifically populate cpu affinity and nvlink information
  */
 static list_t *_get_system_gpu_list_nvml(node_config_load_t *node_config)
 {
 	bitstr_t *enabled_cpus_bits = NULL;
 	uint32_t i;
 	uint32_t device_count = 0;
 	list_t *gres_list_system = list_create(destroy_gres_slurmd_conf);
 	char driver[NVML_SYSTEM_DRIVER_VERSION_BUFFER_SIZE];
 	char version[NVML_SYSTEM_NVML_VERSION_BUFFER_SIZE];
 	char **device_lut;
 	nvmlPciInfo_t pci_info;

 	xassert(node_config->xcpuinfo_mac_to_abs);

 	_nvml_init();
 	_nvml_get_driver(driver, NVML_SYSTEM_DRIVER_VERSION_BUFFER_SIZE);
 	_nvml_get_version(version, NVML_SYSTEM_NVML_VERSION_BUFFER_SIZE);
 	debug("Systems Graphics Driver Version: %s", driver);
 	debug("NVML Library Version: %s", version);
 	debug2("NVML API Version: %u", NVML_API_VERSION);
 #ifdef NVML_NO_UNVERSIONED_FUNC_DEFS
 	debug2("NVML_NO_UNVERSIONED_FUNC_DEFS is set, for backwards compatibility");
 #endif

 	gpu_p_get_device_count(&device_count);

 	debug2("Total CPU count: %d", node_config->cpu_cnt);
 	debug2("Device count: %d", device_count);

 	// Create a device index --> PCI Bus ID lookup table
 	device_lut = xcalloc(device_count, sizeof(char *));

 	/*
 	 * Loop through to create device to PCI busId lookup table
 	 */
 	for (i = 0; i < device_count; ++i) {
 		nvmlDevice_t device;

 		if (!_nvml_get_handle(i, &device))
 			continue;

 		memset(&pci_info, 0, sizeof(pci_info));
 		_nvml_get_device_pci_info(&device, &pci_info);
 		device_lut[i] = xstrdup(pci_info.busId);
 	}

 	if (!(slurm_conf.conf_flags & CONF_FLAG_ECORE)) {
 		enabled_cpus_bits = bit_alloc(MAX_CPUS);
 		for (i = 0; i < conf->block_map_size; i++) {
 			bit_set(enabled_cpus_bits, conf->block_map[i]);
 		}
 	}

 	/*
 	 * Loop through all the GPUs on the system and add to gres_list_system
 	 */
 	for (i = 0; i < device_count; ++i) {
 		nvmlDevice_t device;
 		char uuid[NVML_DEVICE_UUID_BUFFER_SIZE] = {0};
 		unsigned int minor_number = 0;
 		unsigned long cpu_set[CPU_SET_SIZE] = {0};
 		char *cpu_aff_mac_range = NULL;
 		char *device_file = NULL;
 		char *nvlinks = NULL;
 		char device_name[NVML_DEVICE_NAME_BUFFER_SIZE] = {0};
 		bool mig_mode = false, added_mig = false;
 		gres_slurmd_conf_t gres_slurmd_conf = {
 			.config_flags =
 				GRES_CONF_ENV_NVML | GRES_CONF_AUTODETECT,
 			.count = 1,
 			.cpu_cnt = node_config->cpu_cnt,
 			.name = "gpu",
 		};

 		if (!_nvml_get_handle(i, &device)) {
 			error("Creating null GRES GPU record");
 			add_gres_to_list(gres_list_system, &gres_slurmd_conf);
 			continue;
 		}

 #if HAVE_MIG_SUPPORT
 		mig_mode = _nvml_is_device_mig(&device);
 #endif

 		memset(&pci_info, 0, sizeof(pci_info));
 		_nvml_get_device_name(&device, device_name,
 				      NVML_DEVICE_NAME_BUFFER_SIZE);
 		_nvml_get_device_uuid(&device, uuid,
 				      NVML_DEVICE_UUID_BUFFER_SIZE);
 		_nvml_get_device_pci_info(&device, &pci_info);
 		_nvml_get_device_minor_number(&device, &minor_number);
 		if (minor_number == NO_VAL)
 			continue;

 		_nvml_get_device_affinity(&device, CPU_SET_SIZE, cpu_set);

 		// Convert from nvml cpu bitmask to slurm bitstr_t (machine fmt)
 		gres_slurmd_conf.cpus_bitmap = bit_alloc(MAX_CPUS);
 		_set_cpu_set_bitstr(gres_slurmd_conf.cpus_bitmap,
 				    cpu_set, CPU_SET_SIZE);

 		if (enabled_cpus_bits) {
 			/*
 			 * Mask out E-cores that may be included from nvml's cpu
 			 * affinity bitstring.
 			 */
 			bit_and(gres_slurmd_conf.cpus_bitmap,
 				enabled_cpus_bits);
 		}

 		// Convert from bitstr_t to cpu range str
 		cpu_aff_mac_range = bit_fmt_full(gres_slurmd_conf.cpus_bitmap);

 		// Convert cpu range str from machine to abstract(slurm) format
 		if (node_config->xcpuinfo_mac_to_abs(cpu_aff_mac_range,
 						     &gres_slurmd_conf.cpus)) {
 			error("    Conversion from machine to abstract failed");
 			FREE_NULL_BITMAP(gres_slurmd_conf.cpus_bitmap);
 			xfree(cpu_aff_mac_range);
 			continue;
 		}

 		nvlinks = _nvml_get_nvlink_info(&device, i, device_lut,
 						device_count);
 		xstrfmtcat(device_file, "/dev/nvidia%u", minor_number);

 		debug2("GPU index %u:", i);
 		debug2("    Name: %s", device_name);
 		debug2("    UUID: %s", uuid);
 		debug2("    PCI Domain/Bus/Device: %u:%u:%u", pci_info.domain,
 		       pci_info.bus, pci_info.device);
 		debug2("    PCI Bus ID: %s", pci_info.busId);
 		debug2("    NVLinks: %s", nvlinks);
 		debug2("    Device File (minor number): %s", device_file);
 		if (minor_number != i)
 			debug("Note: GPU index %u is different from minor "
 			      "number %u", i, minor_number);
 		debug2("    CPU Affinity Range - Machine: %s",
 		       cpu_aff_mac_range);
 		debug2("    Core Affinity Range - Abstract: %s",
 		       gres_slurmd_conf.cpus);
 		debug2("    MIG mode: %s", mig_mode ? "enabled" : "disabled");

 		if (mig_mode) {
 #if HAVE_MIG_SUPPORT
 			unsigned int max_mig_count;
 			unsigned int mig_count = 0;
 			char *tmp_device_name = xstrdup(device_name);
 			char *tok = xstrchr(tmp_device_name, '-');
 			if (tok) {
 				/*
 				 * Here we are clearing everything after the
 				 * first '-' so we can avoid the extra stuff
 				 * after the real type of gpu since we are going
 				 * to add a suffix here of the profile name.
 				 */
 				tok[0] = '\0';
 			}
 			_nvml_get_max_mig_device_count(&device, &max_mig_count);

 			/* Count number of actual MIGs */
 			for (unsigned int j = 0; j < max_mig_count; j++) {
 				nvmlDevice_t mig;
 				if (_nvml_get_mig_handle(&device, j, &mig)) {
 					/*
 					 * Assume MIG indexes start at 0 and are
 					 * contiguous
 					 */
 					xassert(j == mig_count);
 					mig_count++;
 				} else
 					break;
 			}
 			debug2("    MIG count: %u", mig_count);
 			if (mig_count == 0)
 				error("MIG mode is enabled, but no MIG devices were found. Please either create MIG instances, disable MIG mode, remove AutoDetect=nvml, or remove GPUs from the configuration completely.");

 			for (unsigned int j = 0; j < mig_count; j++) {
 				nvml_mig_t nvml_mig = { 0 };
 				nvml_mig.files = xstrdup(device_file);
 				nvml_mig.profile_name =
 					xstrdup(tmp_device_name);

 				/* If MIG exists, print and and return files */
 				if (_handle_mig(&device, minor_number, j,
 						uuid, &nvml_mig) !=
 				    SLURM_SUCCESS) {
 					_free_nvml_mig_members(&nvml_mig);
 					continue;
 				}

 				/*
 				 * Add MIG device to GRES list. MIG does not
 				 * support NVLinks. CPU affinity, CPU count, and
 				 * device name will be the same as non-MIG GPU.
 				 */
 				gres_slurmd_conf.file = nvml_mig.files;
 				gres_slurmd_conf.type_name =
 					nvml_mig.profile_name;
 				gres_slurmd_conf.unique_id = nvml_mig.unique_id;
 				gres_slurmd_conf.config_flags |=
 					GRES_CONF_GLOBAL_INDEX;
 				added_mig = true;

 				add_gres_to_list(gres_list_system,
 						 &gres_slurmd_conf);
 				_free_nvml_mig_members(&nvml_mig);
 			}
 			xfree(tmp_device_name);
 #endif
 		}

 		if (!added_mig) {
 			gres_slurmd_conf.file = device_file;
 			gres_slurmd_conf.links = nvlinks;
 			gres_slurmd_conf.type_name = device_name;

 			add_gres_to_list(gres_list_system, &gres_slurmd_conf);
 		}

 		// Print out possible memory frequencies for this device
 		_nvml_print_freqs(&device, LOG_LEVEL_DEBUG2);

 		FREE_NULL_BITMAP(gres_slurmd_conf.cpus_bitmap);
 		xfree(gres_slurmd_conf.cpus);
 		xfree(cpu_aff_mac_range);
 		xfree(device_file);
 		xfree(nvlinks);
 	}

 	FREE_NULL_BITMAP(enabled_cpus_bits);

 	/*
 	 * Free lookup table
 	 */
 	for (i = 0; i < device_count; ++i)
 		xfree(device_lut[i]);
 	xfree(device_lut);

 	info("%u GPU system device(s) detected", device_count);
 	return gres_list_system;
 }

 static char *_get_nvml_func_str(void *fname)
 {
 	if (fname == nvmlDeviceGetComputeRunningProcesses)
 		return "Compute";
 	return "Graphics";
 }

 static int _get_nvml_process_info(nvmlReturn_t (*get_proc)(nvmlDevice_t,
 							   unsigned int *,
 							   nvmlProcessInfo_t *),
 				  nvmlDevice_t device, pid_t pid,
 				  acct_gather_data_t *data)
 {
 	nvmlReturn_t rc;
 	nvmlProcessInfo_t *proc_info;
 	unsigned int proc_cnt = 0;

 	/*
 	 * Get the number of "[Compute|Graphics]" processes. If there are no
 	 * processes proc_cnt will be 0 and rc == NVML_SUCCESS, if there are
 	 * processes proc_cnt will be set and rc == NVML_ERROR_INSUFFICIENT_SIZE
 	 */
 	rc = get_proc(device, &proc_cnt, NULL);

 	if ((rc != NVML_SUCCESS) && (rc != NVML_ERROR_INSUFFICIENT_SIZE)) {
 		error("NVML: Failed to get %s running process count(%d): %s",
 		      _get_nvml_func_str(get_proc), rc, nvmlErrorString(rc));
 		return SLURM_ERROR;
 	}

 	if (proc_cnt) {
 		proc_info = xcalloc(proc_cnt, sizeof(*proc_info));
 		rc = get_proc(device, &proc_cnt, proc_info);
 		if (rc != NVML_SUCCESS) {
 			if (rc == NVML_ERROR_INSUFFICIENT_SIZE) {
 				log_flag(JAG, "NVML: Failed to get %s running procs(%d): %s. New processes started in between calls, accounting not gathered during this interval",
 					 _get_nvml_func_str(get_proc),
 					 rc, nvmlErrorString(rc));
 			} else {
 				error("NVML: Failed to get %s running procs(%d): %s",
 				      _get_nvml_func_str(get_proc),
 				      rc, nvmlErrorString(rc));
 			}
 			xfree(proc_info);
 			return SLURM_ERROR;
 		}
 		for (int i = 0; i < proc_cnt; i++) {
 			if (proc_info[i].pid != pid)
 				continue;
 			/* Store MB usedGpuMemory is in bytes */
 			data[gpumem_pos].size_read += proc_info[i].usedGpuMemory;
 			break;
 		}
 		xfree(proc_info);
 		log_flag(JAG, "pid %d has GPUUtil=%lu and MemMB=%lu",
 			 pid, data[gpuutil_pos].size_read,
 			 data[gpumem_pos].size_read / 1048576);
 	}
 	return SLURM_SUCCESS;
 }

 static int _get_gpumem(nvmlDevice_t device, pid_t pid, acct_gather_data_t *data)
 {
 	if (_get_nvml_process_info(nvmlDeviceGetComputeRunningProcesses, device,
 				   pid, data) != SLURM_SUCCESS)
 		return SLURM_ERROR;

 	if (_get_nvml_process_info(nvmlDeviceGetGraphicsRunningProcesses,
 				   device, pid, data) != SLURM_SUCCESS)
 		return SLURM_ERROR;

 	return SLURM_SUCCESS;
 }

 static int _get_gpuutil(nvmlDevice_t device, pid_t pid,
 			acct_gather_data_t *data)
 {
 	nvmlReturn_t rc;
 	nvmlProcessUtilizationSample_t *proc_util;
 	unsigned int cnt = 0;

 	/*
 	 * Sending NULL will fill in cnt with the number of processes so we can
 	 * use that to allocate the array correctly afterwards. A rc of
 	 * NVML_SUCCESS means no processes yet.
 	 */
 	rc = nvmlDeviceGetProcessUtilization(device, NULL, &cnt,
 					     data[gpuutil_pos].last_time);
 	if (rc == NVML_SUCCESS || !cnt)
 		return SLURM_SUCCESS;

 	if (rc != NVML_ERROR_INSUFFICIENT_SIZE) {
 		error("NVML: Failed to get process count for gpu utilization(%d): %s",
 		      rc, nvmlErrorString(rc));
 		return SLURM_ERROR;
 	}

 	proc_util = xcalloc(cnt, sizeof(*proc_util));
 	rc = nvmlDeviceGetProcessUtilization(device, proc_util, &cnt,
 					     data[gpuutil_pos].last_time);

 	if (rc == NVML_ERROR_NOT_FOUND) {
 		debug2("Couldn't find pid %d, probably hasn't started yet or has already finished",
 		       pid);
 		xfree(proc_util);
 		return SLURM_SUCCESS;
 #if HAVE_MIG_SUPPORT
 	} else if ((rc == NVML_ERROR_NOT_SUPPORTED) &&
 		   _nvml_is_device_mig(&device)) {
 		/*
 		 * NOTE: At the moment you can not query MIGs for
 		 * utilization. This will probably be fixed in the
 		 * future and hopefully this will start working.
 		 */
 		debug2("On MIG-enabled GPUs, querying process utilization is not currently supported.");
 #endif
 	} else if (rc != NVML_SUCCESS) {
 		error("NVML: Failed to get usage(%d): %s", rc,
 		      nvmlErrorString(rc));
 		xfree(proc_util);
 		return SLURM_ERROR;
 	}

 	for (int i = 0; i < cnt; i++) {
 		if (proc_util[i].pid != pid)
 			continue;
 		data[gpuutil_pos].last_time = proc_util[i].timeStamp;
 		data[gpuutil_pos].size_read += proc_util[i].smUtil;
 		break;
 	}
 	xfree(proc_util);

 	return SLURM_SUCCESS;
 }

 extern int init(void)
 {
 	if (running_in_slurmstepd()) {
 		gpu_get_tres_pos(&gpumem_pos, &gpuutil_pos);
 	}

 	debug("%s: %s loaded", __func__, plugin_name);

 	return SLURM_SUCCESS;
 }

 extern void fini(void)
 {
 	_nvml_shutdown();

 	debug("%s: unloading %s", __func__, plugin_name);
 }

 extern list_t *gpu_p_get_system_gpu_list(node_config_load_t *node_config)
 {
 	list_t *gres_list_system = NULL;

 	if (!(gres_list_system = _get_system_gpu_list_nvml(node_config)))
 		error("System GPU detection failed");

 	return gres_list_system;
 }

 extern void gpu_p_step_hardware_init(bitstr_t *usable_gpus, char *tres_freq)
 {
 	char *freq = NULL;
 	char *tmp = NULL;

 	xassert(tres_freq);
 	xassert(usable_gpus);

 	if (!usable_gpus)
 		return;		/* Job allocated no GPUs */
 	if (!tres_freq)
 		return;		/* No TRES frequency spec */

 	if (!(tmp = strstr(tres_freq, "gpu:")))
 		return;		/* No GPU frequency spec */

 	freq = xstrdup(tmp + 4);
 	if ((tmp = strchr(freq, ';')))
 		tmp[0] = '\0';

 	// Save a copy of the GPUs affected, so we can reset things afterwards
 	FREE_NULL_BITMAP(saved_gpus);
 	saved_gpus = bit_copy(usable_gpus);

 	_nvml_init();
 	// Set the frequency of each GPU index specified in the bitstr
 	_set_freq(usable_gpus, freq);
 	xfree(freq);
 }

 extern void gpu_p_step_hardware_fini(void)
 {
 	if (!saved_gpus)
 		return;

 	// Reset the frequencies back to the hardware default
 	_reset_freq(saved_gpus);
 	FREE_NULL_BITMAP(saved_gpus);
 	_nvml_shutdown();
 }

 extern char *gpu_p_test_cpu_conv(char *cpu_range)
 {
 	unsigned long cpu_set[CPU_SET_SIZE];
 	bitstr_t *cpu_aff_mac_bitstr;
 	int i;
 	char *result;
 	info("%s: cpu_range: %s", __func__, cpu_range);

 	if (!cpu_range) {
 		error("cpu_range is null");
 		return xstrdup("");
 	}

 	if (cpu_range[0] != '~') {
 		error("cpu_range doesn't start with `~`!");
 		return xstrdup("");
 	}

 	// Initialize cpu_set to 0
 	for (i = 0; i < CPU_SET_SIZE; ++i) {
 		cpu_set[i] = 0;
 	}

 	if (xstrcmp(cpu_range, "~zero") == 0) {
 		// nothing
 	} else if (xstrcmp(cpu_range, "~max") == 0) {
 		for (i = 0; i < CPU_SET_SIZE; ++i) {
 			cpu_set[i] = -1UL;
 		}
 	} else if (xstrcmp(cpu_range, "~one") == 0) {
 		cpu_set[0] = 1;
 	} else if (xstrcmp(cpu_range, "~three") == 0) {
 		cpu_set[0] = 3;
 	} else if (xstrcmp(cpu_range, "~half") == 0) {
 		cpu_set[0] = 0xff00;
 	} else if (cpu_range[1] == 'X') {
 		/*
 		 * Put in all -1's for each X
 		 * Limit to CPU_SET_SIZE
 		 */
 		int count = MIN(strlen(&cpu_range[1]), CPU_SET_SIZE);
 		for (i = 0; i < count; ++i) {
 			cpu_set[i] = -1UL;
 		}
 		for (i = count; i < CPU_SET_SIZE; ++i) {
 			cpu_set[i] = 0;
 		}
 	} else {
 		error("Unknown test keyword");
 		return xstrdup("");
 	}

 	// Print out final cpu set
 	for (i = 0; i < CPU_SET_SIZE; ++i) {
 		if ((signed) cpu_set[i] == -1)
 			printf("X");
 		else {
 			if (cpu_set[i] > 9)
 				printf("(%lu)", cpu_set[i]);
 			else
 				printf("%lu", cpu_set[i]);
 		}
 	}
 	printf("\n");

 	cpu_aff_mac_bitstr = bit_alloc(MAX_CPUS);
 	// Convert from nvml cpu bitmask to slurm bitstr_t (machine fmt)
 	_set_cpu_set_bitstr(cpu_aff_mac_bitstr, cpu_set, CPU_SET_SIZE);

 	// Convert from bitstr_t to cpu range str
 	result = bit_fmt_full(cpu_aff_mac_bitstr);

 	FREE_NULL_BITMAP(cpu_aff_mac_bitstr);
 	return result;
 }

 extern int gpu_p_energy_read(uint32_t dv_ind, gpu_status_t *gpu)
 {
 	return SLURM_SUCCESS;
 }

 extern int gpu_p_usage_read(pid_t pid, acct_gather_data_t *data)
 {
 	uint32_t device_count = 0;
 	bool track_gpumem, track_gpuutil;

 	track_gpumem = (gpumem_pos != -1);
 	track_gpuutil = (gpuutil_pos != -1);

 	if (!track_gpuutil && !track_gpumem) {
 		debug2("%s: We are not tracking TRES gpuutil/gpumem", __func__);
 		return SLURM_SUCCESS;
 	}

 	_nvml_init();
 	gpu_p_get_device_count(&device_count);

 	data[gpumem_pos].size_read = 0;
 	data[gpuutil_pos].size_read = 0;

 	for (int i = 0; i < device_count; i++) {
 		nvmlDevice_t device;

 		if (!_nvml_get_handle(i, &device))
 			continue;

 		if (track_gpumem)
 			_get_gpumem(device, pid, data);
 		if (track_gpuutil)
 			_get_gpuutil(device, pid, data);

 		log_flag(JAG, "pid %d has GPUUtil=%lu and MemMB=%lu",
 			 pid,
 			 data[gpuutil_pos].size_read,
 			 data[gpumem_pos].size_read / 1048576);
 	}

 	return SLURM_SUCCESS;
 }