| /*****************************************************************************\ |
| * gpu_nvml.c - Support nvml interface to an Nvidia GPU. |
| ***************************************************************************** |
| * Copyright (C) SchedMD LLC. |
| * |
| * This file is part of Slurm, a resource management program. |
| * For details, see <https://slurm.schedmd.com/>. |
| * Please also read the included file: DISCLAIMER. |
| * |
| * Slurm is free software; you can redistribute it and/or modify it under |
| * the terms of the GNU General Public License as published by the Free |
| * Software Foundation; either version 2 of the License, or (at your option) |
| * any later version. |
| * |
| * In addition, as a special exception, the copyright holders give permission |
| * to link the code of portions of this program with the OpenSSL library under |
| * certain conditions as described in each individual source file, and |
| * distribute linked combinations including the two. You must obey the GNU |
| * General Public License in all respects for all of the code used other than |
| * OpenSSL. If you modify file(s) with this exception, you may extend this |
| * exception to your version of the file(s), but you are not obligated to do |
| * so. If you do not wish to do so, delete this exception statement from your |
| * version. If you delete this exception statement from all source files in |
| * the program, then also delete it here. |
| * |
| * Slurm is distributed in the hope that it will be useful, but WITHOUT ANY |
| * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS |
| * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more |
| * details. |
| * |
| * You should have received a copy of the GNU General Public License along |
| * with Slurm; if not, write to the Free Software Foundation, Inc., |
| * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. |
| \*****************************************************************************/ |
| |
| #define _GNU_SOURCE |
| |
| #include <nvml.h> |
| #include <math.h> |
| |
| #include "../common/gpu_common.h" |
| |
| #if defined (__APPLE__) |
| extern slurmd_conf_t *conf __attribute__((weak_import)); |
| #else |
| slurmd_conf_t *conf = NULL; |
| #endif |
| |
| /* |
| * #defines needed to test nvml. |
| */ |
| #define MAX_CPUS 0x8000 |
| #define ULONG_BYTES (sizeof(unsigned long)) |
| #define ULONG_BITS (ULONG_BYTES * 8) |
| /* |
| * The # of unsigned longs needed to accommodate a bitmask array capable |
| * of representing MAX_CPUS cpus (will vary if 32-bit or 64-bit) |
| * E.g. for a 130 CPU 64-bit machine: (130 + 63) / 64 = 3.02 |
| * -> Integer division floor -> 3 ulongs to represent 130 CPUs |
| */ |
| #define CPU_SET_SIZE ((MAX_CPUS + (ULONG_BITS-1)) / ULONG_BITS) |
| #define NVLINK_SELF -1 |
| #define NVLINK_NONE 0 |
| #define FREQS_SIZE 512 |
| |
| #define MIG_LINE_SIZE 128 |
| |
| typedef struct { |
| char *files; /* Includes MIG cap files and parent GPU device file */ |
| char *links; /* MIG doesn't support NVLinks, but use for sorting */ |
| char *profile_name; /* <GPU_type>_<slice_cnt>g.<mem>gb */ |
| char *unique_id; |
| /* `MIG-<GPU-UUID>/<GPU instance ID>/<compute instance ID>` */ |
| } nvml_mig_t; |
| |
| static bitstr_t *saved_gpus = NULL; |
| /* |
| * These variables are required by the generic plugin interface. If they |
| * are not found in the plugin, the plugin loader will ignore it. |
| * |
| * plugin_name - A string giving a human-readable description of the |
| * plugin. There is no maximum length, but the symbol must refer to |
| * a valid string. |
| * |
| * plugin_type - A string suggesting the type of the plugin or its |
| * applicability to a particular form of data or method of data handling. |
| * If the low-level plugin API is used, the contents of this string are |
| * unimportant and may be anything. Slurm uses the higher-level plugin |
| * interface which requires this string to be of the form |
| * |
| * <application>/<method> |
| * |
| * where <application> is a description of the intended application of |
| * the plugin (e.g., "auth" for Slurm authentication) and <method> is a |
| * description of how this plugin satisfies that application. Slurm will |
| * only load authentication plugins if the plugin_type string has a prefix |
| * of "auth/". |
| * |
| * plugin_version - an unsigned 32-bit integer containing the Slurm version |
| * (major.minor.micro combined into a single number). |
| */ |
| const char plugin_name[] = "GPU NVML plugin"; |
| const char plugin_type[] = "gpu/nvml"; |
| const uint32_t plugin_version = SLURM_VERSION_NUMBER; |
| |
| static int gpumem_pos = -1; |
| static int gpuutil_pos = -1; |
| static pid_t init_pid = 0; |
| |
| /* |
| * Converts a cpu_set returned from the NVML API into a Slurm bitstr_t |
| * |
| * This function accounts for the endianness of the machine. |
| * |
| * cpu_set_bitstr: (IN/OUT) A preallocated bitstr_t via bit_alloc() that is |
| * bitstr_size bits wide. This will get filled in. |
| * cpu_set: (IN) The cpu_set array returned by nvmlDeviceGetCpuAffinity() |
| * cpu_set_size: (IN) The size of the cpu_set array |
| */ |
| static void _set_cpu_set_bitstr(bitstr_t *cpu_set_bitstr, |
| unsigned long *cpu_set, |
| unsigned int cpu_set_size) |
| { |
| int j, k, b; |
| int bit_cur; |
| int bitstr_bits = (int) bit_size(cpu_set_bitstr); |
| int cpu_set_bits = (cpu_set_size * ULONG_BITS); |
| |
| // If this fails, then something went horribly wrong |
| if (bitstr_bits != cpu_set_bits) |
| fatal("%s: bitstr_bits != cpu_set_bits", __func__); |
| |
| bit_cur = bitstr_bits - 1; |
| |
| // Iterate through each cpu_set long int |
| for (j = cpu_set_size - 1; j >= 0; --j) { |
| // Iterate through the bytes of the jth ulong bitmask |
| char *bitmask = (char *) &cpu_set[j]; |
| #ifdef SLURM_BIGENDIAN |
| for (k = 0; k < ULONG_BYTES; ++k) { |
| #else |
| for (k = ULONG_BYTES - 1; k >= 0; --k) { |
| #endif // SLURM_BIGENDIAN |
| unsigned char byte = bitmask[k]; |
| unsigned char mask; |
| // If byte is zero, nothing to set |
| if (byte == 0) { |
| bit_cur -= 8; |
| continue; |
| } |
| |
| // Test each bit of byte, from MSB to LSB. Set if needed |
| mask = 0x80; |
| for (b = 0; b < 8; ++b) { |
| if (byte & mask) |
| bit_set(cpu_set_bitstr, bit_cur); |
| mask >>= 1; |
| bit_cur--; |
| } |
| xassert(mask == 0x00); |
| } |
| } |
| |
| xassert(bit_cur == -1); |
| // If NVML gave us an empty CPU affinity, then something is very wrong |
| if (bit_set_count(cpu_set_bitstr) == 0) |
| fatal("%s: cpu_set_bitstr is empty! No CPU affinity for device", |
| __func__); |
| } |
| |
| |
| /* |
| * Initialize the NVML library. This takes a few seconds |
| */ |
| static void _nvml_init(void) |
| { |
| pid_t my_pid = conf->pid ? conf->pid : getpid(); |
| nvmlReturn_t nvml_rc; |
| |
| if (init_pid == my_pid) |
| return; |
| |
| init_pid = my_pid; |
| |
| DEF_TIMERS; |
| START_TIMER; |
| nvml_rc = nvmlInit(); |
| END_TIMER; |
| debug3("nvmlInit() took %ld microseconds", DELTA_TIMER); |
| if (nvml_rc != NVML_SUCCESS) |
| error("Failed to initialize NVML: %s", |
| nvmlErrorString(nvml_rc)); |
| else |
| debug2("Successfully initialized NVML"); |
| } |
| |
| /* |
| * Undo _nvml_init |
| */ |
| static void _nvml_shutdown(void) |
| { |
| nvmlReturn_t nvml_rc; |
| DEF_TIMERS; |
| START_TIMER; |
| nvml_rc = nvmlShutdown(); |
| init_pid = 0; |
| END_TIMER; |
| debug3("nvmlShutdown() took %ld microseconds", DELTA_TIMER); |
| if (nvml_rc != NVML_SUCCESS) |
| error("Failed to shut down NVML: %s", nvmlErrorString(nvml_rc)); |
| else |
| debug2("Successfully shut down NVML"); |
| } |
| |
| /* |
| * Get the handle to the GPU for the passed index |
| * |
| * index (IN) The GPU index (corresponds to PCI Bus ID order) |
| * device (OUT) The device handle |
| * |
| * Returns true if successful, false if not |
| */ |
| static bool _nvml_get_handle(int index, nvmlDevice_t *device) |
| { |
| nvmlReturn_t nvml_rc; |
| nvml_rc = nvmlDeviceGetHandleByIndex(index, device); |
| if (nvml_rc != NVML_SUCCESS) { |
| error("Failed to get device handle for GPU %d: %s", index, |
| nvmlErrorString(nvml_rc)); |
| return false; |
| } |
| return true; |
| } |
| |
| /* |
| * Get all possible memory frequencies for the device |
| * |
| * device (IN) The device handle |
| * mem_freqs_size (IN/OUT) The size of the mem_freqs array; this will be |
| * overwritten with the number of memory freqs found. |
| * mem_freqs (OUT) The possible memory frequencies, sorted in |
| * descending order |
| * |
| * Return true if successful, false if not. |
| */ |
| static bool _nvml_get_mem_freqs(nvmlDevice_t *device, uint32_t *mem_freqs_size, |
| uint32_t *mem_freqs) |
| { |
| nvmlReturn_t nvml_rc; |
| DEF_TIMERS; |
| START_TIMER; |
| unsigned int *nvml_mem_freqs = mem_freqs; |
| unsigned int *nvml_mem_freqs_size = mem_freqs_size; |
| nvml_rc = nvmlDeviceGetSupportedMemoryClocks(*device, |
| nvml_mem_freqs_size, |
| nvml_mem_freqs); |
| END_TIMER; |
| debug3("nvmlDeviceGetSupportedMemoryClocks() took %ld microseconds", |
| DELTA_TIMER); |
| |
| if (nvml_rc != NVML_SUCCESS) { |
| error("%s: Failed to get supported memory frequencies for the " |
| "GPU : %s", __func__, nvmlErrorString(nvml_rc)); |
| return false; |
| } |
| |
| qsort(mem_freqs, *mem_freqs_size, sizeof(uint32_t), |
| slurm_sort_uint32_list_desc); |
| |
| if ((*mem_freqs_size > 1) && |
| (mem_freqs[0] <= mem_freqs[(*mem_freqs_size)-1])) { |
| error("%s: mem frequencies are not stored in descending order!", |
| __func__); |
| return false; |
| } |
| return true; |
| } |
| |
| /* |
| * Get all possible graphics frequencies for the device |
| * |
| * device (IN) The device handle |
| * mem_freq (IN) The memory frequency to get graphics freqs for. |
| * gfx_freqs_size (IN/OUT) The size of the gfx_freqs array; this will |
| * be overwritten with the number of graphics freqs found. |
| * gfx_freqs (OUT) The possible graphics frequencies, sorted in |
| * descending order |
| * |
| * Return true if successful, false if not. |
| */ |
| static bool _nvml_get_gfx_freqs(nvmlDevice_t *device, uint32_t mem_freq, |
| uint32_t *gfx_freqs_size, uint32_t *gfx_freqs) |
| { |
| nvmlReturn_t nvml_rc; |
| DEF_TIMERS; |
| START_TIMER; |
| |
| unsigned int *nvml_gfx_freqs = gfx_freqs; |
| unsigned int *nvml_gfx_freqs_size = gfx_freqs_size; |
| nvml_rc = nvmlDeviceGetSupportedGraphicsClocks(*device, mem_freq, |
| nvml_gfx_freqs_size, |
| nvml_gfx_freqs); |
| END_TIMER; |
| debug3("nvmlDeviceGetSupportedGraphicsClocks() took %ld microseconds", |
| DELTA_TIMER); |
| if (nvml_rc != NVML_SUCCESS) { |
| error("%s: Failed to get supported graphics frequencies for the" |
| " GPU at mem frequency %u: %s", __func__, mem_freq, |
| nvmlErrorString(nvml_rc)); |
| return false; |
| } |
| |
| qsort(gfx_freqs, *gfx_freqs_size, sizeof(uint32_t), |
| slurm_sort_uint32_list_desc); |
| |
| if ((*gfx_freqs_size > 1) && |
| (gfx_freqs[0] <= gfx_freqs[(*gfx_freqs_size)-1])) { |
| error("%s: gfx frequencies are not stored in descending order!", |
| __func__); |
| return false; |
| } |
| return true; |
| } |
| |
| /* |
| * Print out all possible graphics frequencies for the given device and mem |
| * freq. If there are many frequencies, only prints out a few. |
| * |
| * device (IN) The device handle |
| * mem_freq (IN) The memory frequency to get graphics freqs for. |
| * gfx_freqs_size (IN) The size of the gfx_freqs array |
| * gfx_freqs (IN) A preallocated empty array of size gfx_freqs_size |
| * to fill with possible graphics frequencies |
| * l (IN) The log level at which to print |
| * |
| * NOTE: The contents of gfx_freqs will be modified during use. |
| */ |
| static void _nvml_print_gfx_freqs(nvmlDevice_t *device, uint32_t mem_freq, |
| uint32_t gfx_freqs_size, uint32_t *gfx_freqs, |
| log_level_t l) |
| { |
| uint32_t size = gfx_freqs_size; |
| |
| if (!_nvml_get_gfx_freqs(device, mem_freq, &size, gfx_freqs)) |
| return; |
| |
| gpu_common_print_freqs(gfx_freqs, size, l, "GPU Graphics", 8); |
| } |
| |
| /* |
| * Print out all possible memory and graphics frequencies for the given device. |
| * If there are more than FREQS_SIZE frequencies, prints a summary instead |
| * |
| * device (IN) The device handle |
| * l (IN) The log level at which to print |
| */ |
| static void _nvml_print_freqs(nvmlDevice_t *device, log_level_t l) |
| { |
| uint32_t mem_size = FREQS_SIZE; |
| uint32_t mem_freqs[FREQS_SIZE] = {0}; |
| uint32_t gfx_freqs[FREQS_SIZE] = {0}; |
| uint32_t i; |
| bool concise = false; |
| |
| if (!_nvml_get_mem_freqs(device, &mem_size, mem_freqs)) |
| return; |
| |
| if (mem_size > FREQS_CONCISE) |
| concise = true; |
| |
| log_var(l, "Possible GPU Memory Frequencies (%u):", mem_size); |
| log_var(l, "-------------------------------"); |
| if (concise) { |
| // first, next, ..., middle, ..., penultimate, last |
| unsigned int tmp; |
| log_var(l, " *%u MHz [0]", mem_freqs[0]); |
| _nvml_print_gfx_freqs(device, mem_freqs[0], FREQS_SIZE, |
| gfx_freqs, l); |
| log_var(l, " *%u MHz [1]", mem_freqs[1]); |
| _nvml_print_gfx_freqs(device, mem_freqs[1], FREQS_SIZE, |
| gfx_freqs, l); |
| log_var(l, " ..."); |
| tmp = (mem_size - 1) / 2; |
| log_var(l, " *%u MHz [%u]", mem_freqs[tmp], tmp); |
| _nvml_print_gfx_freqs(device, mem_freqs[tmp], FREQS_SIZE, |
| gfx_freqs, l); |
| log_var(l, " ..."); |
| tmp = mem_size - 2; |
| log_var(l, " *%u MHz [%u]", mem_freqs[tmp], tmp); |
| _nvml_print_gfx_freqs(device, mem_freqs[tmp], FREQS_SIZE, |
| gfx_freqs, l); |
| tmp = mem_size - 1; |
| log_var(l, " *%u MHz [%u]", mem_freqs[tmp], tmp); |
| _nvml_print_gfx_freqs(device, mem_freqs[tmp], FREQS_SIZE, |
| gfx_freqs, l); |
| return; |
| } |
| |
| for (i = 0; i < mem_size; ++i) { |
| log_var(l," *%u MHz [%u]", mem_freqs[i], i); |
| _nvml_print_gfx_freqs(device, mem_freqs[i], FREQS_SIZE, |
| gfx_freqs, l); |
| } |
| } |
| |
| /* |
| * Get the nearest valid memory and graphics clock frequencies |
| * |
| * device (IN) The NVML GPU device handle |
| * mem_freq (IN/OUT) The requested memory frequency, in MHz. This |
| * will be overwritten with the output value, if different. |
| * gfx_freq (IN/OUT) The requested graphics frequency, in MHz. This |
| * will be overwritten with the output value, if different. |
| */ |
| static void _nvml_get_nearest_freqs(nvmlDevice_t *device, uint32_t *mem_freq, |
| uint32_t *gfx_freq) |
| { |
| uint32_t mem_freqs[FREQS_SIZE] = {0}; |
| uint32_t mem_freqs_size = FREQS_SIZE; |
| uint32_t gfx_freqs[FREQS_SIZE] = {0}; |
| uint32_t gfx_freqs_size = FREQS_SIZE; |
| |
| // Get the memory frequencies |
| if (!_nvml_get_mem_freqs(device, &mem_freqs_size, mem_freqs)) |
| return; |
| |
| // Set the nearest valid memory frequency for the requested frequency |
| gpu_common_get_nearest_freq(mem_freq, mem_freqs_size, mem_freqs); |
| |
| // Get the graphics frequencies at this memory frequency |
| if (!_nvml_get_gfx_freqs(device, *mem_freq, &gfx_freqs_size, gfx_freqs)) |
| return; |
| // Set the nearest valid graphics frequency for the requested frequency |
| gpu_common_get_nearest_freq(gfx_freq, gfx_freqs_size, gfx_freqs); |
| } |
| |
| /* |
| * Set the memory and graphics clock frequencies for the GPU |
| * |
| * device (IN) The NVML GPU device handle |
| * mem_freq (IN) The memory clock frequency, in MHz |
| * gfx_freq (IN) The graphics clock frequency, in MHz |
| * |
| * Returns true if successful, false if not |
| */ |
| static bool _nvml_set_freqs(nvmlDevice_t *device, uint32_t mem_freq, |
| uint32_t gfx_freq) |
| { |
| nvmlReturn_t nvml_rc; |
| DEF_TIMERS; |
| START_TIMER; |
| nvml_rc = nvmlDeviceSetApplicationsClocks(*device, mem_freq, gfx_freq); |
| END_TIMER; |
| debug3("nvmlDeviceSetApplicationsClocks(%u, %u) took %ld microseconds", |
| mem_freq, gfx_freq, DELTA_TIMER); |
| if (nvml_rc != NVML_SUCCESS) { |
| error("%s: Failed to set memory and graphics clock frequency " |
| "pair (%u, %u) for the GPU: %s", __func__, mem_freq, |
| gfx_freq, nvmlErrorString(nvml_rc)); |
| return false; |
| } |
| return true; |
| } |
| |
| /* |
| * Reset the memory and graphics clock frequencies for the GPU to the same |
| * default frequencies that are used after system reboot or driver reload. This |
| * default cannot be changed. |
| * |
| * device (IN) The NVML GPU device handle |
| * |
| * Returns true if successful, false if not |
| */ |
| static bool _nvml_reset_freqs(nvmlDevice_t *device) |
| { |
| nvmlReturn_t nvml_rc; |
| DEF_TIMERS; |
| |
| START_TIMER; |
| nvml_rc = nvmlDeviceResetApplicationsClocks(*device); |
| END_TIMER; |
| debug3("nvmlDeviceResetApplicationsClocks() took %ld microseconds", |
| DELTA_TIMER); |
| if (nvml_rc != NVML_SUCCESS) { |
| error("%s: Failed to reset GPU frequencies to the hardware default: %s", |
| __func__, nvmlErrorString(nvml_rc)); |
| return false; |
| } |
| return true; |
| } |
| |
| /* |
| * Get the memory or graphics clock frequency that the GPU is currently running |
| * at |
| * |
| * device (IN) The NVML GPU device handle |
| * type (IN) The clock type to query. Either NVML_CLOCK_GRAPHICS or |
| * NVML_CLOCK_MEM. |
| * |
| * Returns the clock frequency in MHz if successful, or 0 if not |
| */ |
| static uint32_t _nvml_get_freq(nvmlDevice_t *device, nvmlClockType_t type) |
| { |
| nvmlReturn_t nvml_rc; |
| uint32_t freq = 0; |
| char *type_str = "unknown"; |
| DEF_TIMERS; |
| |
| switch (type) { |
| case NVML_CLOCK_GRAPHICS: |
| type_str = "graphics"; |
| break; |
| case NVML_CLOCK_MEM: |
| type_str = "memory"; |
| break; |
| default: |
| error("%s: Unsupported clock type", __func__); |
| break; |
| } |
| |
| START_TIMER; |
| unsigned int *nvml_freq = &freq; |
| nvml_rc = nvmlDeviceGetApplicationsClock(*device, type, nvml_freq); |
| END_TIMER; |
| debug3("nvmlDeviceGetApplicationsClock(%s) took %ld microseconds", |
| type_str, DELTA_TIMER); |
| if (nvml_rc != NVML_SUCCESS) { |
| error("%s: Failed to get the GPU %s frequency: %s", __func__, |
| type_str, nvmlErrorString(nvml_rc)); |
| return 0; |
| } |
| return freq; |
| } |
| |
| static uint32_t _nvml_get_gfx_freq(nvmlDevice_t *device) |
| { |
| return _nvml_get_freq(device, NVML_CLOCK_GRAPHICS); |
| } |
| |
| static uint32_t _nvml_get_mem_freq(nvmlDevice_t *device) |
| { |
| return _nvml_get_freq(device, NVML_CLOCK_MEM); |
| } |
| |
| /* |
| * Reset the frequencies of each GPU in the step to the hardware default |
| * NOTE: NVML must be initialized beforehand |
| * |
| * gpus (IN) A bitmap specifying the GPUs on which to operate. |
| */ |
| static void _reset_freq(bitstr_t *gpus) |
| { |
| int gpu_len = bit_size(gpus); |
| int i = -1, count = 0, count_set = 0; |
| bool freq_reset = false; |
| |
| /* |
| * Reset the frequency of each device allocated to the step |
| */ |
| for (i = 0; i < gpu_len; i++) { |
| nvmlDevice_t device; |
| if (!bit_test(gpus, i)) |
| continue; |
| count++; |
| |
| if (!_nvml_get_handle(i, &device)) |
| continue; |
| |
| debug2("Memory frequency before reset: %u", |
| _nvml_get_mem_freq(&device)); |
| debug2("Graphics frequency before reset: %u", |
| _nvml_get_gfx_freq(&device)); |
| freq_reset =_nvml_reset_freqs(&device); |
| debug2("Memory frequency after reset: %u", |
| _nvml_get_mem_freq(&device)); |
| debug2("Graphics frequency after reset: %u", |
| _nvml_get_gfx_freq(&device)); |
| |
| if (freq_reset) { |
| log_flag(GRES, "Successfully reset GPU[%d]", i); |
| count_set++; |
| } else { |
| log_flag(GRES, "Failed to reset GPU[%d]", i); |
| } |
| } |
| |
| if (count_set != count) { |
| log_flag(GRES, "%s: Could not reset frequencies for all GPUs. Set %d/%d total GPUs", |
| __func__, count_set, count); |
| fprintf(stderr, "Could not reset frequencies for all GPUs. " |
| "Set %d/%d total GPUs\n", count_set, count); |
| } |
| } |
| |
| /* |
| * Set the frequencies of each GPU specified for the step |
| * NOTE: NVML must be initialized beforehand |
| * |
| * gpus (IN) A bitmap specifying the GPUs on which to operate. |
| * gpu_freq (IN) The frequencies to set each of the GPUs to. If a NULL or |
| * empty memory or graphics frequency is specified, then GpuFreqDef |
| * will be consulted, which defaults to "high,memory=high" if not |
| * set. |
| */ |
| static void _set_freq(bitstr_t *gpus, char *gpu_freq) |
| { |
| bool verbose_flag = false; |
| int gpu_len = 0; |
| int i = -1, count = 0, count_set = 0; |
| unsigned int gpu_freq_num = 0, mem_freq_num = 0; |
| bool freq_set = false, freq_logged = false; |
| char *tmp = NULL; |
| bool task_cgroup = false; |
| bool constrained_devices = false; |
| bool cgroups_active = false; |
| |
| /* |
| * Parse frequency information |
| */ |
| debug2("_parse_gpu_freq(%s)", gpu_freq); |
| gpu_common_parse_gpu_freq(gpu_freq, &gpu_freq_num, &mem_freq_num, |
| &verbose_flag); |
| if (verbose_flag) |
| debug2("verbose_flag ON"); |
| |
| tmp = gpu_common_freq_value_to_string(mem_freq_num); |
| debug2("Requested GPU memory frequency: %s", tmp); |
| xfree(tmp); |
| tmp = gpu_common_freq_value_to_string(gpu_freq_num); |
| debug2("Requested GPU graphics frequency: %s", tmp); |
| xfree(tmp); |
| |
| if (!mem_freq_num && !gpu_freq_num) { |
| debug2("%s: No frequencies to set", __func__); |
| return; |
| } |
| |
| // Check if GPUs are constrained by cgroups |
| cgroup_conf_init(); |
| if (slurm_cgroup_conf.constrain_devices) |
| constrained_devices = true; |
| |
| // Check if task/cgroup plugin is loaded |
| if (xstrstr(slurm_conf.task_plugin, "cgroup")) |
| task_cgroup = true; |
| |
| // If both of these are true, then GPUs will be constrained |
| if (constrained_devices && task_cgroup) { |
| cgroups_active = true; |
| gpu_len = bit_set_count(gpus); |
| debug2("%s: cgroups are configured. Using LOCAL GPU IDs", |
| __func__); |
| } else { |
| gpu_len = bit_size(gpus); |
| debug2("%s: cgroups are NOT configured. Assuming GLOBAL GPU IDs", |
| __func__); |
| } |
| |
| /* |
| * Set the frequency of each device allocated to the step |
| */ |
| for (i = 0; i < gpu_len; i++) { |
| char *sep = ""; |
| nvmlDevice_t device; |
| unsigned int gpu_freq = gpu_freq_num, mem_freq = mem_freq_num; |
| |
| // Only check the global GPU bitstring if not using cgroups |
| if (!cgroups_active && !bit_test(gpus, i)) { |
| debug2("Passing over NVML device %u", i); |
| continue; |
| } |
| count++; |
| |
| if (!_nvml_get_handle(i, &device)) |
| continue; |
| debug2("Setting frequency of NVML device %u", i); |
| _nvml_get_nearest_freqs(&device, &mem_freq, &gpu_freq); |
| |
| debug2("Memory frequency before set: %u", |
| _nvml_get_mem_freq(&device)); |
| debug2("Graphics frequency before set: %u", |
| _nvml_get_gfx_freq(&device)); |
| freq_set = _nvml_set_freqs(&device, mem_freq, gpu_freq); |
| debug2("Memory frequency after set: %u", |
| _nvml_get_mem_freq(&device)); |
| debug2("Graphics frequency after set: %u", |
| _nvml_get_gfx_freq(&device)); |
| |
| if (mem_freq) { |
| xstrfmtcat(tmp, "%smemory_freq:%u", sep, mem_freq); |
| sep = ","; |
| } |
| if (gpu_freq) { |
| xstrfmtcat(tmp, "%sgraphics_freq:%u", sep, gpu_freq); |
| } |
| |
| if (freq_set) { |
| log_flag(GRES, "Successfully set GPU[%d] %s", i, tmp); |
| count_set++; |
| } else { |
| log_flag(GRES, "Failed to set GPU[%d] %s", i, tmp); |
| } |
| |
| if (verbose_flag && !freq_logged) { |
| fprintf(stderr, "GpuFreq=%s\n", tmp); |
| freq_logged = true; /* Just log for first GPU */ |
| } |
| xfree(tmp); |
| } |
| |
| if (count_set != count) { |
| log_flag(GRES, "%s: Could not set frequencies for all GPUs. Set %d/%d total GPUs", |
| __func__, count_set, count); |
| fprintf(stderr, "Could not set frequencies for all GPUs. " |
| "Set %d/%d total GPUs\n", count_set, count); |
| } |
| } |
| |
| |
| /* |
| * Get the version of the system's graphics driver |
| */ |
| static void _nvml_get_driver(char *driver, unsigned int len) |
| { |
| nvmlReturn_t nvml_rc = nvmlSystemGetDriverVersion(driver, len); |
| if (nvml_rc != NVML_SUCCESS) { |
| error("Failed to get the NVIDIA graphics driver version: %s", |
| nvmlErrorString(nvml_rc)); |
| driver[0] = '\0'; |
| } |
| } |
| |
| /* |
| * Get the version of the NVML library |
| */ |
| static void _nvml_get_version(char *version, unsigned int len) |
| { |
| nvmlReturn_t nvml_rc = nvmlSystemGetNVMLVersion(version, len); |
| if (nvml_rc != NVML_SUCCESS) { |
| error("Failed to get the NVML library version: %s", |
| nvmlErrorString(nvml_rc)); |
| version[0] = '\0'; |
| } |
| } |
| |
| /* |
| * Get the total # of GPUs in the system |
| */ |
| extern void gpu_p_get_device_count(uint32_t *device_count) |
| { |
| unsigned int *nvml_device_count = device_count; |
| nvmlReturn_t nvml_rc = nvmlDeviceGetCount(nvml_device_count); |
| if (nvml_rc != NVML_SUCCESS) { |
| error("Failed to get device count: %s", |
| nvmlErrorString(nvml_rc)); |
| *device_count = 0; |
| } |
| } |
| |
| /* |
| * Get the name of the GPU |
| */ |
| static void _nvml_get_device_name(nvmlDevice_t *device, char *device_name, |
| unsigned int size) |
| { |
| nvmlReturn_t nvml_rc = nvmlDeviceGetName(*device, device_name, size); |
| if (nvml_rc != NVML_SUCCESS) { |
| error("Failed to get name of the GPU: %s", |
| nvmlErrorString(nvml_rc)); |
| } |
| gpu_common_underscorify_tolower(device_name); |
| } |
| |
| /* |
| * Get the UUID of the device, since device index can fluctuate |
| */ |
| static void _nvml_get_device_uuid(nvmlDevice_t *device, char *uuid, |
| unsigned int len) |
| { |
| nvmlReturn_t nvml_rc = nvmlDeviceGetUUID(*device, uuid, len); |
| if (nvml_rc != NVML_SUCCESS) { |
| error("Failed to get UUID of GPU: %s", |
| nvmlErrorString(nvml_rc)); |
| } |
| } |
| |
| /* |
| * Get the PCI Bus ID of the device, since device index can fluctuate |
| */ |
| static void _nvml_get_device_pci_info(nvmlDevice_t *device, nvmlPciInfo_t *pci) |
| { |
| nvmlReturn_t nvml_rc = nvmlDeviceGetPciInfo(*device, pci); |
| if (nvml_rc != NVML_SUCCESS) { |
| error("Failed to get PCI info of GPU: %s", |
| nvmlErrorString(nvml_rc)); |
| } |
| } |
| |
| /* |
| * Retrieves minor number for the device. The minor number for the device is |
| * such that the Nvidia device node file for each GPU will have the form |
| * /dev/nvidia[minor_number]. |
| */ |
| static void _nvml_get_device_minor_number(nvmlDevice_t *device, |
| unsigned int *minor) |
| { |
| nvmlReturn_t nvml_rc = nvmlDeviceGetMinorNumber(*device, minor); |
| if (nvml_rc != NVML_SUCCESS) { |
| error("Failed to get minor number of GPU: %s", |
| nvmlErrorString(nvml_rc)); |
| *minor = NO_VAL; |
| } |
| } |
| |
| /* |
| * Retrieves an array of unsigned ints (sized to cpuSetSize) of bitmasks with |
| * the ideal CPU affinity for the GPU. |
| * |
| * cpu_set: an array reference in which to return a bitmask of CPUs. 64 CPUs per |
| * unsigned long on 64-bit machines, 32 on 32-bit machines. |
| * |
| * For example, on 32-bit machines, if processors 0, 1, 32, and 33 are ideal for |
| * the device and cpuSetSize == 2, result[0] = 0x3, result[1] = 0x3. |
| */ |
| static void _nvml_get_device_affinity(nvmlDevice_t *device, unsigned int size, |
| unsigned long *cpu_set) |
| { |
| nvmlReturn_t nvml_rc = nvmlDeviceGetCpuAffinity(*device, size, cpu_set); |
| if (nvml_rc != NVML_SUCCESS) { |
| error("Failed to get cpu affinity of GPU: %s", |
| nvmlErrorString(nvml_rc)); |
| } |
| } |
| |
| /* |
| * Returns the busId string of the connected endpoint device of an nvlink lane. |
| * If query fails, an empty string is returned. |
| * The returned string must be xfree'd. |
| * |
| * device - the GPU device |
| * lane - the nvlink lane that we are checking |
| * |
| * device <---lane---> endpoint/remote device |
| */ |
| static char *_nvml_get_nvlink_remote_pcie(nvmlDevice_t *device, |
| unsigned int lane) |
| { |
| nvmlPciInfo_t pci_info; |
| nvmlReturn_t nvml_rc; |
| |
| memset(&pci_info, 0, sizeof(pci_info)); |
| nvml_rc = nvmlDeviceGetNvLinkRemotePciInfo(*device, lane, &pci_info); |
| if (nvml_rc != NVML_SUCCESS) { |
| error("Failed to get PCI info of endpoint device for lane %d: %s", |
| lane, nvmlErrorString(nvml_rc)); |
| return xstrdup(""); |
| } else { |
| return xstrdup(pci_info.busId); |
| } |
| } |
| |
| /* |
| * Does a linear search for string str in array of strings str_arr, starting |
| * from index 0. |
| * Returns the index of the first match found, else returns -1 if not found. |
| * |
| * str - the string to search for |
| * str_array - the array of strings to search in |
| * size - the size of str_arr |
| */ |
| static int _get_index_from_str_arr(char *str, char **str_arr, unsigned int size) |
| { |
| int i; |
| if (str_arr == NULL || str == NULL) |
| return -1; |
| for (i = 0; i < size; ++i) { |
| if (xstrcmp(str, str_arr[i]) == 0) { |
| return i; |
| } |
| } |
| return -1; |
| } |
| |
| /* |
| * Allocates and returns a string that is a comma separated list of nvlinks of |
| * the device. If no links are specified, then an empty string will be returned. |
| * The string must be xfree'd. |
| * |
| * device - the current GPU to get the nvlink info for |
| * index - the index of the current GPU as returned by NVML. Based on PCI bus id |
| * device_lut - an array of PCI busid's for each GPU. The index is the GPU index |
| * device_count - the size of device_lut |
| */ |
| static char *_nvml_get_nvlink_info(nvmlDevice_t *device, int index, |
| char **device_lut, uint32_t device_count) |
| { |
| unsigned int i; |
| nvmlReturn_t nvml_rc; |
| nvmlEnableState_t is_active; |
| int *links = xcalloc(device_count, sizeof(int)); |
| char *links_str = NULL, *sep = ""; |
| |
| // Initialize links, xmalloc() initialized the array to 0 or NVLINK_NONE |
| links[index] = NVLINK_SELF; |
| |
| // Query all nvlink lanes |
| for (i = 0; i < NVML_NVLINK_MAX_LINKS; ++i) { |
| nvml_rc = nvmlDeviceGetNvLinkState(*device, i, &is_active); |
| if (nvml_rc == NVML_ERROR_INVALID_ARGUMENT) { |
| debug3("Device/lane %d is invalid", i); |
| continue; |
| } else if (nvml_rc == NVML_ERROR_NOT_SUPPORTED) { |
| debug3("Device %d does not support " |
| "nvmlDeviceGetNvLinkState()", i); |
| break; |
| } else if (nvml_rc != NVML_SUCCESS) { |
| error("Failed to get nvlink info from GPU: %s", |
| nvmlErrorString(nvml_rc)); |
| } |
| // See if nvlink lane is active |
| if (is_active == NVML_FEATURE_ENABLED) { |
| char *busid; |
| int k; |
| debug3("nvlink %d is enabled", i); |
| |
| /* |
| * Count link endpoints to determine single and double |
| * links. E.g. if already a single link (1), increment |
| * to a double (2). |
| */ |
| busid = _nvml_get_nvlink_remote_pcie(device, i); |
| k = _get_index_from_str_arr(busid, device_lut, |
| device_count); |
| // Ignore self and not-founds |
| if ((k != index) && (k != -1)) { |
| links[k]++; |
| } |
| xfree(busid); |
| } else |
| debug3("nvlink %d is disabled", i); |
| } |
| |
| // Convert links to comma separated string |
| for (i = 0; i < device_count; ++i) { |
| xstrfmtcat(links_str, "%s%d", sep, links[i]); |
| sep = ","; |
| } |
| |
| xfree(links); |
| if (!links_str) |
| links_str = xstrdup(""); |
| return links_str; |
| } |
| |
| /* MIG requires CUDA 11.1 and NVIDIA driver 450.80.02 or later */ |
| #if HAVE_MIG_SUPPORT |
| |
| static void _free_nvml_mig_members(nvml_mig_t *nvml_mig) |
| { |
| if (!nvml_mig) |
| return; |
| |
| xfree(nvml_mig->files); |
| xfree(nvml_mig->links); |
| xfree(nvml_mig->profile_name); |
| xfree(nvml_mig->unique_id); |
| } |
| |
| /* |
| * Get the handle to the MIG device for the passed GPU device and MIG index |
| * |
| * device (IN) The GPU device handle |
| * mig_index (IN) The MIG index |
| * mig (OUT) The MIG device handle |
| * |
| * Returns true if successful, false if not |
| */ |
| static bool _nvml_get_mig_handle(nvmlDevice_t *device, unsigned int mig_index, |
| nvmlDevice_t *mig) |
| { |
| nvmlReturn_t nvml_rc = nvmlDeviceGetMigDeviceHandleByIndex(*device, |
| mig_index, |
| mig); |
| if (nvml_rc == NVML_ERROR_NOT_FOUND) |
| /* Not found is ok */ |
| return false; |
| else if (nvml_rc != NVML_SUCCESS) { |
| error("Failed to get MIG device at MIG index %u: %s", |
| mig_index, nvmlErrorString(nvml_rc)); |
| return false; |
| } |
| return true; |
| } |
| |
| /* |
| * Get the maximum count of MIGs possible |
| */ |
| static void _nvml_get_max_mig_device_count(nvmlDevice_t *device, |
| unsigned int *count) |
| { |
| nvmlReturn_t nvml_rc = nvmlDeviceGetMaxMigDeviceCount(*device, count); |
| if (nvml_rc != NVML_SUCCESS) { |
| error("Failed to get MIG device count: %s", |
| nvmlErrorString(nvml_rc)); |
| *count = 0; |
| return; |
| } |
| |
| if (count && (*count == 0)) |
| error("MIG device count is 0; MIG is either disabled or not supported"); |
| } |
| |
| /* |
| * Get the GPU instance ID of a MIG device handle |
| */ |
| static void _nvml_get_gpu_instance_id(nvmlDevice_t *mig, unsigned int *gi_id) |
| { |
| nvmlReturn_t nvml_rc = nvmlDeviceGetGpuInstanceId(*mig, gi_id); |
| if (nvml_rc != NVML_SUCCESS) { |
| error("Failed to get MIG GPU instance ID: %s", |
| nvmlErrorString(nvml_rc)); |
| *gi_id = 0; |
| } |
| } |
| |
| /* |
| * Get the compute instance ID of a MIG device handle |
| */ |
| static void _nvml_get_compute_instance_id(nvmlDevice_t *mig, unsigned int *ci_id) |
| { |
| nvmlReturn_t nvml_rc = nvmlDeviceGetComputeInstanceId(*mig, ci_id); |
| if (nvml_rc != NVML_SUCCESS) { |
| error("Failed to get MIG GPU instance ID: %s", |
| nvmlErrorString(nvml_rc)); |
| *ci_id = 0; |
| } |
| } |
| |
| /* |
| * Get the MIG mode of the device |
| * |
| * If current_mode is 1, that means the device is MIG-capable and enabled. |
| * If pending_mode is different than current_mode, then current_mode will be |
| * changed to match pending_mode on the next "activation trigger" (device |
| * unbind, device reset, or machine reboot) |
| */ |
| static void _nvml_get_device_mig_mode(nvmlDevice_t *device, |
| unsigned int *current_mode, |
| unsigned int *pending_mode) |
| { |
| nvmlReturn_t nvml_rc = nvmlDeviceGetMigMode(*device, current_mode, |
| pending_mode); |
| if (nvml_rc == NVML_ERROR_NOT_SUPPORTED) |
| /* This device doesn't support MIG mode */ |
| return; |
| else if (nvml_rc != NVML_SUCCESS) { |
| error("Failed to get MIG mode of the GPU: %s", |
| nvmlErrorString(nvml_rc)); |
| } |
| } |
| |
| /* |
| * Get the minor numbers for the GPU instance and compute instance for a MIG |
| * device. |
| * |
| * gpu_minor (IN) The minor number of the parent GPU of the MIG device. |
| * gi_id (IN) The GPU instance ID of the MIG device. |
| * ci_id (IN) The compute instance ID of the MIG device. |
| * gi_minor (OUT) The minor number of the GPU instance. |
| * ci_minor (OUT) The minor number of the compute instance. |
| * |
| * Returns SLURM_SUCCESS on success and SLURM_ERROR on failure. |
| */ |
| static int _nvml_get_mig_minor_numbers(unsigned int gpu_minor, |
| unsigned int gi_id, unsigned int ci_id, |
| unsigned int *gi_minor, |
| unsigned int *ci_minor) |
| { |
| /* Parse mig-minors file for minor numbers */ |
| FILE *fp = NULL; |
| int rc = SLURM_ERROR; |
| char *path = "/proc/driver/nvidia-caps/mig-minors"; |
| char gi_fmt[MIG_LINE_SIZE]; |
| char ci_fmt[MIG_LINE_SIZE]; |
| char tmp_str[MIG_LINE_SIZE]; |
| unsigned int tmp_val; |
| int i = 0; |
| |
| /* You can't have more than 7 compute instances per GPU instance */ |
| xassert(ci_id <= 7); |
| |
| /* Clear storage for minor numbers */ |
| *gi_minor = 0; |
| *ci_minor = 0; |
| |
| fp = fopen(path, "r"); |
| if (!fp) { |
| error("Could not open file `%s`", path); |
| return rc; |
| } |
| |
| snprintf(gi_fmt, MIG_LINE_SIZE, "gpu%u/gi%u/access", gpu_minor, |
| gi_id); |
| snprintf(ci_fmt, MIG_LINE_SIZE, "gpu%u/gi%u/ci%u/access", gpu_minor, |
| gi_id, ci_id); |
| |
| while (1) { |
| int found = 0; |
| int count = 0; |
| |
| i++; |
| count = fscanf(fp, "%127s%u", tmp_str, &tmp_val); |
| if (count == EOF) { |
| error("mig-minors: %d: Reached end of file. Could not find GPU=%u|GI=%u|CI=%u", |
| i, gpu_minor, gi_id, ci_id); |
| break; |
| } else if (count != 2) { |
| error("mig-minors: %d: Could not find tmp_str and/or tmp_val", |
| i); |
| break; |
| } |
| |
| if (!xstrcmp(tmp_str, gi_fmt)) { |
| found = 1; |
| *gi_minor = tmp_val; |
| } |
| |
| if (!xstrcmp(tmp_str, ci_fmt)) { |
| found = 1; |
| *ci_minor = tmp_val; |
| } |
| |
| if (found) |
| debug3("mig-minors: %d: Found `%s %u`", i, tmp_str, |
| tmp_val); |
| |
| if ((*gi_minor != 0) && (*ci_minor != 0)) { |
| rc = SLURM_SUCCESS; |
| debug3("GPU:%u|GI:%u,GI_minor=%u|CI:%u,CI_minor=%u", |
| gpu_minor, gi_id, *gi_minor, ci_id, *ci_minor); |
| break; |
| } |
| } |
| |
| fclose(fp); |
| return rc; |
| } |
| |
| /* |
| * Get the MIG mode of the device |
| * |
| * If current_mode is 1, that means the device is MIG-capable and enabled. |
| * If pending_mode is different than current_mode, then current_mode will be |
| * changed to match pending_mode on the next "activation trigger" (device |
| * unbind, device reset, or machine reboot) |
| */ |
| static bool _nvml_is_device_mig(nvmlDevice_t *device) |
| { |
| unsigned int current_mode = NVML_DEVICE_MIG_DISABLE; |
| unsigned int pending_mode = NVML_DEVICE_MIG_DISABLE; |
| |
| _nvml_get_device_mig_mode(device, ¤t_mode, &pending_mode); |
| |
| if (current_mode == NVML_DEVICE_MIG_DISABLE && |
| pending_mode == NVML_DEVICE_MIG_ENABLE) |
| info("MIG is disabled, but set to be enabled on next GPU reset"); |
| else if (current_mode == NVML_DEVICE_MIG_ENABLE && |
| pending_mode == NVML_DEVICE_MIG_DISABLE) |
| info("MIG is enabled, but set to be disabled on next GPU reset"); |
| |
| if (current_mode == NVML_DEVICE_MIG_ENABLE) |
| return true; |
| else |
| return false; |
| } |
| |
| /* |
| * According to NVIDIA documentation: |
| * "With drivers >= R470 (470.42.01+), each MIG device is assigned a GPU UUID |
| * starting with MIG-<UUID>." |
| * https://docs.nvidia.com/datacenter/tesla/mig-user-guide/#:~:text=CUDA_VISIBLE_DEVICES%20has%20been,instance%20ID%3E |
| */ |
| static bool _nvml_use_mig_uuid() |
| { |
| static bool nvml_use_mig_uuid; |
| static bool set = false; |
| |
| if (!set) { |
| int m_major = 470, m_minor = 42, m_rev = 1; /* 470.42.01 */ |
| int major, minor, rev; |
| char v[NVML_SYSTEM_DRIVER_VERSION_BUFFER_SIZE]; |
| |
| _nvml_get_driver(v, NVML_SYSTEM_DRIVER_VERSION_BUFFER_SIZE); |
| sscanf(v, "%d.%d.%d", &major, &minor, &rev); |
| |
| if ((major > m_major) || |
| ((major == m_major) && (minor > m_minor)) || |
| ((major == m_major) && (minor == m_minor) && |
| (rev >= m_rev))) |
| nvml_use_mig_uuid = true; |
| else |
| nvml_use_mig_uuid = false; |
| set = true; |
| } |
| |
| return nvml_use_mig_uuid; |
| } |
| |
| /* |
| * Print out a MIG device and return a populated nvml_mig struct. |
| * |
| * device (IN) The MIG device handle |
| * gpu_minor (IN) The GPU minor number |
| * mig_index (IN) The MIG index |
| * gpu_uuid (IN) The UUID string of the parent GPU |
| * nvml_mig (OUT) An nvml_mig_t struct. This function sets profile_name, |
| * files, links, and unique_id. profile_name should already be |
| * populated with the parent GPU type string, and files should |
| * already be populated with the parent GPU device file. |
| * |
| * Returns SLURM_SUCCESS or SLURM_ERROR. Caller must xfree() struct fields. |
| * |
| * files includes a comma-separated string of NVIDIA capability device files |
| * (/dev/nividia-caps/...) associated with the compute instance behind this MIG |
| * device. |
| */ |
| static int _handle_mig(nvmlDevice_t *device, unsigned int gpu_minor, |
| unsigned int mig_index, char *gpu_uuid, |
| nvml_mig_t *nvml_mig) |
| { |
| nvmlDevice_t mig; |
| /* Use the V2 size so it can fit extra MIG info */ |
| char mig_uuid[NVML_DEVICE_UUID_V2_BUFFER_SIZE] = {0}; |
| char device_name[NVML_DEVICE_NAME_BUFFER_SIZE] = {0}; |
| char *str; |
| unsigned int gi_id; |
| unsigned int ci_id; |
| unsigned int gi_minor; |
| unsigned int ci_minor; |
| |
| xassert(nvml_mig); |
| |
| if (!_nvml_get_mig_handle(device, mig_index, &mig)) |
| return SLURM_ERROR; |
| |
| _nvml_get_device_uuid(&mig, mig_uuid, |
| NVML_DEVICE_UUID_V2_BUFFER_SIZE); |
| _nvml_get_gpu_instance_id(&mig, &gi_id); |
| _nvml_get_compute_instance_id(&mig, &ci_id); |
| |
| if (_nvml_get_mig_minor_numbers(gpu_minor, gi_id, ci_id, &gi_minor, |
| &ci_minor) != SLURM_SUCCESS) |
| return SLURM_ERROR; |
| |
| _nvml_get_device_name(&mig, device_name, |
| NVML_DEVICE_NAME_BUFFER_SIZE); |
| if (device_name[0] && (str = strstr(device_name, "mig_"))) { |
| /* Adding 3 to skip "mig" but keep "_" */ |
| xstrfmtcat(nvml_mig->profile_name, "%s", str + 3); |
| } else { /* Backup: generate name from attributes */ |
| nvmlDeviceAttributes_t attributes; |
| nvmlReturn_t nvml_rc; |
| nvml_rc = nvmlDeviceGetAttributes(mig, &attributes); |
| if (nvml_rc != NVML_SUCCESS) { |
| error("Failed to get MIG attributes: %s", |
| nvmlErrorString(nvml_rc)); |
| return SLURM_ERROR; |
| } |
| xstrfmtcat(nvml_mig->profile_name, "_"); |
| |
| if (attributes.computeInstanceSliceCount != |
| attributes.gpuInstanceSliceCount) |
| xstrfmtcat(nvml_mig->profile_name, "%uc.", |
| attributes.computeInstanceSliceCount); |
| |
| /* Divide MB by 1024 (2^10) to get GB, and then round */ |
| xstrfmtcat(nvml_mig->profile_name, "%ug.%lugb", |
| attributes.gpuInstanceSliceCount, |
| (unsigned long)(ROUNDUP(attributes.memorySizeMB, |
| 1024))); |
| } |
| |
| if (_nvml_use_mig_uuid()) |
| xstrfmtcat(nvml_mig->unique_id, "%s", mig_uuid); |
| else |
| xstrfmtcat(nvml_mig->unique_id, "MIG-%s/%u/%u", gpu_uuid, gi_id, ci_id); |
| |
| /* Allow access to both the GPU instance and the compute instance */ |
| xstrfmtcat(nvml_mig->files, ",/dev/nvidia-caps/nvidia-cap%u,/dev/nvidia-caps/nvidia-cap%u", |
| gi_minor, ci_minor); |
| |
| debug2("GPU minor %u, MIG index %u:", gpu_minor, mig_index); |
| debug2(" MIG Profile: %s", nvml_mig->profile_name); |
| debug2(" MIG UUID: %s", mig_uuid); |
| debug2(" UniqueID: %s", nvml_mig->unique_id); |
| debug2(" GPU Instance (GI) ID: %u", gi_id); |
| debug2(" Compute Instance (CI) ID: %u", ci_id); |
| debug2(" GI Minor Number: %u", gi_minor); |
| debug2(" CI Minor Number: %u", ci_minor); |
| debug2(" Device Files: %s", nvml_mig->files); |
| |
| return SLURM_SUCCESS; |
| } |
| |
| #endif |
| |
| /* |
| * Creates and returns a gres conf list of detected nvidia gpus on the node. |
| * If an error occurs, return NULL |
| * Caller is responsible for freeing the list. |
| * |
| * If the NVIDIA NVML API exists (comes with CUDA), then query GPU info, |
| * so the user doesn't need to specify manually in gres.conf. |
| * Specifically populate cpu affinity and nvlink information |
| */ |
| static list_t *_get_system_gpu_list_nvml(node_config_load_t *node_config) |
| { |
| bitstr_t *enabled_cpus_bits = NULL; |
| uint32_t i; |
| uint32_t device_count = 0; |
| list_t *gres_list_system = list_create(destroy_gres_slurmd_conf); |
| char driver[NVML_SYSTEM_DRIVER_VERSION_BUFFER_SIZE]; |
| char version[NVML_SYSTEM_NVML_VERSION_BUFFER_SIZE]; |
| char **device_lut; |
| nvmlPciInfo_t pci_info; |
| |
| xassert(node_config->xcpuinfo_mac_to_abs); |
| |
| _nvml_init(); |
| _nvml_get_driver(driver, NVML_SYSTEM_DRIVER_VERSION_BUFFER_SIZE); |
| _nvml_get_version(version, NVML_SYSTEM_NVML_VERSION_BUFFER_SIZE); |
| debug("Systems Graphics Driver Version: %s", driver); |
| debug("NVML Library Version: %s", version); |
| debug2("NVML API Version: %u", NVML_API_VERSION); |
| #ifdef NVML_NO_UNVERSIONED_FUNC_DEFS |
| debug2("NVML_NO_UNVERSIONED_FUNC_DEFS is set, for backwards compatibility"); |
| #endif |
| |
| gpu_p_get_device_count(&device_count); |
| |
| debug2("Total CPU count: %d", node_config->cpu_cnt); |
| debug2("Device count: %d", device_count); |
| |
| // Create a device index --> PCI Bus ID lookup table |
| device_lut = xcalloc(device_count, sizeof(char *)); |
| |
| /* |
| * Loop through to create device to PCI busId lookup table |
| */ |
| for (i = 0; i < device_count; ++i) { |
| nvmlDevice_t device; |
| |
| if (!_nvml_get_handle(i, &device)) |
| continue; |
| |
| memset(&pci_info, 0, sizeof(pci_info)); |
| _nvml_get_device_pci_info(&device, &pci_info); |
| device_lut[i] = xstrdup(pci_info.busId); |
| } |
| |
| if (!(slurm_conf.conf_flags & CONF_FLAG_ECORE)) { |
| enabled_cpus_bits = bit_alloc(MAX_CPUS); |
| for (i = 0; i < conf->block_map_size; i++) { |
| bit_set(enabled_cpus_bits, conf->block_map[i]); |
| } |
| } |
| |
| /* |
| * Loop through all the GPUs on the system and add to gres_list_system |
| */ |
| for (i = 0; i < device_count; ++i) { |
| nvmlDevice_t device; |
| char uuid[NVML_DEVICE_UUID_BUFFER_SIZE] = {0}; |
| unsigned int minor_number = 0; |
| unsigned long cpu_set[CPU_SET_SIZE] = {0}; |
| char *cpu_aff_mac_range = NULL; |
| char *device_file = NULL; |
| char *nvlinks = NULL; |
| char device_name[NVML_DEVICE_NAME_BUFFER_SIZE] = {0}; |
| bool mig_mode = false, added_mig = false; |
| gres_slurmd_conf_t gres_slurmd_conf = { |
| .config_flags = |
| GRES_CONF_ENV_NVML | GRES_CONF_AUTODETECT, |
| .count = 1, |
| .cpu_cnt = node_config->cpu_cnt, |
| .name = "gpu", |
| }; |
| |
| if (!_nvml_get_handle(i, &device)) { |
| error("Creating null GRES GPU record"); |
| add_gres_to_list(gres_list_system, &gres_slurmd_conf); |
| continue; |
| } |
| |
| #if HAVE_MIG_SUPPORT |
| mig_mode = _nvml_is_device_mig(&device); |
| #endif |
| |
| memset(&pci_info, 0, sizeof(pci_info)); |
| _nvml_get_device_name(&device, device_name, |
| NVML_DEVICE_NAME_BUFFER_SIZE); |
| _nvml_get_device_uuid(&device, uuid, |
| NVML_DEVICE_UUID_BUFFER_SIZE); |
| _nvml_get_device_pci_info(&device, &pci_info); |
| _nvml_get_device_minor_number(&device, &minor_number); |
| if (minor_number == NO_VAL) |
| continue; |
| |
| _nvml_get_device_affinity(&device, CPU_SET_SIZE, cpu_set); |
| |
| // Convert from nvml cpu bitmask to slurm bitstr_t (machine fmt) |
| gres_slurmd_conf.cpus_bitmap = bit_alloc(MAX_CPUS); |
| _set_cpu_set_bitstr(gres_slurmd_conf.cpus_bitmap, |
| cpu_set, CPU_SET_SIZE); |
| |
| if (enabled_cpus_bits) { |
| /* |
| * Mask out E-cores that may be included from nvml's cpu |
| * affinity bitstring. |
| */ |
| bit_and(gres_slurmd_conf.cpus_bitmap, |
| enabled_cpus_bits); |
| } |
| |
| // Convert from bitstr_t to cpu range str |
| cpu_aff_mac_range = bit_fmt_full(gres_slurmd_conf.cpus_bitmap); |
| |
| // Convert cpu range str from machine to abstract(slurm) format |
| if (node_config->xcpuinfo_mac_to_abs(cpu_aff_mac_range, |
| &gres_slurmd_conf.cpus)) { |
| error(" Conversion from machine to abstract failed"); |
| FREE_NULL_BITMAP(gres_slurmd_conf.cpus_bitmap); |
| xfree(cpu_aff_mac_range); |
| continue; |
| } |
| |
| nvlinks = _nvml_get_nvlink_info(&device, i, device_lut, |
| device_count); |
| xstrfmtcat(device_file, "/dev/nvidia%u", minor_number); |
| |
| debug2("GPU index %u:", i); |
| debug2(" Name: %s", device_name); |
| debug2(" UUID: %s", uuid); |
| debug2(" PCI Domain/Bus/Device: %u:%u:%u", pci_info.domain, |
| pci_info.bus, pci_info.device); |
| debug2(" PCI Bus ID: %s", pci_info.busId); |
| debug2(" NVLinks: %s", nvlinks); |
| debug2(" Device File (minor number): %s", device_file); |
| if (minor_number != i) |
| debug("Note: GPU index %u is different from minor " |
| "number %u", i, minor_number); |
| debug2(" CPU Affinity Range - Machine: %s", |
| cpu_aff_mac_range); |
| debug2(" Core Affinity Range - Abstract: %s", |
| gres_slurmd_conf.cpus); |
| debug2(" MIG mode: %s", mig_mode ? "enabled" : "disabled"); |
| |
| if (mig_mode) { |
| #if HAVE_MIG_SUPPORT |
| unsigned int max_mig_count; |
| unsigned int mig_count = 0; |
| char *tmp_device_name = xstrdup(device_name); |
| char *tok = xstrchr(tmp_device_name, '-'); |
| if (tok) { |
| /* |
| * Here we are clearing everything after the |
| * first '-' so we can avoid the extra stuff |
| * after the real type of gpu since we are going |
| * to add a suffix here of the profile name. |
| */ |
| tok[0] = '\0'; |
| } |
| _nvml_get_max_mig_device_count(&device, &max_mig_count); |
| |
| /* Count number of actual MIGs */ |
| for (unsigned int j = 0; j < max_mig_count; j++) { |
| nvmlDevice_t mig; |
| if (_nvml_get_mig_handle(&device, j, &mig)) { |
| /* |
| * Assume MIG indexes start at 0 and are |
| * contiguous |
| */ |
| xassert(j == mig_count); |
| mig_count++; |
| } else |
| break; |
| } |
| debug2(" MIG count: %u", mig_count); |
| if (mig_count == 0) |
| error("MIG mode is enabled, but no MIG devices were found. Please either create MIG instances, disable MIG mode, remove AutoDetect=nvml, or remove GPUs from the configuration completely."); |
| |
| for (unsigned int j = 0; j < mig_count; j++) { |
| nvml_mig_t nvml_mig = { 0 }; |
| nvml_mig.files = xstrdup(device_file); |
| nvml_mig.profile_name = |
| xstrdup(tmp_device_name); |
| |
| /* If MIG exists, print and and return files */ |
| if (_handle_mig(&device, minor_number, j, |
| uuid, &nvml_mig) != |
| SLURM_SUCCESS) { |
| _free_nvml_mig_members(&nvml_mig); |
| continue; |
| } |
| |
| /* |
| * Add MIG device to GRES list. MIG does not |
| * support NVLinks. CPU affinity, CPU count, and |
| * device name will be the same as non-MIG GPU. |
| */ |
| gres_slurmd_conf.file = nvml_mig.files; |
| gres_slurmd_conf.type_name = |
| nvml_mig.profile_name; |
| gres_slurmd_conf.unique_id = nvml_mig.unique_id; |
| gres_slurmd_conf.config_flags |= |
| GRES_CONF_GLOBAL_INDEX; |
| added_mig = true; |
| |
| add_gres_to_list(gres_list_system, |
| &gres_slurmd_conf); |
| _free_nvml_mig_members(&nvml_mig); |
| } |
| xfree(tmp_device_name); |
| #endif |
| } |
| |
| if (!added_mig) { |
| gres_slurmd_conf.file = device_file; |
| gres_slurmd_conf.links = nvlinks; |
| gres_slurmd_conf.type_name = device_name; |
| |
| add_gres_to_list(gres_list_system, &gres_slurmd_conf); |
| } |
| |
| // Print out possible memory frequencies for this device |
| _nvml_print_freqs(&device, LOG_LEVEL_DEBUG2); |
| |
| FREE_NULL_BITMAP(gres_slurmd_conf.cpus_bitmap); |
| xfree(gres_slurmd_conf.cpus); |
| xfree(cpu_aff_mac_range); |
| xfree(device_file); |
| xfree(nvlinks); |
| } |
| |
| FREE_NULL_BITMAP(enabled_cpus_bits); |
| |
| /* |
| * Free lookup table |
| */ |
| for (i = 0; i < device_count; ++i) |
| xfree(device_lut[i]); |
| xfree(device_lut); |
| |
| info("%u GPU system device(s) detected", device_count); |
| return gres_list_system; |
| } |
| |
| static char *_get_nvml_func_str(void *fname) |
| { |
| if (fname == nvmlDeviceGetComputeRunningProcesses) |
| return "Compute"; |
| return "Graphics"; |
| } |
| |
| static int _get_nvml_process_info(nvmlReturn_t (*get_proc)(nvmlDevice_t, |
| unsigned int *, |
| nvmlProcessInfo_t *), |
| nvmlDevice_t device, pid_t pid, |
| acct_gather_data_t *data) |
| { |
| nvmlReturn_t rc; |
| nvmlProcessInfo_t *proc_info; |
| unsigned int proc_cnt = 0; |
| |
| /* |
| * Get the number of "[Compute|Graphics]" processes. If there are no |
| * processes proc_cnt will be 0 and rc == NVML_SUCCESS, if there are |
| * processes proc_cnt will be set and rc == NVML_ERROR_INSUFFICIENT_SIZE |
| */ |
| rc = get_proc(device, &proc_cnt, NULL); |
| |
| if ((rc != NVML_SUCCESS) && (rc != NVML_ERROR_INSUFFICIENT_SIZE)) { |
| error("NVML: Failed to get %s running process count(%d): %s", |
| _get_nvml_func_str(get_proc), rc, nvmlErrorString(rc)); |
| return SLURM_ERROR; |
| } |
| |
| if (proc_cnt) { |
| proc_info = xcalloc(proc_cnt, sizeof(*proc_info)); |
| rc = get_proc(device, &proc_cnt, proc_info); |
| if (rc != NVML_SUCCESS) { |
| if (rc == NVML_ERROR_INSUFFICIENT_SIZE) { |
| log_flag(JAG, "NVML: Failed to get %s running procs(%d): %s. New processes started in between calls, accounting not gathered during this interval", |
| _get_nvml_func_str(get_proc), |
| rc, nvmlErrorString(rc)); |
| } else { |
| error("NVML: Failed to get %s running procs(%d): %s", |
| _get_nvml_func_str(get_proc), |
| rc, nvmlErrorString(rc)); |
| } |
| xfree(proc_info); |
| return SLURM_ERROR; |
| } |
| for (int i = 0; i < proc_cnt; i++) { |
| if (proc_info[i].pid != pid) |
| continue; |
| /* Store MB usedGpuMemory is in bytes */ |
| data[gpumem_pos].size_read += proc_info[i].usedGpuMemory; |
| break; |
| } |
| xfree(proc_info); |
| log_flag(JAG, "pid %d has GPUUtil=%lu and MemMB=%lu", |
| pid, data[gpuutil_pos].size_read, |
| data[gpumem_pos].size_read / 1048576); |
| } |
| return SLURM_SUCCESS; |
| } |
| |
| static int _get_gpumem(nvmlDevice_t device, pid_t pid, acct_gather_data_t *data) |
| { |
| if (_get_nvml_process_info(nvmlDeviceGetComputeRunningProcesses, device, |
| pid, data) != SLURM_SUCCESS) |
| return SLURM_ERROR; |
| |
| if (_get_nvml_process_info(nvmlDeviceGetGraphicsRunningProcesses, |
| device, pid, data) != SLURM_SUCCESS) |
| return SLURM_ERROR; |
| |
| return SLURM_SUCCESS; |
| } |
| |
| static int _get_gpuutil(nvmlDevice_t device, pid_t pid, |
| acct_gather_data_t *data) |
| { |
| nvmlReturn_t rc; |
| nvmlProcessUtilizationSample_t *proc_util; |
| unsigned int cnt = 0; |
| |
| /* |
| * Sending NULL will fill in cnt with the number of processes so we can |
| * use that to allocate the array correctly afterwards. A rc of |
| * NVML_SUCCESS means no processes yet. |
| */ |
| rc = nvmlDeviceGetProcessUtilization(device, NULL, &cnt, |
| data[gpuutil_pos].last_time); |
| if (rc == NVML_SUCCESS || !cnt) |
| return SLURM_SUCCESS; |
| |
| if (rc != NVML_ERROR_INSUFFICIENT_SIZE) { |
| error("NVML: Failed to get process count for gpu utilization(%d): %s", |
| rc, nvmlErrorString(rc)); |
| return SLURM_ERROR; |
| } |
| |
| proc_util = xcalloc(cnt, sizeof(*proc_util)); |
| rc = nvmlDeviceGetProcessUtilization(device, proc_util, &cnt, |
| data[gpuutil_pos].last_time); |
| |
| if (rc == NVML_ERROR_NOT_FOUND) { |
| debug2("Couldn't find pid %d, probably hasn't started yet or has already finished", |
| pid); |
| xfree(proc_util); |
| return SLURM_SUCCESS; |
| #if HAVE_MIG_SUPPORT |
| } else if ((rc == NVML_ERROR_NOT_SUPPORTED) && |
| _nvml_is_device_mig(&device)) { |
| /* |
| * NOTE: At the moment you can not query MIGs for |
| * utilization. This will probably be fixed in the |
| * future and hopefully this will start working. |
| */ |
| debug2("On MIG-enabled GPUs, querying process utilization is not currently supported."); |
| #endif |
| } else if (rc != NVML_SUCCESS) { |
| error("NVML: Failed to get usage(%d): %s", rc, |
| nvmlErrorString(rc)); |
| xfree(proc_util); |
| return SLURM_ERROR; |
| } |
| |
| for (int i = 0; i < cnt; i++) { |
| if (proc_util[i].pid != pid) |
| continue; |
| data[gpuutil_pos].last_time = proc_util[i].timeStamp; |
| data[gpuutil_pos].size_read += proc_util[i].smUtil; |
| break; |
| } |
| xfree(proc_util); |
| |
| return SLURM_SUCCESS; |
| } |
| |
| extern int init(void) |
| { |
| if (running_in_slurmstepd()) { |
| gpu_get_tres_pos(&gpumem_pos, &gpuutil_pos); |
| } |
| |
| debug("%s: %s loaded", __func__, plugin_name); |
| |
| return SLURM_SUCCESS; |
| } |
| |
| extern void fini(void) |
| { |
| _nvml_shutdown(); |
| |
| debug("%s: unloading %s", __func__, plugin_name); |
| } |
| |
| extern list_t *gpu_p_get_system_gpu_list(node_config_load_t *node_config) |
| { |
| list_t *gres_list_system = NULL; |
| |
| if (!(gres_list_system = _get_system_gpu_list_nvml(node_config))) |
| error("System GPU detection failed"); |
| |
| return gres_list_system; |
| } |
| |
| extern void gpu_p_step_hardware_init(bitstr_t *usable_gpus, char *tres_freq) |
| { |
| char *freq = NULL; |
| char *tmp = NULL; |
| |
| xassert(tres_freq); |
| xassert(usable_gpus); |
| |
| if (!usable_gpus) |
| return; /* Job allocated no GPUs */ |
| if (!tres_freq) |
| return; /* No TRES frequency spec */ |
| |
| if (!(tmp = strstr(tres_freq, "gpu:"))) |
| return; /* No GPU frequency spec */ |
| |
| freq = xstrdup(tmp + 4); |
| if ((tmp = strchr(freq, ';'))) |
| tmp[0] = '\0'; |
| |
| // Save a copy of the GPUs affected, so we can reset things afterwards |
| FREE_NULL_BITMAP(saved_gpus); |
| saved_gpus = bit_copy(usable_gpus); |
| |
| _nvml_init(); |
| // Set the frequency of each GPU index specified in the bitstr |
| _set_freq(usable_gpus, freq); |
| xfree(freq); |
| } |
| |
| extern void gpu_p_step_hardware_fini(void) |
| { |
| if (!saved_gpus) |
| return; |
| |
| // Reset the frequencies back to the hardware default |
| _reset_freq(saved_gpus); |
| FREE_NULL_BITMAP(saved_gpus); |
| _nvml_shutdown(); |
| } |
| |
| extern char *gpu_p_test_cpu_conv(char *cpu_range) |
| { |
| unsigned long cpu_set[CPU_SET_SIZE]; |
| bitstr_t *cpu_aff_mac_bitstr; |
| int i; |
| char *result; |
| info("%s: cpu_range: %s", __func__, cpu_range); |
| |
| if (!cpu_range) { |
| error("cpu_range is null"); |
| return xstrdup(""); |
| } |
| |
| if (cpu_range[0] != '~') { |
| error("cpu_range doesn't start with `~`!"); |
| return xstrdup(""); |
| } |
| |
| // Initialize cpu_set to 0 |
| for (i = 0; i < CPU_SET_SIZE; ++i) { |
| cpu_set[i] = 0; |
| } |
| |
| if (xstrcmp(cpu_range, "~zero") == 0) { |
| // nothing |
| } else if (xstrcmp(cpu_range, "~max") == 0) { |
| for (i = 0; i < CPU_SET_SIZE; ++i) { |
| cpu_set[i] = -1UL; |
| } |
| } else if (xstrcmp(cpu_range, "~one") == 0) { |
| cpu_set[0] = 1; |
| } else if (xstrcmp(cpu_range, "~three") == 0) { |
| cpu_set[0] = 3; |
| } else if (xstrcmp(cpu_range, "~half") == 0) { |
| cpu_set[0] = 0xff00; |
| } else if (cpu_range[1] == 'X') { |
| /* |
| * Put in all -1's for each X |
| * Limit to CPU_SET_SIZE |
| */ |
| int count = MIN(strlen(&cpu_range[1]), CPU_SET_SIZE); |
| for (i = 0; i < count; ++i) { |
| cpu_set[i] = -1UL; |
| } |
| for (i = count; i < CPU_SET_SIZE; ++i) { |
| cpu_set[i] = 0; |
| } |
| } else { |
| error("Unknown test keyword"); |
| return xstrdup(""); |
| } |
| |
| // Print out final cpu set |
| for (i = 0; i < CPU_SET_SIZE; ++i) { |
| if ((signed) cpu_set[i] == -1) |
| printf("X"); |
| else { |
| if (cpu_set[i] > 9) |
| printf("(%lu)", cpu_set[i]); |
| else |
| printf("%lu", cpu_set[i]); |
| } |
| } |
| printf("\n"); |
| |
| cpu_aff_mac_bitstr = bit_alloc(MAX_CPUS); |
| // Convert from nvml cpu bitmask to slurm bitstr_t (machine fmt) |
| _set_cpu_set_bitstr(cpu_aff_mac_bitstr, cpu_set, CPU_SET_SIZE); |
| |
| // Convert from bitstr_t to cpu range str |
| result = bit_fmt_full(cpu_aff_mac_bitstr); |
| |
| FREE_NULL_BITMAP(cpu_aff_mac_bitstr); |
| return result; |
| } |
| |
| extern int gpu_p_energy_read(uint32_t dv_ind, gpu_status_t *gpu) |
| { |
| return SLURM_SUCCESS; |
| } |
| |
| extern int gpu_p_usage_read(pid_t pid, acct_gather_data_t *data) |
| { |
| uint32_t device_count = 0; |
| bool track_gpumem, track_gpuutil; |
| |
| track_gpumem = (gpumem_pos != -1); |
| track_gpuutil = (gpuutil_pos != -1); |
| |
| if (!track_gpuutil && !track_gpumem) { |
| debug2("%s: We are not tracking TRES gpuutil/gpumem", __func__); |
| return SLURM_SUCCESS; |
| } |
| |
| _nvml_init(); |
| gpu_p_get_device_count(&device_count); |
| |
| data[gpumem_pos].size_read = 0; |
| data[gpuutil_pos].size_read = 0; |
| |
| for (int i = 0; i < device_count; i++) { |
| nvmlDevice_t device; |
| |
| if (!_nvml_get_handle(i, &device)) |
| continue; |
| |
| if (track_gpumem) |
| _get_gpumem(device, pid, data); |
| if (track_gpuutil) |
| _get_gpuutil(device, pid, data); |
| |
| log_flag(JAG, "pid %d has GPUUtil=%lu and MemMB=%lu", |
| pid, |
| data[gpuutil_pos].size_read, |
| data[gpumem_pos].size_read / 1048576); |
| } |
| |
| return SLURM_SUCCESS; |
| } |