| /*****************************************************************************\ |
| * gpu_rsmi.c - Support rsmi interface to an AMD GPU. |
| ***************************************************************************** |
| * Copyright (C) SchedMD LLC. |
| * Copyright (c) 2019, Advanced Micro Devices, Inc. All rights reserved. |
| * Written by Advanced Micro Devices, |
| * who borrowed heavily from SLURM gpu and nvml plugin. |
| * |
| * This file is part of Slurm, a resource management program. |
| * For details, see <https://slurm.schedmd.com/>. |
| * Please also read the included file: DISCLAIMER. |
| * |
| * Slurm is free software; you can redistribute it and/or modify it under |
| * the terms of the GNU General Public License as published by the Free |
| * Software Foundation; either version 2 of the License, or (at your option) |
| * any later version. |
| * |
| * In addition, as a special exception, the copyright holders give permission |
| * to link the code of portions of this program with the OpenSSL library under |
| * certain conditions as described in each individual source file, and |
| * distribute linked combinations including the two. You must obey the GNU |
| * General Public License in all respects for all of the code used other than |
| * OpenSSL. If you modify file(s) with this exception, you may extend this |
| * exception to your version of the file(s), but you are not obligated to do |
| * so. If you do not wish to do so, delete this exception statement from your |
| * version. If you delete this exception statement from all source files in |
| * the program, then also delete it here. |
| * |
| * Slurm is distributed in the hope that it will be useful, but WITHOUT ANY |
| * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS |
| * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more |
| * details. |
| * |
| * You should have received a copy of the GNU General Public License along |
| * with Slurm; if not, write to the Free Software Foundation, Inc., |
| * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. |
| \*****************************************************************************/ |
| |
| #define _GNU_SOURCE |
| |
| #include <dlfcn.h> |
| #include <rocm_smi/rocm_smi.h> |
| |
| #include "../common/gpu_common.h" |
| |
| #ifdef HAVE_NUMA |
| # include <numa.h> |
| #endif |
| |
| /* |
| * #defines needed to test rsmi. |
| */ |
| |
| static bitstr_t *saved_gpus; |
| |
| /* |
| * Buffer size large enough for RSMI string |
| */ |
| #define RSMI_STRING_BUFFER_SIZE 80 |
| |
| /* ROCM release version >= 6.0.0 required for gathering usage */ |
| #define RSMI_REQ_VERSION_USAGE 6 |
| |
| /* |
| * PCI information about a GPU device. |
| */ |
| typedef struct rsmiPciInfo_st { |
| union { |
| struct { |
| #ifdef SLURM_BIGENDIAN |
| uint64_t domain : 32; |
| uint64_t reserved : 16; |
| uint64_t bus : 8; |
| uint64_t device : 5; |
| uint64_t function : 3; |
| #else |
| uint64_t function : 3; |
| uint64_t device : 5; |
| uint64_t bus : 8; |
| uint64_t reserved : 16; |
| uint64_t domain : 32; |
| #endif |
| }; |
| uint64_t bdfid; |
| }; |
| } rsmiPciInfo_t; |
| |
| /* |
| * These variables are required by the generic plugin interface. If they |
| * are not found in the plugin, the plugin loader will ignore it. |
| * |
| * plugin_name - A string giving a human-readable description of the |
| * plugin. There is no maximum length, but the symbol must refer to |
| * a valid string. |
| * |
| * plugin_type - A string suggesting the type of the plugin or its |
| * applicability to a particular form of data or method of data handling. |
| * If the low-level plugin API is used, the contents of this string are |
| * unimportant and may be anything. Slurm uses the higher-level plugin |
| * interface which requires this string to be of the form |
| * |
| * <application>/<method> |
| * |
| * where <application> is a description of the intended application of |
| * the plugin (e.g., "auth" for Slurm authentication) and <method> is a |
| * description of how this plugin satisfies that application. Slurm will |
| * only load authentication plugins if the plugin_type string has a prefix |
| * of "auth/". |
| * |
| * plugin_version - an unsigned 32-bit integer containing the Slurm version |
| * (major.minor.micro combined into a single number). |
| */ |
| const char plugin_name[] = "GPU RSMI plugin"; |
| const char plugin_type[] = "gpu/rsmi"; |
| const uint32_t plugin_version = SLURM_VERSION_NUMBER; |
| |
| static int gpumem_pos = -1; |
| static int gpuutil_pos = -1; |
| |
| static bool get_usage = true; |
| |
| static void _rsmi_get_version(char *version, unsigned int len); |
| static void _rsmi_get_driver(char *driver, unsigned int len); |
| |
| /* |
| * Initialize the rsmi library. |
| */ |
| static void _rsmi_init() |
| { |
| static pid_t init_pid = 0; |
| pid_t my_pid = conf->pid ? conf->pid : getpid(); |
| rsmi_status_t rsmi_rc; |
| const char *status_string; |
| char version[RSMI_STRING_BUFFER_SIZE]; |
| char driver[RSMI_STRING_BUFFER_SIZE]; |
| |
| if (init_pid == my_pid) /* Already inited */ |
| return; |
| |
| init_pid = my_pid; |
| |
| DEF_TIMERS; |
| START_TIMER; |
| rsmi_rc = rsmi_init(0); |
| END_TIMER; |
| debug3("rsmi_init() took %ld microseconds", DELTA_TIMER); |
| if (rsmi_rc != RSMI_STATUS_SUCCESS) { |
| rsmi_status_string(rsmi_rc, &status_string); |
| error("Failed to initialize rsmi: %s", |
| status_string); |
| } else |
| debug2("Successfully initialized rsmi"); |
| |
| _rsmi_get_driver(driver, RSMI_STRING_BUFFER_SIZE); |
| _rsmi_get_version(version, RSMI_STRING_BUFFER_SIZE); |
| debug("AMD Graphics Driver Version: %s", driver); |
| debug("RSMI Library Version: %s", version); |
| } |
| |
| extern int init(void) |
| { |
| if (running_in_slurmstepd()) { |
| gpu_get_tres_pos(&gpumem_pos, &gpuutil_pos); |
| } |
| |
| debug("%s: %s loaded", __func__, plugin_name); |
| |
| return SLURM_SUCCESS; |
| } |
| |
| extern void fini(void) |
| { |
| debug("%s: unloading %s", __func__, plugin_name); |
| |
| rsmi_shut_down(); |
| } |
| |
| /* |
| * Get all possible memory frequencies for the device |
| * |
| * dv_ind (IN) The device index |
| * mem_freqs_size (IN/OUT) The size of the mem_freqs array; this will be |
| * overwritten with the number of memory freqs found. |
| * mem_freqs (OUT) The possible memory frequencies in MHz. |
| * |
| * Return true if successful, false if not. |
| */ |
| static bool _rsmi_get_mem_freqs(uint32_t dv_ind, uint32_t *mem_freqs_size, |
| uint32_t *mem_freqs) |
| { |
| const char *status_string; |
| rsmi_status_t rsmi_rc; |
| rsmi_frequencies_t rsmi_freqs; |
| |
| DEF_TIMERS; |
| START_TIMER; |
| rsmi_rc = rsmi_dev_gpu_clk_freq_get( |
| dv_ind, RSMI_CLK_TYPE_MEM, &rsmi_freqs); |
| END_TIMER; |
| debug3("rsmi_dev_gpu_clk_freq_get() took %ld microseconds", |
| DELTA_TIMER); |
| |
| if (rsmi_rc != RSMI_STATUS_SUCCESS) { |
| rsmi_rc = rsmi_status_string(rsmi_rc, &status_string); |
| error("RSMI: Failed to get memory frequencies error: %s", |
| status_string); |
| return false; |
| } |
| |
| *mem_freqs_size = rsmi_freqs.num_supported; |
| for (int i = 0; i < *mem_freqs_size; i++) |
| mem_freqs[i] = rsmi_freqs.frequency[i]/1000000; |
| |
| return true; |
| } |
| |
| /* |
| * Get all possible graphics frequencies for the device |
| * |
| * dv_ind (IN) The device index |
| * gfx_freqs_size (IN/OUT) The size of the gfx_freqs array; this will |
| * be overwritten with the number of graphics freqs found. |
| * gfx_freqs (OUT) The possible graphics frequencies in MHz. |
| * |
| * Return true if successful, false if not. |
| */ |
| static bool _rsmi_get_gfx_freqs(uint32_t dv_ind, uint32_t *gfx_freqs_size, |
| uint32_t *gfx_freqs) |
| { |
| const char *status_string; |
| rsmi_status_t rsmi_rc; |
| rsmi_frequencies_t rsmi_freqs; |
| |
| DEF_TIMERS; |
| START_TIMER; |
| rsmi_rc = rsmi_dev_gpu_clk_freq_get( |
| dv_ind, RSMI_CLK_TYPE_SYS, &rsmi_freqs); |
| END_TIMER; |
| debug3("rsmi_dev_gpu_clk_freq_get() took %ld microseconds", |
| DELTA_TIMER); |
| |
| if (rsmi_rc != RSMI_STATUS_SUCCESS) { |
| rsmi_rc = rsmi_status_string(rsmi_rc, &status_string); |
| error("RSMI: Failed to get graphics frequencies error: %s", |
| status_string); |
| return false; |
| } |
| |
| *gfx_freqs_size = rsmi_freqs.num_supported; |
| for (int i = 0; i < *gfx_freqs_size; i++) |
| gfx_freqs[i] = rsmi_freqs.frequency[i]/1000000; |
| |
| return true; |
| } |
| |
| /* |
| * Print out all possible memory and graphics frequencies for the given device. |
| * If there are more than FREQS_CONCISE frequencies, prints a summary instead |
| * |
| * dv_ind (IN) The device index |
| * l (IN) The log level at which to print |
| */ |
| static void _rsmi_print_freqs(uint32_t dv_ind, log_level_t l) |
| { |
| uint32_t mem_freqs[RSMI_MAX_NUM_FREQUENCIES] = {0}; |
| uint32_t gfx_freqs[RSMI_MAX_NUM_FREQUENCIES] = {0}; |
| uint32_t size = RSMI_MAX_NUM_FREQUENCIES; |
| |
| if (!_rsmi_get_mem_freqs(dv_ind, &size, mem_freqs)) |
| return; |
| |
| qsort(mem_freqs, size, sizeof(uint32_t), |
| slurm_sort_uint32_list_desc); |
| if ((size > 1) && (mem_freqs[0] <= mem_freqs[(size)-1])) { |
| error("%s: memory frequencies are not stored in descending order!", |
| __func__); |
| return; |
| } |
| |
| gpu_common_print_freqs(mem_freqs, size, l, "GPU Memory", 0); |
| |
| size = RSMI_MAX_NUM_FREQUENCIES; |
| if (!_rsmi_get_gfx_freqs(dv_ind, &size, gfx_freqs)) |
| return; |
| |
| qsort(gfx_freqs, size, sizeof(uint32_t), |
| slurm_sort_uint32_list_desc); |
| if ((size > 1) && (gfx_freqs[0] <= gfx_freqs[(size)-1])) { |
| error("%s: Graphics frequencies are not stored in descending order!", |
| __func__); |
| return; |
| } |
| |
| gpu_common_print_freqs(gfx_freqs, size, l, "GPU Graphics", 0); |
| } |
| |
| /* |
| * Get the nearest valid memory and graphics frequencies |
| * Return bit masks indicating the indices of the |
| * frequencies that are to be enabled (1) and disabled (0). |
| * |
| * dv_ind (IN) the device index |
| * mem_freq (IN/OUT) requested/nearest valid memory frequency |
| * mem_bitmask (OUT) bit mask for the nearest valid memory frequency |
| * gfx_freq (IN/OUT) requested/nearest valid graphics frequency |
| * gfx_bitmask (OUT) bit mask for the nearest valid graphics frequency |
| */ |
| static void _rsmi_get_nearest_freqs(uint32_t dv_ind, uint32_t *mem_freq, |
| uint64_t *mem_bitmask, uint32_t *gfx_freq, |
| uint64_t *gfx_bitmask) |
| { |
| uint32_t mem_freqs[RSMI_MAX_NUM_FREQUENCIES] = {0}; |
| uint32_t mem_freqs_sort[RSMI_MAX_NUM_FREQUENCIES] = {0}; |
| uint32_t mem_freqs_size = RSMI_MAX_NUM_FREQUENCIES; |
| |
| uint32_t gfx_freqs[RSMI_MAX_NUM_FREQUENCIES] = {0}; |
| uint32_t gfx_freqs_sort[RSMI_MAX_NUM_FREQUENCIES] = {0}; |
| uint32_t gfx_freqs_size = RSMI_MAX_NUM_FREQUENCIES; |
| |
| // Get the memory frequencies |
| if (!_rsmi_get_mem_freqs(dv_ind, &mem_freqs_size, mem_freqs)) |
| return; |
| |
| memcpy(mem_freqs_sort, mem_freqs, mem_freqs_size*sizeof(uint32_t)); |
| qsort(mem_freqs_sort, mem_freqs_size, sizeof(uint32_t), |
| slurm_sort_uint32_list_desc); |
| if ((mem_freqs_size > 1) && |
| (mem_freqs_sort[0] <= mem_freqs_sort[(mem_freqs_size)-1])) { |
| error("%s: memory frequencies are not stored in descending order!", |
| __func__); |
| return; |
| } |
| |
| // Set the nearest valid memory frequency for the requested frequency |
| gpu_common_get_nearest_freq(mem_freq, mem_freqs_size, mem_freqs_sort); |
| |
| // convert the frequency to bit mask |
| for (uint64_t i = 0; i < mem_freqs_size; i++) |
| if (*mem_freq == mem_freqs[i]) { |
| *mem_bitmask = (1 << i); |
| break; |
| } |
| |
| // Get the graphics frequencies |
| if (!_rsmi_get_gfx_freqs(dv_ind, &gfx_freqs_size, gfx_freqs)) |
| return; |
| |
| memcpy(gfx_freqs_sort, gfx_freqs, gfx_freqs_size*sizeof(uint32_t)); |
| qsort(gfx_freqs_sort, gfx_freqs_size, sizeof(uint32_t), |
| slurm_sort_uint32_list_desc); |
| if ((gfx_freqs_size > 1) && |
| (gfx_freqs_sort[0] <= gfx_freqs_sort[(gfx_freqs_size)-1])) { |
| error("%s: graphics frequencies are not stored in descending order!", |
| __func__); |
| return; |
| } |
| |
| // Set the nearest valid graphics frequency for the requested frequency |
| gpu_common_get_nearest_freq(gfx_freq, gfx_freqs_size, gfx_freqs_sort); |
| |
| // convert the frequency to bit mask |
| for (uint64_t i = 0; i < gfx_freqs_size; i++) |
| if (*gfx_freq == gfx_freqs[i]) { |
| *gfx_bitmask = (1 << i); |
| break; |
| } |
| } |
| |
| /* |
| * Set the memory and graphics clock frequencies for the GPU |
| * |
| * dv_ind (IN) The device index |
| * mem_bitmask (IN) bit mask for the memory frequency. |
| * gfx_bitmask (IN) bit mask for the graphics frequency. |
| * |
| * Returns true if successful, false if not |
| */ |
| static bool _rsmi_set_freqs(uint32_t dv_ind, uint64_t mem_bitmask, |
| uint64_t gfx_bitmask) |
| { |
| const char *status_string; |
| rsmi_status_t rsmi_rc; |
| |
| DEF_TIMERS; |
| START_TIMER; |
| rsmi_rc = rsmi_dev_gpu_clk_freq_set( |
| dv_ind, RSMI_CLK_TYPE_MEM, mem_bitmask); |
| END_TIMER; |
| debug3("rsmi_dev_gpu_clk_freq_set(0x%lx) for memory took %ld microseconds", |
| mem_bitmask, DELTA_TIMER); |
| if (rsmi_rc != RSMI_STATUS_SUCCESS) { |
| rsmi_rc = rsmi_status_string(rsmi_rc, &status_string); |
| error("RSMI: Failed to set memory frequency GPU %u error: %s", |
| dv_ind, status_string); |
| return false; |
| } |
| |
| START_TIMER; |
| rsmi_rc = rsmi_dev_gpu_clk_freq_set(dv_ind, |
| RSMI_CLK_TYPE_SYS, gfx_bitmask); |
| debug3("rsmi_dev_gpu_clk_freq_set(0x%lx) for graphics took %ld microseconds", |
| gfx_bitmask, DELTA_TIMER); |
| END_TIMER; |
| if (rsmi_rc != RSMI_STATUS_SUCCESS) { |
| rsmi_rc = rsmi_status_string(rsmi_rc, &status_string); |
| error("RSMI: Failed to set graphic frequency GPU %u error: %s", |
| dv_ind, status_string); |
| return false; |
| } |
| return true; |
| } |
| |
| /* |
| * Reset the memory and graphics clock frequencies for the GPU to the same |
| * default frequencies that are used after system reboot or driver reload. This |
| * default cannot be changed. |
| * |
| * dv_ind (IN) The device index |
| * |
| * Returns true if successful, false if not |
| */ |
| static bool _rsmi_reset_freqs(uint32_t dv_ind) |
| { |
| const char *status_string; |
| rsmi_status_t rsmi_rc; |
| |
| DEF_TIMERS; |
| |
| START_TIMER; |
| rsmi_rc = rsmi_dev_perf_level_set(dv_ind, RSMI_DEV_PERF_LEVEL_AUTO); |
| END_TIMER; |
| debug3("rsmi_dev_perf_level_set() took %ld microseconds", |
| DELTA_TIMER); |
| if (rsmi_rc != RSMI_STATUS_SUCCESS) { |
| rsmi_rc = rsmi_status_string(rsmi_rc, &status_string); |
| error("RSMI: Failed to reset frequencies error: %s", |
| status_string); |
| return false; |
| } |
| return true; |
| } |
| |
| /* |
| * Get the memory or graphics clock frequency that the GPU is currently running |
| * at |
| * |
| * dv_ind (IN) The device index |
| * type (IN) The clock type to query. Either RSMI_CLK_TYPE_SYS or |
| * RSMI_CLK_TYPE_MEM. |
| * |
| * Returns the clock frequency in MHz if successful, or 0 if not |
| */ |
| static uint32_t _rsmi_get_freq(uint32_t dv_ind, rsmi_clk_type_t type) |
| { |
| const char *status_string; |
| rsmi_status_t rsmi_rc; |
| rsmi_frequencies_t rsmi_freqs; |
| char *type_str = "unknown"; |
| |
| DEF_TIMERS; |
| |
| switch (type) { |
| case RSMI_CLK_TYPE_SYS: |
| type_str = "graphics"; |
| break; |
| case RSMI_CLK_TYPE_MEM: |
| type_str = "memory"; |
| break; |
| default: |
| error("%s: Unsupported clock type", __func__); |
| break; |
| } |
| |
| START_TIMER; |
| rsmi_rc = rsmi_dev_gpu_clk_freq_get(dv_ind, type, &rsmi_freqs); |
| END_TIMER; |
| debug3("rsmi_dev_gpu_clk_freq_get(%s) took %ld microseconds", |
| type_str, DELTA_TIMER); |
| if (rsmi_rc != RSMI_STATUS_SUCCESS) { |
| rsmi_rc = rsmi_status_string(rsmi_rc, &status_string); |
| error("RSMI: Failed to get the GPU frequency type %s, error: %s", |
| type_str, status_string); |
| return 0; |
| } |
| return (rsmi_freqs.frequency[rsmi_freqs.current]/1000000); |
| } |
| |
| static uint32_t _rsmi_get_gfx_freq(uint32_t dv_ind) |
| { |
| return _rsmi_get_freq(dv_ind, RSMI_CLK_TYPE_SYS); |
| } |
| |
| static uint32_t _rsmi_get_mem_freq(uint32_t dv_ind) |
| { |
| return _rsmi_get_freq(dv_ind, RSMI_CLK_TYPE_MEM); |
| } |
| |
| /* |
| * Reset the frequencies of each GPU in the step to the hardware default |
| * NOTE: RSMI must be initialized beforehand |
| * |
| * gpus (IN) A bitmap specifying the GPUs on which to operate. |
| */ |
| static void _reset_freq(bitstr_t *gpus) |
| { |
| int gpu_len = bit_size(gpus); |
| int i = -1, count = 0, count_set = 0; |
| bool freq_reset = false; |
| |
| // Reset the frequency of each device allocated to the step |
| for (i = 0; i < gpu_len; i++) { |
| if (!bit_test(gpus, i)) |
| continue; |
| count++; |
| |
| debug2("Memory frequency before reset: %u", |
| _rsmi_get_mem_freq(i)); |
| debug2("Graphics frequency before reset: %u", |
| _rsmi_get_gfx_freq(i)); |
| freq_reset = _rsmi_reset_freqs(i); |
| debug2("Memory frequency after reset: %u", |
| _rsmi_get_mem_freq(i)); |
| debug2("Graphics frequency after reset: %u", |
| _rsmi_get_gfx_freq(i)); |
| |
| if (freq_reset) { |
| log_flag(GRES, "Successfully reset GPU[%d]", i); |
| count_set++; |
| } else { |
| log_flag(GRES, "Failed to reset GPU[%d]", i); |
| } |
| } |
| |
| if (count_set != count) { |
| log_flag(GRES, "%s: Could not reset frequencies for all GPUs %d/%d total GPUs", |
| __func__, count_set, count); |
| fprintf(stderr, "Could not reset frequencies for all GPUs %d/%d total GPUs\n", |
| count_set, count); |
| } |
| } |
| |
| /* |
| * Set the frequencies of each GPU specified for the step |
| * NOTE: RSMI must be initialized beforehand |
| * |
| * gpus (IN) A bitmap specifying the GPUs on which to operate. |
| * gpu_freq (IN) The frequencies to set each of the GPUs to. If a NULL or |
| * empty memory or graphics frequency is specified, then GpuFreqDef |
| * will be consulted, which defaults to "high,memory=high" if not |
| * set. |
| */ |
| static void _set_freq(bitstr_t *gpus, char *gpu_freq) |
| { |
| bool verbose_flag = false; |
| int gpu_len = 0; |
| int i = -1, count = 0, count_set = 0; |
| unsigned int gpu_freq_num = 0, mem_freq_num = 0; |
| bool freq_set = false, freq_logged = false; |
| char *tmp = NULL; |
| bool task_cgroup = false; |
| bool constrained_devices = false; |
| bool cgroups_active = false; |
| |
| // Parse frequency information |
| debug2("_parse_gpu_freq(%s)", gpu_freq); |
| gpu_common_parse_gpu_freq(gpu_freq, &gpu_freq_num, &mem_freq_num, |
| &verbose_flag); |
| if (verbose_flag) |
| debug2("verbose_flag ON"); |
| |
| tmp = gpu_common_freq_value_to_string(mem_freq_num); |
| debug2("Requested GPU memory frequency: %s", tmp); |
| xfree(tmp); |
| tmp = gpu_common_freq_value_to_string(gpu_freq_num); |
| debug2("Requested GPU graphics frequency: %s", tmp); |
| xfree(tmp); |
| |
| if (!mem_freq_num && !gpu_freq_num) { |
| debug2("%s: No frequencies to set", __func__); |
| return; |
| } |
| |
| // Check if GPUs are constrained by cgroups |
| cgroup_conf_init(); |
| if (slurm_cgroup_conf.constrain_devices) |
| constrained_devices = true; |
| |
| // Check if task/cgroup plugin is loaded |
| if (xstrstr(slurm_conf.task_plugin, "cgroup")) |
| task_cgroup = true; |
| |
| // If both of these are true, then GPUs will be constrained |
| if (constrained_devices && task_cgroup) { |
| cgroups_active = true; |
| gpu_len = bit_set_count(gpus); |
| debug2("%s: cgroups are configured. Using LOCAL GPU IDs", |
| __func__); |
| } else { |
| gpu_len = bit_size(gpus); |
| debug2("%s: cgroups are NOT configured. Assuming GLOBAL GPU IDs", |
| __func__); |
| } |
| |
| // Set the frequency of each device allocated to the step |
| for (i = 0; i < gpu_len; i++) { |
| char *sep = ""; |
| uint64_t mem_bitmask = 0, gpu_bitmask = 0; |
| unsigned int gpu_freq = gpu_freq_num, mem_freq = mem_freq_num; |
| |
| // Only check the global GPU bitstring if not using cgroups |
| if (!cgroups_active && !bit_test(gpus, i)) { |
| debug2("Passing over RSMI device %u", i); |
| continue; |
| } |
| count++; |
| |
| debug2("Setting frequency of RSMI device %u", i); |
| _rsmi_get_nearest_freqs(i, &mem_freq, &mem_bitmask, |
| &gpu_freq, &gpu_bitmask); |
| |
| debug2("Memory frequency before set: %u", |
| _rsmi_get_mem_freq(i)); |
| debug2("Graphics frequency before set: %u", |
| _rsmi_get_gfx_freq(i)); |
| freq_set = _rsmi_set_freqs(i, mem_bitmask, gpu_bitmask); |
| debug2("Memory frequency after set: %u", |
| _rsmi_get_mem_freq(i)); |
| debug2("Graphics frequency after set: %u", |
| _rsmi_get_gfx_freq(i)); |
| |
| if (mem_freq) { |
| xstrfmtcat(tmp, "%smemory_freq:%u", sep, mem_freq); |
| sep = ","; |
| } |
| if (gpu_freq) { |
| xstrfmtcat(tmp, "%sgraphics_freq:%u", sep, gpu_freq); |
| } |
| |
| if (freq_set) { |
| log_flag(GRES, "Successfully set GPU[%d] %s", i, tmp); |
| count_set++; |
| } else { |
| log_flag(GRES, "Failed to set GPU[%d] %s", i, tmp); |
| } |
| |
| if (verbose_flag && !freq_logged) { |
| fprintf(stderr, "GpuFreq=%s\n", tmp); |
| freq_logged = true; /* Just log for first GPU */ |
| } |
| xfree(tmp); |
| } |
| |
| if (count_set != count) { |
| log_flag(GRES, "%s: Could not set frequencies for all GPUs %d/%d total GPUs", |
| __func__, count_set, count); |
| fprintf(stderr, "Could not set frequencies for all GPUs %d/%d total GPUs\n", |
| count_set, count); |
| } |
| } |
| |
| /* |
| * Get the version of the AMD Graphics driver |
| * |
| * driver (OUT) A string to return version of AMD GPU driver |
| * len (OUT) Length for version of AMD GPU driver |
| */ |
| static void _rsmi_get_driver(char *driver, unsigned int len) |
| { |
| rsmi_version_str_get(RSMI_SW_COMP_DRIVER, driver, len); |
| } |
| |
| /* |
| * Get the version of the ROCM-SMI library |
| * |
| * version (OUT) A string to return version of RSMI |
| * len (OUT) Length for version of RSMI |
| */ |
| static void _rsmi_get_version(char *version, unsigned int len) |
| { |
| const char *status_string; |
| rsmi_version_t rsmi_version; |
| rsmi_status_t rsmi_rc = rsmi_version_get(&rsmi_version); |
| |
| if (rsmi_rc != RSMI_STATUS_SUCCESS) { |
| rsmi_rc = rsmi_status_string(rsmi_rc, &status_string); |
| error("RSMI: Failed to get the version error: %s", |
| status_string); |
| version[0] = '\0'; |
| } else { |
| snprintf(version, len, "%s", rsmi_version.build); |
| if (rsmi_version.major < RSMI_REQ_VERSION_USAGE) { |
| get_usage = false; |
| error("%s: GPU usage accounting disabled. RSMI version >= 6.0.0 required.", |
| __func__); |
| } |
| } |
| } |
| |
| /* |
| * Get the total # of GPUs in the system |
| * |
| * device_count (OUT) Number of available GPU devices |
| */ |
| extern void gpu_p_get_device_count(uint32_t *device_count) |
| { |
| const char *status_string; |
| rsmi_status_t rsmi_rc = rsmi_num_monitor_devices(device_count); |
| |
| if (rsmi_rc != RSMI_STATUS_SUCCESS) { |
| rsmi_rc = rsmi_status_string(rsmi_rc, &status_string); |
| error("RSMI: Failed to get device count: %s", status_string); |
| *device_count = 0; |
| } |
| } |
| |
| /* |
| * Get the name of the GPU |
| * |
| * dv_ind (IN) The device index |
| * device_name (OUT) Name of GPU devices |
| * size (OUT) Size of name |
| */ |
| static void _rsmi_get_device_name(uint32_t dv_ind, char *device_name, |
| unsigned int size) |
| { |
| const char *status_string; |
| rsmi_status_t rsmi_rc = rsmi_dev_name_get(dv_ind, device_name, size); |
| |
| if (rsmi_rc != RSMI_STATUS_SUCCESS) { |
| rsmi_rc = rsmi_status_string(rsmi_rc, &status_string); |
| error("RSMI: Failed to get name of the GPU: %s", status_string); |
| } |
| gpu_common_underscorify_tolower(device_name); |
| } |
| |
| /* |
| * Get the brand of the GPU |
| * |
| * dv_ind (IN) The device index |
| * device_brand (OUT) Brand of GPU devices |
| * size (OUT) Size of name |
| */ |
| static void _rsmi_get_device_brand(uint32_t dv_ind, char *device_brand, |
| unsigned int size) |
| { |
| const char *status_string; |
| rsmi_status_t rsmi_rc = rsmi_dev_brand_get(dv_ind, device_brand, size); |
| |
| if (rsmi_rc != RSMI_STATUS_SUCCESS) { |
| rsmi_rc = rsmi_status_string(rsmi_rc, &status_string); |
| error("RSMI: Failed to get brand of the GPU: %s", |
| status_string); |
| } |
| gpu_common_underscorify_tolower(device_brand); |
| } |
| |
| /* |
| * Retrieves minor number of the render device. Each AMD GPU will have a device node file |
| * in form /dev/dri/renderD[minor_number]. |
| * |
| * dv_ind (IN) The device index |
| * minor (OUT) minor number of device node |
| */ |
| static void _rsmi_get_device_minor_number(uint32_t dv_ind, |
| unsigned int *minor) |
| { |
| const char *status_string; |
| rsmi_status_t rsmi_rc = rsmi_dev_drm_render_minor_get(dv_ind, minor); |
| |
| if (rsmi_rc != RSMI_STATUS_SUCCESS) { |
| rsmi_rc = rsmi_status_string(rsmi_rc, &status_string); |
| error("RSMI: Failed to get minor number of GPU: %s", |
| status_string); |
| } |
| } |
| |
| /* |
| * Get the PCI Info of the GPU |
| * |
| * dv_ind (IN) The device index |
| * pci (OUT) PCI Info of GPU devices |
| */ |
| static void _rsmi_get_device_pci_info(uint32_t dv_ind, rsmiPciInfo_t *pci) |
| { |
| const char *status_string; |
| rsmi_status_t rsmi_rc = rsmi_dev_pci_id_get(dv_ind, &(pci->bdfid)); |
| |
| if (rsmi_rc != RSMI_STATUS_SUCCESS) { |
| rsmi_rc = rsmi_status_string(rsmi_rc, &status_string); |
| error("RSMI: Failed to get PCI Info of the GPU: %s", |
| status_string); |
| } |
| } |
| |
| /* |
| * Get the Unique ID of the GPU |
| * |
| * dv_ind (IN) The device index |
| * id (OUT) Unique ID of GPU devices |
| */ |
| static void _rsmi_get_device_unique_id(uint32_t dv_ind, uint64_t *id) |
| { |
| const char *status_string; |
| rsmi_status_t rsmi_rc = rsmi_dev_unique_id_get(dv_ind, id); |
| |
| if (rsmi_rc != RSMI_STATUS_SUCCESS) { |
| rsmi_rc = rsmi_status_string(rsmi_rc, &status_string); |
| error("RSMI: Failed to get Unique ID of the GPU: %s", |
| status_string); |
| } |
| } |
| |
| static bitstr_t *_rsmi_get_device_cpu_mask(uint32_t dv_ind) |
| { |
| bitstr_t *cpu_aff_mac_bitstr = NULL; |
| #ifdef HAVE_NUMA |
| uint32_t nnid = 1; |
| uint16_t maxcpus = conf->sockets * conf->cores * conf->threads; |
| struct bitmask *collective; |
| rsmi_status_t rsmi_rc = rsmi_topo_get_numa_node_number(dv_ind, &nnid); |
| |
| if (rsmi_rc != RSMI_STATUS_SUCCESS) { |
| const char *status_string; |
| rsmi_rc = rsmi_status_string(rsmi_rc, &status_string); |
| error("RSMI: Failed to get numa affinity of the GPU: %s", |
| status_string); |
| return NULL; |
| } |
| |
| collective = numa_allocate_cpumask(); |
| if (maxcpus > collective->size) { |
| error("Size mismatch!!!! %d %lu", maxcpus, collective->size); |
| numa_free_cpumask(collective); |
| return NULL; |
| } |
| |
| /* |
| * FIXME: This is a hack (copied from task/affinity/numa.c to make it |
| * work like NUMA v2, but for the time being we are stuck on |
| * v1. (numa_node_to_cpus will multiple the size by 8 and the collective |
| * is already at the correct size) |
| */ |
| if (numa_node_to_cpus(nnid, collective->maskp, collective->size / 8)) { |
| error("numa_node_to_cpus: %m"); |
| numa_free_cpumask(collective); |
| return NULL; |
| } |
| |
| /* Convert the collective to a slurm bitstr_t */ |
| cpu_aff_mac_bitstr = bit_alloc(maxcpus); |
| for (int i = 0; i < maxcpus; i++) { |
| if (!numa_bitmask_isbitset(collective, i)) |
| continue; |
| |
| bit_set(cpu_aff_mac_bitstr, i); |
| } |
| |
| numa_free_cpumask(collective); |
| #endif |
| return cpu_aff_mac_bitstr; |
| } |
| |
| /* |
| * Creates and returns a gres conf list of detected AMD gpus on the node. |
| * If an error occurs, return NULL |
| * Caller is responsible for freeing the list. |
| * |
| * If the AMD ROCM-SMI API exists, then query GPU info, |
| * so the user doesn't need to specify manually in gres.conf. |
| * |
| * node_config (IN/OUT) pointer of node_config_load_t passed down |
| */ |
| static list_t *_get_system_gpu_list_rsmi(node_config_load_t *node_config) |
| { |
| uint32_t i, device_count = 0; |
| list_t *gres_list_system = list_create(destroy_gres_slurmd_conf); |
| |
| _rsmi_init(); |
| |
| gpu_p_get_device_count(&device_count); |
| debug2("Device count: %d", device_count); |
| |
| // Loop through all the GPUs on the system and add to gres_list_system |
| for (i = 0; i < device_count; ++i) { |
| unsigned int minor_number = 0; |
| char device_name[RSMI_STRING_BUFFER_SIZE] = {0}; |
| char device_brand[RSMI_STRING_BUFFER_SIZE] = {0}; |
| rsmiPciInfo_t pci_info; |
| uint64_t uuid = 0; |
| char *cpu_aff_mac_range = NULL; |
| gres_slurmd_conf_t gres_slurmd_conf = { |
| .config_flags = |
| GRES_CONF_ENV_RSMI | GRES_CONF_AUTODETECT, |
| .count = 1, |
| .cpu_cnt = node_config->cpu_cnt, |
| .cpus_bitmap = _rsmi_get_device_cpu_mask(i), |
| .name = "gpu", |
| }; |
| |
| if (gres_slurmd_conf.cpus_bitmap) { |
| cpu_aff_mac_range = bit_fmt_full( |
| gres_slurmd_conf.cpus_bitmap); |
| |
| /* |
| * Convert cpu range str from machine to abstract(slurm) |
| * format |
| */ |
| if (node_config->xcpuinfo_mac_to_abs( |
| cpu_aff_mac_range, |
| &gres_slurmd_conf.cpus)) { |
| error("Conversion from machine to abstract failed"); |
| FREE_NULL_BITMAP(gres_slurmd_conf.cpus_bitmap); |
| xfree(cpu_aff_mac_range); |
| continue; |
| } |
| } |
| |
| _rsmi_get_device_name(i, device_name, RSMI_STRING_BUFFER_SIZE); |
| _rsmi_get_device_brand(i, device_brand, |
| RSMI_STRING_BUFFER_SIZE); |
| _rsmi_get_device_minor_number(i, &minor_number); |
| pci_info.bdfid = 0; |
| _rsmi_get_device_pci_info(i, &pci_info); |
| _rsmi_get_device_unique_id(i, &uuid); |
| |
| /* Use links to record PCI bus ID order */ |
| gres_slurmd_conf.links = |
| gres_links_create_empty(i, device_count); |
| |
| xstrfmtcat(gres_slurmd_conf.file, |
| "/dev/dri/renderD%u", minor_number); |
| |
| debug2("GPU index %u:", i); |
| debug2(" Name: %s", device_name); |
| debug2(" Brand/Type: %s", device_brand); |
| debug2(" UUID: %lx", uuid); |
| debug2(" PCI Domain/Bus/Device/Function: %u:%u:%u.%u", |
| pci_info.domain, |
| pci_info.bus, pci_info.device, pci_info.function); |
| debug2(" Links: %s", gres_slurmd_conf.links); |
| debug2(" Device File (minor number): %s", |
| gres_slurmd_conf.file); |
| if (minor_number != i+128) |
| debug("Note: GPU index %u is different from minor # %u", |
| i, minor_number); |
| debug2(" CPU Affinity Range - Machine: %s", |
| cpu_aff_mac_range); |
| debug2(" Core Affinity Range - Abstract: %s", |
| gres_slurmd_conf.cpus); |
| |
| // Print out possible memory frequencies for this device |
| _rsmi_print_freqs(i, LOG_LEVEL_DEBUG2); |
| |
| /* If no brand found use device_name as type name */ |
| if (device_brand[0]) |
| gres_slurmd_conf.type_name = device_brand; |
| else |
| gres_slurmd_conf.type_name = device_name; |
| |
| add_gres_to_list(gres_list_system, &gres_slurmd_conf); |
| |
| FREE_NULL_BITMAP(gres_slurmd_conf.cpus_bitmap); |
| xfree(cpu_aff_mac_range); |
| xfree(gres_slurmd_conf.cpus); |
| xfree(gres_slurmd_conf.file); |
| xfree(gres_slurmd_conf.links); |
| } |
| |
| info("%u GPU system device(s) detected", device_count); |
| return gres_list_system; |
| } |
| |
| extern list_t *gpu_p_get_system_gpu_list(node_config_load_t *node_config) |
| { |
| list_t *gres_list_system = _get_system_gpu_list_rsmi(node_config); |
| |
| if (!gres_list_system) |
| error("System GPU detection failed"); |
| |
| return gres_list_system; |
| } |
| |
| extern void gpu_p_step_hardware_init(bitstr_t *usable_gpus, char *tres_freq) |
| { |
| char *freq = NULL; |
| char *tmp = NULL; |
| |
| xassert(tres_freq); |
| xassert(usable_gpus); |
| |
| if (!usable_gpus) |
| return; /* Job allocated no GPUs */ |
| if (!tres_freq) |
| return; /* No TRES frequency spec */ |
| |
| tmp = strstr(tres_freq, "gpu:"); |
| if (!tmp) |
| return; /* No GPU frequency spec */ |
| |
| freq = xstrdup(tmp + 4); |
| tmp = strchr(freq, ';'); |
| if (tmp) |
| tmp[0] = '\0'; |
| |
| // Save a copy of the GPUs affected, so we can reset things afterwards |
| FREE_NULL_BITMAP(saved_gpus); |
| saved_gpus = bit_copy(usable_gpus); |
| |
| _rsmi_init(); |
| // Set the frequency of each GPU index specified in the bitstr |
| _set_freq(usable_gpus, freq); |
| xfree(freq); |
| |
| } |
| |
| extern void gpu_p_step_hardware_fini(void) |
| { |
| if (!saved_gpus) |
| return; |
| |
| // Reset the frequencies back to the hardware default |
| _reset_freq(saved_gpus); |
| FREE_NULL_BITMAP(saved_gpus); |
| rsmi_shut_down(); |
| } |
| |
| extern char *gpu_p_test_cpu_conv(char *cpu_range) |
| { |
| return NULL; |
| } |
| |
| /* |
| * gpu_p_energy_read read current average watts and update last_update_watt |
| * |
| * dv_ind (IN) The device index |
| * energy (IN) A pointer to gpu_status_t structure |
| */ |
| extern int gpu_p_energy_read(uint32_t dv_ind, gpu_status_t *gpu) |
| { |
| const char *status_string; |
| uint64_t curr_milli_watts; |
| rsmi_status_t rsmi_rc = rsmi_dev_power_ave_get( |
| dv_ind, 0, &curr_milli_watts); |
| |
| if (rsmi_rc != RSMI_STATUS_SUCCESS) { |
| rsmi_rc = rsmi_status_string(rsmi_rc, &status_string); |
| error("RSMI: Failed to get power: %s", status_string); |
| gpu->energy.current_watts = NO_VAL; |
| return SLURM_ERROR; |
| } |
| |
| gpu->last_update_watt = curr_milli_watts/1000000; |
| gpu->previous_update_time = gpu->last_update_time; |
| gpu->last_update_time = time(NULL); |
| |
| return SLURM_SUCCESS; |
| } |
| |
| extern int gpu_p_usage_read(pid_t pid, acct_gather_data_t *data) |
| { |
| const char *status_string; |
| rsmi_process_info_t proc = {0}; |
| rsmi_status_t rc; |
| bool track_gpumem, track_gpuutil; |
| |
| track_gpumem = (gpumem_pos != -1); |
| track_gpuutil = (gpuutil_pos != -1); |
| |
| if (!track_gpuutil && !track_gpumem) { |
| debug2("%s: We are not tracking TRES gpuutil/gpumem", __func__); |
| return SLURM_SUCCESS; |
| } |
| |
| _rsmi_init(); |
| |
| /* |
| * If version < RSMI_REQ_VERSION_USAGE get_usage will be set to |
| * false, so we won't set gpumem_pos and gpuutil_pos which |
| * effectively disables gpu accounting. |
| */ |
| if (!get_usage) { |
| debug2("%s: ROCM release version is < 6.0.0 which is required for gathering usage. Not gathering usage.", __func__); |
| return SLURM_SUCCESS; |
| } |
| |
| rc = rsmi_compute_process_info_by_pid_get(pid, &proc); |
| |
| if (rc == RSMI_STATUS_NOT_FOUND) { |
| debug2("Couldn't find pid %d, probably hasn't started yet or has already finished", |
| pid); |
| return SLURM_SUCCESS; |
| } else if (rc != RSMI_STATUS_SUCCESS) { |
| (void) rsmi_status_string(rc, &status_string); |
| error("RSMI: Failed to get usage(%d): %s", rc, status_string); |
| return SLURM_ERROR; |
| } |
| |
| if (track_gpuutil) |
| data[gpuutil_pos].size_read = proc.cu_occupancy; |
| |
| if (track_gpumem) |
| data[gpumem_pos].size_read = proc.vram_usage; |
| |
| log_flag(JAG, "pid %d has GPUUtil=%lu and MemMB=%lu", |
| pid, |
| data[gpuutil_pos].size_read, |
| data[gpumem_pos].size_read / 1048576); |
| |
| return SLURM_SUCCESS; |
| } |