| /*****************************************************************************\ |
| * gpu_oneapi.c - Support oneAPI interface to an Intel GPU. |
| ***************************************************************************** |
| * Copyright (C) SchedMD LLC. |
| * Copyright (C) 2022 Intel Corporation |
| * Written by Kemp Ke <kemp.ke@intel.com> |
| * Based on gpu_nvml.c, written by Danny Auble <da@schedmd.com> |
| * |
| * This file is part of Slurm, a resource management program. |
| * For details, see <https://slurm.schedmd.com/>. |
| * Please also read the included file: DISCLAIMER. |
| * |
| * Slurm is free software; you can redistribute it and/or modify it under |
| * the terms of the GNU General Public License as published by the Free |
| * Software Foundation; either version 2 of the License, or (at your option) |
| * any later version. |
| * |
| * In addition, as a special exception, the copyright holders give permission |
| * to link the code of portions of this program with the OpenSSL library under |
| * certain conditions as described in each individual source file, and |
| * distribute linked combinations including the two. You must obey the GNU |
| * General Public License in all respects for all of the code used other than |
| * OpenSSL. If you modify file(s) with this exception, you may extend this |
| * exception to your version of the file(s), but you are not obligated to do |
| * so. If you do not wish to do so, delete this exception statement from your |
| * version. If you delete this exception statement from all source files in |
| * the program, then also delete it here. |
| * |
| * Slurm is distributed in the hope that it will be useful, but WITHOUT ANY |
| * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS |
| * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more |
| * details. |
| * |
| * You should have received a copy of the GNU General Public License along |
| * with Slurm; if not, write to the Free Software Foundation, Inc., |
| * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. |
| \*****************************************************************************/ |
| |
| #define _GNU_SOURCE |
| |
| #include <dirent.h> |
| #include <dlfcn.h> |
| #include <limits.h> |
| #include <regex.h> |
| #include <sys/types.h> |
| #include <ze_api.h> |
| #include <zes_api.h> |
| |
| #include "src/plugins/gpu/common/gpu_common.h" |
| #include "src/common/strlcpy.h" |
| #include "src/common/xregex.h" |
| |
| #define MAX_GPU_NUM 256 |
| #define MAX_NUM_FREQUENCIES 256 |
| #define CPU_LINE_SIZE 256 |
| #define CARD_NAME_LEN 256 |
| |
| #define MAX_CPUS 0x8000 |
| #define ULONG_BYTES (sizeof(uint64_t)) |
| #define ULONG_BITS (ULONG_BYTES * 8) |
| |
| /* |
| * The # of uint64_ts needed to accommodate a bitmask array capable |
| * of representing MAX_CPUS cpus (will vary if 32-bit or 64-bit) |
| * E.g. for a 130 CPU 64-bit machine: (130 + 63) / 64 = 3.02 |
| * -> Integer division floor -> 3 uint64_ts to represent 130 CPUs |
| */ |
| #define CPU_SET_SIZE ((MAX_CPUS + (ULONG_BITS - 1)) / ULONG_BITS) |
| |
| static bitstr_t *saved_gpus; |
| |
| const char plugin_name[] = "GPU oneAPI plugin"; |
| const char plugin_type[] = "gpu/oneapi"; |
| const uint32_t plugin_version = SLURM_VERSION_NUMBER; |
| |
| /* Duplicated from NVML plugin */ |
| static void _set_cpu_set_bitstr(bitstr_t *cpu_set_bitstr, |
| uint64_t *cpu_set, |
| uint32_t cpu_set_size) |
| { |
| int j, k, b; |
| int bit_cur; |
| int bitstr_bits = (int) bit_size(cpu_set_bitstr); |
| int cpu_set_bits = (cpu_set_size * ULONG_BITS); |
| |
| /* If this fails, then something went horribly wrong */ |
| if (bitstr_bits != cpu_set_bits) |
| fatal("%s: bitstr_bits != cpu_set_bits", __func__); |
| |
| bit_cur = bitstr_bits - 1; |
| |
| /* Iterate through each cpu_set long int */ |
| for (j = cpu_set_size - 1; j >= 0; --j) { |
| /* Iterate through the bytes of the jth ulong bitmask */ |
| char *bitmask = (char *) &cpu_set[j]; |
| #ifdef SLURM_BIGENDIAN |
| for (k = 0; k < ULONG_BYTES; ++k) { |
| #else |
| for (k = ULONG_BYTES - 1; k >= 0; --k) { |
| #endif |
| unsigned char byte = bitmask[k]; |
| unsigned char mask; |
| /* If byte is zero, nothing to set */ |
| if (byte == 0) { |
| bit_cur -= 8; |
| continue; |
| } |
| |
| /* |
| * Test each bit of byte, from MSB to LSB. |
| * Set if needed. |
| */ |
| mask = 0x80; |
| for (b = 0; b < 8; ++b) { |
| if (byte & mask) |
| bit_set(cpu_set_bitstr, bit_cur); |
| mask >>= 1; |
| bit_cur--; |
| } |
| xassert(mask == 0x00); |
| } |
| } |
| |
| xassert(bit_cur == -1); |
| if (bit_set_count(cpu_set_bitstr) == 0) |
| fatal("%s: cpu_set_bitstr is empty! No CPU affinity for device", |
| __func__); |
| } |
| |
| /* |
| * Initialize the oneapi library. |
| */ |
| static ze_result_t _oneapi_init() |
| { |
| static pid_t init_pid = 0; |
| pid_t my_pid = conf->pid ? conf->pid : getpid(); |
| ze_result_t oneapi_rc; |
| |
| if (init_pid == my_pid) /* Already inited */ |
| return ZE_RESULT_SUCCESS; |
| |
| init_pid = my_pid; |
| |
| setenv("ZES_ENABLE_SYSMAN", "1", 1); |
| setenv("ZE_FLAT_DEVICE_HIERARCHY", "COMPOSITE", 1); |
| setenv("ZE_ENABLE_PCI_ID_DEVICE_ORDER", "1", 1); |
| |
| DEF_TIMERS; |
| START_TIMER; |
| oneapi_rc = zeInit(0); |
| END_TIMER; |
| debug3("zeInit() took %ld microseconds", DELTA_TIMER); |
| if (oneapi_rc != ZE_RESULT_SUCCESS) { |
| error("Failed to initialize oneapi: 0x%x", oneapi_rc); |
| } else |
| debug2("Successfully initialized oneapi"); |
| |
| return oneapi_rc; |
| } |
| |
| /* |
| * Print GPU driver version and API version |
| * |
| * driver (IN) The driver handle |
| * |
| */ |
| static void _oneapi_print_driver_info(ze_driver_handle_t driver) |
| { |
| ze_driver_properties_t driver_prop; |
| ze_api_version_t api_version; |
| ze_result_t oneapi_rc; |
| |
| /* Print driver version */ |
| oneapi_rc = zeDriverGetProperties(driver, &driver_prop); |
| if (oneapi_rc != ZE_RESULT_SUCCESS) |
| error("Failed to get driver properties: 0x%x", oneapi_rc); |
| else |
| debug("Systems Graphics Driver Version: %u", |
| driver_prop.driverVersion); |
| |
| /* Print API version */ |
| oneapi_rc = zeDriverGetApiVersion(driver, &api_version); |
| if (oneapi_rc != ZE_RESULT_SUCCESS) { |
| error("Failed to get driver API version: 0x%x", oneapi_rc); |
| } else { |
| /* |
| * The value is encoded as a 16-bit major and 16-bit minor |
| * part. Split apart when printing. |
| */ |
| debug("Supported Driver API Version: %u.%u", api_version >> 16, |
| api_version & 0x0000ffff); |
| } |
| } |
| |
| /* |
| * Get all of GPU device handles |
| * |
| * gpu_handles (IN/OUT) The device handles |
| * gpu_size (IN/OUT) The size of the gpu_handles array. This will |
| * be overwritten with the number of device handles found. |
| * print_version (IN) Print driver version and device count information |
| * |
| */ |
| static void _oneapi_get_device_handles(ze_device_handle_t *gpu_handles, |
| uint32_t *gpu_size, |
| bool print_version) |
| { |
| ze_result_t oneapi_rc; |
| uint32_t driver_count = 0; |
| int gpu_count = 0; |
| uint32_t device_count = 0; |
| ze_driver_handle_t *all_drivers = NULL; |
| ze_device_handle_t *all_devices = NULL; |
| ze_device_properties_t device_properties; |
| bool gpu_driver = false; |
| |
| /* Get driver count */ |
| oneapi_rc = zeDriverGet(&driver_count, NULL); |
| if (oneapi_rc != ZE_RESULT_SUCCESS) { |
| error("Failed to get driver count: 0x%x", oneapi_rc); |
| return; |
| } |
| |
| /* Get drivers */ |
| all_drivers = xcalloc(driver_count, sizeof(ze_driver_handle_t)); |
| oneapi_rc = zeDriverGet(&driver_count, all_drivers); |
| if (oneapi_rc != ZE_RESULT_SUCCESS) { |
| error("Failed to get driver: 0x%x", oneapi_rc); |
| return; |
| } |
| |
| for (int i = 0; i < driver_count; i++) { |
| /* Get device count */ |
| gpu_driver = false; |
| device_count = 0; |
| oneapi_rc = zeDeviceGet(all_drivers[i], &device_count, NULL); |
| if (oneapi_rc != ZE_RESULT_SUCCESS) { |
| error("Failed to get device count: 0x%x", oneapi_rc); |
| continue; |
| } |
| |
| /* Get devices */ |
| all_devices = xcalloc(device_count, |
| sizeof(ze_device_handle_t)); |
| oneapi_rc = zeDeviceGet(all_drivers[i], &device_count, |
| all_devices); |
| if (oneapi_rc != ZE_RESULT_SUCCESS) { |
| error("Failed to get device: 0x%x", oneapi_rc); |
| continue; |
| } |
| |
| for (int j = 0; j < device_count; j++) { |
| /* Get device properties */ |
| oneapi_rc = zeDeviceGetProperties(all_devices[j], |
| &device_properties); |
| if (oneapi_rc != ZE_RESULT_SUCCESS) { |
| error("Failed to get device property: 0x%x", |
| oneapi_rc); |
| continue; |
| } |
| |
| /* Filter non-GPU devices */ |
| if (ZE_DEVICE_TYPE_GPU != device_properties.type) |
| continue; |
| gpu_driver = true; |
| |
| /* |
| * If the number of GPU exceeds the buffer length, |
| * return the limited number of devices |
| */ |
| if (gpu_count + 1 > *gpu_size) |
| break; |
| |
| gpu_handles[gpu_count++] = all_devices[j]; |
| |
| } |
| |
| xfree(all_devices); |
| |
| if (print_version && gpu_driver) |
| _oneapi_print_driver_info(all_drivers[i]); |
| } |
| |
| if (print_version) |
| debug2("Device count: %d", gpu_count); |
| |
| xfree(all_drivers); |
| *gpu_size = gpu_count; |
| } |
| |
| /* |
| * Get available clocks of a frequency handle |
| * |
| * freq_handle (IN) the frequency handle |
| * freqs (IN/OUT) array of frequencies in units of MHz and sorted from |
| * slowest to fastest. if freq_count is less than the number of |
| * frequencies that are available, then only that number of |
| * frequencies will be returned |
| * freq_count (IN/OUT) pointer to the size of freqs. |
| * if freq_count is greater than the number of frequencies |
| * that are available, then it will be updated with the correct |
| * number of frequencies. |
| * |
| * Returns true if successful, false if not |
| */ |
| static bool _oneapi_get_available_clocks(zes_freq_handle_t freq_handle, |
| uint32_t *freqs, uint32_t *freq_count) |
| { |
| double *clocks = NULL; |
| ze_result_t oneapi_rc; |
| |
| xassert(*freq_count > 0); |
| |
| /* Get available clocks */ |
| clocks = xcalloc(*freq_count, sizeof(double)); |
| oneapi_rc = zesFrequencyGetAvailableClocks(freq_handle, freq_count, |
| clocks); |
| if (oneapi_rc != ZE_RESULT_SUCCESS) { |
| error("Failed to get available clocks: 0x%x", oneapi_rc); |
| xfree(clocks); |
| return false; |
| } |
| |
| for (int i = 0; i < *freq_count; i++) |
| freqs[i] = (uint32_t) clocks[i]; |
| |
| xfree(clocks); |
| return true; |
| } |
| |
| /* |
| * Get the nearest valid frequencies |
| * |
| * freq_handle (IN) the frequency handle |
| * freq (IN/OUT) requested/nearest valid frequency |
| * |
| * Returns true if successful, false if not |
| */ |
| static bool _oneapi_get_nearest_freq(zes_freq_handle_t freq_handle, |
| uint32_t *freq) |
| { |
| uint32_t freqs[MAX_NUM_FREQUENCIES] = {0}; |
| uint32_t freqs_sort[MAX_NUM_FREQUENCIES] = {0}; |
| uint32_t freqs_size = MAX_NUM_FREQUENCIES; |
| |
| /* Get available clocks */ |
| if (!_oneapi_get_available_clocks(freq_handle, freqs, &freqs_size)) |
| return false; |
| |
| memcpy(freqs_sort, freqs, freqs_size * sizeof(uint32_t)); |
| qsort(freqs_sort, freqs_size, sizeof(uint32_t), |
| slurm_sort_uint32_list_desc); |
| |
| /* Set the nearest valid frequency for the requested frequency */ |
| gpu_common_get_nearest_freq(freq, freqs_size, freqs_sort); |
| return true; |
| } |
| |
| /* |
| * Print frequency information |
| * |
| * freq_prop (IN) The pointer of the frequency property |
| * l (IN) The log level at which to print |
| * |
| * Returns true if successful, false if not |
| */ |
| static void _oneapi_print_freq_info(zes_freq_properties_t *freq_prop, |
| log_level_t l) |
| { |
| if ((freq_prop->type != ZES_FREQ_DOMAIN_GPU) && |
| (freq_prop->type != ZES_FREQ_DOMAIN_MEMORY)) |
| return; |
| |
| log_var(l, "%s frequency min: %u, max: %u, onSubdevice: %s, subdeviceId: %d, canControl: %s", |
| freq_prop->type == ZES_FREQ_DOMAIN_GPU ? "Graphics" : "Memory", |
| (uint32_t) freq_prop->min, |
| (uint32_t) freq_prop->max, |
| freq_prop->onSubdevice ? "true" : "false", |
| freq_prop->subdeviceId, |
| freq_prop->canControl ? "true" : "false"); |
| } |
| |
| /* |
| * Print out all possible memory and graphics frequencies for the given device |
| * |
| * device (IN) The device handle |
| * l (IN) The log level at which to print |
| * |
| * Returns true if successful, false if not |
| * |
| * NOTE: Intel GPU supports tiles. One GPU may have two tiles, so the |
| * frequencies of all of tiles needs to be printed. |
| */ |
| static void _oneapi_print_freqs(ze_device_handle_t device, log_level_t l) |
| { |
| zes_freq_handle_t freq_handles[MAX_NUM_FREQUENCIES]; |
| uint32_t freq_handle_size = MAX_NUM_FREQUENCIES; |
| zes_freq_properties_t freq_prop; |
| ze_result_t oneapi_rc; |
| |
| /* Get all of frequency handles */ |
| oneapi_rc = zesDeviceEnumFrequencyDomains((zes_device_handle_t)device, |
| &freq_handle_size, |
| freq_handles); |
| if (oneapi_rc != ZE_RESULT_SUCCESS) { |
| error("Failed to enumerate frequency domains: 0x%x", |
| oneapi_rc); |
| return; |
| } |
| |
| /* Loop all of frequency handles and print frequency */ |
| for (int i = 0; i < freq_handle_size; i++) { |
| uint32_t freqs[MAX_NUM_FREQUENCIES] = {0}; |
| uint32_t freqs_size = MAX_NUM_FREQUENCIES; |
| |
| /* Get available clocks */ |
| if (!_oneapi_get_available_clocks(freq_handles[i], freqs, |
| &freqs_size)) |
| continue; |
| qsort(freqs, freqs_size, sizeof(uint32_t), |
| slurm_sort_uint32_list_desc); |
| |
| /* Get frequency property */ |
| oneapi_rc = zesFrequencyGetProperties(freq_handles[i], |
| &freq_prop); |
| if (oneapi_rc != ZE_RESULT_SUCCESS) { |
| error("Failed to get freq properties: 0x%x", |
| oneapi_rc); |
| continue; |
| } |
| |
| _oneapi_print_freq_info(&freq_prop, l); |
| |
| if (freq_prop.type == ZES_FREQ_DOMAIN_GPU) |
| gpu_common_print_freqs(freqs, freqs_size, l, |
| "GPU Graphics", 8); |
| else if (freq_prop.type == ZES_FREQ_DOMAIN_MEMORY) |
| gpu_common_print_freqs(freqs, freqs_size, l, |
| "GPU Memory", 8); |
| else |
| log_var(l, "Unsupported frequency domain: %u", |
| freq_prop.type); |
| } |
| } |
| |
| /* |
| * Print current frequency range |
| * |
| * freq_handler (IN) the frequency handler |
| * freq_type (IN) the frequency type |
| * |
| */ |
| static void _oneapi_print_freq_range(zes_freq_handle_t freq_handler, |
| uint32_t freq_type) |
| { |
| zes_freq_range_t freq_range; |
| ze_result_t oneapi_rc; |
| |
| if (freq_type != ZES_FREQ_DOMAIN_GPU && |
| freq_type != ZES_FREQ_DOMAIN_MEMORY) |
| return; |
| |
| oneapi_rc = zesFrequencyGetRange(freq_handler, &freq_range); |
| if (oneapi_rc != ZE_RESULT_SUCCESS) { |
| error("Failed to get frequency range"); |
| return; |
| } |
| |
| debug2("%s frequency: %u~%u", |
| freq_type == ZES_FREQ_DOMAIN_GPU ? "Graphics" : |
| "Memory", (uint32_t)freq_range.min, |
| (uint32_t)freq_range.max); |
| } |
| |
| /* |
| * Set frequency for the GPU |
| * |
| * device (IN) The device handle |
| * reset (IN) If true, the device will be reset to default frequencies |
| * gpu_freq_num (IN) The gpu frequency code. It will be ingorned |
| if reset is true. |
| * mem_freq_num (IN) The memory frequency code. It will be ingorned |
| if reset is true. |
| * freq_msg (OUT) Frequency log message and must be freed by the caller |
| * |
| * Returns true if successful, false if not |
| * |
| * NOTE: Intel GPU supports tiles. One GPU may have two tiles, so all of tiles |
| * need to be set with the frequencies. |
| */ |
| static bool _oneapi_set_freqs(ze_device_handle_t device, |
| bool reset, |
| unsigned int gpu_freq_num, |
| unsigned int mem_freq_num, |
| char **freq_msg) |
| { |
| uint32_t freq_handle_size = MAX_NUM_FREQUENCIES; |
| zes_freq_handle_t freq_handles[MAX_NUM_FREQUENCIES]; |
| zes_freq_properties_t freq_prop; |
| zes_freq_range_t freq_range; |
| ze_result_t oneapi_rc; |
| unsigned int freq = 0; |
| |
| /* Get all of frequency handles */ |
| oneapi_rc = zesDeviceEnumFrequencyDomains((zes_device_handle_t)device, |
| &freq_handle_size, |
| freq_handles); |
| if (oneapi_rc != ZE_RESULT_SUCCESS) { |
| error("Failed to get freq domains: 0x%x", oneapi_rc); |
| return false; |
| } |
| |
| /* Loop all of frequency handles and set range of frequency */ |
| for (int i = 0; i < freq_handle_size; i++) { |
| /* Get frequency property */ |
| oneapi_rc = zesFrequencyGetProperties(freq_handles[i], |
| &freq_prop); |
| if (oneapi_rc != ZE_RESULT_SUCCESS) { |
| error("Failed to get freq properties: 0x%x", |
| oneapi_rc); |
| return false; |
| } |
| |
| /* |
| * If the frequency is not GPU or memory frequency or it cannot |
| * be controlled, ignore it |
| */ |
| if (((freq_prop.type != ZES_FREQ_DOMAIN_GPU) && |
| (freq_prop.type != ZES_FREQ_DOMAIN_MEMORY)) || |
| !freq_prop.canControl) { |
| debug2("Unsupported frequency. domain: %u, onSubdevice: %u, subdeviceId: %d, canControl:%s", |
| freq_prop.type, freq_prop.onSubdevice, |
| freq_prop.subdeviceId, |
| freq_prop.canControl ? "true" : "false"); |
| continue; |
| } |
| |
| if (!reset) { |
| /* Get nearest frequency */ |
| freq = (freq_prop.type == ZES_FREQ_DOMAIN_GPU) ? |
| gpu_freq_num : mem_freq_num; |
| if (!_oneapi_get_nearest_freq(freq_handles[i], |
| &freq)) { |
| error("Failed to get nearest freq: %u", freq); |
| return false; |
| } |
| freq_range.max = freq_range.min = freq; |
| } else { |
| /* |
| * "-1" means the device will be set to the default |
| * frequencies |
| */ |
| freq_range.max = freq_range.min = -1; |
| } |
| |
| /* Print frequency before setting */ |
| debug2("Before %s frequency", reset ? "reset" : "set"); |
| _oneapi_print_freq_range(freq_handles[i], freq_prop.type); |
| |
| /* Set frequency range with a fixed value */ |
| oneapi_rc = zesFrequencySetRange(freq_handles[i], &freq_range); |
| if (oneapi_rc != ZE_RESULT_SUCCESS) { |
| error("Failed to set frequency range: %f~%f, error:0x%x", |
| freq_range.min, freq_range.max, oneapi_rc); |
| return false; |
| } |
| |
| /* Print frequency after setting */ |
| debug2("After %s frequency", reset ? "reset" : "set"); |
| _oneapi_print_freq_range(freq_handles[i], freq_prop.type); |
| |
| if (freq_msg) { |
| if (*freq_msg) |
| xstrcat(*freq_msg, ","); |
| if (freq_prop.type == ZES_FREQ_DOMAIN_GPU) |
| xstrfmtcat(*freq_msg, "graphics_freq:%u", |
| freq); |
| else |
| xstrfmtcat(*freq_msg, "memory_freq:%u", freq); |
| } |
| } |
| |
| return true; |
| } |
| |
| /* |
| * Reset the frequencies for the GPU to the same default frequencies |
| * that are used after system reboot or driver reload. This default |
| * cannot be changed. |
| * |
| * device (IN) The device handle |
| * |
| * Returns true if successful, false if not |
| */ |
| static bool _oneapi_reset_freqs(ze_device_handle_t device) |
| { |
| if (!_oneapi_set_freqs(device, true, 0, 0, NULL)) { |
| error("Failed to reset frequencies"); |
| return false; |
| } |
| |
| return true; |
| } |
| |
| /* |
| * Reset the frequencies of each GPU in the step to the hardware default |
| * |
| * gpus (IN) A bitmap specifying the GPUs on which to operate |
| */ |
| static void _reset_freq(bitstr_t *gpus) |
| { |
| int gpu_len = bit_size(gpus); |
| int count = 0, count_set = 0; |
| bool freq_reset = false; |
| ze_device_handle_t all_devices[MAX_GPU_NUM]; |
| uint32_t gpu_num = MAX_GPU_NUM; |
| |
| /* Get all of device handles */ |
| _oneapi_get_device_handles(all_devices, &gpu_num, false); |
| if (gpu_num == 0) { |
| error("Failed to get devices!"); |
| return; |
| } |
| |
| /* |
| * If the gpu length is greater than the total GPU number, |
| * use the total GPU number |
| */ |
| if (gpu_len > gpu_num) |
| gpu_len = gpu_num; |
| |
| /* Reset the frequency of each device allocated to the step */ |
| for (int i = 0; i < gpu_len; i++) { |
| if (!bit_test(gpus, i)) |
| continue; |
| count++; |
| |
| /* Reset frequency to the default value */ |
| freq_reset = _oneapi_reset_freqs(all_devices[i]); |
| |
| if (freq_reset) { |
| log_flag(GRES, "Successfully reset GPU[%d]", i); |
| count_set++; |
| } else { |
| log_flag(GRES, "Failed to reset GPU[%d]", i); |
| } |
| } |
| |
| if (count_set != count) { |
| log_flag(GRES, "%s: Could not reset frequencies for all GPUs %d/%d total GPUs", |
| __func__, count_set, count); |
| fprintf(stderr, "Could not reset frequencies for all GPUs %d/%d total GPUs\n", |
| count_set, count); |
| } |
| } |
| |
| /* |
| * Set the frequencies of each GPU specified for the step |
| * |
| * gpus (IN) A bitmap specifying the GPUs on which to operate. |
| * gpu_freq (IN) The frequencies to set each of the GPUs to. If a NULL or |
| * empty memory or graphics frequency is specified, then |
| GpuFreqDef will be consulted, which defaults to |
| "high,memory=high" if not set. |
| */ |
| static void _set_freq(bitstr_t *gpus, char *gpu_freq) |
| { |
| bool verbose_flag = false; |
| int gpu_len = 0; |
| int count = 0, count_set = 0; |
| unsigned int gpu_freq_num = 0, mem_freq_num = 0; |
| bool freq_set = false, freq_logged = false; |
| char *tmp = NULL; |
| bool task_cgroup = false; |
| bool constrained_devices = false; |
| bool cgroups_active = false; |
| ze_device_handle_t all_devices[MAX_GPU_NUM]; |
| uint32_t gpu_num = MAX_GPU_NUM; |
| |
| /* |
| * Parse frequency information |
| */ |
| debug2("_parse_gpu_freq(%s)", gpu_freq); |
| gpu_common_parse_gpu_freq(gpu_freq, &gpu_freq_num, &mem_freq_num, |
| &verbose_flag); |
| if (verbose_flag) |
| debug2("verbose_flag ON"); |
| |
| tmp = gpu_common_freq_value_to_string(mem_freq_num); |
| debug2("Requested GPU memory frequency: %s", tmp); |
| xfree(tmp); |
| tmp = gpu_common_freq_value_to_string(gpu_freq_num); |
| debug2("Requested GPU graphics frequency: %s", tmp); |
| xfree(tmp); |
| |
| if (!mem_freq_num && !gpu_freq_num) { |
| debug2("%s: No frequencies to set", __func__); |
| return; |
| } |
| |
| /* Check if GPUs are constrained by cgroups */ |
| cgroup_conf_init(); |
| if (slurm_cgroup_conf.constrain_devices) |
| constrained_devices = true; |
| |
| /* Check if task/cgroup plugin is loaded */ |
| if (xstrstr(slurm_conf.task_plugin, "cgroup")) |
| task_cgroup = true; |
| |
| /* If both of these are true, then GPUs will be constrained */ |
| if (constrained_devices && task_cgroup) { |
| cgroups_active = true; |
| gpu_len = bit_set_count(gpus); |
| debug2("%s: cgroups are configured. Using LOCAL GPU IDs", |
| __func__); |
| } else { |
| gpu_len = bit_size(gpus); |
| debug2("%s: cgroups are NOT configured. Assuming GLOBAL GPU IDs", |
| __func__); |
| } |
| |
| /* Get all of device handles */ |
| _oneapi_get_device_handles(all_devices, &gpu_num, false); |
| if (gpu_num == 0) { |
| error("Failed to get devices!"); |
| return; |
| } |
| |
| if (gpu_len > gpu_num) |
| gpu_len = gpu_num; |
| |
| /* Set the frequency of each device allocated to the step */ |
| for (int i = 0; i < gpu_len; i++) { |
| /* Only check the global GPU bitstring if not using cgroups */ |
| if (!cgroups_active && !bit_test(gpus, i)) { |
| debug2("Passing over oneAPI device %u", i); |
| continue; |
| } |
| count++; |
| |
| freq_set = _oneapi_set_freqs(all_devices[i], false, |
| gpu_freq_num, mem_freq_num, |
| &tmp); |
| if (freq_set) { |
| log_flag(GRES, "Successfully set GPU[%d] %s", i, tmp); |
| count_set++; |
| } else { |
| log_flag(GRES, "Failed to set GPU[%d] %s", i, tmp); |
| } |
| |
| if (verbose_flag && !freq_logged) { |
| fprintf(stderr, "GpuFreq=%s\n", tmp); |
| freq_logged = true; /* Just log for first GPU */ |
| } |
| xfree(tmp); |
| } |
| |
| if (count_set != count) { |
| log_flag(GRES, "%s: Could not set frequencies for all GPUs %d/%d total GPUs", |
| __func__, count_set, count); |
| fprintf(stderr, "Could not set frequencies for all GPUs %d/%d total GPUs\n", |
| count_set, count); |
| } |
| } |
| |
| /* |
| * Set the cpu affinity mask |
| * |
| * cpu (IN) The index of the CPU |
| * cpu_set: [IN/out] An array reference in which to return a bitmask of |
| * CPUs. 64 CPUs per uint64_t on 64-bit machines, 32 on |
| * 32-bit machines. For example, on 32-bit machines, |
| * if processors 0, 1, 32, and 33 are ideal for the device |
| * and cpuSetSize == 2, result[0] = 0x3, result[1] = 0x3. |
| * size [IN] The size of the cpu set buffer |
| * |
| * Returns true if successful, false if not |
| */ |
| static bool _oneapi_set_cpu_affinity_mask(int cpu, |
| uint64_t *cpu_set, |
| uint32_t size) |
| { |
| uint32_t count; |
| uint32_t model; |
| |
| if (cpu < 0) |
| return false; |
| |
| count = cpu / ULONG_BITS; |
| if ((count + 1) > size) { |
| error("cpu set size is not enough: %u", size); |
| return false; |
| } |
| |
| model = cpu % ULONG_BITS; |
| cpu_set[count] = cpu_set[count] | (0x01UL << model); |
| return true; |
| } |
| |
| /* |
| * Read the cpu affinity mask |
| * |
| * file (IN) The full path of cpu list file |
| * For example, /sys/class/drm/card1/device/local_cpulist |
| * cpu_set: [IN/out] An array reference in which to return a bitmask of |
| * CPUs. 64 CPUs per uint64_t on 64-bit machines, 32 on |
| * 32-bit machines. For example, on 32-bit machines, |
| * if processors 0, 1, 32, and 33 are ideal for the device |
| * and cpuSetSize == 2, result[0] = 0x3, result[1] = 0x3. |
| * size [IN] The size of the cpu set buffer |
| * |
| * Returns true if successful, false if not |
| */ |
| static bool _oneapi_read_cpu_affinity_list(const char *file, |
| uint64_t *cpu_set, |
| uint32_t size) |
| { |
| char line[CPU_LINE_SIZE] = {'\0'}; |
| char *save_ptr = line, *tok = NULL; |
| int min_cpu = -1, max_cpu = -1; |
| FILE *fp = NULL; |
| int pos = -1; |
| |
| debug2("Read file: %s", file); |
| |
| fp = fopen(file, "r"); |
| if (fp == NULL) { |
| error("Failed to read the file: %s", file); |
| return false; |
| } |
| |
| /* Example format: "0-27,56-83" */ |
| if (fgets(line, sizeof(line), fp) != NULL) { |
| debug2("line is: %s", line); |
| while ((tok = strtok_r(save_ptr, ",", &save_ptr)) != NULL) { |
| /* Split CPU range from string like "0-27" */ |
| debug2("tok is :%s", tok); |
| pos = strcspn(tok, "-"); |
| if (pos > 0 && pos < strlen(tok)) { |
| min_cpu = atoi(tok); |
| max_cpu = atoi(tok + pos + 1); |
| } else if (pos > 0 && pos == strlen(tok)) { |
| max_cpu = min_cpu = atoi(tok); |
| } else { |
| continue; |
| } |
| |
| debug2("cpu range is: %d~%d", min_cpu, max_cpu); |
| |
| /* Set CPU bit mask */ |
| for (int i = min_cpu; i <= max_cpu; i++) |
| _oneapi_set_cpu_affinity_mask(i, cpu_set, |
| size); |
| } |
| } |
| |
| fclose(fp); |
| return true; |
| } |
| |
| |
| /* |
| * Get device card name under folder "/sys/class/drm" |
| * There are no APIs to get minor number of Intel GPU at the moment, so we |
| * have to read BDF information from PCI and map it according to the |
| * device file symlinks under the folder "/sys/class/drm". |
| * |
| * domain (IN) From PCI BDF |
| * bus (IN) From PCI BDF |
| * device (IN) From PCI BDF |
| * function (IN) From PCI BDF |
| * name (IN/OUT) The device name |
| * len (IN) The length of the device name buffer |
| * |
| * Returns true if successful, false if not |
| */ |
| static bool _oneapi_get_device_name(uint32_t domain, uint32_t bus, |
| uint32_t device, uint32_t function, |
| char *name, uint32_t len) |
| { |
| static const char *card_reg_string = "renderD[0-9]+$"; |
| const char *search_path = "/sys/class/drm"; |
| char device_pattern[PATH_MAX] = {'\0'}; |
| char path[PATH_MAX] = {'\0'}; |
| char real_path[PATH_MAX] = {'\0'}; |
| DIR *dir = NULL; |
| struct dirent *dp = NULL; |
| regex_t search_reg; |
| regex_t card_reg; |
| regmatch_t reg_match; |
| char *matched = NULL; |
| bool ret = false; |
| int rc; |
| |
| /* |
| * Build search pattern to search strings like |
| * "../../devices/pci0000:89/0000:89:02.0/0000:8a:00.0 |
| * /0000:8b:01.0/0000:8c:00.0/drm/renderD0" |
| */ |
| snprintf(device_pattern, sizeof(device_pattern), |
| "/%04x:%02x:%02x.%0x/%s", |
| domain, bus, device, function, card_reg_string); |
| if ((rc = regcomp(&search_reg, device_pattern, REG_EXTENDED))) { |
| dump_regex_error(rc, &search_reg, |
| "Device file regex \"%s\" compilation failed", |
| device_pattern); |
| return false; |
| } |
| |
| if ((rc = regcomp(&card_reg, card_reg_string, REG_EXTENDED))) { |
| dump_regex_error(rc, &card_reg, |
| "Card regex \"%s\" compilation failed", |
| card_reg_string); |
| regfree(&search_reg); |
| return false; |
| } |
| |
| /* Open the device folder */ |
| if ((dir = opendir(search_path)) == NULL) { |
| error("Failed to open the folder: %s", search_path); |
| regfree(&card_reg); |
| regfree(&search_reg); |
| return false; |
| } |
| |
| /* Loop all of symlink files */ |
| while (((dp = readdir(dir))) != NULL) { |
| /* If the file is folder, ignore it */ |
| if (!strncmp(dp->d_name, ".", 1) || |
| !strncmp(dp->d_name, "..", 2)) |
| continue; |
| |
| /* Read the symlinks */ |
| snprintf(path, sizeof(path), "%s/%s", search_path, dp->d_name); |
| memset(real_path, 0, PATH_MAX); |
| if (readlink(path, real_path, PATH_MAX) < 0) |
| continue; |
| debug2("Read symblink file: %s with real path: %s", |
| path, real_path); |
| |
| /* Check file path match */ |
| if (regexec(&search_reg, real_path, 1, ®_match, 0) == |
| REG_NOMATCH) |
| continue; |
| |
| /* Check card name match */ |
| if (regexec(&card_reg, real_path, 1, ®_match, 0) == |
| REG_NOMATCH) |
| continue; |
| |
| /* BDF string matches, so it should be the devie file name */ |
| matched = xstrndup(real_path + reg_match.rm_so, (size_t) |
| (reg_match.rm_eo - reg_match.rm_so)); |
| snprintf(name, len, "%s", matched); |
| xfree(matched); |
| |
| debug2("Device name is: %s", name); |
| |
| ret = true; |
| break; |
| } |
| |
| regfree(&card_reg); |
| regfree(&search_reg); |
| closedir(dir); |
| |
| return ret; |
| } |
| |
| /* |
| * Get device affinity |
| * |
| * device_name (IN) The device name under folder "/sys/class/drm" |
| * cpu_set: [IN/out] An array reference in which to return a bitmask of |
| * CPUs. 64 CPUs per uint64_t on 64-bit machines, 32 on |
| * 32-bit machines. For example, on 32-bit machines, |
| * if processors 0, 1, 32, and 33 are ideal for the device |
| * and cpuSetSize == 2, result[0] = 0x3, result[1] = 0x3. |
| * size [IN] The size of the cpu set buffer |
| * |
| * Returns true if successful, false if not |
| */ |
| static bool _oneapi_get_device_affinity(const char *device_name, |
| uint64_t *cpu_set, |
| uint32_t size) |
| { |
| const char *search_path = "/sys/class/drm"; |
| const char *cpu_list_sub_path = "device/local_cpulist"; |
| char path[PATH_MAX] = {'\0'}; |
| |
| snprintf(path, sizeof(path), "%s/%s/%s", search_path, device_name, |
| cpu_list_sub_path); |
| return _oneapi_read_cpu_affinity_list(path, cpu_set, size); |
| } |
| |
| extern int init(void) |
| { |
| debug("loading"); |
| |
| return SLURM_SUCCESS; |
| } |
| |
| extern void fini(void) |
| { |
| debug("unloading"); |
| } |
| |
| /* |
| * Creates and returns a gres conf list of detected Intel gpus on the node. |
| * If an error occurs, return NULL |
| * Caller is responsible for freeing the list. |
| * |
| * If the Intel oneAPI exists, then query GPU info, |
| * so the user doesn't need to specify manually in gres.conf. |
| * |
| * node_config (IN/OUT) pointer of node_config_load_t passed down |
| */ |
| static list_t *_get_system_gpu_list_oneapi(node_config_load_t *node_config) |
| { |
| char device_file[PATH_MAX]; |
| char card_name[CARD_NAME_LEN]; |
| ze_device_handle_t all_devices[MAX_GPU_NUM]; |
| ze_device_properties_t device_props; |
| zes_device_handle_t zes_handle; |
| zes_pci_properties_t pci; |
| ze_result_t oneapi_rc; |
| uint32_t gpu_num = MAX_GPU_NUM; |
| uint64_t cpu_set[CPU_SET_SIZE] = {0}; |
| char *cpu_aff_mac_range = NULL; |
| int i; |
| |
| list_t *gres_list_system = list_create(destroy_gres_slurmd_conf); |
| |
| if (_oneapi_init() != ZE_RESULT_SUCCESS) { |
| return gres_list_system; |
| } |
| |
| /* Get all of device handles */ |
| _oneapi_get_device_handles(all_devices, &gpu_num, true); |
| if (gpu_num == 0) { |
| error("Failed to get devices!"); |
| return gres_list_system ; |
| } |
| |
| /* Loop all of GPU device handles */ |
| for (i = 0; i < gpu_num; i++) { |
| gres_slurmd_conf_t gres_slurmd_conf = { |
| .config_flags = |
| GRES_CONF_ENV_ONEAPI | GRES_CONF_AUTODETECT, |
| .count = 1, |
| .cpu_cnt = node_config->cpu_cnt, |
| .name = "gpu", |
| }; |
| |
| /* Get PCI properties */ |
| zes_handle = (zes_device_handle_t)all_devices[i]; |
| oneapi_rc = zesDevicePciGetProperties(zes_handle, &pci); |
| if (oneapi_rc != ZE_RESULT_SUCCESS) { |
| error("Failed to get pci info: 0x%x", oneapi_rc); |
| continue; |
| } |
| |
| /* Get device card name */ |
| if (!_oneapi_get_device_name(pci.address.domain, |
| pci.address.bus, |
| pci.address.device, |
| pci.address.function, |
| card_name, CARD_NAME_LEN)) { |
| error("Failed to get device card name for GPU: %u", i); |
| continue; |
| } |
| |
| /* Get device file */ |
| snprintf(device_file, PATH_MAX, "/dev/dri/%s", card_name); |
| |
| /* Get device affinity */ |
| memset(cpu_set, 0, sizeof(uint64_t) * CPU_SET_SIZE); |
| if (!_oneapi_get_device_affinity(card_name, cpu_set, |
| CPU_SET_SIZE)) { |
| error("Failed to get device affinity for GPU: %u", i); |
| continue; |
| } |
| |
| /* Convert from cpu bitmask to slurm bitstr_t (machine fmt) */ |
| gres_slurmd_conf.cpus_bitmap = bit_alloc(MAX_CPUS); |
| _set_cpu_set_bitstr(gres_slurmd_conf.cpus_bitmap, |
| cpu_set, CPU_SET_SIZE); |
| |
| /* Convert from bitstr_t to cpu range str */ |
| cpu_aff_mac_range = bit_fmt_full(gres_slurmd_conf.cpus_bitmap); |
| |
| /* |
| * Convert cpu range str from machine to abstract (slurm) format |
| */ |
| if (node_config->xcpuinfo_mac_to_abs(cpu_aff_mac_range, |
| &gres_slurmd_conf.cpus)) { |
| error("Conversion from machine to abstract failed"); |
| FREE_NULL_BITMAP(gres_slurmd_conf.cpus_bitmap); |
| xfree(cpu_aff_mac_range); |
| continue; |
| } |
| |
| /* Use links to record PCI bus ID order */ |
| gres_slurmd_conf.links = gres_links_create_empty(i, gpu_num); |
| |
| /* Get device properties */ |
| oneapi_rc = zeDeviceGetProperties(all_devices[i], |
| &device_props); |
| gpu_common_underscorify_tolower(device_props.name); |
| if (oneapi_rc != ZE_RESULT_SUCCESS) { |
| info("Failed to get device property: 0x%x", oneapi_rc); |
| FREE_NULL_BITMAP(gres_slurmd_conf.cpus_bitmap); |
| xfree(cpu_aff_mac_range); |
| xfree(gres_slurmd_conf.links); |
| continue; |
| } |
| |
| debug2("GPU index %u:", i); |
| debug2(" Name: %s", device_props.name); |
| debug2(" DeviceId: %u", device_props.deviceId); |
| debug2(" PCI Domain/Bus/Device/Function: %u:%u:%u:%u", |
| pci.address.domain, pci.address.bus, |
| pci.address.device, pci.address.function); |
| debug2(" Links: %s", gres_slurmd_conf.links); |
| debug2(" Device File: %s", device_file); |
| debug2(" CPU Affinity Range - Machine: %s", |
| cpu_aff_mac_range); |
| debug2(" Core Affinity Range - Abstract: %s", |
| gres_slurmd_conf.cpus); |
| |
| /* Print out possible frequencies for this device */ |
| _oneapi_print_freqs(all_devices[i], LOG_LEVEL_DEBUG2); |
| |
| gres_slurmd_conf.type_name = device_props.name; |
| gres_slurmd_conf.file = device_file; |
| |
| /* Add the GPU to list */ |
| add_gres_to_list(gres_list_system, &gres_slurmd_conf); |
| |
| FREE_NULL_BITMAP(gres_slurmd_conf.cpus_bitmap); |
| xfree(cpu_aff_mac_range); |
| xfree(gres_slurmd_conf.cpus); |
| xfree(gres_slurmd_conf.links); |
| } |
| |
| return gres_list_system; |
| } |
| |
| extern list_t *gpu_p_get_system_gpu_list(node_config_load_t *node_config) |
| { |
| xassert(node_config); |
| |
| list_t *gres_list_system = _get_system_gpu_list_oneapi(node_config); |
| if (!gres_list_system) |
| error("System GPU detection failed"); |
| |
| return gres_list_system; |
| } |
| |
| extern void gpu_p_step_hardware_init(bitstr_t *usable_gpus, char *tres_freq) |
| { |
| debug2("enter gpu_p_step_hardware_init()"); |
| |
| char *freq = NULL; |
| char *tmp = NULL; |
| |
| xassert(tres_freq); |
| xassert(usable_gpus); |
| |
| if (!usable_gpus) |
| return; /* Job allocated no GPUs */ |
| if (!tres_freq) |
| return; /* No TRES frequency spec */ |
| |
| tmp = strstr(tres_freq, "gpu:"); |
| if (!tmp) |
| return; /* No GPU frequency spec */ |
| |
| freq = xstrdup(tmp + 4); |
| tmp = strchr(freq, ';'); |
| if (tmp) |
| tmp[0] = '\0'; |
| |
| /* |
| * Save a copy of the GPUs affected, so we can reset things afterwards |
| */ |
| FREE_NULL_BITMAP(saved_gpus); |
| saved_gpus = bit_copy(usable_gpus); |
| |
| if (_oneapi_init() != ZE_RESULT_SUCCESS) { |
| return; |
| } |
| |
| /* Set the frequency of each GPU index specified in the bitstr */ |
| _set_freq(usable_gpus, freq); |
| xfree(freq); |
| |
| debug2("exit gpu_p_step_hardware_init() normally"); |
| } |
| |
| extern void gpu_p_step_hardware_fini(void) |
| { |
| debug2("enter gpu_p_step_hardware_fini()"); |
| |
| if (!saved_gpus) |
| return; |
| |
| /* Reset the frequencies back to the hardware default */ |
| _reset_freq(saved_gpus); |
| FREE_NULL_BITMAP(saved_gpus); |
| |
| debug2("exit gpu_p_step_hardware_fini() normally"); |
| } |
| |
| extern char *gpu_p_test_cpu_conv(char *cpu_range) |
| { |
| return NULL; |
| } |
| |
| extern void gpu_p_get_device_count(uint32_t *device_count) |
| { |
| ze_device_handle_t all_devices[MAX_GPU_NUM]; |
| uint32_t gpu_num = MAX_GPU_NUM; |
| |
| _oneapi_get_device_handles(all_devices, &gpu_num, false); |
| if (gpu_num == 0) { |
| error("Failed to get device count!"); |
| *device_count = 0; |
| } else { |
| *device_count = gpu_num; |
| } |
| } |
| |
| extern int gpu_p_energy_read(uint32_t dv_ind, gpu_status_t *gpu) |
| { |
| return SLURM_SUCCESS; |
| } |
| |
| extern int gpu_p_usage_read(pid_t pid, acct_gather_data_t *data) |
| { |
| return SLURM_SUCCESS; |
| } |