blob: 76bf41329e88c75c5c3f17e40f81749858162c68 [file]
/*****************************************************************************\
* gpu.c - driver for gpu plugin
*****************************************************************************
* Copyright (C) SchedMD LLC.
*
* This file is part of Slurm, a resource management program.
* For details, see <https://slurm.schedmd.com/>.
* Please also read the included file: DISCLAIMER.
*
* Slurm is free software; you can redistribute it and/or modify it under
* the terms of the GNU General Public License as published by the Free
* Software Foundation; either version 2 of the License, or (at your option)
* any later version.
*
* In addition, as a special exception, the copyright holders give permission
* to link the code of portions of this program with the OpenSSL library under
* certain conditions as described in each individual source file, and
* distribute linked combinations including the two. You must obey the GNU
* General Public License in all respects for all of the code used other than
* OpenSSL. If you modify file(s) with this exception, you may extend this
* exception to your version of the file(s), but you are not obligated to do
* so. If you do not wish to do so, delete this exception statement from your
* version. If you delete this exception statement from all source files in
* the program, then also delete it here.
*
* Slurm is distributed in the hope that it will be useful, but WITHOUT ANY
* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
* FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
* details.
*
* You should have received a copy of the GNU General Public License along
* with Slurm; if not, write to the Free Software Foundation, Inc.,
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
\*****************************************************************************/
#include <dlfcn.h>
#include "src/common/assoc_mgr.h"
#include "src/common/list.h"
#include "src/common/plugin.h"
#include "src/common/xmalloc.h"
#include "src/interfaces/gpu.h"
/* Gres symbols provided by the plugin */
typedef struct slurm_ops {
list_t *(*get_system_gpu_list) (node_config_load_t *node_conf);
void (*step_hardware_init) (bitstr_t *usable_gpus,
char *tres_freq);
void (*step_hardware_fini) (void);
char *(*test_cpu_conv) (char *cpu_range);
int (*energy_read) (uint32_t dv_ind, gpu_status_t *gpu);
void (*get_device_count) (uint32_t *device_count);
int (*usage_read) (pid_t pid, acct_gather_data_t *data);
} slurm_ops_t;
/*
* These strings must be kept in the same order as the fields
* declared for slurm_ops_t.
*/
static const char *syms[] = {
"gpu_p_get_system_gpu_list",
"gpu_p_step_hardware_init",
"gpu_p_step_hardware_fini",
"gpu_p_test_cpu_conv",
"gpu_p_energy_read",
"gpu_p_get_device_count",
"gpu_p_usage_read",
};
/* Local variables */
static slurm_ops_t ops;
static plugin_context_t *g_context = NULL;
static pthread_mutex_t g_context_lock = PTHREAD_MUTEX_INITIALIZER;
static void *ext_lib_handle = NULL;
/*
* Parses fake_gpus_file for fake GPU devices and adds them to gres_list_system
*
* The file format is: <type>|<sys_cpu_count>|<cpu_range>|<links>|<device_file>
*
* Each line represents a single GPU device. Therefore, <device_file> can't
* specify more than one file (i.e. ranges like [1-2] won't work).
*
* Each line has a max of 256 characters, including the newline.
*
* If `_` or `(null)` is specified, then the value will be left NULL or 0.
*
* If a <cpu_range> is of the form `~F0F0`, an array of unsigned longs will be
* generated with the specified cpu hex mask and then converted to a bitstring.
* This is to test converting the cpu mask from NVML to Slurm.
* Only 0xF and 0x0 are supported.
*/
static void _add_fake_gpus_from_file(list_t *gres_list_system,
char *fake_gpus_file)
{
char buffer[256];
int line_number = 0;
FILE *f = fopen(fake_gpus_file, "r");
if (f == NULL) {
error("Unable to read \"%s\": %m", fake_gpus_file);
return;
}
while (fgets(buffer, 256, f)) {
char *save_ptr = NULL;
char *tok;
int i = 0;
gres_slurmd_conf_t gres_slurmd_conf = {
.count = 1,
.name = "gpu",
};
line_number++;
buffer[strcspn(buffer, "\r\n")] = '\0';
if (!buffer[0] || buffer[0] == '#')
continue;
debug("%s", buffer);
tok = strtok_r(buffer, "|", &save_ptr);
while (tok) {
if (xstrcmp(tok, "(null)") == 0) {
i++;
tok = strtok_r(NULL, "|", &save_ptr);
continue;
}
switch (i) {
case 0:
gres_slurmd_conf.type_name = xstrdup(tok);
break;
case 1:
gres_slurmd_conf.cpu_cnt = atoi(tok);
break;
case 2:
if (tok[0] == '~')
/* accommodate special tests */
gres_slurmd_conf.cpus =
gpu_g_test_cpu_conv(tok);
else
gres_slurmd_conf.cpus = xstrdup(tok);
break;
case 3:
gres_slurmd_conf.links = xstrdup(tok);
break;
case 4:
gres_slurmd_conf.file = xstrdup(tok);
break;
case 5:
gres_slurmd_conf.unique_id = xstrdup(tok);
break;
case 6:
gres_slurmd_conf.config_flags =
gres_flags_parse(tok, NULL, NULL);
break;
default:
error("Malformed line: too many data fields");
break;
}
i++;
tok = strtok_r(NULL, "|", &save_ptr);
}
if ((i < 5) || (i > 7))
error("Line #%d in fake_gpus.conf failed to parse! Make sure that the line has no empty tokens and that the format is <type>|<sys_cpu_count>|<cpu_range>|<links>|<device_file>[|<unique_id>[|<flags>]]",
line_number);
if (!(gres_slurmd_conf.config_flags & GRES_CONF_ENV_SET))
warning("Line #%d in fake_gpus.conf has no env flags set. Real GPU plugins always set vendor env flags (e.g. nvidia_gpu_env).",
line_number);
gres_slurmd_conf.cpus_bitmap =
bit_alloc(gres_slurmd_conf.cpu_cnt);
if (bit_unfmt(gres_slurmd_conf.cpus_bitmap,
gres_slurmd_conf.cpus))
fatal("bit_unfmt() failed for CPU range: %s",
gres_slurmd_conf.cpus);
add_gres_to_list(gres_list_system, &gres_slurmd_conf);
FREE_NULL_BITMAP(gres_slurmd_conf.cpus_bitmap);
xfree(gres_slurmd_conf.cpus);
xfree(gres_slurmd_conf.file);
xfree(gres_slurmd_conf.type_name);
xfree(gres_slurmd_conf.links);
xfree(gres_slurmd_conf.unique_id);
}
fclose(f);
}
static list_t *gpu_get_fake_system_list(void)
{
list_t *gres_list_system = NULL;
struct stat config_stat;
char *fake_gpus_file = get_extra_conf_path("fake_gpus.conf");
if (stat(fake_gpus_file, &config_stat) >= 0) {
info("Adding fake system GPU data from %s", fake_gpus_file);
gres_list_system = list_create(destroy_gres_slurmd_conf);
_add_fake_gpus_from_file(gres_list_system, fake_gpus_file);
}
xfree(fake_gpus_file);
return gres_list_system;
}
/*
* Common function to dlopen() the appropriate gpu libraries, and
* report back type needed.
*/
static char *_get_gpu_type(void)
{
/*
* Here we are dlopening the gpu .so to verify it exists on this node.
*
* NOTE: We are dlopening these genericly on purpose. This is to make
* it so we always use the lib that the card is running regardless of
* what was put in configure. This dlopen is what will load the
* symbols for the plugin to use.
*
* We are also doing this outside of the plugins on purpose as we want
* to be able to deal with heterogeneous systems where not all the nodes
* will have cards and we want the slurmds to still run there with only
* 1 gres.conf file.
*/
uint32_t autodetect_flags = gres_get_autodetect_flags();
if (autodetect_flags & GRES_AUTODETECT_GPU_NVML) {
#ifdef HAVE_NVML
(void) dlerror();
if (!(ext_lib_handle = dlopen("libnvidia-ml.so",
RTLD_NOW | RTLD_GLOBAL)) &&
!(ext_lib_handle = dlopen("libnvidia-ml.so.1",
RTLD_NOW | RTLD_GLOBAL)))
info("We were configured with nvml functionality, but that lib wasn't found on the system. Attempted loading libnvidia-ml.so and libnvidia-ml.so.1 without success. Last error is: %s",
dlerror());
else
return "gpu/nvml";
#else
info("We were configured to autodetect nvml functionality, but we weren't able to find that lib when Slurm was configured.");
#endif
} else if (autodetect_flags & GRES_AUTODETECT_GPU_RSMI) {
#ifdef HAVE_RSMI
(void) dlerror();
if (!(ext_lib_handle = dlopen("librocm_smi64.so",
RTLD_NOW | RTLD_GLOBAL)))
info("Configured with rsmi, but that lib wasn't found. %s",
dlerror());
else
return "gpu/rsmi";
#else
info("Configured with rsmi, but rsmi isn't enabled during the build.");
#endif
} else if (autodetect_flags & GRES_AUTODETECT_GPU_ONEAPI) {
#ifdef HAVE_ONEAPI
(void) dlerror();
if (!(ext_lib_handle = dlopen("libze_loader.so",
RTLD_NOW | RTLD_GLOBAL)))
info("Configured with oneAPI, but that lib wasn't found. %s",
dlerror());
else
return "gpu/oneapi";
#else
info("Configured with oneAPI, but oneAPI isn't enabled during the build.");
#endif
} else if (autodetect_flags & GRES_AUTODETECT_GPU_NRT) {
return "gpu/nrt";
} else if (autodetect_flags & GRES_AUTODETECT_GPU_NVIDIA) {
return "gpu/nvidia";
}
return "gpu/generic";
}
static int _gpu_clear_plugin_locked(void)
{
int rc = SLURM_SUCCESS;
if (ext_lib_handle) {
dlclose(ext_lib_handle);
ext_lib_handle = NULL;
}
if (g_context) {
rc = plugin_context_destroy(g_context);
g_context = NULL;
}
return rc;
}
static char *_gpu_create_plugin_context(bool error_on_failure)
{
const char *plugin_type = "gpu";
char *type = NULL;
type = _get_gpu_type();
g_context = plugin_context_create(
plugin_type, type, (void **)&ops, syms, sizeof(syms));
if (!g_context) {
error("cannot create %s context for %s",
plugin_type, type);
}
return type;
}
/*
* Try init'ing GPU plugins in order until hardware is found (Autodetect=full).
*/
static void _gpu_plugin_init_full(node_config_load_t *node_conf)
{
char *type;
static const uint32_t probe_order[] = {
GRES_AUTODETECT_GPU_NVML,
GRES_AUTODETECT_GPU_NVIDIA,
GRES_AUTODETECT_GPU_RSMI,
GRES_AUTODETECT_GPU_ONEAPI,
GRES_AUTODETECT_GPU_NRT,
};
xassert(!g_context);
for (int i = 0; i < ARRAY_SIZE(probe_order); i++) {
list_t *gres_list_system;
gres_autodetect_flags_set_gpu(probe_order[i]);
type = _gpu_create_plugin_context(false);
if (!g_context) {
(void) _gpu_clear_plugin_locked();
continue;
}
gres_list_system = gpu_g_get_system_gpu_list(node_conf);
if (gres_list_system && list_count(gres_list_system)) {
verbose("%s: autodetect=full selected %s",
__func__, type);
FREE_NULL_LIST(gres_list_system);
return;
}
FREE_NULL_LIST(gres_list_system);
_gpu_clear_plugin_locked();
}
gres_autodetect_flags_set_gpu(GRES_AUTODETECT_UNSET);
}
extern int gpu_plugin_init(node_config_load_t *node_conf)
{
slurm_mutex_lock(&g_context_lock);
if (!g_context && node_conf && node_conf->in_slurmd &&
(gres_get_autodetect_flags() & GRES_AUTODETECT_GPU_FULL)) {
_gpu_plugin_init_full(node_conf);
/* Probe found no devices; fall through to gpu/generic. */
}
if (!g_context) {
_gpu_create_plugin_context(true);
}
slurm_mutex_unlock(&g_context_lock);
return g_context ? SLURM_SUCCESS : SLURM_ERROR;
}
extern int gpu_plugin_fini(void)
{
int rc;
if (!g_context)
return SLURM_SUCCESS;
slurm_mutex_lock(&g_context_lock);
rc = _gpu_clear_plugin_locked();
slurm_mutex_unlock(&g_context_lock);
return rc;
}
extern void gpu_get_tres_pos(int *gpumem_pos, int *gpuutil_pos)
{
static int loc_gpumem_pos = -1;
static int loc_gpuutil_pos = -1;
static bool inited = false;
if (!inited) {
slurmdb_tres_rec_t tres_rec;
memset(&tres_rec, 0, sizeof(slurmdb_tres_rec_t));
tres_rec.type = "gres";
tres_rec.name = "gpuutil";
loc_gpuutil_pos = assoc_mgr_find_tres_pos(&tres_rec, false);
tres_rec.name = "gpumem";
loc_gpumem_pos = assoc_mgr_find_tres_pos(&tres_rec, false);
inited = true;
}
if (gpumem_pos)
*gpumem_pos = loc_gpumem_pos;
if (gpuutil_pos)
*gpuutil_pos = loc_gpuutil_pos;
}
extern list_t *gpu_g_get_system_gpu_list(node_config_load_t *node_conf)
{
list_t *fake = gpu_get_fake_system_list();
if (fake)
return fake;
/* No fake override; only probe real hardware from slurmd. */
if (node_conf && !node_conf->in_slurmd)
return NULL;
xassert(g_context);
return (*(ops.get_system_gpu_list))(node_conf);
}
extern void gpu_g_step_hardware_init(bitstr_t *usable_gpus, char *tres_freq)
{
xassert(g_context);
(*(ops.step_hardware_init))(usable_gpus, tres_freq);
}
extern void gpu_g_step_hardware_fini(void)
{
xassert(g_context);
(*(ops.step_hardware_fini))();
}
extern char *gpu_g_test_cpu_conv(char *cpu_range)
{
xassert(g_context);
return (*(ops.test_cpu_conv))(cpu_range);
}
extern int gpu_g_energy_read(uint32_t dv_ind, gpu_status_t *gpu)
{
xassert(g_context);
return (*(ops.energy_read))(dv_ind, gpu);
}
extern void gpu_g_get_device_count(uint32_t *device_count)
{
xassert(g_context);
(*(ops.get_device_count))(device_count);
}
extern int gpu_g_usage_read(pid_t pid, acct_gather_data_t *data)
{
xassert(g_context);
return (*(ops.usage_read))(pid, data);
}