blob: ecdbbeefe88576035efa68c08130d01be01cba97 [file] [log] [blame] [edit]
/*****************************************************************************\
* gpu.c - driver for gpu plugin
*****************************************************************************
* Copyright (C) 2019 SchedMD LLC
* Written by Danny Auble <da@schedmd.com>
*
* This file is part of Slurm, a resource management program.
* For details, see <https://slurm.schedmd.com/>.
* Please also read the included file: DISCLAIMER.
*
* Slurm is free software; you can redistribute it and/or modify it under
* the terms of the GNU General Public License as published by the Free
* Software Foundation; either version 2 of the License, or (at your option)
* any later version.
*
* In addition, as a special exception, the copyright holders give permission
* to link the code of portions of this program with the OpenSSL library under
* certain conditions as described in each individual source file, and
* distribute linked combinations including the two. You must obey the GNU
* General Public License in all respects for all of the code used other than
* OpenSSL. If you modify file(s) with this exception, you may extend this
* exception to your version of the file(s), but you are not obligated to do
* so. If you do not wish to do so, delete this exception statement from your
* version. If you delete this exception statement from all source files in
* the program, then also delete it here.
*
* Slurm is distributed in the hope that it will be useful, but WITHOUT ANY
* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
* FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
* details.
*
* You should have received a copy of the GNU General Public License along
* with Slurm; if not, write to the Free Software Foundation, Inc.,
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
\*****************************************************************************/
#include <dlfcn.h>
#include "src/common/gpu.h"
#include "src/common/plugin.h"
/* Gres symbols provided by the plugin */
typedef struct slurm_ops {
void (*reconfig) (void);
List (*get_system_gpu_list) (node_config_load_t *node_conf);
void (*step_hardware_init) (bitstr_t *usable_gpus,
char *tres_freq);
void (*step_hardware_fini) (void);
char *(*test_cpu_conv) (char *cpu_range);
} slurm_ops_t;
/*
* These strings must be kept in the same order as the fields
* declared for slurm_ops_t.
*/
static const char *syms[] = {
"gpu_p_reconfig",
"gpu_p_get_system_gpu_list",
"gpu_p_step_hardware_init",
"gpu_p_step_hardware_fini",
"gpu_p_test_cpu_conv",
};
/* Local variables */
static slurm_ops_t ops;
static plugin_context_t *g_context = NULL;
static pthread_mutex_t g_context_lock = PTHREAD_MUTEX_INITIALIZER;
static bool init_run = false;
/*
* Common function to dlopen() the appropriate gpu libraries, and
* report back type needed.
*/
static char *_get_gpu_type(void)
{
/*
* Here we are dlopening the gpu .so to verify it exists on this node.
*/
uint32_t autodetect_types = gres_get_autodetect_types();
if (autodetect_types & GRES_AUTODETECT_NVML) {
#ifdef HAVE_NVML
if (!dlopen("libnvidia-ml.so", RTLD_NOW | RTLD_GLOBAL))
fatal("We were configured with nvml functionality, but that lib wasn't found on the system.");
else
return "gpu/nvml";
#else
fatal("We were configured to autodetect nvml functionality, but we weren't able to find that lib when Slurm was configured.");
#endif
} else if (autodetect_types & GRES_AUTODETECT_RSMI) {
#ifdef HAVE_RSMI
if (!dlopen("librocm_smi64.so", RTLD_NOW | RTLD_GLOBAL))
fatal("Configured with rsmi, but that lib wasn't found.");
else
return "gpu/rsmi";
#else
fatal("Configured with rsmi, but rsmi isn't enabled during the build.");
#endif
}
return "gpu/generic";
}
/*
* Initialize the GRES plugins.
*
* Returns a Slurm errno.
*/
extern int gpu_plugin_init(void)
{
int retval = SLURM_SUCCESS;
char *plugin_type = "gpu";
char *type = NULL;
if (init_run && g_context)
return retval;
slurm_mutex_lock(&g_context_lock);
if (g_context)
goto done;
type = _get_gpu_type();
g_context = plugin_context_create(
plugin_type, type, (void **)&ops, syms, sizeof(syms));
if (!g_context) {
error("cannot create %s context for %s", plugin_type, type);
retval = SLURM_ERROR;
goto done;
}
init_run = true;
done:
slurm_mutex_unlock(&g_context_lock);
return retval;
}
extern int gpu_plugin_fini(void)
{
int rc;
if (!g_context)
return SLURM_SUCCESS;
slurm_mutex_lock(&g_context_lock);
init_run = false;
rc = plugin_context_destroy(g_context);
g_context = NULL;
slurm_mutex_unlock(&g_context_lock);
return rc;
}
extern void gpu_g_reconfig(void)
{
if (gpu_plugin_init() < 0)
return;
(*(ops.reconfig))();
}
extern List gpu_g_get_system_gpu_list(node_config_load_t *node_conf)
{
if (gpu_plugin_init() < 0)
return NULL;
return (*(ops.get_system_gpu_list))(node_conf);
}
extern void gpu_g_step_hardware_init(bitstr_t *usable_gpus, char *tres_freq)
{
if (gpu_plugin_init() < 0)
return;
(*(ops.step_hardware_init))(usable_gpus, tres_freq);
}
extern void gpu_g_step_hardware_fini(void)
{
if (gpu_plugin_init() < 0)
return;
(*(ops.step_hardware_fini))();
}
extern char *gpu_g_test_cpu_conv(char *cpu_range)
{
if (gpu_plugin_init() < 0)
return NULL;
return (*(ops.test_cpu_conv))(cpu_range);
}