blob: c4a005d960c3caa4cf5ae50f886e5e701b7852b1 [file] [log] [blame]
/*****************************************************************************\
* switch_nvidia_imex.c
*****************************************************************************
* Copyright (C) SchedMD LLC.
*
* This file is part of Slurm, a resource management program.
* For details, see <https://slurm.schedmd.com/>.
* Please also read the included file: DISCLAIMER.
*
* Slurm is free software; you can redistribute it and/or modify it under
* the terms of the GNU General Public License as published by the Free
* Software Foundation; either version 2 of the License, or (at your option)
* any later version.
*
* In addition, as a special exception, the copyright holders give permission
* to link the code of portions of this program with the OpenSSL library under
* certain conditions as described in each individual source file, and
* distribute linked combinations including the two. You must obey the GNU
* General Public License in all respects for all of the code used other than
* OpenSSL. If you modify file(s) with this exception, you may extend this
* exception to your version of the file(s), but you are not obligated to do
* so. If you do not wish to do so, delete this exception statement from your
* version. If you delete this exception statement from all source files in
* the program, then also delete it here.
*
* Slurm is distributed in the hope that it will be useful, but WITHOUT ANY
* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
* FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
* details.
*
* You should have received a copy of the GNU General Public License along
* with Slurm; if not, write to the Free Software Foundation, Inc.,
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
\*****************************************************************************/
#include <stdbool.h>
#include <stdint.h>
#include "src/common/slurm_xlator.h"
#include "src/common/bitstring.h"
#include "src/common/list.h"
#include "src/common/pack.h"
#include "src/common/run_in_daemon.h"
#include "src/common/xstring.h"
#include "src/interfaces/gres.h"
#include "src/interfaces/switch.h"
#include "src/plugins/switch/nvidia_imex/imex_device.h"
#if defined(__APPLE__)
extern list_t *job_list __attribute__((weak_import));
#else
list_t *job_list;
#endif
/*
* These variables are required by the generic plugin interface. If they
* are not found in the plugin, the plugin loader will ignore it.
*
* plugin_name - a string giving a human-readable description of the
* plugin. There is no maximum length, but the symbol must refer to
* a valid string.
*
* plugin_type - a string suggesting the type of the plugin or its
* applicability to a particular form of data or method of data handling.
* If the low-level plugin API is used, the contents of this string are
* unimportant and may be anything. Slurm uses the higher-level plugin
* interface which requires this string to be of the form
*
* <application>/<method>
*
* where <application> is a description of the intended application of
* the plugin (e.g., "switch" for Slurm switch) and <method> is a description
* of how this plugin satisfies that application. Slurm will only load
* a switch plugin if the plugin_type string has a prefix of "switch/".
*
* plugin_version - an unsigned 32-bit integer containing the Slurm version
* (major.minor.micro combined into a single number).
*/
const char plugin_name[] = "switch NVIDIA IMEX plugin";
const char plugin_type[] = "switch/nvidia_imex";
const uint32_t plugin_version = SLURM_VERSION_NUMBER;
const uint32_t plugin_id = SWITCH_PLUGIN_NVIDIA_IMEX;
#define SWITCH_INFO_MAGIC 0xFF00FF00
typedef struct {
uint32_t magic;
uint32_t channel;
} switch_info_t;
static uint32_t channel_count = 2048;
static bitstr_t *imex_channels = NULL;
static switch_info_t *_create_info(uint32_t channel)
{
switch_info_t *new = xmalloc(sizeof(*new));
new->magic = SWITCH_INFO_MAGIC;
new->channel = channel;
return new;
}
static void _setup_controller(void)
{
char *tmp_str = NULL;
if ((tmp_str = conf_get_opt_str(slurm_conf.switch_param,
"imex_channel_count="))) {
channel_count = atoi(tmp_str);
xfree(tmp_str);
}
log_flag(SWITCH, "managing %u channels", channel_count);
imex_channels = bit_alloc(channel_count);
bit_set(imex_channels, 0);
}
extern int init(void)
{
if (running_in_slurmctld())
_setup_controller();
else if (running_in_slurmd())
return slurmd_init();
else if (running_in_slurmstepd())
return stepd_init();
return SLURM_SUCCESS;
}
extern void fini(void)
{
return;
}
extern int switch_p_save(void)
{
/*
* Skip managing our own state file, just recover the allocations
* data from the job_list after restart.
*/
return SLURM_SUCCESS;
}
static int _mark_used(void *x, void *arg)
{
job_record_t *job_ptr = x;
switch_info_t *switch_info = job_ptr->switch_jobinfo;
if (!switch_info)
return 1;
if (switch_info->channel < channel_count) {
debug("marking channel %u used by %pJ",
switch_info->channel, job_ptr);
bit_set(imex_channels, switch_info->channel);
} else {
error("%s: channel %u outside of tracked range, ignoring",
plugin_type, switch_info->channel);
}
return 1;
}
extern int switch_p_restore(bool recover)
{
/*
* FIXME: this is run too soon at slurmctld startup to be used here.
* See switch_p_job_start() for the current workaround.
*/
return SLURM_SUCCESS;
}
extern void switch_p_pack_jobinfo(switch_info_t *switch_info, buf_t *buffer,
uint16_t protocol_version)
{
log_flag(SWITCH, "channel %u",
(switch_info ? switch_info->channel : NO_VAL));
if (protocol_version >= SLURM_MIN_PROTOCOL_VERSION) {
if (!switch_info) {
pack32(NO_VAL, buffer);
return;
}
xassert(switch_info->magic == SWITCH_INFO_MAGIC);
pack32(switch_info->channel, buffer);
}
}
extern int switch_p_unpack_jobinfo(switch_info_t **switch_info, buf_t *buffer,
uint16_t protocol_version)
{
uint32_t channel = NO_VAL;
*switch_info = NULL;
if (protocol_version >= SLURM_MIN_PROTOCOL_VERSION) {
safe_unpack32(&channel, buffer);
}
if (channel != NO_VAL)
*switch_info = _create_info(channel);
log_flag(SWITCH, "channel %u", channel);
return SLURM_SUCCESS;
unpack_error:
error("%s: unpack error", __func__);
return SLURM_ERROR;
}
/* Used to free switch_jobinfo when switch_p_job_complete can't be used */
extern void switch_p_free_jobinfo(job_record_t *job_ptr)
{
xfree(job_ptr->switch_jobinfo);
job_ptr->switch_jobinfo = NULL;
}
extern int switch_p_build_stepinfo(switch_info_t **switch_step,
slurm_step_layout_t *step_layout,
step_record_t *step_ptr)
{
if (step_ptr->job_ptr && step_ptr->job_ptr->switch_jobinfo) {
switch_info_t *jobinfo = step_ptr->job_ptr->switch_jobinfo;
*switch_step = _create_info(jobinfo->channel);
log_flag(SWITCH, "using channel %u for %pS",
jobinfo->channel, step_ptr);
} else {
log_flag(SWITCH, "no channel for %pS", step_ptr);
}
return SLURM_SUCCESS;
}
extern void switch_p_duplicate_stepinfo(switch_info_t *orig,
switch_info_t **dest)
{
if (orig)
*dest = _create_info(orig->channel);
}
extern void switch_p_free_stepinfo(switch_info_t *switch_step)
{
xfree(switch_step);
}
extern void switch_p_pack_stepinfo(switch_info_t *switch_step, buf_t *buffer,
uint16_t protocol_version)
{
switch_p_pack_jobinfo(switch_step, buffer, protocol_version);
}
extern int switch_p_unpack_stepinfo(switch_info_t **switch_step, buf_t *buffer,
uint16_t protocol_version)
{
return switch_p_unpack_jobinfo(switch_step, buffer, protocol_version);
}
extern int switch_p_job_preinit(stepd_step_rec_t *step)
{
return SLURM_SUCCESS;
}
extern int switch_p_job_init(stepd_step_rec_t *step)
{
if (xstrcasestr(slurm_conf.job_container_plugin, "tmpfs")) {
error("%s: %s: skipping due incompatibility with job_container/tmpfs",
plugin_type, __func__);
return SLURM_SUCCESS;
}
if (step->switch_step && step->switch_step->data) {
switch_info_t *switch_info = step->switch_step->data;
if (switch_info->channel != NO_VAL)
return setup_imex_channel(switch_info->channel, true);
}
return SLURM_SUCCESS;
}
extern int switch_p_job_postfini(stepd_step_rec_t *step)
{
return SLURM_SUCCESS;
}
extern int switch_p_job_attach(switch_info_t *stepinfo, char ***env,
uint32_t nodeid, uint32_t procid,
uint32_t nnodes, uint32_t nprocs, uint32_t rank)
{
return SLURM_SUCCESS;
}
extern int switch_p_job_step_complete(switch_info_t *stepinfo, char *nodelist)
{
return SLURM_SUCCESS;
}
extern void switch_p_job_start(job_record_t *job_ptr)
{
static bool first_alloc = true;
int channel = -1;
/*
* FIXME: this is hacked in here as switch_p_restore() is called
* before the job_list has been repopulated. Instead, before we
* allocate any new channels, scan the job_list to work out which
* are already in use.
*/
if (first_alloc) {
list_for_each(job_list, _mark_used, NULL);
first_alloc = false;
}
channel = bit_ffc(imex_channels);
if (channel > 0) {
debug("allocating channel %d to %pJ", channel, job_ptr);
bit_set(imex_channels, channel);
job_ptr->switch_jobinfo = _create_info(channel);
} else {
error("%s: %s: no channel available",
plugin_type, __func__);
}
}
extern void switch_p_job_complete(job_record_t *job_ptr)
{
switch_info_t *switch_jobinfo = job_ptr->switch_jobinfo;
if (!switch_jobinfo)
return;
if (switch_jobinfo->channel < channel_count) {
debug("marking channel %u released by %pJ",
switch_jobinfo->channel, job_ptr);
bit_clear(imex_channels, switch_jobinfo->channel);
xfree(job_ptr->switch_jobinfo);
} else {
error("%s: %s: channel %u outside of tracked range, ignoring release",
plugin_type, __func__, switch_jobinfo->channel);
}
}
extern int switch_p_fs_init(stepd_step_rec_t *step)
{
if (step->switch_step && step->switch_step->data) {
switch_info_t *switch_info = step->switch_step->data;
if (switch_info->channel != NO_VAL)
return setup_imex_channel(switch_info->channel, false);
}
return SLURM_SUCCESS;
}
extern void switch_p_extern_stepinfo(switch_info_t **stepinfo,
job_record_t *job_ptr)
{
if (job_ptr->switch_jobinfo) {
switch_info_t *jobinfo = job_ptr->switch_jobinfo;
*stepinfo = _create_info(jobinfo->channel);
log_flag(SWITCH, "using channel %u for %pJ",
jobinfo->channel, job_ptr);
}
}
extern void switch_p_extern_step_fini(int job_id)
{
/* not supported */
}