blob: 63ac7a910c4fc2f294fbef7fb96376a1b0f61ada [file] [log] [blame] [edit]
/*****************************************************************************\
* slurm_mpi.c - Generic mpi selector for slurm
*****************************************************************************
* Copyright (C) 2002 The Regents of the University of California.
* Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
* Written by Mark Grondona <grondo1@llnl.gov>.
* CODE-OCEC-09-009. All rights reserved.
*
* This file is part of Slurm, a resource management program.
* For details, see <https://slurm.schedmd.com/>.
* Please also read the included file: DISCLAIMER.
*
* Slurm is free software; you can redistribute it and/or modify it under
* the terms of the GNU General Public License as published by the Free
* Software Foundation; either version 2 of the License, or (at your option)
* any later version.
*
* In addition, as a special exception, the copyright holders give permission
* to link the code of portions of this program with the OpenSSL library under
* certain conditions as described in each individual source file, and
* distribute linked combinations including the two. You must obey the GNU
* General Public License in all respects for all of the code used other than
* OpenSSL. If you modify file(s) with this exception, you may extend this
* exception to your version of the file(s), but you are not obligated to do
* so. If you do not wish to do so, delete this exception statement from your
* version. If you delete this exception statement from all source files in
* the program, then also delete it here.
*
* Slurm is distributed in the hope that it will be useful, but WITHOUT ANY
* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
* FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
* details.
*
* You should have received a copy of the GNU General Public License along
* with Slurm; if not, write to the Free Software Foundation, Inc.,
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
\*****************************************************************************/
#include "config.h"
#include <stdlib.h>
#include <unistd.h>
#include "src/common/env.h"
#include "src/common/macros.h"
#include "src/common/plugin.h"
#include "src/common/plugrack.h"
#include "src/common/slurm_mpi.h"
#include "src/common/xmalloc.h"
#include "src/common/xstring.h"
#define _DEBUG 0
typedef struct slurm_mpi_ops {
int (*slurmstepd_prefork)(const stepd_step_rec_t *job,
char ***env);
int (*slurmstepd_init) (const mpi_plugin_task_info_t *job,
char ***env);
mpi_plugin_client_state_t *
(*client_prelaunch) (const mpi_plugin_client_info_t *job,
char ***env);
int (*client_fini) (mpi_plugin_client_state_t *);
} slurm_mpi_ops_t;
/*
* These strings must be kept in the same order as the fields
* declared for slurm_mpi_ops_t.
*/
static const char *syms[] = {
"p_mpi_hook_slurmstepd_prefork",
"p_mpi_hook_slurmstepd_task",
"p_mpi_hook_client_prelaunch",
"p_mpi_hook_client_fini"
};
static slurm_mpi_ops_t ops;
static plugin_context_t *g_context = NULL;
static pthread_mutex_t context_lock = PTHREAD_MUTEX_INITIALIZER;
static bool init_run = false;
#if _DEBUG
/* Debugging information is invaluable to debug heterogeneous step support */
static inline void _log_env(char **env)
{
#if _DEBUG > 1
int i;
if (!env)
return;
for (i = 0; env[i]; i++)
info("%s", env[i]);
#endif
}
static void _log_step_rec(const stepd_step_rec_t *job)
{
int i;
info("STEPD_STEP_REC");
info("job_id:%u step_id:%u", job->jobid, job->stepid);
info("ntasks:%u nnodes:%u node_id:%u", job->ntasks, job->nnodes,
job->nodeid);
info("node_tasks:%u", job->node_tasks);
for (i = 0; i < job->node_tasks; i ++)
info("gtid[%d]:%u", i, job->task[i]->gtid);
for (i = 0; i < job->nnodes; i++)
info("task_cnts[%d]:%u", i, job->task_cnts[i]);
if ((job->het_job_id != 0) && (job->het_job_id != NO_VAL)) {
info("het_job_id:%u step_id:%u", job->het_jobid, job->stepid);
info("het_job_ntasks:%u het_job_nnodes:%u", job->het_job_ntasks,
job->het_job_nnodes);
info("het_job_node_offset:%u het_job_task_offset:%u",
job->het_job_offset, job->het_job_task_offset);
for (i = 0; i < job->het_job_nnodes; i++)
info("het_job_task_cnts[%d]:%u", i,
job->het_job_task_cnts[i]);
info("het_job_node_list:%s", job->het_job_node_list);
}
}
static void _log_mpi_rec(const mpi_plugin_client_info_t *job)
{
slurm_step_layout_t *layout = job->step_layout;
int i, j;
info("MPI_PLUGIN_CLIENT_INFO");
info("job_id:%u step_id:%u", job->jobid, job->stepid);
if ((job->het_job_id != 0) && (job->het_job_id != NO_VAL)) {
info("het_job_id:%u step_id:%u", job->het_job_id, job->stepid);
}
if (layout) {
info("node_cnt:%u task_cnt:%u", layout->node_cnt,
layout->task_cnt);
info("node_list:%s", layout->node_list);
info("plane_size:%u task_dist:%u", layout->plane_size,
layout->task_dist);
for (i = 0; i < layout->node_cnt; i++) {
info("tasks[%d]:%u", i, layout->tasks[i]);
for (j = 0; j < layout->tasks[i]; j++) {
info("tids[%d][%d]:%u", i, j,
layout->tids[i][j]);
}
}
}
}
static void _log_task_rec(const mpi_plugin_task_info_t *job)
{
info("MPI_PLUGIN_TASK_INFO");
info("job_id:%u step_id:%u", job->jobid, job->stepid);
info("nnodes:%u node_id:%u", job->nnodes, job->nodeid);
info("ntasks:%u local_tasks:%u", job->ntasks, job->ltasks);
info("global_task_id:%u local_task_id:%u", job->gtaskid, job->ltaskid);
}
#endif
int _mpi_init (char *mpi_type)
{
int retval = SLURM_SUCCESS;
char *plugin_type = "mpi";
char *type = NULL;
int got_default = 0;
if (init_run && g_context)
return retval;
slurm_mutex_lock( &context_lock );
if ( g_context )
goto done;
if (mpi_type == NULL) {
mpi_type = slurm_get_mpi_default();
got_default = 1;
} else if (!xstrcmp(mpi_type, "openmpi")) {
/*
* The openmpi plugin has been equivalent to none for a while.
* Translate so we can discard that duplicated no-op plugin.
*/
mpi_type = "none";
}
if (mpi_type == NULL) {
error("No MPI default set.");
retval = SLURM_ERROR;
goto done;
}
if (!xstrcmp(mpi_type, "list")) {
char *plugin_dir;
plugrack_t *mpi_rack = plugrack_create("mpi");
plugin_dir = slurm_get_plugin_dir();
plugrack_read_dir(mpi_rack, plugin_dir);
plugrack_print_all_plugin(mpi_rack);
exit(0);
}
setenvf(NULL, "SLURM_MPI_TYPE", "%s", mpi_type);
type = xstrdup_printf("mpi/%s", mpi_type);
g_context = plugin_context_create(
plugin_type, type, (void **)&ops, syms, sizeof(syms));
if (!g_context) {
error("cannot create %s context for %s", plugin_type, type);
retval = SLURM_ERROR;
goto done;
}
init_run = true;
done:
xfree(type);
if (got_default)
xfree(mpi_type);
slurm_mutex_unlock( &context_lock );
return retval;
}
int mpi_hook_slurmstepd_init (char ***env)
{
char *mpi_type = getenvp (*env, "SLURM_MPI_TYPE");
#if _DEBUG
info("IN %s mpi_type:%s", __func__, mpi_type);
_log_env(*env);
#else
debug("mpi type = %s", mpi_type);
#endif
if (_mpi_init(mpi_type) == SLURM_ERROR)
return SLURM_ERROR;
/*
* Unset env var so that "none" doesn't exist in salloc'ed env, but
* still keep it in srun if not none.
*/
if (!xstrcmp(mpi_type, "none"))
unsetenvp (*env, "SLURM_MPI_TYPE");
return SLURM_SUCCESS;
}
int mpi_hook_slurmstepd_prefork (const stepd_step_rec_t *job, char ***env)
{
#if _DEBUG
info("IN %s", __func__);
_log_env(*env);
_log_step_rec(job);
#endif
if (mpi_hook_slurmstepd_init(env) == SLURM_ERROR)
return SLURM_ERROR;
return (*(ops.slurmstepd_prefork))(job, env);
}
int mpi_hook_slurmstepd_task (const mpi_plugin_task_info_t *job, char ***env)
{
#if _DEBUG
info("IN %s", __func__);
_log_task_rec(job);
_log_env(*env);
#endif
if (mpi_hook_slurmstepd_init(env) == SLURM_ERROR)
return SLURM_ERROR;
return (*(ops.slurmstepd_init))(job, env);
}
int mpi_hook_client_init (char *mpi_type)
{
#if _DEBUG
info("IN %s mpi_type:%s", __func__, mpi_type);
#else
debug("mpi type = %s", mpi_type);
#endif
if (_mpi_init(mpi_type) == SLURM_ERROR)
return SLURM_ERROR;
return SLURM_SUCCESS;
}
mpi_plugin_client_state_t *
mpi_hook_client_prelaunch(const mpi_plugin_client_info_t *job, char ***env)
{
mpi_plugin_client_state_t *rc;
#if _DEBUG
info("IN %s", __func__);
_log_env(*env);
_log_mpi_rec(job);
#endif
if (_mpi_init(NULL) < 0)
return NULL;
rc = (*(ops.client_prelaunch))(job, env);
#if _DEBUG
_log_env(*env);
#endif
return rc;
}
int mpi_hook_client_fini (mpi_plugin_client_state_t *state)
{
#if _DEBUG
info("IN %s", __func__);
#endif
if (_mpi_init(NULL) < 0)
return SLURM_ERROR;
return (*(ops.client_fini))(state);
}
int mpi_fini (void)
{
int rc;
if (!g_context)
return SLURM_SUCCESS;
init_run = false;
rc = plugin_context_destroy(g_context);
return rc;
}