blob: 4b7e823e0cc80058c69becd35d147d5caf49b0ab [file] [log] [blame]
/*****************************************************************************\
** mpi_pmix.c - Main plugin callbacks for PMIx support in Slurm
*****************************************************************************
* Copyright (C) 2014-2015 Artem Polyakov. All rights reserved.
* Copyright (C) 2015-2020 Mellanox Technologies. All rights reserved.
* Written by Artem Y. Polyakov <artpol84@gmail.com, artemp@mellanox.com>.
* Copyright (C) 2020 Siberian State University of Telecommunications
* and Information Sciences (SibSUTIS).
* All rights reserved.
* Written by Boris Bochkarev <boris-bochkaryov@yandex.ru>.
*
* This file is part of Slurm, a resource management program.
* For details, see <https://slurm.schedmd.com/>.
* Please also read the included file: DISCLAIMER.
*
* Slurm is free software; you can redistribute it and/or modify it under
* the terms of the GNU General Public License as published by the Free
* Software Foundation; either version 2 of the License, or (at your option)
* any later version.
*
* In addition, as a special exception, the copyright holders give permission
* to link the code of portions of this program with the OpenSSL library under
* certain conditions as described in each individual source file, and
* distribute linked combinations including the two. You must obey the GNU
* General Public License in all respects for all of the code used other than
* OpenSSL. If you modify file(s) with this exception, you may extend this
* exception to your version of the file(s), but you are not obligated to do
* so. If you do not wish to do so, delete this exception statement from your
* version. If you delete this exception statement from all source files in
* the program, then also delete it here.
*
* Slurm is distributed in the hope that it will be useful, but WITHOUT ANY
* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
* FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
* details.
*
* You should have received a copy of the GNU General Public License along
* with Slurm; if not, write to the Free Software Foundation, Inc.,
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
\*****************************************************************************/
#include <fcntl.h>
#include <signal.h>
#include <sys/types.h>
#include <dlfcn.h>
#include "pmixp_common.h"
#include "pmixp_server.h"
#include "pmixp_debug.h"
#include "pmixp_agent.h"
#include "pmixp_info.h"
#include "pmixp_client.h"
/* Required Slurm and mpi plugin symbols: */
const char plugin_name[] = "PMIx plugin";
const uint32_t plugin_version = SLURM_VERSION_NUMBER;
#if (HAVE_PMIX_VER == 2)
const char plugin_type[] = "mpi/pmix_v2";
const uint32_t plugin_id = MPI_PLUGIN_PMIX2;
#elif (HAVE_PMIX_VER == 3)
const char plugin_type[] = "mpi/pmix_v3";
const uint32_t plugin_id = MPI_PLUGIN_PMIX3;
#elif (HAVE_PMIX_VER == 4)
const char plugin_type[] = "mpi/pmix_v4";
const uint32_t plugin_id = MPI_PLUGIN_PMIX4;
#elif (HAVE_PMIX_VER == 5)
const char plugin_type[] = "mpi/pmix_v5";
const uint32_t plugin_id = MPI_PLUGIN_PMIX5;
#elif (HAVE_PMIX_VER == 6)
const char plugin_type[] = "mpi/pmix_v6";
const uint32_t plugin_id = MPI_PLUGIN_PMIX6;
#endif
static pthread_mutex_t setup_mutex = PTHREAD_MUTEX_INITIALIZER;
static pthread_cond_t setup_cond = PTHREAD_COND_INITIALIZER;
static bool setup_done = false;
void *libpmix_plug = NULL;
char *process_mapping = NULL;
s_p_options_t pmix_options[] = {
{"PMIxCliTmpDirBase", S_P_STRING},
{"PMIxCollFence", S_P_STRING},
{"PMIxDebug", S_P_UINT32},
{"PMIxDirectConn", S_P_BOOLEAN},
{"PMIxDirectConnEarly", S_P_BOOLEAN},
{"PMIxDirectConnUCX", S_P_BOOLEAN},
{"PMIxDirectSameArch", S_P_BOOLEAN},
{"PMIxEnv", S_P_STRING},
{"PMIxFenceBarrier", S_P_BOOLEAN},
{"PMIxNetDevicesUCX", S_P_STRING},
{"PMIxShareServerTopology", S_P_BOOLEAN},
{"PMIxTimeout", S_P_UINT32},
{"PMIxTlsUCX", S_P_STRING},
{NULL}
};
slurm_pmix_conf_t slurm_pmix_conf;
static void _libpmix_close(void *lib_plug)
{
xassert(lib_plug);
dlclose(lib_plug);
}
static void *_libpmix_open(void)
{
void *lib_plug = NULL;
char *full_path = NULL;
/*
* x_ac_pmix.m4 determines if we should use full path or relative path
* Note that this implies no fall-through to relative libraries
*/
#ifdef PMIXP_LIBPATH
xstrfmtcat(full_path, "%s/", PMIXP_LIBPATH);
#endif
xstrfmtcat(full_path, "libpmix.so.2");
lib_plug = dlopen(full_path, RTLD_LAZY | RTLD_GLOBAL);
xfree(full_path);
if (lib_plug && (HAVE_PMIX_VER != pmixp_lib_get_version())) {
PMIXP_ERROR("pmi/pmix: incorrect PMIx library version loaded %d was loaded, required %d version",
pmixp_lib_get_version(), (int)HAVE_PMIX_VER);
_libpmix_close(lib_plug);
lib_plug = NULL;
}
return lib_plug;
}
static void _init_pmix_conf(void)
{
slurm_pmix_conf.cli_tmpdir_base = NULL;
slurm_pmix_conf.coll_fence = NULL;
slurm_pmix_conf.debug = 0;
slurm_pmix_conf.direct_conn = true;
slurm_pmix_conf.direct_conn_early = false;
slurm_pmix_conf.direct_conn_ucx = false;
slurm_pmix_conf.direct_samearch = false;
slurm_pmix_conf.env = NULL;
slurm_pmix_conf.fence_barrier = false;
slurm_pmix_conf.share_topology = false;
slurm_pmix_conf.timeout = PMIXP_TIMEOUT_DEFAULT;
slurm_pmix_conf.ucx_netdevices = NULL;
slurm_pmix_conf.ucx_tls = NULL;
}
static void _reset_pmix_conf(void)
{
xfree(slurm_pmix_conf.cli_tmpdir_base);
xfree(slurm_pmix_conf.coll_fence);
slurm_pmix_conf.debug = 0;
slurm_pmix_conf.direct_conn = true;
slurm_pmix_conf.direct_conn_early = false;
slurm_pmix_conf.direct_conn_ucx = false;
slurm_pmix_conf.direct_samearch = false;
xfree(slurm_pmix_conf.env);
slurm_pmix_conf.fence_barrier = false;
slurm_pmix_conf.share_topology = false;
slurm_pmix_conf.timeout = PMIXP_TIMEOUT_DEFAULT;
xfree(slurm_pmix_conf.ucx_netdevices);
xfree(slurm_pmix_conf.ucx_tls);
}
extern int init(void)
{
libpmix_plug = _libpmix_open();
if (!libpmix_plug) {
PMIXP_ERROR("pmi/pmix: can not load PMIx library");
return SLURM_ERROR;
}
_init_pmix_conf();
debug("%s loaded", plugin_name);
return SLURM_SUCCESS;
}
extern void fini(void)
{
PMIXP_DEBUG("%s: call fini()", pmixp_info_hostname());
pmixp_agent_stop();
pmixp_stepd_finalize();
_libpmix_close(libpmix_plug);
_reset_pmix_conf();
}
extern int mpi_p_slurmstepd_prefork(const stepd_step_rec_t *step, char ***env)
{
int ret;
pmixp_debug_hang(0);
PMIXP_DEBUG("start");
if (step->batch)
return SLURM_SUCCESS;
if (SLURM_SUCCESS != (ret = pmixp_stepd_init(step, env))) {
PMIXP_ERROR("pmixp_stepd_init() failed");
goto err_ext;
}
if (SLURM_SUCCESS != (ret = pmixp_agent_start())) {
PMIXP_ERROR("pmixp_agent_start() failed");
goto err_ext;
}
return SLURM_SUCCESS;
err_ext:
/* Abort the whole job if error! */
slurm_kill_job_step((slurm_step_id_t *) &step->step_id, SIGKILL, 0);
return ret;
}
extern int mpi_p_slurmstepd_task(const mpi_task_info_t *mpi_task, char ***env)
{
char **tmp_env = NULL;
pmixp_debug_hang(0);
PMIXP_DEBUG("Patch environment for task %d", mpi_task->gtaskid);
pmixp_lib_setup_fork(
mpi_task->gtaskid, pmixp_info_namespace(), &tmp_env);
if (NULL != tmp_env) {
int i;
for (i = 0; NULL != tmp_env[i]; i++) {
char *value = strchr(tmp_env[i], '=');
if (NULL != value) {
*value = '\0';
value++;
env_array_overwrite(env,
(const char *)tmp_env[i],
value);
}
free(tmp_env[i]);
}
free(tmp_env);
tmp_env = NULL;
}
return SLURM_SUCCESS;
}
extern mpi_plugin_client_state_t *
mpi_p_client_prelaunch(const mpi_step_info_t *mpi_step, char ***env)
{
uint32_t nnodes, ntasks, **tids;
uint16_t *task_cnt;
int ret;
if ((ret = pmixp_abort_agent_start(env)) != SLURM_SUCCESS) {
PMIXP_ERROR("pmixp_abort_agent_start() failed %d", ret);
return NULL;
}
PMIXP_DEBUG("setup process mapping in srun");
if ((mpi_step->het_job_id == NO_VAL) ||
(mpi_step->het_job_task_offset == 0)) {
nnodes = mpi_step->step_layout->node_cnt;
ntasks = mpi_step->step_layout->task_cnt;
task_cnt = mpi_step->step_layout->tasks;
tids = mpi_step->step_layout->tids;
process_mapping = pack_process_mapping(nnodes, ntasks,
task_cnt, tids);
slurm_mutex_lock(&setup_mutex);
setup_done = true;
slurm_cond_broadcast(&setup_cond);
slurm_mutex_unlock(&setup_mutex);
} else {
slurm_mutex_lock(&setup_mutex);
while (!setup_done)
slurm_cond_wait(&setup_cond, &setup_mutex);
slurm_mutex_unlock(&setup_mutex);
}
if (!process_mapping) {
PMIXP_ERROR("Cannot create process mapping");
return NULL;
}
setenvf(env, PMIXP_SLURM_MAPPING_ENV, "%s", process_mapping);
/* only return NULL on error */
return (void *)0xdeadbeef;
}
extern int mpi_p_client_fini(void)
{
slurm_mutex_lock(&setup_mutex);
if (setup_done)
xfree(process_mapping);
slurm_mutex_unlock(&setup_mutex);
return pmixp_abort_agent_stop();
}
extern void mpi_p_conf_options(s_p_options_t **full_options, int *full_opt_cnt)
{
transfer_s_p_options(full_options, pmix_options, full_opt_cnt);
}
extern void mpi_p_conf_set(s_p_hashtbl_t *tbl)
{
_reset_pmix_conf();
if (tbl) {
s_p_get_string(&slurm_pmix_conf.cli_tmpdir_base,
"PMIxCliTmpDirBase", tbl);
s_p_get_string(
&slurm_pmix_conf.coll_fence, "PMIxCollFence", tbl);
s_p_get_uint32(&slurm_pmix_conf.debug, "PMIxDebug", tbl);
s_p_get_boolean(
&slurm_pmix_conf.direct_conn,"PMIxDirectConn", tbl);
s_p_get_boolean(&slurm_pmix_conf.direct_conn_early,
"PMIxDirectConnEarly", tbl);
s_p_get_boolean(&slurm_pmix_conf.direct_conn_ucx,
"PMIxDirectConnUCX", tbl);
s_p_get_boolean(&slurm_pmix_conf.direct_samearch,
"PMIxDirectSameArch", tbl);
s_p_get_string(&slurm_pmix_conf.env, "PMIxEnv", tbl);
s_p_get_boolean(&slurm_pmix_conf.fence_barrier,
"PMIxFenceBarrier", tbl);
s_p_get_string(&slurm_pmix_conf.ucx_netdevices,
"PMIxNetDevicesUCX", tbl);
s_p_get_boolean(&slurm_pmix_conf.share_topology,
"PMIxShareServerTopology", tbl);
s_p_get_uint32(&slurm_pmix_conf.timeout, "PMIxTimeout", tbl);
s_p_get_string(&slurm_pmix_conf.ucx_tls, "PMIxTlsUCX", tbl);
}
}
extern s_p_hashtbl_t *mpi_p_conf_get(void)
{
s_p_hashtbl_t *tbl = s_p_hashtbl_create(pmix_options);
char *value;
if (slurm_pmix_conf.cli_tmpdir_base)
s_p_parse_pair(tbl, "PMIxCliTmpDirBase",
slurm_pmix_conf.cli_tmpdir_base);
if (slurm_pmix_conf.coll_fence)
s_p_parse_pair(tbl, "PMIxCollFence",
slurm_pmix_conf.coll_fence);
value = xstrdup_printf("%u", slurm_pmix_conf.debug);
s_p_parse_pair(tbl, "PMIxDebug", value);
xfree(value);
s_p_parse_pair(tbl, "PMIxDirectConn",
(slurm_pmix_conf.direct_conn ? "yes" : "no"));
s_p_parse_pair(tbl, "PMIxDirectConnEarly",
(slurm_pmix_conf.direct_conn_early ? "yes" : "no"));
s_p_parse_pair(tbl, "PMIxDirectConnUCX",
(slurm_pmix_conf.direct_conn_ucx ? "yes" : "no"));
s_p_parse_pair(tbl, "PMIxDirectSameArch",
(slurm_pmix_conf.direct_samearch ? "yes" : "no"));
if(slurm_pmix_conf.env)
s_p_parse_pair(tbl, "PMIxEnv", slurm_pmix_conf.env);
s_p_parse_pair(tbl, "PMIxFenceBarrier",
(slurm_pmix_conf.fence_barrier ? "yes" : "no"));
if (slurm_pmix_conf.ucx_netdevices)
s_p_parse_pair(tbl, "PMIxNetDevicesUCX",
slurm_pmix_conf.ucx_netdevices);
s_p_parse_pair(tbl, "PMIxShareServerTopology",
(slurm_pmix_conf.share_topology ? "yes" : "no"));
value = xstrdup_printf("%u", slurm_pmix_conf.timeout);
s_p_parse_pair(tbl, "PMIxTimeout", value);
xfree(value);
if (slurm_pmix_conf.ucx_tls)
s_p_parse_pair(tbl, "PMIxTlsUCX", slurm_pmix_conf.ucx_tls);
return tbl;
}
extern list_t *mpi_p_conf_get_printable(void)
{
list_t *data = list_create(destroy_config_key_pair);
add_key_pair(data, "PMIxCliTmpDirBase", "%s",
slurm_pmix_conf.cli_tmpdir_base);
add_key_pair(data, "PMIxCollFence", "%s", slurm_pmix_conf.coll_fence);
add_key_pair(data, "PMIxDebug", "%u", slurm_pmix_conf.debug);
add_key_pair_bool(data, "PMIxDirectConn", slurm_pmix_conf.direct_conn);
add_key_pair_bool(data, "PMIxDirectConnEarly",
slurm_pmix_conf.direct_conn_early);
add_key_pair_bool(data, "PMIxDirectConnUCX",
slurm_pmix_conf.direct_conn_ucx);
add_key_pair_bool(data, "PMIxDirectSameArch",
slurm_pmix_conf.direct_samearch);
add_key_pair(data, "PMIxEnv", "%s", slurm_pmix_conf.env);
add_key_pair_bool(data, "PMIxFenceBarrier",
slurm_pmix_conf.fence_barrier);
add_key_pair(data, "PMIxNetDevicesUCX", "%s",
slurm_pmix_conf.ucx_netdevices);
add_key_pair_bool(data, "PMIxShareServerTopology",
slurm_pmix_conf.share_topology);
add_key_pair(data, "PMIxTimeout", "%u", slurm_pmix_conf.timeout);
add_key_pair(data, "PMIxTlsUCX", "%s", slurm_pmix_conf.ucx_tls);
return data;
}