blob: 03af224186ad3ed0ad09d547710332138fa65695 [file] [log] [blame]
/*****************************************************************************\
** pmix_client_v2.c - PMIx v2 client communication code
*****************************************************************************
* Copyright (C) 2014-2015 Artem Polyakov. All rights reserved.
* Copyright (C) 2015-2020 Mellanox Technologies. All rights reserved.
* Written by Artem Polyakov <artpol84@gmail.com, artemp@mellanox.com>,
* Boris Karasev <karasev.b@gmail.com, boriska@mellanox.com>.
* Copyright (C) 2020 Siberian State University of Telecommunications
* and Information Sciences (SibSUTIS).
* All rights reserved.
* Written by Boris Bochkarev <boris-bochkaryov@yandex.ru>.
*
* This file is part of Slurm, a resource management program.
* For details, see <https://slurm.schedmd.com/>.
* Please also read the included file: DISCLAIMER.
*
* Slurm is free software; you can redistribute it and/or modify it under
* the terms of the GNU General Public License as published by the Free
* Software Foundation; either version 2 of the License, or (at your option)
* any later version.
*
* In addition, as a special exception, the copyright holders give permission
* to link the code of portions of this program with the OpenSSL library under
* certain conditions as described in each individual source file, and
* distribute linked combinations including the two. You must obey the GNU
* General Public License in all respects for all of the code used other than
* OpenSSL. If you modify file(s) with this exception, you may extend this
* exception to your version of the file(s), but you are not obligated to do
* so. If you do not wish to do so, delete this exception statement from your
* version. If you delete this exception statement from all source files in
* the program, then also delete it here.
*
* Slurm is distributed in the hope that it will be useful, but WITHOUT ANY
* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
* FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
* details.
*
* You should have received a copy of the GNU General Public License along
* with Slurm; if not, write to the Free Software Foundation, Inc.,
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
\*****************************************************************************/
#include "pmixp_common.h"
#include "pmixp_state.h"
#include "pmixp_io.h"
#include "pmixp_nspaces.h"
#include "pmixp_debug.h"
#include "pmixp_coll.h"
#include "pmixp_server.h"
#include "pmixp_dmdx.h"
#include "pmixp_client.h"
#include <sys/stat.h>
#include <sys/types.h>
#include <pmix_server.h>
static int _client_connected(const pmix_proc_t *proc, void *server_object,
pmix_op_cbfunc_t cbfunc, void *cbdata)
{
/*
* Before PMIx v6, a return of PMIX_SUCCESS was always excepted.
* (Note: PMIx v3.1.0rc2+ would also treat PMIX_OPERATION_SUCCEEDED the
* same as PMIX_SUCCESS. Also cbfunc was always NULL before PMIx v6)
*
* Starting in PMIx v6, PMIX_SUCCESS and PMIX_OPERATION_SUCCEEDED are
* treated differently. In v6+ only return PMIX_SUCCESS to indicate
* cbfunc was called.
*
* Also, guarding PMIX_OPERATION_SUCCEEDED in PMIx v6+ allows Slurm to
* compile against PMIx v2 - v3.0.0 since it didn't exist yet.
*/
#if (HAVE_PMIX_VER >= 6)
if (!cbfunc)
return PMIX_OPERATION_SUCCEEDED;
/* Call the callback function. The status will be set in cbdata */
cbfunc(PMIX_SUCCESS, cbdata);
#endif
return PMIX_SUCCESS;
}
static void _errhandler_reg_callbk(pmix_status_t status,
size_t errhandler_ref, void *cbdata)
{
PMIXP_DEBUG("Error handler registration callback is called with status=%d, ref=%d",
status, (int)errhandler_ref);
}
static pmix_status_t _client_finalized(const pmix_proc_t *proc,
void *server_object,
pmix_op_cbfunc_t cbfunc,
void *cbdata)
{
/* don'n do anything by now */
if (NULL != cbfunc) {
cbfunc(PMIX_SUCCESS, cbdata);
}
return PMIX_SUCCESS;
}
static pmix_status_t _abort_fn(const pmix_proc_t *pmix_proc, void *server_object,
int status, const char msg[],
pmix_proc_t pmix_procs[], size_t nprocs,
pmix_op_cbfunc_t cbfunc, void *cbdata)
{
PMIXP_DEBUG("called: status = %d, msg = %s", status, msg);
if (pmixp_lib_abort(status, cbfunc, cbdata) != SLURM_SUCCESS)
return PMIX_ERROR;
return PMIX_SUCCESS;
}
static pmix_status_t _fencenb_fn(const pmix_proc_t procs_v2[], size_t nprocs,
const pmix_info_t info[], size_t ninfo,
char *data, size_t ndata,
pmix_modex_cbfunc_t cbfunc, void *cbdata)
{
PMIXP_DEBUG("called");
int ret;
size_t i;
pmix_proc_t *procs = xmalloc(sizeof(*procs) * nprocs);
bool collect = false;
for (i = 0; i < nprocs; i++) {
procs[i].rank = procs_v2[i].rank;
strlcpy(procs[i].nspace, procs_v2[i].nspace,
sizeof(procs[i].nspace));
}
/* check the info keys */
if (info) {
for (i = 0; i < ninfo; i++) {
if (0 == strncmp(info[i].key, PMIX_COLLECT_DATA, PMIX_MAX_KEYLEN)) {
collect = true;
break;
}
}
}
ret = pmixp_lib_fence(procs, nprocs, collect, data, ndata, cbfunc, cbdata);
xfree(procs);
return ret;
}
static pmix_status_t _dmodex_fn(const pmix_proc_t *proc,
const pmix_info_t info[], size_t ninfo,
pmix_modex_cbfunc_t cbfunc, void *cbdata)
{
int rc;
PMIXP_DEBUG("called");
rc = pmixp_dmdx_get(proc->nspace, proc->rank, cbfunc, cbdata);
return (SLURM_SUCCESS == rc) ? PMIX_SUCCESS : PMIX_ERROR;
}
static pmix_status_t _job_control(const pmix_proc_t *proct,
const pmix_proc_t targets[], size_t ntargets,
const pmix_info_t directives[], size_t ndirs,
pmix_info_cbfunc_t cbfunc, void *cbdata)
{
PMIXP_DEBUG("called");
return PMIX_ERR_NOT_SUPPORTED;
}
static pmix_status_t _publish_fn(const pmix_proc_t *proc,
const pmix_info_t info[], size_t ninfo,
pmix_op_cbfunc_t cbfunc, void *cbdata)
{
PMIXP_DEBUG("called");
return PMIX_ERR_NOT_SUPPORTED;
}
static pmix_status_t _lookup_fn(const pmix_proc_t *proc, char **keys,
const pmix_info_t info[], size_t ninfo,
pmix_lookup_cbfunc_t cbfunc, void *cbdata)
{
PMIXP_DEBUG("called");
return PMIX_ERR_NOT_SUPPORTED;
}
static pmix_status_t _unpublish_fn(const pmix_proc_t *proc, char **keys,
const pmix_info_t info[], size_t ninfo,
pmix_op_cbfunc_t cbfunc, void *cbdata)
{
PMIXP_DEBUG("called");
return PMIX_ERR_NOT_SUPPORTED;
}
static pmix_status_t _spawn_fn(const pmix_proc_t *proc,
const pmix_info_t job_info[], size_t ninfo,
const pmix_app_t apps[], size_t napps,
pmix_spawn_cbfunc_t cbfunc, void *cbdata)
{
PMIXP_DEBUG("called");
return PMIX_ERR_NOT_SUPPORTED;
}
static pmix_status_t _connect_fn(const pmix_proc_t procs[], size_t nprocs,
const pmix_info_t info[], size_t ninfo,
pmix_op_cbfunc_t cbfunc, void *cbdata)
{
PMIXP_DEBUG("called");
return PMIX_ERR_NOT_SUPPORTED;
}
static pmix_status_t _disconnect_fn(const pmix_proc_t procs[], size_t nprocs,
const pmix_info_t info[], size_t ninfo,
pmix_op_cbfunc_t cbfunc, void *cbdata)
{
PMIXP_DEBUG("called");
return PMIX_ERR_NOT_SUPPORTED;
}
static void _errhandler(size_t evhdlr_registration_id,
pmix_status_t status,
const pmix_proc_t *source,
pmix_info_t info[], size_t ninfo,
pmix_info_t *results, size_t nresults,
pmix_event_notification_cbfunc_fn_t cbfunc,
void *cbdata)
{
/* TODO: do something more sophisticated here */
/* FIXME: use proper specificator for nranges */
PMIXP_ERROR("Error handler invoked: status = %d, source = [%s:%d]",
(int) status, source->nspace, source->rank);
slurm_kill_job_step(pmixp_info_step_id(), SIGKILL, 0);
}
static pmix_server_module_t slurm_pmix_cb = {
.client_connected = _client_connected,
.client_finalized = _client_finalized,
.abort = _abort_fn,
.fence_nb = _fencenb_fn,
.direct_modex = _dmodex_fn,
.publish = _publish_fn,
.lookup = _lookup_fn,
.unpublish = _unpublish_fn,
.spawn = _spawn_fn,
.connect = _connect_fn,
.disconnect = _disconnect_fn,
.job_control = _job_control
};
int pmixp_lib_init(void)
{
pmix_info_t *kvp = NULL;
pmix_status_t rc;
#if (HAVE_PMIX_VER < 5)
uint32_t jobuid = pmixp_info_jobuid();
PMIXP_KVP_ADD(kvp, PMIX_USERID, &jobuid, PMIX_UINT32);
#endif
#ifdef PMIX_SERVER_TMPDIR
PMIXP_KVP_ADD(kvp, PMIX_SERVER_TMPDIR,
pmixp_info_tmpdir_lib(), PMIX_STRING);
#endif
#if (HAVE_PMIX_VER > 3)
/*
* (PMIx v4+) If share_topology is true tell the server to make the
* HWLOC topology it grabs available to clients via job-level key-value
* pairs. This results in the following keys being stored in PMIx's GDS:
* PMIX_HWLOC_XML_V2, PMIX_HWLOC_XML_V1, and PMIX_LOCAL_TOPO
*
* Also instruct the server NOT to share the topology via shared memory
* due to permission issues.
*/
if (slurm_pmix_conf.share_topology) {
PMIXP_KVP_ADD(kvp, PMIX_SERVER_SHARE_TOPOLOGY,
&slurm_pmix_conf.share_topology, PMIX_BOOL);
setenv("PMIX_MCA_pmix_hwloc_hole_kind", "none", 1);
}
#endif
/* setup the server library */
if (PMIX_SUCCESS != (rc = PMIx_server_init(&slurm_pmix_cb, kvp,
PMIXP_INFO_SIZE(kvp)))) {
PMIXP_ERROR_STD("PMIx_server_init failed with error %d\n", rc);
return SLURM_ERROR;
}
PMIXP_FREE_KEY(kvp);
/* register the errhandler */
PMIx_Register_event_handler(NULL, 0, NULL, 0, _errhandler,
_errhandler_reg_callbk, NULL);
return SLURM_SUCCESS;
}
int pmixp_lib_finalize(void)
{
int rc = SLURM_SUCCESS;
if (PMIX_SUCCESS != PMIx_server_finalize()) {
rc = SLURM_ERROR;
}
return rc;
}