| /*****************************************************************************\ |
| ** pmix_client_v2.c - PMIx v2 client communication code |
| ***************************************************************************** |
| * Copyright (C) 2014-2015 Artem Polyakov. All rights reserved. |
| * Copyright (C) 2015-2020 Mellanox Technologies. All rights reserved. |
| * Written by Artem Polyakov <artpol84@gmail.com, artemp@mellanox.com>, |
| * Boris Karasev <karasev.b@gmail.com, boriska@mellanox.com>. |
| * Copyright (C) 2020 Siberian State University of Telecommunications |
| * and Information Sciences (SibSUTIS). |
| * All rights reserved. |
| * Written by Boris Bochkarev <boris-bochkaryov@yandex.ru>. |
| * |
| * This file is part of Slurm, a resource management program. |
| * For details, see <https://slurm.schedmd.com/>. |
| * Please also read the included file: DISCLAIMER. |
| * |
| * Slurm is free software; you can redistribute it and/or modify it under |
| * the terms of the GNU General Public License as published by the Free |
| * Software Foundation; either version 2 of the License, or (at your option) |
| * any later version. |
| * |
| * In addition, as a special exception, the copyright holders give permission |
| * to link the code of portions of this program with the OpenSSL library under |
| * certain conditions as described in each individual source file, and |
| * distribute linked combinations including the two. You must obey the GNU |
| * General Public License in all respects for all of the code used other than |
| * OpenSSL. If you modify file(s) with this exception, you may extend this |
| * exception to your version of the file(s), but you are not obligated to do |
| * so. If you do not wish to do so, delete this exception statement from your |
| * version. If you delete this exception statement from all source files in |
| * the program, then also delete it here. |
| * |
| * Slurm is distributed in the hope that it will be useful, but WITHOUT ANY |
| * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS |
| * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more |
| * details. |
| * |
| * You should have received a copy of the GNU General Public License along |
| * with Slurm; if not, write to the Free Software Foundation, Inc., |
| * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. |
| \*****************************************************************************/ |
| |
| #include "pmixp_common.h" |
| #include "pmixp_state.h" |
| #include "pmixp_io.h" |
| #include "pmixp_nspaces.h" |
| #include "pmixp_debug.h" |
| #include "pmixp_coll.h" |
| #include "pmixp_server.h" |
| #include "pmixp_dmdx.h" |
| #include "pmixp_client.h" |
| |
| #include <sys/stat.h> |
| #include <sys/types.h> |
| #include <pmix_server.h> |
| |
| static int _client_connected(const pmix_proc_t *proc, void *server_object, |
| pmix_op_cbfunc_t cbfunc, void *cbdata) |
| { |
| /* |
| * Before PMIx v5.0.8, a return of PMIX_SUCCESS was always excepted. |
| * (Note: PMIx v3.1.0rc2+ would also treat PMIX_OPERATION_SUCCEEDED the |
| * same as PMIX_SUCCESS. Also cbfunc was always NULL before v5.0.8) |
| * |
| * Starting in PMIx v5.0.8, PMIX_SUCCESS and PMIX_OPERATION_SUCCEEDED |
| * are treated differently. In v5+ only return PMIX_SUCCESS to |
| * indicate cbfunc was called. |
| * |
| * Also, guarding PMIX_OPERATION_SUCCEEDED in PMIx v5+ allows Slurm to |
| * compile against PMIx v2 - v3.0.0 since it didn't exist yet. |
| */ |
| #if (HAVE_PMIX_VER >= 5) |
| if (!cbfunc) |
| return PMIX_OPERATION_SUCCEEDED; |
| |
| /* Call the callback function. The status will be set in cbdata */ |
| cbfunc(PMIX_SUCCESS, cbdata); |
| #endif |
| |
| return PMIX_SUCCESS; |
| } |
| |
| static void _errhandler_reg_callbk(pmix_status_t status, |
| size_t errhandler_ref, void *cbdata) |
| { |
| PMIXP_DEBUG("Error handler registration callback is called with status=%d, ref=%d", |
| status, (int)errhandler_ref); |
| } |
| |
| static pmix_status_t _client_finalized(const pmix_proc_t *proc, |
| void *server_object, |
| pmix_op_cbfunc_t cbfunc, |
| void *cbdata) |
| { |
| /* don'n do anything by now */ |
| if (NULL != cbfunc) { |
| cbfunc(PMIX_SUCCESS, cbdata); |
| } |
| return PMIX_SUCCESS; |
| } |
| |
| static pmix_status_t _abort_fn(const pmix_proc_t *pmix_proc, void *server_object, |
| int status, const char msg[], |
| pmix_proc_t pmix_procs[], size_t nprocs, |
| pmix_op_cbfunc_t cbfunc, void *cbdata) |
| { |
| PMIXP_DEBUG("called: status = %d, msg = %s", status, msg); |
| if (pmixp_lib_abort(status, cbfunc, cbdata) != SLURM_SUCCESS) |
| return PMIX_ERROR; |
| |
| return PMIX_SUCCESS; |
| } |
| |
| static pmix_status_t _fencenb_fn(const pmix_proc_t procs_v2[], size_t nprocs, |
| const pmix_info_t info[], size_t ninfo, |
| char *data, size_t ndata, |
| pmix_modex_cbfunc_t cbfunc, void *cbdata) |
| { |
| PMIXP_DEBUG("called"); |
| int ret; |
| size_t i; |
| pmix_proc_t *procs = xmalloc(sizeof(*procs) * nprocs); |
| bool collect = false; |
| |
| for (i = 0; i < nprocs; i++) { |
| procs[i].rank = procs_v2[i].rank; |
| strlcpy(procs[i].nspace, procs_v2[i].nspace, |
| sizeof(procs[i].nspace)); |
| } |
| /* check the info keys */ |
| if (info) { |
| for (i = 0; i < ninfo; i++) { |
| if (0 == strncmp(info[i].key, PMIX_COLLECT_DATA, PMIX_MAX_KEYLEN)) { |
| collect = true; |
| break; |
| } |
| } |
| } |
| ret = pmixp_lib_fence(procs, nprocs, collect, data, ndata, cbfunc, cbdata); |
| xfree(procs); |
| |
| return ret; |
| } |
| |
| static pmix_status_t _dmodex_fn(const pmix_proc_t *proc, |
| const pmix_info_t info[], size_t ninfo, |
| pmix_modex_cbfunc_t cbfunc, void *cbdata) |
| { |
| int rc; |
| PMIXP_DEBUG("called"); |
| |
| rc = pmixp_dmdx_get(proc->nspace, proc->rank, cbfunc, cbdata); |
| |
| return (SLURM_SUCCESS == rc) ? PMIX_SUCCESS : PMIX_ERROR; |
| } |
| |
| static pmix_status_t _job_control(const pmix_proc_t *proct, |
| const pmix_proc_t targets[], size_t ntargets, |
| const pmix_info_t directives[], size_t ndirs, |
| pmix_info_cbfunc_t cbfunc, void *cbdata) |
| { |
| PMIXP_DEBUG("called"); |
| return PMIX_ERR_NOT_SUPPORTED; |
| } |
| |
| static pmix_status_t _publish_fn(const pmix_proc_t *proc, |
| const pmix_info_t info[], size_t ninfo, |
| pmix_op_cbfunc_t cbfunc, void *cbdata) |
| { |
| PMIXP_DEBUG("called"); |
| return PMIX_ERR_NOT_SUPPORTED; |
| } |
| |
| static pmix_status_t _lookup_fn(const pmix_proc_t *proc, char **keys, |
| const pmix_info_t info[], size_t ninfo, |
| pmix_lookup_cbfunc_t cbfunc, void *cbdata) |
| { |
| PMIXP_DEBUG("called"); |
| return PMIX_ERR_NOT_SUPPORTED; |
| } |
| |
| static pmix_status_t _unpublish_fn(const pmix_proc_t *proc, char **keys, |
| const pmix_info_t info[], size_t ninfo, |
| pmix_op_cbfunc_t cbfunc, void *cbdata) |
| { |
| PMIXP_DEBUG("called"); |
| return PMIX_ERR_NOT_SUPPORTED; |
| } |
| |
| static pmix_status_t _spawn_fn(const pmix_proc_t *proc, |
| const pmix_info_t job_info[], size_t ninfo, |
| const pmix_app_t apps[], size_t napps, |
| pmix_spawn_cbfunc_t cbfunc, void *cbdata) |
| { |
| PMIXP_DEBUG("called"); |
| return PMIX_ERR_NOT_SUPPORTED; |
| } |
| |
| static pmix_status_t _connect_fn(const pmix_proc_t procs[], size_t nprocs, |
| const pmix_info_t info[], size_t ninfo, |
| pmix_op_cbfunc_t cbfunc, void *cbdata) |
| { |
| PMIXP_DEBUG("called"); |
| return PMIX_ERR_NOT_SUPPORTED; |
| } |
| |
| static pmix_status_t _disconnect_fn(const pmix_proc_t procs[], size_t nprocs, |
| const pmix_info_t info[], size_t ninfo, |
| pmix_op_cbfunc_t cbfunc, void *cbdata) |
| { |
| PMIXP_DEBUG("called"); |
| return PMIX_ERR_NOT_SUPPORTED; |
| } |
| |
| static void _errhandler(size_t evhdlr_registration_id, |
| pmix_status_t status, |
| const pmix_proc_t *source, |
| pmix_info_t info[], size_t ninfo, |
| pmix_info_t *results, size_t nresults, |
| pmix_event_notification_cbfunc_fn_t cbfunc, |
| void *cbdata) |
| { |
| /* TODO: do something more sophisticated here */ |
| /* FIXME: use proper specificator for nranges */ |
| PMIXP_ERROR("Error handler invoked: status = %d, source = [%s:%d]", |
| (int) status, source->nspace, source->rank); |
| slurm_kill_job_step(pmixp_info_step_id(), SIGKILL, 0); |
| } |
| |
| static pmix_server_module_t slurm_pmix_cb = { |
| .client_connected = _client_connected, |
| .client_finalized = _client_finalized, |
| .abort = _abort_fn, |
| .fence_nb = _fencenb_fn, |
| .direct_modex = _dmodex_fn, |
| .publish = _publish_fn, |
| .lookup = _lookup_fn, |
| .unpublish = _unpublish_fn, |
| .spawn = _spawn_fn, |
| .connect = _connect_fn, |
| .disconnect = _disconnect_fn, |
| .job_control = _job_control |
| }; |
| |
| int pmixp_lib_init(void) |
| { |
| pmix_info_t *kvp = NULL; |
| pmix_status_t rc; |
| |
| #if (HAVE_PMIX_VER < 5) |
| uint32_t jobuid = pmixp_info_jobuid(); |
| |
| PMIXP_KVP_ADD(kvp, PMIX_USERID, &jobuid, PMIX_UINT32); |
| #endif |
| |
| #ifdef PMIX_SERVER_TMPDIR |
| PMIXP_KVP_ADD(kvp, PMIX_SERVER_TMPDIR, |
| pmixp_info_tmpdir_lib(), PMIX_STRING); |
| #endif |
| |
| #if (HAVE_PMIX_VER > 3) |
| /* |
| * (PMIx v4+) If share_topology is true tell the server to make the |
| * HWLOC topology it grabs available to clients via job-level key-value |
| * pairs. This results in the following keys being stored in PMIx's GDS: |
| * PMIX_HWLOC_XML_V2, PMIX_HWLOC_XML_V1, and PMIX_LOCAL_TOPO |
| * |
| * Also instruct the server NOT to share the topology via shared memory |
| * due to permission issues. |
| */ |
| if (slurm_pmix_conf.share_topology) { |
| PMIXP_KVP_ADD(kvp, PMIX_SERVER_SHARE_TOPOLOGY, |
| &slurm_pmix_conf.share_topology, PMIX_BOOL); |
| setenv("PMIX_MCA_pmix_hwloc_hole_kind", "none", 1); |
| } |
| #endif |
| |
| /* setup the server library */ |
| if (PMIX_SUCCESS != (rc = PMIx_server_init(&slurm_pmix_cb, kvp, |
| PMIXP_INFO_SIZE(kvp)))) { |
| PMIXP_ERROR_STD("PMIx_server_init failed with error %d\n", rc); |
| return SLURM_ERROR; |
| } |
| |
| PMIXP_FREE_KEY(kvp); |
| /* register the errhandler */ |
| PMIx_Register_event_handler(NULL, 0, NULL, 0, _errhandler, |
| _errhandler_reg_callbk, NULL); |
| |
| return SLURM_SUCCESS; |
| } |
| |
| int pmixp_lib_finalize(void) |
| { |
| int rc = SLURM_SUCCESS; |
| |
| if (PMIX_SUCCESS != PMIx_server_finalize()) { |
| rc = SLURM_ERROR; |
| } |
| return rc; |
| } |