blob: 9e9d11dea81baf9b26825cb48ee4b89ae8002fbe [file] [log] [blame]
/*****************************************************************************\
** pmix_coll.h - PMIx collective primitives
*****************************************************************************
* Copyright (C) 2014-2015 Artem Polyakov. All rights reserved.
* Copyright (C) 2015-2017 Mellanox Technologies. All rights reserved.
* Written by Artem Polyakov <artpol84@gmail.com, artemp@mellanox.com>,
* Boris Karasev <karasev.b@gmail.com, boriska@mellanox.com>.
*
* This file is part of Slurm, a resource management program.
* For details, see <https://slurm.schedmd.com/>.
* Please also read the included file: DISCLAIMER.
*
* Slurm is free software; you can redistribute it and/or modify it under
* the terms of the GNU General Public License as published by the Free
* Software Foundation; either version 2 of the License, or (at your option)
* any later version.
*
* In addition, as a special exception, the copyright holders give permission
* to link the code of portions of this program with the OpenSSL library under
* certain conditions as described in each individual source file, and
* distribute linked combinations including the two. You must obey the GNU
* General Public License in all respects for all of the code used other than
* OpenSSL. If you modify file(s) with this exception, you may extend this
* exception to your version of the file(s), but you are not obligated to do
* so. If you do not wish to do so, delete this exception statement from your
* version. If you delete this exception statement from all source files in
* the program, then also delete it here.
*
* Slurm is distributed in the hope that it will be useful, but WITHOUT ANY
* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
* FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
* details.
*
* You should have received a copy of the GNU General Public License along
* with Slurm; if not, write to the Free Software Foundation, Inc.,
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
\*****************************************************************************/
#ifndef PMIXP_COLL_H
#define PMIXP_COLL_H
#include "pmixp_common.h"
#include "pmixp_debug.h"
#define PMIXP_COLL_DEBUG 1
#define PMIXP_COLL_RING_CTX_NUM 3
typedef enum {
PMIXP_COLL_TYPE_FENCE_TREE = 0,
PMIXP_COLL_TYPE_FENCE_RING,
/* reserve coll fence ids up to 15 */
PMIXP_COLL_TYPE_FENCE_MAX = 15,
PMIXP_COLL_TYPE_CONNECT,
PMIXP_COLL_TYPE_DISCONNECT
} pmixp_coll_type_t;
inline static char *
pmixp_coll_type2str(pmixp_coll_type_t type) {
switch(type) {
case PMIXP_COLL_TYPE_FENCE_TREE:
return "COLL_FENCE_TREE";
case PMIXP_COLL_TYPE_FENCE_RING:
return "COLL_FENCE_RING";
case PMIXP_COLL_TYPE_FENCE_MAX:
return "COLL_FENCE_MAX";
default:
return "COLL_FENCE_UNK";
}
}
typedef enum {
PMIXP_COLL_CPERF_TREE = PMIXP_COLL_TYPE_FENCE_TREE,
PMIXP_COLL_CPERF_RING = PMIXP_COLL_TYPE_FENCE_RING,
PMIXP_COLL_CPERF_MIXED = PMIXP_COLL_TYPE_FENCE_MAX,
PMIXP_COLL_CPERF_BARRIER
} pmixp_coll_cperf_mode_t;
inline static char *
pmixp_coll_cperf_mode2str(pmixp_coll_cperf_mode_t mode) {
switch(mode) {
case PMIXP_COLL_CPERF_RING:
return "PMIXP_COLL_CPERF_RING";
case PMIXP_COLL_CPERF_TREE:
return "PMIXP_COLL_CPERF_TREE";
case PMIXP_COLL_CPERF_MIXED:
return "PMIXP_COLL_CPERF_MIXED";
case PMIXP_COLL_CPERF_BARRIER:
return "PMIXP_COLL_CPERF_BARRIER";
default:
return "PMIXP_COLL_CPERF_UNK";
}
}
int pmixp_hostset_from_ranges(const pmix_proc_t *procs, size_t nprocs,
hostlist_t **hl_out);
/* PMIx Tree collective */
typedef enum {
PMIXP_COLL_TREE_SYNC,
PMIXP_COLL_TREE_COLLECT,
PMIXP_COLL_TREE_UPFWD,
PMIXP_COLL_TREE_UPFWD_WSC, /* Wait for the upward Send Complete */
PMIXP_COLL_TREE_UPFWD_WPC, /* Wait for Parent Contrib */
PMIXP_COLL_TREE_DOWNFWD,
} pmixp_coll_tree_state_t;
inline static char *
pmixp_coll_tree_state2str(pmixp_coll_tree_state_t state)
{
switch (state) {
case PMIXP_COLL_TREE_SYNC:
return "COLL_SYNC";
case PMIXP_COLL_TREE_COLLECT:
return "COLL_COLLECT";
case PMIXP_COLL_TREE_UPFWD:
return "COLL_UPFWD";
case PMIXP_COLL_TREE_UPFWD_WSC:
return "COLL_UPFWD_WAITSND";
case PMIXP_COLL_TREE_UPFWD_WPC:
return "COLL_UPFWD_WAITPRNT";
case PMIXP_COLL_TREE_DOWNFWD:
return "COLL_DOWNFWD";
default:
return "COLL_UNKNOWN";
}
}
typedef enum {
PMIXP_COLL_TREE_SND_NONE,
PMIXP_COLL_TREE_SND_ACTIVE,
PMIXP_COLL_TREE_SND_DONE,
PMIXP_COLL_TREE_SND_FAILED,
} pmixp_coll_tree_sndstate_t;
inline static char *
pmixp_coll_tree_sndstatus2str(pmixp_coll_tree_sndstate_t state)
{
switch (state) {
case PMIXP_COLL_TREE_SND_NONE:
return "COLL_SND_NONE";
case PMIXP_COLL_TREE_SND_ACTIVE:
return "COLL_SND_ACTIVE";
case PMIXP_COLL_TREE_SND_DONE:
return "COLL_SND_DONE";
case PMIXP_COLL_TREE_SND_FAILED:
return "COLL_SND_FAILED";
default:
return "COLL_UNKNOWN";
}
}
typedef enum {
PMIXP_COLL_REQ_PROGRESS,
PMIXP_COLL_REQ_SKIP,
PMIXP_COLL_REQ_FAILURE
} pmixp_coll_req_state_t;
/* tree coll struct */
typedef struct {
/* general information */
pmixp_coll_tree_state_t state;
/* tree topology */
char *prnt_host;
int prnt_peerid;
char *root_host;
int root_peerid;
int chldrn_cnt;
hostlist_t *all_chldrn_hl;
char *chldrn_str;
int *chldrn_ids;
/* collective state */
bool contrib_local;
uint32_t contrib_children;
bool *contrib_chld;
pmixp_coll_tree_sndstate_t ufwd_status;
bool contrib_prnt;
uint32_t dfwd_cb_cnt, dfwd_cb_wait;
pmixp_coll_tree_sndstate_t dfwd_status;
/* collective data */
buf_t *ufwd_buf, *dfwd_buf;
size_t serv_offs, dfwd_offset, ufwd_offset;
} pmixp_coll_tree_t;
/* PMIx Ring collective */
typedef enum {
PMIXP_COLL_RING_SYNC,
PMIXP_COLL_RING_PROGRESS,
PMIXP_COLL_RING_FINALIZE,
} pmixp_ring_state_t;
struct pmixp_coll_s;
typedef struct {
/* ptr to coll data */
struct pmixp_coll_s *coll;
/* context data */
bool in_use;
uint32_t seq;
bool contrib_local;
uint32_t contrib_prev;
uint32_t forward_cnt;
bool *contrib_map;
pmixp_ring_state_t state;
buf_t *ring_buf;
} pmixp_coll_ring_ctx_t;
/* coll ring struct */
typedef struct {
/* next node id */
int next_peerid;
/* coll contexts data */
pmixp_coll_ring_ctx_t ctx_array[PMIXP_COLL_RING_CTX_NUM];
/* buffer pool to ensure parallel sends of ring data */
list_t *fwrd_buf_pool;
list_t *ring_buf_pool;
} pmixp_coll_ring_t;
typedef struct {
uint32_t type;
uint32_t contrib_id;
uint32_t seq;
uint32_t hop_seq;
uint32_t nodeid;
size_t msgsize;
} pmixp_coll_ring_msg_hdr_t;
typedef struct {
size_t size;
char *ptr;
uint32_t contrib_id;
uint32_t hop_seq;
} pmixp_coll_msg_ring_data_t;
inline static char *
pmixp_coll_ring_state2str(pmixp_ring_state_t state)
{
switch (state) {
case PMIXP_COLL_RING_SYNC:
return "COLL_RING_SYNC";
case PMIXP_COLL_RING_PROGRESS:
return "PMIXP_COLL_RING_PROGRESS";
case PMIXP_COLL_RING_FINALIZE:
return "PMIXP_COLL_RING_FINILIZE";
default:
return "COLL_RING_UNKNOWN";
}
}
/* General coll struct */
typedef struct pmixp_coll_s {
#ifndef NDEBUG
#define PMIXP_COLL_STATE_MAGIC 0xC011CAFE
int magic;
#endif
/* element-wise lock */
pthread_mutex_t lock;
/* collective state */
uint32_t seq;
/* general information */
pmixp_coll_type_t type;
/* PMIx collective id */
struct {
pmix_proc_t *procs;
size_t nprocs;
} pset;
int my_peerid;
int peers_cnt;
#ifdef PMIXP_COLL_DEBUG
hostlist_t *peers_hl;
#endif
/* libpmix callback data */
void *cbfunc;
void *cbdata;
/* timestamp for stale collectives detection */
time_t ts, ts_next;
/* coll states */
union {
pmixp_coll_tree_t tree;
pmixp_coll_ring_t ring;
} state;
} pmixp_coll_t;
/* tree coll functions*/
int pmixp_coll_tree_init(pmixp_coll_t *coll, hostlist_t **hl);
void pmixp_coll_tree_free(pmixp_coll_tree_t *tree);
pmixp_coll_t *pmixp_coll_tree_from_cbdata(void *cbdata);
int pmixp_coll_tree_local(pmixp_coll_t *coll, char *data, size_t size,
void *cbfunc, void *cbdata);
int pmixp_coll_tree_child(pmixp_coll_t *coll, uint32_t nodeid,
uint32_t seq, buf_t *buf);
int pmixp_coll_tree_parent(pmixp_coll_t *coll, uint32_t nodeid,
uint32_t seq, buf_t *buf);
void pmixp_coll_tree_bcast(pmixp_coll_t *coll);
bool pmixp_coll_tree_progress(pmixp_coll_t *coll, char *fwd_node,
void **data, uint64_t size);
int pmixp_coll_tree_unpack(buf_t *buf, pmixp_coll_type_t *type,
int *nodeid, pmix_proc_t **r,
size_t *nr);
void pmixp_coll_tree_reset_if_to(pmixp_coll_t *coll, time_t ts);
int pmixp_coll_check(pmixp_coll_t *coll, uint32_t seq);
/* ring coll functions */
int pmixp_coll_ring_init(pmixp_coll_t *coll, hostlist_t **hl);
void pmixp_coll_ring_free(pmixp_coll_ring_t *coll_ring);
int pmixp_coll_ring_check(pmixp_coll_t *coll, pmixp_coll_ring_msg_hdr_t *hdr);
int pmixp_coll_ring_local(pmixp_coll_t *coll, char *data, size_t size,
void *cbfunc, void *cbdata);
int pmixp_coll_ring_neighbor(pmixp_coll_t *coll, pmixp_coll_ring_msg_hdr_t *hdr,
buf_t *buf);
void pmixp_coll_ring_reset(pmixp_coll_ring_ctx_t *coll);
int pmixp_coll_ring_unpack(buf_t *buf, pmixp_coll_type_t *type,
pmixp_coll_ring_msg_hdr_t *ring_hdr,
pmix_proc_t **r, size_t *nr);
void pmixp_coll_ring_reset_if_to(pmixp_coll_t *coll, time_t ts);
pmixp_coll_ring_ctx_t *pmixp_coll_ring_ctx_select(pmixp_coll_t *coll,
const uint32_t seq);
pmixp_coll_t *pmixp_coll_ring_from_cbdata(void *cbdata);
void pmixp_coll_ring_free(pmixp_coll_ring_t *ring);
/* common coll func */
static inline void pmixp_coll_sanity_check(pmixp_coll_t *coll)
{
xassert(NULL != coll);
xassert(coll->magic == PMIXP_COLL_STATE_MAGIC);
}
int pmixp_coll_init(pmixp_coll_t *coll, pmixp_coll_type_t type,
const pmix_proc_t *procs, size_t nprocs);
int pmixp_coll_contrib_local(pmixp_coll_t *coll, pmixp_coll_type_t type,
char *data, size_t ndata,
void *cbfunc, void *cbdata);
void pmixp_coll_free(pmixp_coll_t *coll);
void pmixp_coll_localcb_nodata(pmixp_coll_t *coll, int status);
int pmixp_coll_belong_chk(const pmix_proc_t *procs, size_t nprocs);
void pmixp_coll_log(pmixp_coll_t *coll);
void pmixp_coll_ring_log(pmixp_coll_t *coll);
void pmixp_coll_tree_log(pmixp_coll_t *coll);
#endif /* PMIXP_COLL_RING_H */