blob: 5e5d4750ecdc27a37875baf6d846a2ac0b08aa6a [file] [log] [blame] [edit]
/*****************************************************************************\
* slurm_route.c - route plugin functions.
*****************************************************************************
* Copyright (C) 2014 Bull S. A. S.
* Bull, Rue Jean Jaures, B.P.68, 78340, Les Clayes-sous-Bois.
*
* Written by Rod Schultz <rod.schultz@bull.com>
*
* This file is part of Slurm, a resource management program.
* For details, see <https://slurm.schedmd.com/>.
* Please also read the included file: DISCLAIMER.
*
* Slurm is free software; you can redistribute it and/or modify it under
* the terms of the GNU General Public License as published by the Free
* Software Foundation; either version 2 of the License, or (at your option)
* any later version.
*
* In addition, as a special exception, the copyright holders give permission
* to link the code of portions of this program with the OpenSSL library under
* certain conditions as described in each individual source file, and
* distribute linked combinations including the two. You must obey the GNU
* General Public License in all respects for all of the code used other than
* OpenSSL. If you modify file(s) with this exception, you may extend this
* exception to your version of the file(s), but you are not obligated to do
* so. If you do not wish to do so, delete this exception statement from your
* version. If you delete this exception statement from all source files in
* the program, then also delete it here.
*
* Slurm is distributed in the hope that it will be useful, but WITHOUT ANY
* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
* FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
* details.
*
* You should have received a copy of the GNU General Public License along
* with Slurm; if not, write to the Free Software Foundation, Inc.,
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
\*****************************************************************************/
#include "config.h"
#include <pthread.h>
#include <stdlib.h>
#include <stdio.h>
#include <sys/param.h> /* MAXPATHLEN */
#include "slurm/slurm.h"
#include "src/common/log.h"
#include "src/common/forward.h"
#include "src/common/node_conf.h"
#include "src/common/plugrack.h"
#include "src/common/read_config.h"
#include "src/common/slurm_protocol_api.h"
#include "src/common/slurm_route.h"
#include "src/common/timers.h"
#include "src/common/xmalloc.h"
#include "src/common/xstring.h"
strong_alias(route_split_hostlist_treewidth,
slurm_route_split_hostlist_treewidth);
typedef struct slurm_route_ops {
int (*split_hostlist) (hostlist_t hl,
hostlist_t** sp_hl,
int* count, uint16_t tree_width);
int (*reconfigure) (void);
slurm_addr_t* (*next_collector) (bool* is_collector);
slurm_addr_t* (*next_collector_backup) (void);
} slurm_route_ops_t;
/*
* Must be synchronized with slurm_route_ops_t above.
*/
static const char *syms[] = {
"route_p_split_hostlist",
"route_p_reconfigure",
"route_p_next_collector",
"route_p_next_collector_backup"
};
static slurm_route_ops_t ops;
static plugin_context_t *g_context = NULL;
static pthread_mutex_t g_context_lock = PTHREAD_MUTEX_INITIALIZER;
static bool init_run = false;
static uint32_t debug_flags = 0;
static uint16_t g_tree_width;
static bool this_is_collector = false; /* this node is a collector node */
static slurm_addr_t *msg_collect_node = NULL; /* address of node to aggregate
messages from this node */
/* addresses of backup nodes to aggregate messages from this node */
static uint32_t msg_backup_cnt = 0;
static slurm_addr_t **msg_collect_backup = NULL;
/* _get_all_nodes creates a hostlist containing all the nodes in the
* node_record_table.
*
* Caller must destroy the list.
*/
static hostlist_t _get_all_nodes( void )
{
int i;
hostlist_t nodes = hostlist_create(NULL);
for (i = 0; i < node_record_count; i++) {
hostlist_push_host(nodes, node_record_table_ptr[i].name);
}
return nodes;
}
/*
* _set_collectors call the split_hostlist API on the all nodes hostlist
* to set the node to be used as a collector for unsolicited node aggregation.
*
* If this node is a forwarding node (first node in any hostlist),
* then its collector and backup are the ControlMachine and it's backup.
*
* Otherwise, we find the hostlist containing this node.
* The forwarding node in that hostlist becomes a collector, the next node
* which is not this node becomes the backup.
* That list is split, we iterate through it and searching for a list in
* which this node is a forwarding node. If found, we set the collector and
* backup, else this process is repeated.
*/
static void _set_collectors(char *this_node_name)
{
slurm_ctl_conf_t *conf;
hostlist_t nodes;
hostlist_t *hll = NULL;
uint32_t backup_cnt;
char *parent = NULL, **backup;
char addrbuf[32];
int i, j, f = -1;
int hl_count = 0;
uint16_t parent_port;
uint16_t backup_port;
bool ctldparent = true;
char *tmp = NULL;
#ifdef HAVE_FRONT_END
return; /* on a FrontEnd system this would never be useful. */
#endif
if (!running_in_slurmd())
return; /* Only compute nodes have collectors */
/*
* Set the initial iteration, collector is controller,
* full list is split
*/
xassert(this_node_name);
conf = slurm_conf_lock();
nodes = _get_all_nodes();
backup_cnt = conf->control_cnt;
backup = xmalloc(sizeof(char *) * backup_cnt);
if (conf->slurmctld_addr) {
parent = strdup(conf->slurmctld_addr);
backup_cnt = 1;
} else
parent = strdup(conf->control_addr[0]);
for (i = 0; i < backup_cnt; i++) {
if (conf->control_addr[i])
backup[i] = xstrdup(conf->control_addr[i]);
else
backup[i] = NULL;
}
msg_backup_cnt = backup_cnt + 2;
msg_collect_backup = xmalloc(sizeof(slurm_addr_t *) * msg_backup_cnt);
parent_port = conf->slurmctld_port;
backup_port = parent_port;
slurm_conf_unlock();
while (1) {
if (route_g_split_hostlist(nodes, &hll, &hl_count, 0)) {
error("unable to split forward hostlist");
goto clean; /* collector addrs remains null */
}
/* Find which hostlist contains this node */
for (i = 0; i < hl_count; i++) {
f = hostlist_find(hll[i], this_node_name);
if (f != -1)
break;
}
if (i == hl_count) {
fatal("ROUTE -- %s not found in node_record_table",
this_node_name);
}
if (f == 0) {
/*
* we are a forwarded to node,
* so our parent is "parent"
*/
if (hostlist_count(hll[i]) > 1)
this_is_collector = true;
xfree(msg_collect_node);
msg_collect_node = xmalloc(sizeof(slurm_addr_t));
if (ctldparent) {
slurm_set_addr(msg_collect_node, parent_port,
parent);
} else {
slurm_conf_get_addr(parent, msg_collect_node, 0);
msg_collect_node->sin_port = htons(parent_port);
}
if (debug_flags & DEBUG_FLAG_ROUTE) {
slurm_print_slurm_addr(msg_collect_node,
addrbuf, 32);
info("ROUTE -- message collector (%s) address is %s",
parent, addrbuf);
}
msg_backup_cnt = 0;
xfree(msg_collect_backup[0]);
for (i = 1; (i < backup_cnt) && backup[i]; i++) {
msg_backup_cnt = i;
msg_collect_backup[i-1] =
xmalloc(sizeof(slurm_addr_t));
if (ctldparent) {
slurm_set_addr(msg_collect_backup[i-1],
backup_port, backup[i]);
} else {
slurm_conf_get_addr(backup[i],
msg_collect_backup[i-1], 0);
msg_collect_backup[i-1]->sin_port =
htons(backup_port);
}
if (debug_flags & DEBUG_FLAG_ROUTE) {
slurm_print_slurm_addr(
msg_collect_backup[i-1],
addrbuf, 32);
info("ROUTE -- message collector backup[%d] (%s) "
"address is %s",
i, backup[i], addrbuf);
}
}
if ((i == 1) && (debug_flags & DEBUG_FLAG_ROUTE))
info("ROUTE -- no message collector backup");
goto clean;
}
/*
* We are not a forwarding node, the first node in this list
* will split the forward_list.
* We also know that the forwarding node is not a controller.
*
* clean up parent context
*/
ctldparent = false;
hostlist_destroy(nodes);
nodes = hostlist_copy(hll[i]);
for (j = 0; j < hl_count; j++) {
hostlist_destroy(hll[j]);
}
xfree(hll);
/* set our parent, backup, and continue search */
for (i = 0; i < backup_cnt; i++)
xfree(backup[i]);
if (parent)
free(parent);
parent = hostlist_shift(nodes);
tmp = hostlist_nth(nodes, 0);
backup[0] = xstrdup(tmp);
free(tmp);
tmp = NULL;
if (xstrcmp(backup[0], this_node_name) == 0) {
xfree(backup[0]);
if (hostlist_count(nodes) > 1) {
tmp = hostlist_nth(nodes, 1);
backup[0] = xstrdup(tmp);
free(tmp);
tmp = NULL;
}
}
parent_port = slurm_conf_get_port(parent);
if (backup[0])
backup_port = slurm_conf_get_port(backup[0]);
else
backup_port = 0;
}
clean:
if (debug_flags & DEBUG_FLAG_ROUTE) {
slurm_print_slurm_addr(msg_collect_node, addrbuf, 32);
xstrfmtcat(tmp, "ROUTE -- %s is a %s node (parent:%s",
this_node_name,
this_is_collector ? "collector" : "leaf", addrbuf);
for (i = 0; (i < backup_cnt) && msg_collect_backup[i];
i++) {
slurm_print_slurm_addr(msg_collect_backup[i],
addrbuf, 32);
xstrfmtcat(tmp, " backup[%d]:%s", i, addrbuf);
}
info("%s)", tmp);
xfree(tmp);
}
hostlist_destroy(nodes);
if (parent)
free(parent);
for (i = 0; i < backup_cnt; i++)
xfree(backup[i]);
xfree(backup);
for (i = 0; i < hl_count; i++) {
hostlist_destroy(hll[i]);
}
xfree(hll);
}
extern int route_init(char *node_name)
{
int retval = SLURM_SUCCESS;
char *plugin_type = "route";
char *type = NULL;
if (init_run && g_context)
return retval;
slurm_mutex_lock(&g_context_lock);
if (g_context)
goto done;
type = slurm_get_route_plugin();
g_context = plugin_context_create(
plugin_type, type, (void **)&ops, syms, sizeof(syms));
if (!g_context) {
error("cannot create %s context for %s", plugin_type, type);
retval = SLURM_ERROR;
goto done;
}
g_tree_width = slurm_get_tree_width();
debug_flags = slurm_get_debug_flags();
init_run = true;
_set_collectors(node_name);
done:
slurm_mutex_unlock(&g_context_lock);
xfree(type);
return retval;
}
extern int route_fini(void)
{
int i, rc;
if (!g_context)
return SLURM_SUCCESS;
init_run = false;
rc = plugin_context_destroy(g_context);
g_context = NULL;
xfree(msg_collect_node);
for (i = 0; i < msg_backup_cnt; i++)
xfree(msg_collect_backup[i]);
xfree(msg_collect_backup);
msg_backup_cnt = 0;
return rc;
}
/*
* route_g_split_hostlist - logic to split an input hostlist into
* a set of hostlists to forward to.
*
* IN: hl - hostlist_t - list of every node to send message to
* will be empty on return which is same behavior
* as similar code replaced in forward.c
* OUT: sp_hl - hostlist_t** - the array of hostlists that will be malloced
* OUT: count - int* - the count of created hostlists
* RET: SLURM_SUCCESS - int
*
* Note: created hostlist will have to be freed independently using
* hostlist_destroy by the caller.
* Note: the hostlist_t array will have to be xfree.
*/
extern int route_g_split_hostlist(hostlist_t hl,
hostlist_t** sp_hl,
int* count, uint16_t tree_width)
{
int rc;
int j, nnodes, nnodex;
char *buf;
nnodes = nnodex = 0;
if (route_init(NULL) != SLURM_SUCCESS)
return SLURM_ERROR;
if (debug_flags & DEBUG_FLAG_ROUTE) {
/* nnodes has to be set here as the hl is empty after the
* split_hostlise call. */
nnodes = hostlist_count(hl);
buf = hostlist_ranged_string_xmalloc(hl);
info("ROUTE: split_hostlist: hl=%s tree_width %u",
buf, tree_width);
xfree(buf);
}
rc = (*(ops.split_hostlist))(hl, sp_hl, count,
tree_width ? tree_width : g_tree_width);
if (debug_flags & DEBUG_FLAG_ROUTE) {
/* Sanity check to make sure all nodes in msg list are in
* a child list */
nnodex = 0;
for (j = 0; j < *count; j++) {
nnodex += hostlist_count((*sp_hl)[j]);
}
if (nnodex != nnodes) { /* CLANG false positive */
info("ROUTE: number of nodes in split lists (%d)"
" is not equal to number in input list (%d)",
nnodex, nnodes);
}
}
return rc;
}
/*
* route_g_reconfigure - reset during reconfigure
*
* RET: SLURM_SUCCESS - int
*/
extern int route_g_reconfigure(void)
{
if (route_init(NULL) != SLURM_SUCCESS)
return SLURM_ERROR;
debug_flags = slurm_get_debug_flags();
g_tree_width = slurm_get_tree_width();
return (*(ops.reconfigure))();
}
/*
* route_g_next_collector - return address of next collector
*
* IN: is_collector - bool* - flag indication if this node is a collector
*
* RET: slurm_addr_t* - address of node to send messages to be aggregated.
*/
extern slurm_addr_t* route_g_next_collector(bool *is_collector)
{
if (route_init(NULL) != SLURM_SUCCESS)
return NULL;
return (*(ops.next_collector))(is_collector);
}
/*
* route_g_next_collector_backup
*
* RET: slurm_addr_t* - address of backup node to send messages to be aggregated.
*/
extern slurm_addr_t* route_g_next_collector_backup(void)
{
if (route_init(NULL) != SLURM_SUCCESS)
return NULL;
return (*(ops.next_collector_backup))();
}
/*
* route_split_hostlist_treewidth - logic to split an input hostlist into
* a set of hostlists to forward to.
*
* This is the default behavior. It is implemented here as there are cases
* where the topology version also needs to split the message list based
* on TreeWidth.
*
* IN: hl - hostlist_t - list of every node to send message to
* will be empty on return which is same behavior
* as similar code replaced in forward.c
* OUT: sp_hl - hostlist_t** - the array of hostlists that will be malloced
* OUT: count - int* - the count of created hostlists
* RET: SLURM_SUCCESS - int
*
* Note: created hostlist will have to be freed independently using
* hostlist_destroy by the caller.
* Note: the hostlist_t array will have to be xfree.
*/
extern int route_split_hostlist_treewidth(hostlist_t hl,
hostlist_t** sp_hl,
int* count, uint16_t tree_width)
{
int host_count;
int *span = NULL;
char *name = NULL;
char *buf;
int nhl = 0;
int j;
if (!tree_width)
tree_width = g_tree_width;
host_count = hostlist_count(hl);
span = set_span(host_count, tree_width);
*sp_hl = xmalloc(tree_width * sizeof(hostlist_t));
while ((name = hostlist_shift(hl))) {
(*sp_hl)[nhl] = hostlist_create(name);
free(name);
for (j = 0; j < span[nhl]; j++) {
name = hostlist_shift(hl);
if (!name) {
break;
}
hostlist_push_host((*sp_hl)[nhl], name);
free(name);
}
if (debug_flags & DEBUG_FLAG_ROUTE) {
buf = hostlist_ranged_string_xmalloc((*sp_hl)[nhl]);
debug("ROUTE: ... sublist[%d] %s", nhl, buf);
xfree(buf);
}
nhl++;
}
xfree(span);
*count = nhl;
return SLURM_SUCCESS;
}
/*
* route_next_collector - get collector node address based
*
* IN: is_collector - bool* - flag indication if this node is a collector
*
* RET: slurm_addr_t* - address of node to send messages to be aggregated.
*/
extern slurm_addr_t* route_next_collector(bool *is_collector)
{
*is_collector = this_is_collector;
return msg_collect_node;
}
/*
* route_next_collector_backup - get collector backup address based on offset
*
* backup_inx IN - Backup server index (between 1 and msg_backup_cnt-1)
* RET: slurm_addr_t* - address of backup node to send messages to be aggregated
*/
extern slurm_addr_t* route_next_collector_backup(int backup_inx)
{
if ((backup_inx <= 0) || (backup_inx >= msg_backup_cnt))
return NULL;
return msg_collect_backup[backup_inx];
}