blob: e13f046cd33d06909c42eba7114c437143d9d609 [file] [log] [blame]
/*****************************************************************************\
* sackd_mgr.c - sackd (login) node manager
*****************************************************************************
* Copyright (C) SchedMD LLC.
*
* This file is part of Slurm, a resource management program.
* For details, see <https://slurm.schedmd.com/>.
* Please also read the included file: DISCLAIMER.
*
* Slurm is free software; you can redistribute it and/or modify it under
* the terms of the GNU General Public License as published by the Free
* Software Foundation; either version 2 of the License, or (at your option)
* any later version.
*
* In addition, as a special exception, the copyright holders give permission
* to link the code of portions of this program with the OpenSSL library under
* certain conditions as described in each individual source file, and
* distribute linked combinations including the two. You must obey the GNU
* General Public License in all respects for all of the code used other than
* OpenSSL. If you modify file(s) with this exception, you may extend this
* exception to your version of the file(s), but you are not obligated to do
* so. If you do not wish to do so, delete this exception statement from your
* version. If you delete this exception statement from all source files in
* the program, then also delete it here.
*
* Slurm is distributed in the hope that it will be useful, but WITHOUT ANY
* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
* FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
* details.
*
* You should have received a copy of the GNU General Public License along
* with Slurm; if not, write to the Free Software Foundation, Inc.,
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
\*****************************************************************************/
#include <sys/socket.h>
#include "src/common/fetch_config.h"
#include "src/common/macros.h"
#include "src/common/slurm_protocol_pack.h"
#include "src/common/xmalloc.h"
#include "src/common/xstring.h"
#include "src/interfaces/conn.h"
#include "src/slurmctld/agent.h"
#include "src/slurmctld/slurmctld.h"
typedef struct {
char *hostname;
char *nodeaddr;
time_t last_update;
uint16_t port;
uint16_t protocol_version;
} sackd_node_t;
static list_t *sackd_nodes = NULL;
static pthread_mutex_t sackd_lock = PTHREAD_MUTEX_INITIALIZER;
static void _destroy_sackd_node(void *x)
{
sackd_node_t *node = x;
if (!node)
return;
xfree(node->hostname);
xfree(node->nodeaddr);
xfree(node);
}
/*
* protocol_version is only here because of pack_list().
* There's no need to ever pack an old version though.
*/
static void _pack_node(void *x, uint16_t protocol_version, buf_t *buffer)
{
sackd_node_t *node = x;
xassert(protocol_version == SLURM_PROTOCOL_VERSION);
pack16(node->protocol_version, buffer);
pack64(node->last_update, buffer);
packstr(node->hostname, buffer);
packstr(node->nodeaddr, buffer);
pack16(node->port, buffer);
}
static int _unpack_node(void **x, uint16_t protocol_version, buf_t *buffer)
{
uint64_t time_tmp;
sackd_node_t *node = xmalloc(sizeof(*node));
if (protocol_version >= SLURM_25_05_PROTOCOL_VERSION) {
safe_unpack16(&node->protocol_version, buffer);
safe_unpack64(&time_tmp, buffer);
node->last_update = time_tmp;
safe_unpackstr(&node->hostname, buffer);
safe_unpackstr(&node->nodeaddr, buffer);
safe_unpack16(&node->port, buffer);
} else if (protocol_version >= SLURM_MIN_PROTOCOL_VERSION) {
safe_unpack16(&node->protocol_version, buffer);
safe_unpack64(&time_tmp, buffer);
node->last_update = time_tmp;
safe_unpackstr(&node->hostname, buffer);
safe_unpackstr(&node->nodeaddr, buffer);
}
*x = node;
return SLURM_SUCCESS;
unpack_error:
_destroy_sackd_node(node);
return SLURM_ERROR;
}
static int _find_sackd_node(void *x, void *arg)
{
sackd_node_t *node = x;
char *hostname = arg;
return !xstrcmp(node->hostname, hostname);
}
static void _update_sackd_node(sackd_node_t *node, slurm_msg_t *msg)
{
slurm_addr_t *addr = &msg->address;
node->last_update = time(NULL);
node->protocol_version = msg->protocol_version;
/* Resolve IP of slurmd if needed */
xfree(node->nodeaddr);
if (addr->ss_family == AF_UNSPEC) {
int fd = conn_g_get_fd(msg->tls_conn);
(void) slurm_get_peer_addr(fd, addr);
}
if (addr->ss_family != AF_UNSPEC) {
node->nodeaddr = xmalloc(INET6_ADDRSTRLEN);
slurm_get_ip_str(addr, node->nodeaddr, INET6_ADDRSTRLEN);
} else {
node->nodeaddr = xstrdup(node->hostname);
}
}
extern void sackd_mgr_dump_state(buf_t *buffer, uint16_t protocol_version)
{
slurm_mutex_lock(&sackd_lock);
slurm_pack_list(sackd_nodes, _pack_node, buffer,
SLURM_PROTOCOL_VERSION);
debug("%s: saved state of %d nodes",
__func__, list_count(sackd_nodes));
slurm_mutex_unlock(&sackd_lock);
}
extern int sackd_mgr_load_state(buf_t *buffer, uint16_t protocol_version)
{
int rc;
slurm_mutex_lock(&sackd_lock);
FREE_NULL_LIST(sackd_nodes);
rc = slurm_unpack_list(&sackd_nodes, _unpack_node, _destroy_sackd_node,
buffer, protocol_version);
debug("%s: restored state of %d nodes",
__func__, list_count(sackd_nodes));
slurm_mutex_unlock(&sackd_lock);
return rc;
}
extern void sackd_mgr_fini(void)
{
debug("%s", __func__);
slurm_mutex_lock(&sackd_lock);
slurm_mutex_unlock(&sackd_lock);
}
extern void sackd_mgr_add_node(slurm_msg_t *msg, uint16_t port)
{
sackd_node_t *node = NULL;
char *auth_host = auth_g_get_host(msg);
if (!port)
port = slurm_conf.slurmd_port;
slurm_mutex_lock(&sackd_lock);
if (!sackd_nodes)
sackd_nodes = list_create(_destroy_sackd_node);
if ((node = list_find_first(sackd_nodes, _find_sackd_node,
auth_host))) {
debug("%s: updating existing record for %s:%hu",
__func__, auth_host, port);
node->port = port;
_update_sackd_node(node, msg);
xfree(auth_host);
} else {
debug("%s: adding record for %s:%hu",
__func__, auth_host, port);
node = xmalloc(sizeof(*node));
node->hostname = auth_host;
node->port = port;
_update_sackd_node(node, msg);
list_append(sackd_nodes, node);
}
slurm_mutex_unlock(&sackd_lock);
}
static int _each_sackd_node(void *x, void *arg)
{
sackd_node_t *node = x;
agent_arg_t *args = xmalloc(sizeof(*args));
args->addr = xmalloc(sizeof(slurm_addr_t));
slurm_set_addr(args->addr, node->port, node->nodeaddr);
args->msg_args = new_config_response(false);
args->msg_type = REQUEST_RECONFIGURE_SACKD;
args->hostlist = hostlist_create(node->hostname);
args->node_count = 1;
args->protocol_version = node->protocol_version;
args->retry = 0;
set_agent_arg_r_uid(args, slurm_conf.slurm_user_id);
agent_queue_request(args);
return 0;
}
extern void sackd_mgr_push_reconfig(void)
{
int count = 0;
slurm_mutex_lock(&sackd_lock);
if (!sackd_nodes) {
slurm_mutex_unlock(&sackd_lock);
return;
}
count = list_for_each(sackd_nodes, _each_sackd_node, NULL);
debug("%s: triggered reconfig for %d nodes", __func__, count);
slurm_mutex_unlock(&sackd_lock);
}
extern void sackd_mgr_remove_node(char *node)
{
debug("%s: removing %s", __func__, node);
slurm_mutex_lock(&sackd_lock);
if (sackd_nodes)
list_delete_first(sackd_nodes, _find_sackd_node, node);
slurm_mutex_unlock(&sackd_lock);
}