blob: 9dc1dfd2eb62723c3df9320ff11d4017503ba0f2 [file] [log] [blame]
/*****************************************************************************\
* heartbeat.c
*****************************************************************************
* Copyright (C) SchedMD LLC.
*
* This file is part of Slurm, a resource management program.
* For details, see <https://slurm.schedmd.com/>.
* Please also read the included file: DISCLAIMER.
*
* Slurm is free software; you can redistribute it and/or modify it under
* the terms of the GNU General Public License as published by the Free
* Software Foundation; either version 2 of the License, or (at your option)
* any later version.
*
* In addition, as a special exception, the copyright holders give permission
* to link the code of portions of this program with the OpenSSL library under
* certain conditions as described in each individual source file, and
* distribute linked combinations including the two. You must obey the GNU
* General Public License in all respects for all of the code used other than
* OpenSSL. If you modify file(s) with this exception, you may extend this
* exception to your version of the file(s), but you are not obligated to do
* so. If you do not wish to do so, delete this exception statement from your
* version. If you delete this exception statement from all source files in
* the program, then also delete it here.
*
* Slurm is distributed in the hope that it will be useful, but WITHOUT ANY
* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
* FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
* details.
*
* You should have received a copy of the GNU General Public License along
* with Slurm; if not, write to the Free Software Foundation, Inc.,
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
\*****************************************************************************/
#define _GNU_SOURCE
#include <fcntl.h>
#include <pthread.h>
#include <time.h>
#include <sys/stat.h>
#include <sys/types.h>
#include "src/common/fd.h"
#include "src/common/xstring.h"
#include "src/slurmctld/heartbeat.h"
#include "src/slurmctld/slurmctld.h"
#include "src/slurmctld/state_save.h"
/*
* Write to a file at a frequent interval to demonstrate that the primary
* is still alive and active, and could thus change the contents of
* StateSaveLocation at any point in time. This is monitoried in the backup
* and will prevent the backup controller from assuming control in periods
* of high load (as this thread does not depend on any other locks within
* slurmctld) or if the network path between primary <-> backup is lost but
* the path to the StateSaveLocation storage remains intact.
*
* Will only run if a BackupController is setup, otherwise this is a no-op
* and no thread will be launched.
*/
static void *_heartbeat_thread(void *no_data);
static pthread_mutex_t heartbeat_mutex = PTHREAD_MUTEX_INITIALIZER;
static pthread_cond_t heartbeat_cond = PTHREAD_COND_INITIALIZER;
static bool heart_beating;
static void *_heartbeat_thread(void *no_data)
{
/*
* The frequency needs to be faster than slurmctld_timeout,
* or the backup controller may try to assume control.
* One-fourth is very conservative, one-half should be sufficient.
* Have it happen at least every 30 seconds if the timeout is quite
* large.
*/
int beat = MIN(slurm_conf.slurmctld_timeout / 4, 30);
time_t now;
uint64_t nl;
struct timespec ts = {0, 0};
char *reg_file, *new_file;
int fd;
debug("Heartbeat thread started, beating every %d seconds.", beat);
slurm_mutex_lock(&heartbeat_mutex);
while (heart_beating) {
now = time(NULL);
ts.tv_sec = now + beat;
debug3("Heartbeat at %ld", now);
/*
* Rebuild file path each beat just in case someone changes
* StateSaveLocation and runs reconfigure.
*/
reg_file = xstrdup_printf("%s/heartbeat",
slurm_conf.state_save_location);
new_file = xstrdup_printf("%s.new", reg_file);
fd = open(new_file, O_CREAT|O_WRONLY|O_TRUNC|O_CLOEXEC, 0600);
if (fd < 0) {
error("%s: heartbeat file creation failed to %s.",
__func__, new_file);
goto delay;
}
nl = HTON_uint64(((uint64_t) now));
if (write(fd, &nl, sizeof(uint64_t)) != sizeof(uint64_t)) {
error("%s: heartbeat write failed to %s.",
__func__, new_file);
close(fd);
(void) unlink(new_file);
goto delay;
}
nl = HTON_uint64(((uint64_t) backup_inx));
if (write(fd, &nl, sizeof(uint64_t)) != sizeof(uint64_t)) {
error("%s: heartbeat write failed to %s.",
__func__, new_file);
close(fd);
(void) unlink(new_file);
goto delay;
}
if (fsync_and_close(fd, "heartbeat")) {
(void) unlink(new_file);
goto delay;
}
/* shuffle files around */
(void) unlink(reg_file);
if (link(new_file, reg_file))
debug("%s: unable to create link for %s -> %s, %m",
__func__, new_file, reg_file);
(void) unlink(new_file);
delay:
xfree(reg_file);
xfree(new_file);
slurm_cond_timedwait(&heartbeat_cond, &heartbeat_mutex, &ts);
}
slurm_mutex_unlock(&heartbeat_mutex);
return NULL;
}
void heartbeat_start(void)
{
if (slurm_conf.control_cnt < 2) {
debug("No backup controllers, not launching heartbeat.");
return;
}
slurm_mutex_lock(&heartbeat_mutex);
slurm_thread_create_detached(_heartbeat_thread, NULL);
heart_beating = true;
slurm_mutex_unlock(&heartbeat_mutex);
}
void heartbeat_stop(void)
{
slurm_mutex_lock(&heartbeat_mutex);
if (heart_beating) {
heart_beating = false;
slurm_cond_signal(&heartbeat_cond);
}
slurm_mutex_unlock(&heartbeat_mutex);
}
#define OPEN_RETRIES 3
time_t get_last_heartbeat(int *server_inx)
{
char *file;
int fd = -1, i;
uint64_t value;
uint64_t inx;
file = xstrdup_printf("%s/heartbeat",
slurm_conf.state_save_location);
/*
* Retry the open() in case the primary is rearranging things
* at the moment. Once opened, our handle should persist during
* the shuffle, as the contents are left intact.
*/
for (i = 0; (i < OPEN_RETRIES) && (fd < 0); i++) {
if (i) {
debug("%s: sleeping before attempt %d to open heartbeat",
__func__, i);
usleep(100000);
}
fd = open(file, O_RDONLY);
}
if (fd < 0) {
error("%s: heartbeat open attempt failed from %s.",
__func__, file);
xfree(file);
return 0;
}
if (read(fd, &value, sizeof(uint64_t)) != sizeof(uint64_t)) {
error("%s: heartbeat read failed from %s.",
__func__, file);
value = 0;
}
if (read(fd, &inx, sizeof(uint64_t)) != sizeof(uint64_t)) {
error("%s: heartbeat read failed from %s.",
__func__, file);
} else if (server_inx) {
*server_inx = NTOH_uint64(inx);
}
close(fd);
xfree(file);
return (time_t) NTOH_uint64(value);
}