| /*****************************************************************************\ |
| * heartbeat.c |
| ***************************************************************************** |
| * Copyright (C) SchedMD LLC. |
| * |
| * This file is part of Slurm, a resource management program. |
| * For details, see <https://slurm.schedmd.com/>. |
| * Please also read the included file: DISCLAIMER. |
| * |
| * Slurm is free software; you can redistribute it and/or modify it under |
| * the terms of the GNU General Public License as published by the Free |
| * Software Foundation; either version 2 of the License, or (at your option) |
| * any later version. |
| * |
| * In addition, as a special exception, the copyright holders give permission |
| * to link the code of portions of this program with the OpenSSL library under |
| * certain conditions as described in each individual source file, and |
| * distribute linked combinations including the two. You must obey the GNU |
| * General Public License in all respects for all of the code used other than |
| * OpenSSL. If you modify file(s) with this exception, you may extend this |
| * exception to your version of the file(s), but you are not obligated to do |
| * so. If you do not wish to do so, delete this exception statement from your |
| * version. If you delete this exception statement from all source files in |
| * the program, then also delete it here. |
| * |
| * Slurm is distributed in the hope that it will be useful, but WITHOUT ANY |
| * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS |
| * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more |
| * details. |
| * |
| * You should have received a copy of the GNU General Public License along |
| * with Slurm; if not, write to the Free Software Foundation, Inc., |
| * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. |
| \*****************************************************************************/ |
| |
| #define _GNU_SOURCE |
| |
| #include <fcntl.h> |
| #include <pthread.h> |
| #include <time.h> |
| #include <sys/stat.h> |
| #include <sys/types.h> |
| |
| #include "src/common/fd.h" |
| #include "src/common/xstring.h" |
| #include "src/slurmctld/heartbeat.h" |
| #include "src/slurmctld/slurmctld.h" |
| #include "src/slurmctld/state_save.h" |
| |
| /* |
| * Write to a file at a frequent interval to demonstrate that the primary |
| * is still alive and active, and could thus change the contents of |
| * StateSaveLocation at any point in time. This is monitoried in the backup |
| * and will prevent the backup controller from assuming control in periods |
| * of high load (as this thread does not depend on any other locks within |
| * slurmctld) or if the network path between primary <-> backup is lost but |
| * the path to the StateSaveLocation storage remains intact. |
| * |
| * Will only run if a BackupController is setup, otherwise this is a no-op |
| * and no thread will be launched. |
| */ |
| |
| static void *_heartbeat_thread(void *no_data); |
| |
| static pthread_mutex_t heartbeat_mutex = PTHREAD_MUTEX_INITIALIZER; |
| static pthread_cond_t heartbeat_cond = PTHREAD_COND_INITIALIZER; |
| |
| static bool heart_beating; |
| |
| static void *_heartbeat_thread(void *no_data) |
| { |
| /* |
| * The frequency needs to be faster than slurmctld_timeout, |
| * or the backup controller may try to assume control. |
| * One-fourth is very conservative, one-half should be sufficient. |
| * Have it happen at least every 30 seconds if the timeout is quite |
| * large. |
| */ |
| int beat = MIN(slurm_conf.slurmctld_timeout / 4, 30); |
| time_t now; |
| uint64_t nl; |
| struct timespec ts = {0, 0}; |
| char *reg_file, *new_file; |
| int fd; |
| |
| debug("Heartbeat thread started, beating every %d seconds.", beat); |
| |
| slurm_mutex_lock(&heartbeat_mutex); |
| while (heart_beating) { |
| now = time(NULL); |
| ts.tv_sec = now + beat; |
| |
| debug3("Heartbeat at %ld", now); |
| /* |
| * Rebuild file path each beat just in case someone changes |
| * StateSaveLocation and runs reconfigure. |
| */ |
| reg_file = xstrdup_printf("%s/heartbeat", |
| slurm_conf.state_save_location); |
| new_file = xstrdup_printf("%s.new", reg_file); |
| |
| fd = open(new_file, O_CREAT|O_WRONLY|O_TRUNC|O_CLOEXEC, 0600); |
| if (fd < 0) { |
| error("%s: heartbeat file creation failed to %s.", |
| __func__, new_file); |
| goto delay; |
| } |
| |
| nl = HTON_uint64(((uint64_t) now)); |
| if (write(fd, &nl, sizeof(uint64_t)) != sizeof(uint64_t)) { |
| error("%s: heartbeat write failed to %s.", |
| __func__, new_file); |
| close(fd); |
| (void) unlink(new_file); |
| goto delay; |
| } |
| |
| nl = HTON_uint64(((uint64_t) backup_inx)); |
| if (write(fd, &nl, sizeof(uint64_t)) != sizeof(uint64_t)) { |
| error("%s: heartbeat write failed to %s.", |
| __func__, new_file); |
| close(fd); |
| (void) unlink(new_file); |
| goto delay; |
| } |
| |
| if (fsync_and_close(fd, "heartbeat")) { |
| (void) unlink(new_file); |
| goto delay; |
| } |
| |
| /* shuffle files around */ |
| (void) unlink(reg_file); |
| if (link(new_file, reg_file)) |
| debug("%s: unable to create link for %s -> %s, %m", |
| __func__, new_file, reg_file); |
| (void) unlink(new_file); |
| |
| delay: |
| xfree(reg_file); |
| xfree(new_file); |
| slurm_cond_timedwait(&heartbeat_cond, &heartbeat_mutex, &ts); |
| } |
| slurm_mutex_unlock(&heartbeat_mutex); |
| |
| return NULL; |
| } |
| |
| void heartbeat_start(void) |
| { |
| if (slurm_conf.control_cnt < 2) { |
| debug("No backup controllers, not launching heartbeat."); |
| return; |
| } |
| |
| slurm_mutex_lock(&heartbeat_mutex); |
| slurm_thread_create_detached(_heartbeat_thread, NULL); |
| heart_beating = true; |
| slurm_mutex_unlock(&heartbeat_mutex); |
| } |
| |
| void heartbeat_stop(void) |
| { |
| slurm_mutex_lock(&heartbeat_mutex); |
| if (heart_beating) { |
| heart_beating = false; |
| slurm_cond_signal(&heartbeat_cond); |
| } |
| slurm_mutex_unlock(&heartbeat_mutex); |
| } |
| |
| #define OPEN_RETRIES 3 |
| |
| time_t get_last_heartbeat(int *server_inx) |
| { |
| char *file; |
| int fd = -1, i; |
| uint64_t value; |
| uint64_t inx; |
| |
| file = xstrdup_printf("%s/heartbeat", |
| slurm_conf.state_save_location); |
| |
| /* |
| * Retry the open() in case the primary is rearranging things |
| * at the moment. Once opened, our handle should persist during |
| * the shuffle, as the contents are left intact. |
| */ |
| for (i = 0; (i < OPEN_RETRIES) && (fd < 0); i++) { |
| if (i) { |
| debug("%s: sleeping before attempt %d to open heartbeat", |
| __func__, i); |
| usleep(100000); |
| } |
| fd = open(file, O_RDONLY); |
| } |
| |
| if (fd < 0) { |
| error("%s: heartbeat open attempt failed from %s.", |
| __func__, file); |
| xfree(file); |
| return 0; |
| } |
| |
| if (read(fd, &value, sizeof(uint64_t)) != sizeof(uint64_t)) { |
| error("%s: heartbeat read failed from %s.", |
| __func__, file); |
| value = 0; |
| } |
| if (read(fd, &inx, sizeof(uint64_t)) != sizeof(uint64_t)) { |
| error("%s: heartbeat read failed from %s.", |
| __func__, file); |
| } else if (server_inx) { |
| *server_inx = NTOH_uint64(inx); |
| } |
| |
| close(fd); |
| xfree(file); |
| |
| return (time_t) NTOH_uint64(value); |
| } |