blob: 13e61adb9238d2a5fe03ad4861a636574cd3fb06 [file] [edit]
/*****************************************************************************\
* backup.c - backup slurm dbd
*****************************************************************************
* Copyright (C) 2009 Lawrence Livermore National Security.
* Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
* Written by Danny Auble <da@llnl.gov>
* CODE-OCEC-09-009. All rights reserved.
*
* This file is part of SLURM, a resource management program.
* For details, see <http://www.schedmd.com/slurmdocs/>.
* Please also read the included file: DISCLAIMER.
*
* SLURM is free software; you can redistribute it and/or modify it under
* the terms of the GNU General Public License as published by the Free
* Software Foundation; either version 2 of the License, or (at your option)
* any later version.
*
* In addition, as a special exception, the copyright holders give permission
* to link the code of portions of this program with the OpenSSL library under
* certain conditions as described in each individual source file, and
* distribute linked combinations including the two. You must obey the GNU
* General Public License in all respects for all of the code used other than
* OpenSSL. If you modify file(s) with this exception, you may extend this
* exception to your version of the file(s), but you are not obligated to do
* so. If you do not wish to do so, delete this exception statement from your
* version. If you delete this exception statement from all source files in
* the program, then also delete it here.
*
* SLURM is distributed in the hope that it will be useful, but WITHOUT ANY
* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
* FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
* details.
*
* You should have received a copy of the GNU General Public License along
* with SLURM; if not, write to the Free Software Foundation, Inc.,
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
\*****************************************************************************/
#include <sys/poll.h>
#include "src/common/xmalloc.h"
#include "src/common/slurm_protocol_defs.h"
#include "src/common/fd.h"
#include "src/common/log.h"
#include "src/common/slurmdbd_defs.h"
#include "src/slurmdbd/backup.h"
bool primary_resumed = false;
bool backup = false;
bool have_control = false;
static slurm_fd_t slurmdbd_fd = -1;
/* Open a connection to the Slurm DBD and set slurmdbd_fd */
static void _open_slurmdbd_fd(slurm_addr_t dbd_addr)
{
if (dbd_addr.sin_port == 0) {
error("sin_port == 0 in the slurmdbd backup");
return;
}
slurmdbd_fd = slurm_open_msg_conn(&dbd_addr);
if (slurmdbd_fd >= 0)
fd_set_nonblocking(slurmdbd_fd);
}
/* Close the SlurmDbd connection */
static void _close_slurmdbd_fd(void)
{
if (slurmdbd_fd >= 0) {
close(slurmdbd_fd);
slurmdbd_fd = -1;
}
}
/* Reopen the Slurm DBD connection due to some error */
static void _reopen_slurmdbd_fd(slurm_addr_t dbd_addr)
{
_close_slurmdbd_fd();
_open_slurmdbd_fd(dbd_addr);
}
/* run_backup - this is the backup controller, it should run in standby
* mode, assuming control when the primary controller stops responding */
extern void run_backup(void)
{
slurm_addr_t dbd_addr;
primary_resumed = false;
/* get a connection */
slurm_set_addr(&dbd_addr, slurmdbd_conf->dbd_port,
slurmdbd_conf->dbd_host);
if (dbd_addr.sin_port == 0)
error("Unable to locate SlurmDBD host %s:%u",
slurmdbd_conf->dbd_host, slurmdbd_conf->dbd_port);
else
_open_slurmdbd_fd(dbd_addr);
/* repeatedly ping Primary */
while (!shutdown_time) {
bool writeable = fd_writeable(slurmdbd_fd);
//info("%d %d", have_control, writeable);
if (have_control && writeable) {
info("Primary has come back");
primary_resumed = true;
shutdown_threads();
have_control = false;
break;
} else if (!have_control && !writeable) {
have_control = true;
info("Taking Control");
break;
}
sleep(1);
if (!writeable)
_reopen_slurmdbd_fd(dbd_addr);
}
_close_slurmdbd_fd();
return;
}