| /*****************************************************************************\ |
| * backup.c - backup slurm controller |
| ***************************************************************************** |
| * Copyright (C) 2002-2007 The Regents of the University of California. |
| * Copyright (C) 2008-2010 Lawrence Livermore National Security. |
| * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). |
| * Written by Morris Jette <jette@llnl.gov>, et. al. |
| * CODE-OCEC-09-009. All rights reserved. |
| * |
| * This file is part of Slurm, a resource management program. |
| * For details, see <https://slurm.schedmd.com/>. |
| * Please also read the included file: DISCLAIMER. |
| * |
| * Slurm is free software; you can redistribute it and/or modify it under |
| * the terms of the GNU General Public License as published by the Free |
| * Software Foundation; either version 2 of the License, or (at your option) |
| * any later version. |
| * |
| * In addition, as a special exception, the copyright holders give permission |
| * to link the code of portions of this program with the OpenSSL library under |
| * certain conditions as described in each individual source file, and |
| * distribute linked combinations including the two. You must obey the GNU |
| * General Public License in all respects for all of the code used other than |
| * OpenSSL. If you modify file(s) with this exception, you may extend this |
| * exception to your version of the file(s), but you are not obligated to do |
| * so. If you do not wish to do so, delete this exception statement from your |
| * version. If you delete this exception statement from all source files in |
| * the program, then also delete it here. |
| * |
| * Slurm is distributed in the hope that it will be useful, but WITHOUT ANY |
| * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS |
| * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more |
| * details. |
| * |
| * You should have received a copy of the GNU General Public License along |
| * with Slurm; if not, write to the Free Software Foundation, Inc., |
| * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. |
| \*****************************************************************************/ |
| |
| #include "config.h" |
| |
| #include <errno.h> |
| #include <poll.h> |
| #include <pthread.h> |
| #include <signal.h> |
| #include <stdio.h> |
| #include <stdlib.h> |
| #include <string.h> |
| #include <sys/resource.h> |
| #include <sys/stat.h> |
| |
| #include "slurm/slurm_errno.h" |
| |
| #include "src/common/daemonize.h" |
| #include "src/common/log.h" |
| #include "src/common/macros.h" |
| #include "src/common/xstring.h" |
| |
| #include "src/conmgr/conmgr.h" |
| |
| #include "src/interfaces/accounting_storage.h" |
| #include "src/interfaces/auth.h" |
| #include "src/interfaces/priority.h" |
| #include "src/interfaces/select.h" |
| #include "src/interfaces/switch.h" |
| |
| #include "src/slurmctld/agent.h" |
| #include "src/slurmctld/heartbeat.h" |
| #include "src/slurmctld/locks.h" |
| #include "src/slurmctld/proc_req.h" |
| #include "src/slurmctld/read_config.h" |
| #include "src/slurmctld/slurmctld.h" |
| #include "src/slurmctld/trigger_mgr.h" |
| |
| #define _DEBUG 0 |
| #define SHUTDOWN_WAIT 2 /* Time to wait for primary server shutdown */ |
| |
| static void _backup_reconfig(void); |
| static int _shutdown_primary_controller(int wait_time); |
| static void * _trigger_slurmctld_event(void *arg); |
| |
| typedef struct ping_struct { |
| int backup_inx; |
| char *control_addr; |
| char *control_machine; |
| uint32_t slurmctld_port; |
| } ping_struct_t; |
| |
| typedef struct { |
| time_t control_time; |
| bool responding; |
| } ctld_ping_t; |
| |
| /* Local variables */ |
| static ctld_ping_t * ctld_ping = NULL; |
| static bool dump_core = false; |
| static time_t last_controller_response; |
| static pthread_mutex_t ping_mutex = PTHREAD_MUTEX_INITIALIZER; |
| static volatile bool takeover = false; |
| static pthread_cond_t shutdown_cond = PTHREAD_COND_INITIALIZER; |
| static pthread_mutex_t shutdown_mutex = PTHREAD_MUTEX_INITIALIZER; |
| static int shutdown_rc = SLURM_SUCCESS; |
| static int shutdown_thread_cnt = 0; |
| static int shutdown_timeout = 0; |
| |
| extern void backup_on_sighup(void) |
| { |
| slurmctld_lock_t config_write_lock = { |
| .conf = WRITE_LOCK, |
| .job = WRITE_LOCK, |
| .node = WRITE_LOCK, |
| .part = WRITE_LOCK, |
| }; |
| |
| /* |
| * XXX - need to shut down the scheduler |
| * plugin, re-read the configuration, and then |
| * restart the (possibly new) plugin. |
| */ |
| lock_slurmctld(config_write_lock); |
| _backup_reconfig(); |
| unlock_slurmctld(config_write_lock); |
| } |
| |
| /* |
| * run_backup - this is the backup controller, it should run in standby |
| * mode, assuming control when the primary controller stops responding |
| */ |
| void run_backup(void) |
| { |
| int i; |
| time_t last_ping = 0; |
| slurmctld_lock_t config_read_lock = { |
| READ_LOCK, NO_LOCK, NO_LOCK, NO_LOCK, NO_LOCK }; |
| slurmctld_lock_t config_write_lock = { |
| WRITE_LOCK, WRITE_LOCK, WRITE_LOCK, WRITE_LOCK, NO_LOCK }; |
| |
| info("slurmctld running in background mode"); |
| takeover = false; |
| last_controller_response = time(NULL); |
| |
| /* default: don't resume if shutdown */ |
| slurmctld_config.resume_backup = false; |
| |
| /* It is now ok to tell the primary I am done (if I ever had control) */ |
| slurm_mutex_lock(&slurmctld_config.backup_finish_lock); |
| slurm_cond_broadcast(&slurmctld_config.backup_finish_cond); |
| slurm_mutex_unlock(&slurmctld_config.backup_finish_lock); |
| |
| slurm_thread_create_detached(_trigger_slurmctld_event, NULL); |
| |
| /* wait for the heartbeat file to exist before starting */ |
| while (!get_last_heartbeat(NULL) && |
| (slurmctld_config.shutdown_time == 0)) { |
| warning("Waiting for heartbeat file to exist..."); |
| sleep(1); |
| } |
| |
| for (i = 0; ((i < 5) && (slurmctld_config.shutdown_time == 0)); i++) { |
| sleep(1); /* Give the primary slurmctld set-up time */ |
| } |
| |
| listeners_unquiesce(); |
| |
| /* repeatedly ping ControlMachine */ |
| while (slurmctld_config.shutdown_time == 0) { |
| sleep(1); |
| /* Lock of slurm_conf below not important */ |
| if (slurm_conf.slurmctld_timeout && (takeover == false) && |
| ((time(NULL) - last_ping) < |
| (slurm_conf.slurmctld_timeout / 3))) |
| continue; |
| |
| last_ping = time(NULL); |
| if (ping_controllers(false) == SLURM_SUCCESS) |
| last_controller_response = time(NULL); |
| else if (takeover) { |
| /* |
| * in takeover mode, take control as soon as |
| * primary no longer respond |
| */ |
| break; |
| } else { |
| char *abort_msg = NULL; |
| bool abort_takeover = false; |
| static time_t prev_heartbeat = 0; |
| time_t use_time, last_heartbeat; |
| int server_inx = -1; |
| |
| last_heartbeat = get_last_heartbeat(&server_inx); |
| debug("%s: last_heartbeat %ld from server %d", |
| __func__, last_heartbeat, server_inx); |
| |
| use_time = last_controller_response; |
| if (server_inx > backup_inx) { |
| info("Lower priority slurmctld is currently primary (%d > %d)", |
| server_inx, backup_inx); |
| } else if (last_heartbeat > last_controller_response) { |
| /* Race condition for time stamps */ |
| debug("Last message to the controller was at %ld," |
| " but the last heartbeat was written at %ld," |
| " trusting the filesystem instead of the network" |
| " and not asserting control at this time.", |
| last_controller_response, last_heartbeat); |
| use_time = last_heartbeat; |
| } |
| |
| if (!last_heartbeat) { |
| /* |
| * Failed to read the heartbeat file, abort |
| * takeover because the StateSaveLocation is |
| * broken. |
| */ |
| abort_takeover = 1; |
| abort_msg = "Not taking control. Primary slurmctld is unresponsive, but heartbeat file could not be read. Something is wrong with your StateSaveLocation."; |
| } else if (!prev_heartbeat) { |
| /* |
| * Need at least one loop to detect if the |
| * primary is still running. |
| */ |
| abort_takeover = 1; |
| abort_msg = "Not taking control. Primary slurmctld is unresponsive, but not yet able to determine if primary may actually be running."; |
| } else if (last_heartbeat != prev_heartbeat) { |
| /* |
| * If the primary is unresponsive but the |
| * heartbeat is getting updated, consider the |
| * controller still "working" and abort the |
| * takeover. |
| */ |
| abort_takeover = 1; |
| abort_msg = "Not taking control. Primary slurmctld is unresponsive, but is still updating the heartbeat file. Check for clock skew."; |
| } |
| |
| prev_heartbeat = last_heartbeat; |
| |
| if (((time(NULL) - use_time) > |
| slurm_conf.slurmctld_timeout)) { |
| if (!abort_takeover) { |
| prev_heartbeat = 0; |
| break; |
| } |
| error("%s", abort_msg); |
| } |
| } |
| } |
| |
| listeners_quiesce(); |
| |
| if (slurmctld_config.shutdown_time != 0) { |
| /* |
| * Since pidfile is created as user root (its owner is |
| * changed to SlurmUser) SlurmUser may not be able to |
| * remove it, so this is not necessarily an error. |
| * No longer need slurm_conf lock after above join. |
| */ |
| if (unlink(slurm_conf.slurmctld_pidfile) < 0) |
| verbose("Unable to remove pidfile '%s': %m", |
| slurm_conf.slurmctld_pidfile); |
| |
| info("BackupController terminating"); |
| log_fini(); |
| if (dump_core) |
| abort(); |
| else |
| exit(0); |
| } |
| |
| lock_slurmctld(config_read_lock); |
| error("ControlMachine %s not responding, BackupController%d %s taking over", |
| slurm_conf.control_machine[0], backup_inx, |
| slurmctld_config.node_name_short); |
| unlock_slurmctld(config_read_lock); |
| |
| backup_slurmctld_restart(); |
| trigger_primary_ctld_fail(); |
| trigger_backup_ctld_as_ctrl(); |
| |
| pthread_kill(pthread_self(), SIGTERM); |
| |
| /* |
| * Expressly shutdown the agent. The agent can in whole or in part |
| * shutdown once slurmctld_config.shutdown_time is set. Remove any |
| * doubt about its state here. |
| */ |
| agent_fini(); |
| |
| /* |
| * The job list needs to be freed before we run |
| * ctld_assoc_mgr_init, it should be empty here in the first place. |
| */ |
| lock_slurmctld(config_write_lock); |
| job_fini(); |
| |
| /* |
| * The backup is now done shutting down, reset shutdown_time before |
| * re-initializing. |
| */ |
| slurmctld_config.shutdown_time = (time_t) 0; |
| |
| init_job_conf(); |
| unlock_slurmctld(config_write_lock); |
| |
| /* |
| * Init the agent here so it comes up at roughly the same place as a |
| * normal startup. |
| */ |
| agent_init(); |
| |
| /* Calls assoc_mgr_init() */ |
| ctld_assoc_mgr_init(); |
| |
| /* |
| * priority_g_init() needs to be called after assoc_mgr_init() |
| * and before read_slurm_conf() because jobs could be killed |
| * during read_slurm_conf() and call priority_g_job_end(). |
| */ |
| if (priority_g_init() != SLURM_SUCCESS) |
| fatal("failed to initialize priority plugin"); |
| |
| /* clear old state and read new state */ |
| lock_slurmctld(config_write_lock); |
| if (switch_g_restore(true)) { |
| error("failed to restore switch state"); |
| abort(); |
| } |
| if (read_slurm_conf(2)) { /* Recover all state */ |
| error("Unable to recover slurm state"); |
| abort(); |
| } |
| configless_update(); |
| if (conf_includes_list) { |
| /* |
| * clear included files so that subsequent conf |
| * parsings refill it with updated information. |
| */ |
| list_flush(conf_includes_list); |
| } |
| select_g_select_nodeinfo_set_all(); |
| unlock_slurmctld(config_write_lock); |
| } |
| |
| extern void *on_backup_connection(conmgr_fd_t *con, void *arg) |
| { |
| debug3("%s: [%s] BACKUP: New RPC connection", |
| __func__, conmgr_fd_get_name(con)); |
| |
| return con; |
| } |
| |
| extern void on_backup_finish(conmgr_fd_t *con, void *arg) |
| { |
| xassert(arg == con); |
| |
| debug3("%s: [%s] BACKUP: finish RPC connection", |
| __func__, conmgr_fd_get_name(con)); |
| } |
| |
| /* process an RPC to the backup_controller */ |
| extern int on_backup_msg(conmgr_fd_t *con, slurm_msg_t *msg, void *arg) |
| { |
| int error_code = SLURM_SUCCESS; |
| bool send_rc = true; |
| |
| xassert(arg == con); |
| |
| if (!msg->auth_ids_set) |
| fatal_abort("this should never happen"); |
| |
| log_flag(PROTOCOL, "%s: [%s] Received opcode %s from uid %u", |
| __func__, conmgr_fd_get_name(con), rpc_num2string(msg->msg_type), |
| msg->auth_uid); |
| |
| if (msg->msg_type != REQUEST_PING) { |
| bool super_user = false; |
| |
| if (validate_slurm_user(msg->auth_uid)) |
| super_user = true; |
| |
| if (super_user && (msg->msg_type == REQUEST_SHUTDOWN)) { |
| info("Performing background RPC: REQUEST_SHUTDOWN"); |
| pthread_kill(pthread_self(), SIGTERM); |
| } else if (super_user && |
| (msg->msg_type == REQUEST_TAKEOVER)) { |
| info("Performing background RPC: REQUEST_TAKEOVER"); |
| if (get_last_heartbeat(NULL)) { |
| _shutdown_primary_controller(SHUTDOWN_WAIT); |
| takeover = true; |
| error_code = SLURM_SUCCESS; |
| } else { |
| error_code = ESLURM_TAKEOVER_NO_HEARTBEAT; |
| } |
| } else if (super_user && |
| (msg->msg_type == REQUEST_CONTROL)) { |
| debug3("Ignoring RPC: REQUEST_CONTROL"); |
| error_code = ESLURM_DISABLED; |
| last_controller_response = time(NULL); |
| } else if (msg->msg_type == REQUEST_CONTROL_STATUS) { |
| slurm_rpc_control_status(msg); |
| send_rc = false; |
| } else if (msg->msg_type == REQUEST_CONFIG) { |
| /* |
| * Config was asked for from the wrong controller |
| * Assume there was a misconfiguration and redirect |
| * to the correct controller. This usually indicates a |
| * configuration issue. |
| */ |
| error("REQUEST_CONFIG received while in standby."); |
| error_code = ESLURM_IN_STANDBY_USE_BACKUP; |
| } else { |
| error("Invalid RPC received %s while in standby mode", |
| rpc_num2string(msg->msg_type)); |
| error_code = ESLURM_IN_STANDBY_MODE; |
| } |
| } |
| if (send_rc) |
| slurm_send_rc_msg(msg, error_code); |
| |
| slurm_free_msg(msg); |
| return SLURM_SUCCESS; |
| } |
| |
| static void *_ping_ctld_thread(void *arg) |
| { |
| ping_struct_t *ping = (ping_struct_t *) arg; |
| slurm_msg_t req, resp; |
| control_status_msg_t *control_msg; |
| time_t control_time = (time_t) 0; |
| bool responding = false; |
| |
| slurm_msg_t_init(&req); |
| slurm_set_addr(&req.address, ping->slurmctld_port, ping->control_addr); |
| req.msg_type = REQUEST_CONTROL_STATUS; |
| slurm_msg_set_r_uid(&req, SLURM_AUTH_UID_ANY); |
| if (slurm_send_recv_node_msg(&req, &resp, 0) == SLURM_SUCCESS) { |
| switch (resp.msg_type) { |
| case RESPONSE_CONTROL_STATUS: |
| control_msg = (control_status_msg_t *) resp.data; |
| if (ping->backup_inx != control_msg->backup_inx) { |
| error("%s: BackupController# index mismatch (%d != %u) from host %s", |
| __func__, ping->backup_inx, |
| control_msg->backup_inx, |
| ping->control_machine); |
| } |
| control_time = control_msg->control_time; |
| responding = true; |
| break; |
| default: |
| error("%s:, Unknown response message %u from host %s", |
| __func__, resp.msg_type, ping->control_machine); |
| break; |
| } |
| slurm_free_msg_data(resp.msg_type, resp.data); |
| if (resp.auth_cred) |
| auth_g_destroy(resp.auth_cred); |
| } |
| |
| slurm_mutex_lock(&ping_mutex); |
| if (responding) { |
| ctld_ping[ping->backup_inx].control_time = control_time; |
| ctld_ping[ping->backup_inx].responding = true; |
| } |
| slurm_mutex_unlock(&ping_mutex); |
| |
| xfree(ping->control_addr); |
| xfree(ping->control_machine); |
| xfree(ping); |
| |
| return NULL; |
| } |
| |
| /* |
| * Ping all higher-priority control nodes. |
| * RET SLURM_SUCCESS if a currently active controller is found |
| */ |
| extern int ping_controllers(bool active_controller) |
| { |
| int i, ping_target_cnt; |
| ping_struct_t *ping; |
| pthread_t *ping_tids; |
| /* Locks: Read configuration */ |
| slurmctld_lock_t config_read_lock = { |
| READ_LOCK, NO_LOCK, NO_LOCK, NO_LOCK, NO_LOCK }; |
| bool active_ctld = false, avail_ctld = false; |
| |
| if (active_controller) |
| ping_target_cnt = slurm_conf.control_cnt; |
| else |
| ping_target_cnt = backup_inx; |
| |
| ctld_ping = xcalloc(ping_target_cnt, sizeof(ctld_ping_t)); |
| ping_tids = xcalloc(ping_target_cnt, sizeof(pthread_t)); |
| |
| for (i = 0; i < ping_target_cnt; i++) { |
| ctld_ping[i].control_time = (time_t) 0; |
| ctld_ping[i].responding = false; |
| } |
| |
| lock_slurmctld(config_read_lock); |
| for (i = 0; i < ping_target_cnt; i++) { |
| if (i == backup_inx) /* Avoid pinging ourselves */ |
| continue; |
| |
| ping = xmalloc(sizeof(ping_struct_t)); |
| ping->backup_inx = i; |
| ping->control_addr = xstrdup(slurm_conf.control_addr[i]); |
| ping->control_machine = xstrdup(slurm_conf.control_machine[i]); |
| ping->slurmctld_port = slurm_conf.slurmctld_port; |
| slurm_thread_create(&ping_tids[i], _ping_ctld_thread, ping); |
| } |
| unlock_slurmctld(config_read_lock); |
| |
| for (i = 0; i < ping_target_cnt; i++) { |
| if (i == backup_inx) /* Avoid pinging ourselves */ |
| continue; |
| slurm_thread_join(ping_tids[i]); |
| } |
| xfree(ping_tids); |
| |
| for (i = 0; i < ping_target_cnt; i++) { |
| if (i == backup_inx) /* Avoid pinging ourselves */ |
| continue; |
| if (ctld_ping[i].control_time) { |
| /* |
| * Higher priority slurmctld is already in |
| * primary mode |
| */ |
| active_ctld = true; |
| } |
| if (ctld_ping[i].responding) { |
| /* |
| * Higher priority slurmctld is available to |
| * enter primary mode |
| */ |
| avail_ctld = true; |
| } else if (active_controller) { |
| trigger_backup_ctld_fail(i); |
| } |
| } |
| |
| xfree(ctld_ping); |
| if (active_ctld || avail_ctld) |
| return SLURM_SUCCESS; |
| return SLURM_ERROR; |
| } |
| |
| /* |
| * Reload the slurm.conf parameters without any processing |
| * of the node, partition, or state information. |
| * Specifically, we don't want to purge batch scripts based |
| * upon old job state information. |
| * This is a stripped down version of read_slurm_conf(0). |
| */ |
| static void _backup_reconfig(void) |
| { |
| slurm_conf_reinit(NULL); |
| update_logging(); |
| slurm_conf.last_update = time(NULL); |
| } |
| |
| static void *_shutdown_controller(void *arg) |
| { |
| int shutdown_inx, rc = SLURM_SUCCESS, rc2 = SLURM_SUCCESS; |
| slurm_msg_t req; |
| bool do_shutdown = false; |
| shutdown_arg_t *shutdown_arg; |
| shutdown_msg_t shutdown_msg; |
| |
| shutdown_arg = (shutdown_arg_t *)arg; |
| shutdown_inx = shutdown_arg->index; |
| do_shutdown = shutdown_arg->shutdown; |
| xfree(arg); |
| |
| slurm_msg_t_init(&req); |
| slurm_msg_set_r_uid(&req, slurm_conf.slurm_user_id); |
| slurm_set_addr(&req.address, slurm_conf.slurmctld_port, |
| slurm_conf.control_addr[shutdown_inx]); |
| if (do_shutdown) { |
| req.msg_type = REQUEST_SHUTDOWN; |
| shutdown_msg.options = SLURMCTLD_SHUTDOWN_CTLD; |
| req.data = &shutdown_msg; |
| } else { |
| req.msg_type = REQUEST_CONTROL; |
| } |
| if (slurm_send_recv_rc_msg_only_one(&req, &rc2, shutdown_timeout) < 0) { |
| error("%s: send/recv(%s): %m", |
| __func__, slurm_conf.control_machine[shutdown_inx]); |
| rc = SLURM_ERROR; |
| } else if (rc2 == ESLURM_DISABLED) { |
| debug("primary controller responding"); |
| } else if (rc2 == SLURM_SUCCESS) { |
| debug("primary controller has relinquished control"); |
| } else { |
| error("%s(%s): %s", |
| __func__, slurm_conf.control_machine[shutdown_inx], |
| slurm_strerror(rc2)); |
| rc = SLURM_ERROR; |
| } |
| |
| slurm_mutex_lock(&shutdown_mutex); |
| if (rc != SLURM_SUCCESS) |
| shutdown_rc = rc; |
| shutdown_thread_cnt--; |
| slurm_cond_signal(&shutdown_cond); |
| slurm_mutex_unlock(&shutdown_mutex); |
| return NULL; |
| } |
| |
| /* |
| * Tell the primary controller and all other possible controller daemons to |
| * relinquish control, primary control_machine has to suspend operation |
| * Based on _shutdown_backup_controller from controller.c |
| * wait_time - How long to wait for primary controller to write state, seconds. |
| * RET 0 or an error code |
| * NOTE: READ lock_slurmctld config before entry (or be single-threaded) |
| */ |
| static int _shutdown_primary_controller(int wait_time) |
| { |
| int i; |
| shutdown_arg_t *shutdown_arg; |
| |
| if (shutdown_timeout == 0) { |
| shutdown_timeout = slurm_conf.msg_timeout / 2; |
| shutdown_timeout = MAX(shutdown_timeout, 2); /* 2 sec min */ |
| shutdown_timeout = MIN(shutdown_timeout, CONTROL_TIMEOUT); |
| shutdown_timeout *= 1000; /* sec to msec */ |
| } |
| |
| if ((slurm_conf.control_addr[0] == NULL) || |
| (slurm_conf.control_addr[0][0] == '\0')) { |
| error("%s: no primary controller to shutdown", __func__); |
| return SLURM_ERROR; |
| } |
| |
| shutdown_rc = SLURM_SUCCESS; |
| for (i = 0; i < slurm_conf.control_cnt; i++) { |
| if (i == backup_inx) |
| continue; /* No message to self */ |
| |
| shutdown_arg = xmalloc(sizeof(*shutdown_arg)); |
| shutdown_arg->index = i; |
| /* |
| * need to send actual REQUEST_SHUTDOWN to non-primary ctlds |
| * in order to have them properly shutdown and not contend |
| * for primary position, otherwise "takeover" results in |
| * contention among backups for primary position. |
| */ |
| if (i < backup_inx) |
| shutdown_arg->shutdown = true; |
| slurm_thread_create_detached(_shutdown_controller, |
| shutdown_arg); |
| slurm_mutex_lock(&shutdown_mutex); |
| shutdown_thread_cnt++; |
| slurm_mutex_unlock(&shutdown_mutex); |
| } |
| |
| slurm_mutex_lock(&shutdown_mutex); |
| while (shutdown_thread_cnt != 0) { |
| slurm_cond_wait(&shutdown_cond, &shutdown_mutex); |
| } |
| slurm_mutex_unlock(&shutdown_mutex); |
| |
| /* |
| * FIXME: Ideally the REQUEST_CONTROL RPC does not return until all |
| * other activity has ceased and the state has been saved. That is |
| * not presently the case (it returns when no other work is pending, |
| * so the state save should occur right away). We sleep for a while |
| * here and give the primary controller time to shutdown |
| */ |
| if (wait_time) |
| sleep(wait_time); |
| |
| return shutdown_rc; |
| } |
| |
| static void *_trigger_slurmctld_event(void *arg) |
| { |
| trigger_info_t ti; |
| |
| memset(&ti, 0, sizeof(ti)); |
| ti.res_id = "*"; |
| ti.res_type = TRIGGER_RES_TYPE_SLURMCTLD; |
| ti.trig_type = TRIGGER_TYPE_BU_CTLD_RES_OP; |
| ti.control_inx = backup_inx; |
| if (slurm_pull_trigger(&ti)) { |
| error("%s: TRIGGER_TYPE_BU_CTLD_RES_OP send failure: %m", |
| __func__); |
| } else { |
| verbose("%s: TRIGGER_TYPE_BU_CTLD_RES_OP sent", __func__); |
| } |
| return NULL; |
| } |