| /*****************************************************************************\ |
| * backup.c - backup slurm controller |
| ***************************************************************************** |
| * Copyright (C) 2002-2007 The Regents of the University of California. |
| * Copyright (C) 2008-2010 Lawrence Livermore National Security. |
| * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). |
| * Written by Morris Jette <jette@llnl.gov>, et. al. |
| * CODE-OCEC-09-009. All rights reserved. |
| * |
| * This file is part of Slurm, a resource management program. |
| * For details, see <https://slurm.schedmd.com/>. |
| * Please also read the included file: DISCLAIMER. |
| * |
| * Slurm is free software; you can redistribute it and/or modify it under |
| * the terms of the GNU General Public License as published by the Free |
| * Software Foundation; either version 2 of the License, or (at your option) |
| * any later version. |
| * |
| * In addition, as a special exception, the copyright holders give permission |
| * to link the code of portions of this program with the OpenSSL library under |
| * certain conditions as described in each individual source file, and |
| * distribute linked combinations including the two. You must obey the GNU |
| * General Public License in all respects for all of the code used other than |
| * OpenSSL. If you modify file(s) with this exception, you may extend this |
| * exception to your version of the file(s), but you are not obligated to do |
| * so. If you do not wish to do so, delete this exception statement from your |
| * version. If you delete this exception statement from all source files in |
| * the program, then also delete it here. |
| * |
| * Slurm is distributed in the hope that it will be useful, but WITHOUT ANY |
| * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS |
| * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more |
| * details. |
| * |
| * You should have received a copy of the GNU General Public License along |
| * with Slurm; if not, write to the Free Software Foundation, Inc., |
| * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. |
| \*****************************************************************************/ |
| |
| #include "config.h" |
| |
| #include <errno.h> |
| #include <poll.h> |
| #include <pthread.h> |
| #include <signal.h> |
| #include <stdio.h> |
| #include <stdlib.h> |
| #include <string.h> |
| #include <sys/resource.h> |
| #include <sys/stat.h> |
| |
| #include "slurm/slurm_errno.h" |
| |
| #include "src/common/daemonize.h" |
| #include "src/common/log.h" |
| #include "src/common/macros.h" |
| #include "src/common/xsignal.h" |
| #include "src/common/xstring.h" |
| |
| #include "src/interfaces/accounting_storage.h" |
| #include "src/interfaces/auth.h" |
| #include "src/interfaces/priority.h" |
| #include "src/interfaces/select.h" |
| #include "src/interfaces/switch.h" |
| |
| #include "src/slurmctld/agent.h" |
| #include "src/slurmctld/heartbeat.h" |
| #include "src/slurmctld/locks.h" |
| #include "src/slurmctld/proc_req.h" |
| #include "src/slurmctld/read_config.h" |
| #include "src/slurmctld/slurmctld.h" |
| #include "src/slurmctld/trigger_mgr.h" |
| |
| #define _DEBUG 0 |
| #define SHUTDOWN_WAIT 2 /* Time to wait for primary server shutdown */ |
| |
| static int _background_process_msg(slurm_msg_t * msg); |
| static void * _background_rpc_mgr(void *no_data); |
| static void * _background_signal_hand(void *no_data); |
| static void _backup_reconfig(void); |
| static int _shutdown_primary_controller(int wait_time); |
| static void * _trigger_slurmctld_event(void *arg); |
| |
| typedef struct ping_struct { |
| int backup_inx; |
| char *control_addr; |
| char *control_machine; |
| uint32_t slurmctld_port; |
| } ping_struct_t; |
| |
| typedef struct { |
| time_t control_time; |
| bool responding; |
| } ctld_ping_t; |
| |
| /* Local variables */ |
| static ctld_ping_t * ctld_ping = NULL; |
| static bool dump_core = false; |
| static time_t last_controller_response; |
| static pthread_mutex_t ping_mutex = PTHREAD_MUTEX_INITIALIZER; |
| static volatile bool takeover = false; |
| static pthread_cond_t shutdown_cond = PTHREAD_COND_INITIALIZER; |
| static pthread_mutex_t shutdown_mutex = PTHREAD_MUTEX_INITIALIZER; |
| static int shutdown_rc = SLURM_SUCCESS; |
| static int shutdown_thread_cnt = 0; |
| static int shutdown_timeout = 0; |
| |
| /* |
| * Static list of signals to block in this process |
| * *Must be zero-terminated* |
| */ |
| static int backup_sigarray[] = { |
| SIGINT, SIGTERM, SIGCHLD, SIGUSR1, |
| SIGUSR2, SIGTSTP, SIGXCPU, SIGQUIT, |
| SIGPIPE, SIGALRM, SIGABRT, SIGHUP, 0 |
| }; |
| |
| /* |
| * run_backup - this is the backup controller, it should run in standby |
| * mode, assuming control when the primary controller stops responding |
| */ |
| void run_backup(void) |
| { |
| int i; |
| time_t last_ping = 0; |
| slurmctld_lock_t config_read_lock = { |
| READ_LOCK, NO_LOCK, NO_LOCK, NO_LOCK, NO_LOCK }; |
| slurmctld_lock_t config_write_lock = { |
| WRITE_LOCK, WRITE_LOCK, WRITE_LOCK, WRITE_LOCK, NO_LOCK }; |
| |
| info("slurmctld running in background mode"); |
| takeover = false; |
| last_controller_response = time(NULL); |
| |
| /* default: don't resume if shutdown */ |
| slurmctld_config.resume_backup = false; |
| |
| /* It is now ok to tell the primary I am done (if I ever had control) */ |
| slurm_mutex_lock(&slurmctld_config.backup_finish_lock); |
| slurm_cond_broadcast(&slurmctld_config.backup_finish_cond); |
| slurm_mutex_unlock(&slurmctld_config.backup_finish_lock); |
| |
| if (xsignal_block(backup_sigarray) < 0) |
| error("Unable to block signals"); |
| |
| /* |
| * create attached thread to process RPCs |
| */ |
| slurm_thread_create(&slurmctld_config.thread_id_rpc, |
| _background_rpc_mgr, NULL); |
| |
| /* |
| * create attached thread for signal handling |
| */ |
| slurm_thread_create(&slurmctld_config.thread_id_sig, |
| _background_signal_hand, NULL); |
| |
| slurm_thread_create_detached(_trigger_slurmctld_event, NULL); |
| |
| /* wait for the heartbeat file to exist before starting */ |
| while (!get_last_heartbeat(NULL) && |
| (slurmctld_config.shutdown_time == 0)) { |
| warning("Waiting for heartbeat file to exist..."); |
| sleep(1); |
| } |
| |
| for (i = 0; ((i < 5) && (slurmctld_config.shutdown_time == 0)); i++) { |
| sleep(1); /* Give the primary slurmctld set-up time */ |
| } |
| |
| /* repeatedly ping ControlMachine */ |
| while (slurmctld_config.shutdown_time == 0) { |
| sleep(1); |
| /* Lock of slurm_conf below not important */ |
| if (slurm_conf.slurmctld_timeout && (takeover == false) && |
| ((time(NULL) - last_ping) < |
| (slurm_conf.slurmctld_timeout / 3))) |
| continue; |
| |
| last_ping = time(NULL); |
| if (ping_controllers(false) == SLURM_SUCCESS) |
| last_controller_response = time(NULL); |
| else if (takeover) { |
| /* |
| * in takeover mode, take control as soon as |
| * primary no longer respond |
| */ |
| break; |
| } else { |
| time_t use_time, last_heartbeat; |
| int server_inx = -1; |
| last_heartbeat = get_last_heartbeat(&server_inx); |
| debug("%s: last_heartbeat %ld from server %d", |
| __func__, last_heartbeat, server_inx); |
| |
| use_time = last_controller_response; |
| if (server_inx > backup_inx) { |
| info("Lower priority slurmctld is currently primary (%d > %d)", |
| server_inx, backup_inx); |
| } else if (last_heartbeat > last_controller_response) { |
| /* Race condition for time stamps */ |
| debug("Last message to the controller was at %ld," |
| " but the last heartbeat was written at %ld," |
| " trusting the filesystem instead of the network" |
| " and not asserting control at this time.", |
| last_controller_response, last_heartbeat); |
| use_time = last_heartbeat; |
| } |
| |
| if (((time(NULL) - use_time) > |
| slurm_conf.slurmctld_timeout)) { |
| if (last_heartbeat) |
| break; |
| error("Not taking control. Heartbeat file could not be read and the primary slurmctld is unresponsive. Something is wrong with your StateSaveLocation."); |
| } |
| } |
| } |
| |
| if (slurmctld_config.shutdown_time != 0) { |
| /* |
| * Since pidfile is created as user root (its owner is |
| * changed to SlurmUser) SlurmUser may not be able to |
| * remove it, so this is not necessarily an error. |
| * No longer need slurm_conf lock after above join. |
| */ |
| if (unlink(slurm_conf.slurmctld_pidfile) < 0) |
| verbose("Unable to remove pidfile '%s': %m", |
| slurm_conf.slurmctld_pidfile); |
| |
| info("BackupController terminating"); |
| pthread_join(slurmctld_config.thread_id_sig, NULL); |
| log_fini(); |
| if (dump_core) |
| abort(); |
| else |
| exit(0); |
| } |
| |
| lock_slurmctld(config_read_lock); |
| error("ControlMachine %s not responding, BackupController%d %s taking over", |
| slurm_conf.control_machine[0], backup_inx, |
| slurmctld_config.node_name_short); |
| unlock_slurmctld(config_read_lock); |
| |
| backup_slurmctld_restart(); |
| trigger_primary_ctld_fail(); |
| trigger_backup_ctld_as_ctrl(); |
| |
| pthread_kill(slurmctld_config.thread_id_sig, SIGTERM); |
| pthread_join(slurmctld_config.thread_id_sig, NULL); |
| pthread_join(slurmctld_config.thread_id_rpc, NULL); |
| |
| /* |
| * Expressly shutdown the agent. The agent can in whole or in part |
| * shutdown once slutmctld_config.shutdown_time is set. Remove any |
| * doubt about its state here. |
| */ |
| agent_fini(); |
| |
| /* |
| * The job list needs to be freed before we run |
| * ctld_assoc_mgr_init, it should be empty here in the first place. |
| */ |
| lock_slurmctld(config_write_lock); |
| job_fini(); |
| |
| /* |
| * The backup is now done shutting down, reset shutdown_time before |
| * re-initializing. |
| */ |
| slurmctld_config.shutdown_time = (time_t) 0; |
| |
| init_job_conf(); |
| unlock_slurmctld(config_write_lock); |
| |
| /* |
| * Init the agent here so it comes up at roughly the same place as a |
| * normal startup. |
| */ |
| agent_init(); |
| |
| /* Calls assoc_mgr_init() */ |
| ctld_assoc_mgr_init(); |
| |
| /* |
| * priority_g_init() needs to be called after assoc_mgr_init() |
| * and before read_slurm_conf() because jobs could be killed |
| * during read_slurm_conf() and call priority_g_job_end(). |
| */ |
| if (priority_g_init() != SLURM_SUCCESS) |
| fatal("failed to initialize priority plugin"); |
| |
| /* clear old state and read new state */ |
| lock_slurmctld(config_write_lock); |
| if (switch_g_restore(slurm_conf.state_save_location, true)) { |
| error("failed to restore switch state"); |
| abort(); |
| } |
| if (read_slurm_conf(2, false)) { /* Recover all state */ |
| error("Unable to recover slurm state"); |
| abort(); |
| } |
| configless_update(); |
| if (conf_includes_list) { |
| /* |
| * clear included files so that subsequent conf |
| * parsings refill it with updated information. |
| */ |
| list_flush(conf_includes_list); |
| } |
| unlock_slurmctld(config_write_lock); |
| select_g_select_nodeinfo_set_all(); |
| |
| return; |
| } |
| |
| /* |
| * _background_signal_hand - Process daemon-wide signals for the |
| * backup controller |
| */ |
| static void *_background_signal_hand(void *no_data) |
| { |
| int sig, rc; |
| sigset_t set; |
| /* Locks: Write configuration, job, node, and partition */ |
| slurmctld_lock_t config_write_lock = { |
| WRITE_LOCK, WRITE_LOCK, WRITE_LOCK, WRITE_LOCK, NO_LOCK }; |
| |
| while (slurmctld_config.shutdown_time == 0) { |
| xsignal_sigset_create(backup_sigarray, &set); |
| rc = sigwait(&set, &sig); |
| if (rc == EINTR) |
| continue; |
| switch (sig) { |
| case SIGINT: /* kill -2 or <CTRL-C> */ |
| case SIGTERM: /* kill -15 */ |
| info("Terminate signal (SIGINT or SIGTERM) received"); |
| slurmctld_config.shutdown_time = time(NULL); |
| slurmctld_shutdown(); |
| return NULL; /* Normal termination */ |
| break; |
| case SIGHUP: /* kill -1 */ |
| info("Reconfigure signal (SIGHUP) received"); |
| /* |
| * XXX - need to shut down the scheduler |
| * plugin, re-read the configuration, and then |
| * restart the (possibly new) plugin. |
| */ |
| lock_slurmctld(config_write_lock); |
| _backup_reconfig(); |
| unlock_slurmctld(config_write_lock); |
| break; |
| case SIGABRT: /* abort */ |
| info("SIGABRT received"); |
| slurmctld_config.shutdown_time = time(NULL); |
| slurmctld_shutdown(); |
| dump_core = true; |
| return NULL; /* Normal termination */ |
| break; |
| case SIGUSR2: |
| info("Logrotate signal (SIGUSR2) received"); |
| update_logging(); |
| break; |
| default: |
| error("Invalid signal (%d) received", sig); |
| } |
| } |
| return NULL; |
| } |
| |
| static void _sig_handler(int signal) |
| { |
| } |
| |
| /* |
| * _background_rpc_mgr - Read and process incoming RPCs to the background |
| * controller (that's us) |
| */ |
| static void *_background_rpc_mgr(void *no_data) |
| { |
| int newsockfd; |
| int fd_next = 0, i; |
| slurm_addr_t cli_addr; |
| slurm_msg_t msg; |
| |
| int sigarray[] = {SIGUSR1, 0}; |
| |
| debug3("_background_rpc_mgr pid = %lu", (unsigned long) getpid()); |
| |
| /* |
| * Prepare to catch SIGUSR1 to interrupt accept(). This signal is |
| * generated by the slurmctld signal handler thread upon receipt of |
| * SIGABRT, SIGINT, or SIGTERM. That thread does all processing of |
| * all signals. |
| */ |
| xsignal(SIGUSR1, _sig_handler); |
| xsignal_unblock(sigarray); |
| |
| /* |
| * Process incoming RPCs indefinitely |
| */ |
| while (slurmctld_config.shutdown_time == 0) { |
| if (poll(listen_fds, listen_nports, -1) == -1) { |
| if (errno != EINTR) |
| error("slurm_accept_msg_conn poll: %m"); |
| continue; |
| } |
| |
| /* find one to process */ |
| for (i = 0; i < listen_nports; i++) { |
| if (listen_fds[(fd_next + i) % listen_nports].revents) { |
| i = (fd_next + i) % listen_nports; |
| break; |
| } |
| } |
| fd_next = (i + 1) % listen_nports; |
| |
| if ((newsockfd = slurm_accept_msg_conn(listen_fds[i].fd, |
| &cli_addr)) |
| == SLURM_ERROR) { |
| if (errno != EINTR) |
| error("slurm_accept_msg_conn: %m"); |
| continue; |
| } |
| |
| log_flag(PROTOCOL, "%s: accept() connection from %pA", |
| __func__, &cli_addr); |
| |
| slurm_msg_t_init(&msg); |
| if (slurm_receive_msg(newsockfd, &msg, 0) != 0) |
| error("slurm_receive_msg: %m"); |
| else |
| _background_process_msg(&msg); |
| |
| slurm_free_msg_members(&msg); |
| |
| close(newsockfd); /* close new socket */ |
| } |
| |
| debug3("_background_rpc_mgr shutting down"); |
| return NULL; |
| } |
| |
| /* |
| * _background_process_msg - process an RPC to the backup_controller |
| */ |
| static int _background_process_msg(slurm_msg_t *msg) |
| { |
| int error_code = SLURM_SUCCESS; |
| bool send_rc = true; |
| |
| if (!msg->auth_ids_set) { |
| error("%s: received message without previously validated auth", |
| __func__); |
| return SLURM_ERROR; |
| } |
| |
| if (slurm_conf.debug_flags & DEBUG_FLAG_PROTOCOL) { |
| char *p = rpc_num2string(msg->msg_type); |
| if (msg->conn) { |
| info("%s: received opcode %s from persist conn on (%s)%s uid %u", |
| __func__, p, msg->conn->cluster_name, |
| msg->conn->rem_host, msg->auth_uid); |
| } else { |
| slurm_addr_t cli_addr; |
| (void) slurm_get_peer_addr(msg->conn_fd, &cli_addr); |
| info("%s: received opcode %s from %pA uid %u", |
| __func__, p, &cli_addr, msg->auth_uid); |
| } |
| } |
| |
| if (msg->msg_type != REQUEST_PING) { |
| bool super_user = false; |
| |
| if (validate_slurm_user(msg->auth_uid)) |
| super_user = true; |
| |
| if (super_user && (msg->msg_type == REQUEST_SHUTDOWN)) { |
| info("Performing background RPC: REQUEST_SHUTDOWN"); |
| pthread_kill(slurmctld_config.thread_id_sig, SIGTERM); |
| } else if (super_user && |
| (msg->msg_type == REQUEST_TAKEOVER)) { |
| info("Performing background RPC: REQUEST_TAKEOVER"); |
| if (get_last_heartbeat(NULL)) { |
| _shutdown_primary_controller(SHUTDOWN_WAIT); |
| takeover = true; |
| error_code = SLURM_SUCCESS; |
| } else { |
| error_code = ESLURM_TAKEOVER_NO_HEARTBEAT; |
| } |
| } else if (super_user && |
| (msg->msg_type == REQUEST_CONTROL)) { |
| debug3("Ignoring RPC: REQUEST_CONTROL"); |
| error_code = ESLURM_DISABLED; |
| last_controller_response = time(NULL); |
| } else if (msg->msg_type == REQUEST_CONTROL_STATUS) { |
| slurm_rpc_control_status(msg); |
| send_rc = false; |
| } else if (msg->msg_type == REQUEST_CONFIG) { |
| /* |
| * Config was asked for from the wrong controller |
| * Assume there was a misconfiguration and redirect |
| * to the correct controller. This usually indicates a |
| * configuration issue. |
| */ |
| error("REQUEST_CONFIG received while in standby."); |
| error_code = ESLURM_IN_STANDBY_USE_BACKUP; |
| } else { |
| error("Invalid RPC received %s while in standby mode", |
| rpc_num2string(msg->msg_type)); |
| error_code = ESLURM_IN_STANDBY_MODE; |
| } |
| } |
| if (send_rc) |
| slurm_send_rc_msg(msg, error_code); |
| return error_code; |
| } |
| |
| static void *_ping_ctld_thread(void *arg) |
| { |
| ping_struct_t *ping = (ping_struct_t *) arg; |
| slurm_msg_t req, resp; |
| control_status_msg_t *control_msg; |
| time_t control_time = (time_t) 0; |
| bool responding = false; |
| |
| slurm_msg_t_init(&req); |
| slurm_set_addr(&req.address, ping->slurmctld_port, ping->control_addr); |
| req.msg_type = REQUEST_CONTROL_STATUS; |
| slurm_msg_set_r_uid(&req, SLURM_AUTH_UID_ANY); |
| if (slurm_send_recv_node_msg(&req, &resp, 0) == SLURM_SUCCESS) { |
| switch (resp.msg_type) { |
| case RESPONSE_CONTROL_STATUS: |
| control_msg = (control_status_msg_t *) resp.data; |
| if (ping->backup_inx != control_msg->backup_inx) { |
| error("%s: BackupController# index mismatch (%d != %u) from host %s", |
| __func__, ping->backup_inx, |
| control_msg->backup_inx, |
| ping->control_machine); |
| } |
| control_time = control_msg->control_time; |
| responding = true; |
| break; |
| default: |
| error("%s:, Unknown response message %u from host %s", |
| __func__, resp.msg_type, ping->control_machine); |
| break; |
| } |
| slurm_free_msg_data(resp.msg_type, resp.data); |
| if (resp.auth_cred) |
| auth_g_destroy(resp.auth_cred); |
| } |
| |
| slurm_mutex_lock(&ping_mutex); |
| if (responding) { |
| ctld_ping[ping->backup_inx].control_time = control_time; |
| ctld_ping[ping->backup_inx].responding = true; |
| } |
| slurm_mutex_unlock(&ping_mutex); |
| |
| xfree(ping->control_addr); |
| xfree(ping->control_machine); |
| xfree(ping); |
| |
| return NULL; |
| } |
| |
| /* |
| * Ping all higher-priority control nodes. |
| * RET SLURM_SUCCESS if a currently active controller is found |
| */ |
| extern int ping_controllers(bool active_controller) |
| { |
| int i, ping_target_cnt; |
| ping_struct_t *ping; |
| pthread_t *ping_tids; |
| /* Locks: Read configuration */ |
| slurmctld_lock_t config_read_lock = { |
| READ_LOCK, NO_LOCK, NO_LOCK, NO_LOCK, NO_LOCK }; |
| bool active_ctld = false, avail_ctld = false; |
| |
| if (active_controller) |
| ping_target_cnt = slurm_conf.control_cnt; |
| else |
| ping_target_cnt = backup_inx; |
| |
| ctld_ping = xcalloc(ping_target_cnt, sizeof(ctld_ping_t)); |
| ping_tids = xcalloc(ping_target_cnt, sizeof(pthread_t)); |
| |
| for (i = 0; i < ping_target_cnt; i++) { |
| ctld_ping[i].control_time = (time_t) 0; |
| ctld_ping[i].responding = false; |
| } |
| |
| lock_slurmctld(config_read_lock); |
| for (i = 0; i < ping_target_cnt; i++) { |
| if (i == backup_inx) /* Avoid pinging ourselves */ |
| continue; |
| |
| ping = xmalloc(sizeof(ping_struct_t)); |
| ping->backup_inx = i; |
| ping->control_addr = xstrdup(slurm_conf.control_addr[i]); |
| ping->control_machine = xstrdup(slurm_conf.control_machine[i]); |
| ping->slurmctld_port = slurm_conf.slurmctld_port; |
| slurm_thread_create(&ping_tids[i], _ping_ctld_thread, ping); |
| } |
| unlock_slurmctld(config_read_lock); |
| |
| for (i = 0; i < ping_target_cnt; i++) { |
| if (i == backup_inx) /* Avoid pinging ourselves */ |
| continue; |
| pthread_join(ping_tids[i], NULL); |
| } |
| xfree(ping_tids); |
| |
| for (i = 0; i < ping_target_cnt; i++) { |
| if (i == backup_inx) /* Avoid pinging ourselves */ |
| continue; |
| if (ctld_ping[i].control_time) { |
| /* |
| * Higher priority slurmctld is already in |
| * primary mode |
| */ |
| active_ctld = true; |
| } |
| if (ctld_ping[i].responding) { |
| /* |
| * Higher priority slurmctld is available to |
| * enter primary mode |
| */ |
| avail_ctld = true; |
| } else if (active_controller) { |
| trigger_backup_ctld_fail(i); |
| } |
| } |
| |
| xfree(ctld_ping); |
| if (active_ctld || avail_ctld) |
| return SLURM_SUCCESS; |
| return SLURM_ERROR; |
| } |
| |
| /* |
| * Reload the slurm.conf parameters without any processing |
| * of the node, partition, or state information. |
| * Specifically, we don't want to purge batch scripts based |
| * upon old job state information. |
| * This is a stripped down version of read_slurm_conf(0). |
| */ |
| static void _backup_reconfig(void) |
| { |
| slurm_conf_reinit(NULL); |
| update_logging(); |
| slurm_conf.last_update = time(NULL); |
| return; |
| } |
| |
| static void *_shutdown_controller(void *arg) |
| { |
| int shutdown_inx, rc = SLURM_SUCCESS, rc2 = SLURM_SUCCESS; |
| slurm_msg_t req; |
| bool do_shutdown = false; |
| shutdown_arg_t *shutdown_arg; |
| shutdown_msg_t shutdown_msg; |
| |
| shutdown_arg = (shutdown_arg_t *)arg; |
| shutdown_inx = shutdown_arg->index; |
| do_shutdown = shutdown_arg->shutdown; |
| xfree(arg); |
| |
| slurm_msg_t_init(&req); |
| slurm_msg_set_r_uid(&req, slurm_conf.slurm_user_id); |
| slurm_set_addr(&req.address, slurm_conf.slurmctld_port, |
| slurm_conf.control_addr[shutdown_inx]); |
| if (do_shutdown) { |
| req.msg_type = REQUEST_SHUTDOWN; |
| shutdown_msg.options = SLURMCTLD_SHUTDOWN_CTLD; |
| req.data = &shutdown_msg; |
| } else { |
| req.msg_type = REQUEST_CONTROL; |
| } |
| if (slurm_send_recv_rc_msg_only_one(&req, &rc2, shutdown_timeout) < 0) { |
| error("%s: send/recv(%s): %m", |
| __func__, slurm_conf.control_machine[shutdown_inx]); |
| rc = SLURM_ERROR; |
| } else if (rc2 == ESLURM_DISABLED) { |
| debug("primary controller responding"); |
| } else if (rc2 == SLURM_SUCCESS) { |
| debug("primary controller has relinquished control"); |
| } else { |
| error("%s(%s): %s", |
| __func__, slurm_conf.control_machine[shutdown_inx], |
| slurm_strerror(rc2)); |
| rc = SLURM_ERROR; |
| } |
| |
| slurm_mutex_lock(&shutdown_mutex); |
| if (rc != SLURM_SUCCESS) |
| shutdown_rc = rc; |
| shutdown_thread_cnt--; |
| slurm_cond_signal(&shutdown_cond); |
| slurm_mutex_unlock(&shutdown_mutex); |
| return NULL; |
| } |
| |
| /* |
| * Tell the primary controller and all other possible controller daemons to |
| * relinquish control, primary control_machine has to suspend operation |
| * Based on _shutdown_backup_controller from controller.c |
| * wait_time - How long to wait for primary controller to write state, seconds. |
| * RET 0 or an error code |
| * NOTE: READ lock_slurmctld config before entry (or be single-threaded) |
| */ |
| static int _shutdown_primary_controller(int wait_time) |
| { |
| int i; |
| shutdown_arg_t *shutdown_arg; |
| |
| if (shutdown_timeout == 0) { |
| shutdown_timeout = slurm_conf.msg_timeout / 2; |
| shutdown_timeout = MAX(shutdown_timeout, 2); /* 2 sec min */ |
| shutdown_timeout = MIN(shutdown_timeout, CONTROL_TIMEOUT); |
| shutdown_timeout *= 1000; /* sec to msec */ |
| } |
| |
| if ((slurm_conf.control_addr[0] == NULL) || |
| (slurm_conf.control_addr[0][0] == '\0')) { |
| error("%s: no primary controller to shutdown", __func__); |
| return SLURM_ERROR; |
| } |
| |
| shutdown_rc = SLURM_SUCCESS; |
| for (i = 0; i < slurm_conf.control_cnt; i++) { |
| if (i == backup_inx) |
| continue; /* No message to self */ |
| |
| shutdown_arg = xmalloc(sizeof(*shutdown_arg)); |
| shutdown_arg->index = i; |
| /* |
| * need to send actual REQUEST_SHUTDOWN to non-primary ctlds |
| * in order to have them properly shutdown and not contend |
| * for primary position, otherwise "takeover" results in |
| * contention among backups for primary position. |
| */ |
| if (i < backup_inx) |
| shutdown_arg->shutdown = true; |
| slurm_thread_create_detached(_shutdown_controller, |
| shutdown_arg); |
| slurm_mutex_lock(&shutdown_mutex); |
| shutdown_thread_cnt++; |
| slurm_mutex_unlock(&shutdown_mutex); |
| } |
| |
| slurm_mutex_lock(&shutdown_mutex); |
| while (shutdown_thread_cnt != 0) { |
| slurm_cond_wait(&shutdown_cond, &shutdown_mutex); |
| } |
| slurm_mutex_unlock(&shutdown_mutex); |
| |
| /* |
| * FIXME: Ideally the REQUEST_CONTROL RPC does not return until all |
| * other activity has ceased and the state has been saved. That is |
| * not presently the case (it returns when no other work is pending, |
| * so the state save should occur right away). We sleep for a while |
| * here and give the primary controller time to shutdown |
| */ |
| if (wait_time) |
| sleep(wait_time); |
| |
| return shutdown_rc; |
| } |
| |
| static void *_trigger_slurmctld_event(void *arg) |
| { |
| trigger_info_t ti; |
| |
| memset(&ti, 0, sizeof(ti)); |
| ti.res_id = "*"; |
| ti.res_type = TRIGGER_RES_TYPE_SLURMCTLD; |
| ti.trig_type = TRIGGER_TYPE_BU_CTLD_RES_OP; |
| ti.control_inx = backup_inx; |
| if (slurm_pull_trigger(&ti)) { |
| error("%s: TRIGGER_TYPE_BU_CTLD_RES_OP send failure: %m", |
| __func__); |
| } else { |
| verbose("%s: TRIGGER_TYPE_BU_CTLD_RES_OP sent", __func__); |
| } |
| return NULL; |
| } |