src/slurmctld/controller.c - SchedMD/slurm - Git at Google

 /*****************************************************************************\
  *  controller.c - main control machine daemon for slurm
  *****************************************************************************
  *  Copyright (C) 2002-2007 The Regents of the University of California.
  *  Copyright (C) 2008-2010 Lawrence Livermore National Security.
  *  Copyright (C) SchedMD LLC.
  *  Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
  *  Written by Morris Jette <jette1@llnl.gov>, Kevin Tew <tew1@llnl.gov>
  *  CODE-OCEC-09-009. All rights reserved.
  *
  *  This file is part of Slurm, a resource management program.
  *  For details, see <https://slurm.schedmd.com/>.
  *  Please also read the included file: DISCLAIMER.
  *
  *  Slurm is free software; you can redistribute it and/or modify it under
  *  the terms of the GNU General Public License as published by the Free
  *  Software Foundation; either version 2 of the License, or (at your option)
  *  any later version.
  *
  *  In addition, as a special exception, the copyright holders give permission
  *  to link the code of portions of this program with the OpenSSL library under
  *  certain conditions as described in each individual source file, and
  *  distribute linked combinations including the two. You must obey the GNU
  *  General Public License in all respects for all of the code used other than
  *  OpenSSL. If you modify file(s) with this exception, you may extend this
  *  exception to your version of the file(s), but you are not obligated to do
  *  so. If you do not wish to do so, delete this exception statement from your
  *  version.  If you delete this exception statement from all source files in
  *  the program, then also delete it here.
  *
  *  Slurm is distributed in the hope that it will be useful, but WITHOUT ANY
  *  WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
  *  FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
  *  details.
  *
  *  You should have received a copy of the GNU General Public License along
  *  with Slurm; if not, write to the Free Software Foundation, Inc.,
  *  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301  USA.
 \*****************************************************************************/

 #include "config.h"

 #if HAVE_SYS_PRCTL_H
 #  include <sys/prctl.h>
 #endif

 #include <errno.h>
 #include <getopt.h>
 #include <grp.h>
 #include <poll.h>
 #include <pthread.h>
 #include <signal.h>
 #include <stdint.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <sys/resource.h>
 #include <sys/stat.h>
 #include <unistd.h>

 #include "slurm/slurm_errno.h"

 #include "src/common/assoc_mgr.h"
 #include "src/common/daemonize.h"
 #include "src/common/extra_constraints.h"
 #include "src/common/fd.h"
 #include "src/common/group_cache.h"
 #include "src/common/hostlist.h"
 #include "src/common/list.h"
 #include "src/common/log.h"
 #include "src/common/macros.h"
 #include "src/common/pack.h"
 #include "src/common/port_mgr.h"
 #include "src/common/proc_args.h"
 #include "src/common/read_config.h"
 #include "src/common/ref.h"
 #include "src/common/run_command.h"
 #include "src/common/sluid.h"
 #include "src/common/slurm_protocol_api.h"
 #include "src/common/slurm_protocol_socket.h"
 #include "src/common/slurm_rlimits_info.h"
 #include "src/common/state_save.h"
 #include "src/common/timers.h"
 #include "src/common/track_script.h"
 #include "src/common/uid.h"
 #include "src/common/util-net.h"
 #include "src/common/xstring.h"
 #include "src/common/xsystemd.h"

 #include "src/conmgr/conmgr.h"

 #include "src/interfaces/accounting_storage.h"
 #include "src/interfaces/acct_gather_profile.h"
 #include "src/interfaces/auth.h"
 #include "src/interfaces/burst_buffer.h"
 #include "src/interfaces/certmgr.h"
 #include "src/interfaces/cgroup.h"
 #include "src/interfaces/conn.h"
 #include "src/interfaces/gres.h"
 #include "src/interfaces/hash.h"
 #include "src/interfaces/job_submit.h"
 #include "src/interfaces/jobacct_gather.h"
 #include "src/interfaces/jobcomp.h"
 #include "src/interfaces/mcs.h"
 #include "src/interfaces/mpi.h"
 #include "src/interfaces/node_features.h"
 #include "src/interfaces/preempt.h"
 #include "src/interfaces/prep.h"
 #include "src/interfaces/priority.h"
 #include "src/interfaces/sched_plugin.h"
 #include "src/interfaces/select.h"
 #include "src/interfaces/serializer.h"
 #include "src/interfaces/site_factor.h"
 #include "src/interfaces/switch.h"
 #include "src/interfaces/topology.h"

 #include "src/slurmctld/acct_policy.h"
 #include "src/slurmctld/agent.h"
 #include "src/slurmctld/fed_mgr.h"
 #include "src/slurmctld/gang.h"
 #include "src/slurmctld/heartbeat.h"
 #include "src/slurmctld/job_scheduler.h"
 #include "src/slurmctld/licenses.h"
 #include "src/slurmctld/locks.h"
 #include "src/slurmctld/ping_nodes.h"
 #include "src/slurmctld/power_save.h"
 #include "src/slurmctld/proc_req.h"
 #include "src/slurmctld/rate_limit.h"
 #include "src/slurmctld/read_config.h"
 #include "src/slurmctld/reservation.h"
 #include "src/slurmctld/rpc_queue.h"
 #include "src/slurmctld/sackd_mgr.h"
 #include "src/slurmctld/slurmctld.h"
 #include "src/slurmctld/slurmscriptd.h"
 #include "src/slurmctld/state_save.h"
 #include "src/slurmctld/trigger_mgr.h"

 #include "src/stepmgr/srun_comm.h"
 #include "src/stepmgr/stepmgr.h"

 decl_static_data(usage_txt);

 #define SLURMCTLD_CONMGR_DEFAULT_MAX_CONNECTIONS 50
 #define MIN_CHECKIN_TIME  3	/* Nodes have this number of seconds to
 				 * check-in before we ping them */
 #define SHUTDOWN_WAIT     2	/* Time to wait for backup server shutdown */
 #define JOB_COUNT_INTERVAL 30   /* Time to update running job count */

 #define DEV_TTY_PATH "/dev/tty"
 #define DEV_NULL_PATH "/dev/null"

 /**************************************************************************\
  * To test for memory leaks, set MEMORY_LEAK_DEBUG to 1 using
  * "configure --enable-memory-leak-debug" then execute
  *
  * $ valgrind --tool=memcheck --leak-check=yes --num-callers=40 \
  *   --leak-resolution=high ./slurmctld -Dc >valg.ctld.out 2>&1
  *
  * Then exercise the slurmctld functionality before executing
  * > scontrol shutdown
  *
  * Note that --enable-memory-leak-debug will cause the daemon to
  * unload the shared objects at exit thus preventing valgrind
  * to display the stack where the eventual leaks may be.
  * It is always best to test with and without --enable-memory-leak-debug.
  *
  * On some systems _keyvalue_regex_init() will generate two blocks "definitely
  *    lost", both of size zero.
  * On some systems dlopen() will generate a small number of "definitely
  *    lost" blocks that are not cleared by dlclose().
  * On some systems, pthread_create() will generated a small number of
  *    "possibly lost" blocks.
  * Otherwise the report should be free of errors. Remember to reset
  *    MEMORY_LEAK_DEBUG to 0 for production use (non-seamless backup
  *    controller use).
 \**************************************************************************/

 uint32_t slurm_daemon = IS_SLURMCTLD;

 /* Log to stderr and syslog until becomes a daemon */
 log_options_t log_opts = LOG_OPTS_INITIALIZER;
 /* Scheduler Log options */
 log_options_t sched_log_opts = SCHEDLOG_OPTS_INITIALIZER;

 /* Global variables */
 bool    preempt_send_user_signal = false;
 uint16_t accounting_enforce = 0;
 void *	acct_db_conn = NULL;
 int	backup_inx;
 int	batch_sched_delay = 3;
 bool cloud_dns = false;
 uint32_t cluster_cpus = 0;
 time_t	control_time = 0;
 bool disable_remote_singleton = false;
 int max_depend_depth = 10;
 time_t	last_proc_req_start = 0;
 uint32_t max_powered_nodes = NO_VAL;
 bool	ping_nodes_now = false;
 pthread_cond_t purge_thread_cond = PTHREAD_COND_INITIALIZER;
 pthread_mutex_t purge_thread_lock = PTHREAD_MUTEX_INITIALIZER;
 pthread_mutex_t check_bf_running_lock = PTHREAD_MUTEX_INITIALIZER;
 int	sched_interval = 60;
 slurmctld_config_t slurmctld_config = {0};
 diag_stats_t slurmctld_diag_stats;
 bool slurmctld_primary = true;
 bool	want_nodes_reboot = true;
 int   slurmctld_tres_cnt = 0;
 slurmdb_cluster_rec_t *response_cluster_rec = NULL;
 uint16_t running_cache = RUNNING_CACHE_STATE_NOTRUNNING;
 pthread_mutex_t assoc_cache_mutex = PTHREAD_MUTEX_INITIALIZER;
 pthread_cond_t assoc_cache_cond = PTHREAD_COND_INITIALIZER;

 /* Local variables */
 static pthread_t assoc_cache_thread = (pthread_t) 0;
 static char binary[PATH_MAX];
 static int	bu_rc = SLURM_SUCCESS;
 static int	bu_thread_cnt = 0;
 static pthread_cond_t bu_cond = PTHREAD_COND_INITIALIZER;
 static pthread_mutex_t bu_mutex = PTHREAD_MUTEX_INITIALIZER;
 static bool daemonize = true;
 static bool setwd = false;
 static int	debug_level = 0;
 static char *	debug_logfile = NULL;
 static bool	dump_core = false;
 static int      job_sched_cnt = 0;
 static int main_argc = 0;
 static char **main_argv = NULL;
 static uint32_t max_server_threads = MAX_SERVER_THREADS;
 static time_t	next_stats_reset = 0;
 static int	new_nice = 0;
 static bool original = true;
 static int pidfd = -1;
 /*
  * 0 = use no saved state information
  * 1 = recover saved job state,
  *     node DOWN/DRAIN state & reason information
  * 2 = recover state saved from last shutdown
  */
 static int recover = 1;
 static pthread_mutex_t sched_cnt_mutex = PTHREAD_MUTEX_INITIALIZER;
 static char *	slurm_conf_filename;
 static int reconfig_rc = SLURM_SUCCESS;
 static bool reconfig = false;
 static list_t *reconfig_reqs = NULL;
 static pthread_mutex_t shutdown_mutex = PTHREAD_MUTEX_INITIALIZER;
 static pthread_cond_t shutdown_cond = PTHREAD_COND_INITIALIZER;
 static bool under_systemd = false;

 /* Array of listening sockets */
 static struct {
 	pthread_mutex_t mutex;
 	int count;
 	int *fd;
 	conmgr_fd_t **cons;
 	bool standby_mode;
 	bool quiesced;
 } listeners = {
 	.mutex = PTHREAD_MUTEX_INITIALIZER,
 	.quiesced = true,
 };

 static int          _accounting_cluster_ready();
 static int          _accounting_mark_all_nodes_down(char *reason);
 static void *       _assoc_cache_mgr(void *no_data);
 static int          _controller_index(void);
 static void         _create_clustername_file(void);
 static void _flush_rpcs(void);
 static void         _get_fed_updates();
 static void         _init_config(void);
 static void         _init_pidfile(void);
 static int          _init_tres(void);
 static void         _kill_old_slurmctld(void);
 static void _open_ports(void);
 static void         _parse_commandline(int argc, char **argv);
 static void _post_reconfig(void);
 static void *       _purge_files_thread(void *no_data);
 static void *_acct_update_thread(void *no_data);
 static void         _remove_assoc(slurmdb_assoc_rec_t *rec);
 static void         _remove_qos(slurmdb_qos_rec_t *rec);
 static void         _restore_job_dependencies(void);
 static void         _run_primary_prog(bool primary_on);
 static void         _send_future_cloud_to_db();
 static void _service_connection(conmgr_callback_args_t conmgr_args,
 				int input_fd, int output_fd, void *tls_conn,
 				void *arg);
 static void         _set_work_dir(void);
 static int          _shutdown_backup_controller(void);
 static void *       _slurmctld_background(void *no_data);
 static void         _test_thread_limit(void);
 static int _try_to_reconfig(void);
 static void         _update_assoc(slurmdb_assoc_rec_t *rec);
 static void         _update_diag_job_state_counts(void);
 static void         _update_cluster_tres(void);
 static void         _update_nice(void);
 static void _update_pidfile(void);
 static void         _update_qos(slurmdb_qos_rec_t *rec);
 static void _usage(void);
 static void _verify_clustername(void);

 static void _send_reconfig_replies(void)
 {
 	slurm_msg_t *msg = NULL;

 	while ((msg = list_pop(reconfig_reqs))) {
 		/* Must avoid sending reply via msg->conmgr_con */
 		xassert(!msg->conmgr_con);

 		(void) slurm_send_rc_msg(msg, reconfig_rc);
 		conn_g_destroy(msg->tls_conn, true);
 		slurm_free_msg(msg);
 	}
 }

 static void _attempt_reconfig(void)
 {
 	info("Attempting to reconfigure");

 	/*
 	 * Reconfigure requires all connections to fully processed before
 	 * continuing as the file descriptors will be closed during fork() and
 	 * the parent process will call _exit() instead of finishing their
 	 * processing if the new slurmctld process starts successfully.
 	 */
 	conmgr_quiesce(__func__);

 	/*
 	 * Send RC to requesters in foreground mode now as slurmctld is about
 	 * to call exec() which will close connections.
 	 */
 	if (!daemonize && !under_systemd)
 		_send_reconfig_replies();

 	reconfig_rc = _try_to_reconfig();

 	_send_reconfig_replies();

 	if (!reconfig_rc) {
 		info("Relinquishing control to new child");
 		_exit(0);
 	}

 	recover = 2;

 	/*
 	 * Reconfigure failed which means this process needs start again
 	 * processing connections.
 	 */
 	conmgr_unquiesce(__func__);
 }

 static void _on_sigint(conmgr_callback_args_t conmgr_args, void *arg)
 {
 	if (conmgr_args.status == CONMGR_WORK_STATUS_CANCELLED)
 		return;

 	info("Terminate signal SIGINT received");
 	slurmctld_shutdown();
 }

 static void _on_sigterm(conmgr_callback_args_t conmgr_args, void *arg)
 {
 	if (conmgr_args.status == CONMGR_WORK_STATUS_CANCELLED)
 		return;

 	info("Terminate signal SIGTERM received");
 	slurmctld_shutdown();
 }

 static void _on_sigchld(conmgr_callback_args_t conmgr_args, void *arg)
 {
 	if (conmgr_args.status == CONMGR_WORK_STATUS_CANCELLED)
 		return;

 	debug5("Caught SIGCHLD. Ignoring");
 }

 static void _on_sigquit(conmgr_callback_args_t conmgr_args, void *arg)
 {
 	if (conmgr_args.status == CONMGR_WORK_STATUS_CANCELLED)
 		return;

 	info("Terminate signal SIGQUIT received");
 	slurmctld_shutdown();
 }

 static void _on_sighup(conmgr_callback_args_t conmgr_args, void *arg)
 {
 	bool standby_mode;

 	if (conmgr_args.status == CONMGR_WORK_STATUS_CANCELLED)
 		return;

 	info("Reconfigure signal (SIGHUP) received");

 	slurm_mutex_lock(&listeners.mutex);
 	standby_mode = listeners.standby_mode;
 	slurm_mutex_unlock(&listeners.mutex);

 	if (standby_mode) {
 		backup_on_sighup();
 		return;
 	}

 	reconfig = true;
 	slurmctld_shutdown();
 }

 static void _on_sigusr1(conmgr_callback_args_t conmgr_args, void *arg)
 {
 	if (conmgr_args.status == CONMGR_WORK_STATUS_CANCELLED)
 		return;

 	debug5("Caught SIGUSR1. Ignoring.");
 }

 static void _on_sigusr2(conmgr_callback_args_t conmgr_args, void *arg)
 {
 	static const slurmctld_lock_t conf_write_lock = {
 		.conf = WRITE_LOCK,
 	};

 	if (conmgr_args.status == CONMGR_WORK_STATUS_CANCELLED)
 		return;

 	info("Logrotate signal (SIGUSR2) received");

 	lock_slurmctld(conf_write_lock);
 	update_logging();
 	if (slurmctld_primary)
 		slurmscriptd_update_log_level(slurm_conf.slurmctld_debug, true);
 	unlock_slurmctld(conf_write_lock);

 	/*
 	 * This can happen when jobcomp hasn't been init yet, so call it here.
 	 * It is a NOOP if it has already been init.
 	 */
 	if (slurmctld_primary && jobcomp_g_init() && jobcomp_g_set_location())
 		error("%s: JobComp set location operation failed on SIGUSR2",
 		      __func__);
 }

 static void _on_sigpipe(conmgr_callback_args_t conmgr_args, void *arg)
 {
 	if (conmgr_args.status == CONMGR_WORK_STATUS_CANCELLED)
 		return;

 	debug5("Caught SIGPIPE. Ignoring.");
 }

 static void _on_sigxcpu(conmgr_callback_args_t conmgr_args, void *arg)
 {
 	if (conmgr_args.status == CONMGR_WORK_STATUS_CANCELLED)
 		return;

 	debug5("Caught SIGXCPU. Ignoring.");
 }

 static void _on_sigabrt(conmgr_callback_args_t conmgr_args, void *arg)
 {
 	if (conmgr_args.status == CONMGR_WORK_STATUS_CANCELLED)
 		return;

 	info("SIGABRT received");
 	slurmctld_shutdown();
 	dump_core = true;
 }

 static void _on_sigalrm(conmgr_callback_args_t conmgr_args, void *arg)
 {
 	if (conmgr_args.status == CONMGR_WORK_STATUS_CANCELLED)
 		return;

 	debug5("Caught SIGALRM. Ignoring.");
 }

 static void _register_signal_handlers(conmgr_callback_args_t conmgr_args,
 				      void *arg)
 {
 	conmgr_add_work_signal(SIGINT, _on_sigint, NULL);
 	conmgr_add_work_signal(SIGTERM, _on_sigterm, NULL);
 	conmgr_add_work_signal(SIGCHLD, _on_sigchld, NULL);
 	conmgr_add_work_signal(SIGQUIT, _on_sigquit, NULL);
 	conmgr_add_work_signal(SIGHUP, _on_sighup, NULL);
 	conmgr_add_work_signal(SIGUSR1, _on_sigusr1, NULL);
 	conmgr_add_work_signal(SIGUSR2, _on_sigusr2, NULL);
 	conmgr_add_work_signal(SIGPIPE, _on_sigpipe, NULL);
 	conmgr_add_work_signal(SIGXCPU, _on_sigxcpu, NULL);
 	conmgr_add_work_signal(SIGABRT, _on_sigabrt, NULL);
 	conmgr_add_work_signal(SIGALRM, _on_sigalrm, NULL);
 }

 static void _reopen_stdio(void)
 {
 	int devnull = -1;

 	if ((devnull = open(DEV_NULL_PATH, O_RDWR)) < 0)
 		fatal_abort("Unable to open %s: %m", DEV_NULL_PATH);

 	dup2(devnull, STDIN_FILENO);
 	dup2(devnull, STDOUT_FILENO);
 	dup2(devnull, STDERR_FILENO);

 	if (devnull > STDERR_FILENO)
 		fd_close(&devnull);

 #ifdef __linux__
 	if (isatty(STDOUT_FILENO) && !daemonize) {
 		int tty = -1;

 		if ((tty = open(DEV_TTY_PATH, O_WRONLY)) > 0 && isatty(tty)) {
 			dup2(tty, STDOUT_FILENO);
 			dup2(tty, STDERR_FILENO);
 		}

 		if (tty > STDERR_FILENO)
 			fd_close(&tty);
 	}
 #endif /* __linux__ */
 }

 static void _init_db_conn(void)
 {
 	int rc;

 	/*
 	 * errno is used to get an error when establishing the persistent
 	 * connection. Set errno to 0 here to avoid a previous value for
 	 * errno causing us to think there was a problem.
 	 * FIXME: Stop using errno for control flow here.
 	 */
 	errno = 0;

 	if (acct_db_conn)
 		acct_storage_g_close_connection(&acct_db_conn);

 	acct_db_conn = acct_storage_g_get_connection(
 		0, NULL, false, slurm_conf.cluster_name);
 	rc = clusteracct_storage_g_register_ctld(acct_db_conn,
 						 slurm_conf.slurmctld_port);

 	if (rc & RC_AS_CLUSTER_ID) {
 		uint16_t id = rc & ~RC_AS_CLUSTER_ID;
 		if (slurm_conf.cluster_id && (id != slurm_conf.cluster_id)) {
 			fatal("CLUSTER ID MISMATCH.\n"
 			      "slurmctld has been started with \"ClusterID=%u\"  from the state files in StateSaveLocation, but the DBD thinks it should be \"%u\".\n"
 			      "Running multiple clusters from a shared StateSaveLocation WILL CAUSE CORRUPTION.\n"
 			      "Remove %s/clustername to override this safety check if this is intentional.",
 			      slurm_conf.cluster_id, id,
 			      slurm_conf.state_save_location);
 		} else if (!slurm_conf.cluster_id) {
 			slurm_conf.cluster_id = id;
 			_create_clustername_file();
 		} else {
 			clustername_existed = 1;
 		}
 	}
 }

 /*
  * Retry connecting to the dbd and initializing assoc_mgr until success, or
  * fatal on shutdown.
  */
 static void _retry_init_db_conn(assoc_init_args_t *args)
 {
 	while (true) {
 		struct timespec ts = timespec_now();
 		ts.tv_sec += 2;

 		slurm_mutex_lock(&shutdown_mutex);
 		slurm_cond_timedwait(&shutdown_cond, &shutdown_mutex, &ts);
 		slurm_mutex_unlock(&shutdown_mutex);

 		if (slurmctld_config.shutdown_time)
 			fatal("slurmdbd must be up at slurmctld start time");

 		error("Retrying initial connection to slurmdbd");
 		_init_db_conn();
 		if (!slurm_conf.cluster_id) {
 			error("Still don't know my ClusterID");
 			continue;
 		}
 		if (!assoc_mgr_init(acct_db_conn, args, errno))
 			break;
 	}
 }

 /* main - slurmctld main function, start various threads and process RPCs */
 int main(int argc, char **argv)
 {
 	int error_code;
 	struct timeval start, now;
 	struct stat stat_buf;
 	struct rlimit rlim;
 	/* Locks: Write configuration, job, node, and partition */
 	slurmctld_lock_t config_write_lock = {
 		WRITE_LOCK, WRITE_LOCK, WRITE_LOCK, WRITE_LOCK, NO_LOCK };
 	prep_callbacks_t prep_callbacks = {
 		.prolog_slurmctld = prep_prolog_slurmctld_callback,
 		.epilog_slurmctld = prep_epilog_slurmctld_callback,
 	};
 	bool backup_has_control = false;
 	bool slurmscriptd_mode = false;
 	char *conf_file;
 	stepmgr_ops_t stepmgr_ops = {0};

 	stepmgr_ops.agent_queue_request = agent_queue_request;
 	stepmgr_ops.find_job_array_rec = find_job_array_rec;
 	stepmgr_ops.find_job_record = find_job_record;
 	stepmgr_ops.job_config_fini = job_config_fini;
 	stepmgr_ops.last_job_update = &last_job_update;
 	stepmgr_init(&stepmgr_ops);

 	main_argc = argc;
 	main_argv = argv;

 	if (getenv("SLURMCTLD_RECONF"))
 		original = false;
 	if (getenv(SLURMSCRIPTD_MODE_ENV))
 		slurmscriptd_mode = true;

 	/*
 	 * Make sure we have no extra open files which
 	 * would be propagated to spawned tasks.
 	 */
 	if (original || slurmscriptd_mode) {
 		closeall(slurmscriptd_mode ? SLURMSCRIPT_CLOSEALL :
 			 (STDERR_FILENO + 1));
 	}

 	if (slurmscriptd_mode)
 		_reopen_stdio();

 	/*
 	 * Establish initial configuration
 	 */
 	_init_config();
 	_parse_commandline(argc, argv);
 	log_init(argv[0], log_opts, LOG_DAEMON, NULL);
 	sched_log_init(argv[0], sched_log_opts, LOG_DAEMON, NULL);
 	/*
 	 * Must pass in an explicit filename to slurm_conf_init() to avoid
 	 * the "configless" mode of operation kicking in if no file is
 	 * currently available.
 	 */
 	if (!(conf_file = slurm_conf_filename))
 		if (!(conf_file = getenv("SLURM_CONF")))
 			conf_file = default_slurm_config_file;
 	slurm_conf_init(conf_file);

 	lock_slurmctld(config_write_lock);
 	update_logging();
 	unlock_slurmctld(config_write_lock);

 	if (slurmscriptd_mode) {
 		/* Cleanup env */
 		(void) unsetenv(SLURMSCRIPTD_MODE_ENV);
 		/* Execute in slurmscriptd mode. */
 		become_slurm_user();
 		slurmscriptd_run_slurmscriptd(argc, argv, binary);
 	}

 	if (original && under_systemd &&
 	    (slurm_conf.slurm_user_id != getuid())) {
 		/*
 		 * Sanity check that we are running as the SlurmUser.
 		 * If not fatal to prevent changing the permissions of the state
 		 * save files and or losing the state save.
 		 */
 		fatal("Running user ID does not match the SlurmUser. Check that SlurmUser in slurm.conf and User in the slurmctld unit file match.");
 	}

 	memset(&slurmctld_diag_stats, 0, sizeof(slurmctld_diag_stats));
 	/*
 	 * Calculate speed of gettimeofday() for sdiag.
 	 * Large delays indicate the Linux vDSO is not in use, which
 	 * will lead to significant scheduler performance issues.
 	 */
 	gettimeofday(&start, NULL);
 	for (int i = 0; i < 1000; i++)
 		gettimeofday(&now, NULL);

 	slurmctld_diag_stats.latency  = (now.tv_sec  - start.tv_sec) * 1000000;
 	slurmctld_diag_stats.latency +=  now.tv_usec - start.tv_usec;

 	if (slurmctld_diag_stats.latency > 200)
 		error("High latency for 1000 calls to gettimeofday(): %d microseconds",
 		      slurmctld_diag_stats.latency);

 	/*
 	 * Verify clustername from conf matches value in spool dir
 	 * exit if inconsistent to protect state files from corruption.
 	 * This needs to be done before we kill the old one just in case we
 	 * fail.
 	 */
 	_verify_clustername();

 	_update_nice();
 	if (original)
 		_kill_old_slurmctld();

 	for (int i = 0; i < 3; i++)
 		fd_set_close_on_exec(i);

 	if (original && daemonize) {
 		if (xdaemon())
 			error("daemon(): %m");
 		sched_debug("slurmctld starting");
 	}

 	if (slurm_conf.slurmctld_params)
 		conmgr_set_params(slurm_conf.slurmctld_params);

 	conmgr_init(0, SLURMCTLD_CONMGR_DEFAULT_MAX_CONNECTIONS);

 	conmgr_add_work_fifo(_register_signal_handlers, NULL);

 	conmgr_run(false);

 	if (auth_g_init() != SLURM_SUCCESS)
 		fatal("failed to initialize auth plugin");
 	if (hash_g_init() != SLURM_SUCCESS)
 		fatal("failed to initialize hash plugin");
 	if (conn_g_init() != SLURM_SUCCESS)
 		fatal("Failed to initialize tls plugin");
 	if (certmgr_g_init() != SLURM_SUCCESS)
 		fatal("Failed to initialize certmgr plugin");
 	if (serializer_g_init() != SLURM_SUCCESS)
 		fatal("Failed to initialize serialization plugins.");

 	if (original && !under_systemd) {
 		/*
 		 * Need to create pidfile here in case we setuid() below
 		 * (init_pidfile() exits if it can't initialize pid file).
 		 * On Linux we also need to make this setuid job explicitly
 		 * able to write a core dump.
 		 */
 		_init_pidfile();
 		become_slurm_user();
 	}

 	reconfig_reqs = list_create(NULL);

 	rate_limit_init();
 	rpc_queue_init();

 	/* open ports must happen after become_slurm_user() */
 	 _open_ports();

 	/*
 	 * Create StateSaveLocation directory if necessary.
 	 */
 	set_slurmctld_state_loc();

 	if (daemonize || setwd)
 		_set_work_dir();

 	if (stat(slurm_conf.mail_prog, &stat_buf) != 0)
 		error("Configured MailProg is invalid");

 	if (!slurm_conf.accounting_storage_type) {
 		if (slurm_conf.job_acct_gather_type)
 			error("Job accounting information gathered, but not stored");
 	} else if (!slurm_conf.job_acct_gather_type)
 		info("Job accounting information stored, but details not gathered");

 #ifdef PR_SET_DUMPABLE
 	if (prctl(PR_SET_DUMPABLE, 1) < 0)
 		debug ("Unable to set dumpable to 1");
 #endif /* PR_SET_DUMPABLE */

 	/* Warn if the stack size is not unlimited */
 	if ((getrlimit(RLIMIT_STACK, &rlim) == 0) &&
 	    (rlim.rlim_cur != RLIM_INFINITY))
 		info("Stack size set to %ld", rlim.rlim_max);

 	test_core_limit();
 	_test_thread_limit();


 	/*
 	 * This creates a thread to listen to slurmscriptd, so this needs to
 	 * happen after we block signals so that thread doesn't catch any
 	 * signals.
 	 */
 	slurmscriptd_init(argv, binary);
 	if ((run_command_init(argc, argv, binary) != SLURM_SUCCESS) &&
 	    binary[0])
 		fatal("%s: Unable to reliably execute %s", __func__, binary);

 	accounting_enforce = slurm_conf.accounting_storage_enforce;
 	if (slurm_with_slurmdbd()) {
 		/* we need job_list not to be NULL */
 		init_job_conf();
 	}

 	if (accounting_enforce && !slurm_with_slurmdbd()) {
 		accounting_enforce = 0;
 		slurm_conf.conf_flags &= (~CONF_FLAG_WCKEY);
 		slurm_conf.accounting_storage_enforce = 0;

 		error("You can not have AccountingStorageEnforce set for AccountingStorageType='%s'",
 		      slurm_conf.accounting_storage_type);
 	}

 	info("slurmctld version %s started on cluster %s(%u)",
 	     SLURM_VERSION_STRING, slurm_conf.cluster_name,
 	     slurm_conf.cluster_id);
 	if ((error_code = gethostname_short(slurmctld_config.node_name_short,
 					    HOST_NAME_MAX)))
 		fatal("getnodename_short error %s", slurm_strerror(error_code));
 	if ((error_code = gethostname(slurmctld_config.node_name_long,
 				      HOST_NAME_MAX)))
 		fatal("getnodename error %s", slurm_strerror(error_code));

 	/* init job credential stuff */
 	if (cred_g_init() != SLURM_SUCCESS)
 		fatal("failed to initialize cred plugin");

 	/* Must set before plugins are loaded. */
 	backup_inx = _controller_index();
 	if (backup_inx == -1) {
 		error("This host (%s/%s) not a valid controller",
 		      slurmctld_config.node_name_short,
 		      slurmctld_config.node_name_long);
 		exit(1);
 	}

 	if (backup_inx > 0) {
 		slurmctld_primary = false;

 		if (xstrcasestr(slurm_conf.sched_params,
 		                "no_backup_scheduling"))
 			slurmctld_config.scheduling_disabled = true;
 	}

 	if (!original && !slurmctld_primary) {
 		info("Restarted while operating as primary, resuming operation as primary.");
 		backup_has_control = true;
 	}

 	/*
 	 * Initialize plugins.
 	 * If running configuration test, report ALL failures.
 	 */
 	if (select_g_init() != SLURM_SUCCESS)
 		fatal("failed to initialize node selection plugin");
 	/* gres_init() must follow select_g_init() */
 	if (gres_init() != SLURM_SUCCESS)
 		fatal("failed to initialize gres plugin");
 	if (preempt_g_init() != SLURM_SUCCESS)
 		fatal("failed to initialize preempt plugin");
 	if (acct_gather_conf_init() != SLURM_SUCCESS)
 		fatal("failed to initialize acct_gather plugins");
 	if (jobacct_gather_init() != SLURM_SUCCESS)
 		fatal("failed to initialize jobacct_gather plugin");
 	if (job_submit_g_init(false) != SLURM_SUCCESS)
 		fatal("failed to initialize job_submit plugin");
 	if (prep_g_init(&prep_callbacks) != SLURM_SUCCESS)
 		fatal("failed to initialize prep plugin");
 	if (node_features_g_init() != SLURM_SUCCESS)
 		fatal("failed to initialize node_features plugin");
 	if (mpi_g_daemon_init() != SLURM_SUCCESS)
 		fatal("Failed to initialize MPI plugins.");
 	/* Fatal if we use extra_constraints without json serializer */
 	if (extra_constraints_enabled())
 		serializer_required(MIME_TYPE_JSON);
 	if (switch_g_init(true) != SLURM_SUCCESS)
 		fatal("Failed to initialize switch plugin");

 	/* Initialize licenses - serializer required to be initialized */
 	if (license_init(slurm_conf.licenses) != SLURM_SUCCESS)
 		fatal("Invalid Licenses value: %s", slurm_conf.licenses);

 	if (original && under_systemd)
 		xsystemd_change_mainpid(getpid());

 	while (1) {
 		bool reconfiguring = reconfig;
 		/* initialization for each primary<->backup switch */
 		slurmctld_config.shutdown_time = (time_t) 0;
 		slurmctld_config.resume_backup = false;
 		control_time = 0;
 		reconfig = false;
 		reconfig_rc = SLURM_SUCCESS;

 		agent_init();

 		/* start in primary or backup mode */
 		if (!slurmctld_primary && !backup_has_control) {
 			controller_fini_scheduling(); /* make sure shutdown */
 			_run_primary_prog(false);
 			if (acct_storage_g_init() != SLURM_SUCCESS)
 				fatal("failed to initialize accounting_storage plugin");
 			if (bb_g_init() != SLURM_SUCCESS)
 				fatal("failed to initialize burst buffer plugin");

 			slurm_mutex_lock(&listeners.mutex);
 			listeners.standby_mode = true;
 			slurm_mutex_unlock(&listeners.mutex);

 			/*
 			 * run_backup() will never return unless it is time for
 			 * standby to take control as backup controller
 			 */
 			run_backup();

 			slurm_mutex_lock(&listeners.mutex);
 			listeners.standby_mode = false;
 			slurm_mutex_unlock(&listeners.mutex);

 			(void) _shutdown_backup_controller();
 		} else {
 			if (acct_storage_g_init() != SLURM_SUCCESS)
 				fatal("failed to initialize accounting_storage plugin");
 			(void) _shutdown_backup_controller();
 			trigger_primary_ctld_res_ctrl();
 			ctld_assoc_mgr_init();
 			/*
 			 * read_slurm_conf() will load the burst buffer state,
 			 * init the burst buffer plugin early.
 			 */
 			if (bb_g_init() != SLURM_SUCCESS)
 				fatal("failed to initialize burst_buffer plugin");
 			/* Now recover the remaining state information */
 			lock_slurmctld(config_write_lock);
 			if (switch_g_restore(recover))
 				fatal("failed to initialize switch plugin");
 		}

 		/*
 		 * priority_g_init() needs to be called after assoc_mgr_init()
 		 * and before read_slurm_conf() because jobs could be killed
 		 * during read_slurm_conf() and call priority_g_job_end().
 		 */
 		if (priority_g_init() != SLURM_SUCCESS)
 			fatal("failed to initialize priority plugin");

 		if ((slurmctld_primary || backup_has_control) &&
 		    !reconfiguring) {
 			if ((error_code = read_slurm_conf(recover))) {
 				fatal("read_slurm_conf reading %s: %s",
 				      slurm_conf.slurm_conf,
 				      slurm_strerror(error_code));
 			}
 			configless_update();
 			if (conf_includes_list) {
 				/*
 				 * clear included files so that subsequent conf
 				 * parsings refill it with updated information.
 				 */
 				list_flush(conf_includes_list);
 			}
 		}

 		priority_g_thread_start();

 		if (slurmctld_primary || backup_has_control) {
 			select_g_select_nodeinfo_set_all();
 			unlock_slurmctld(config_write_lock);

 			if (recover == 0) {
 				slurmctld_init_db = 1;
 				_accounting_mark_all_nodes_down("cold-start");
 			}
 		}

 		slurm_persist_conn_recv_server_init();
 		info("Running as primary controller");
 		if (!reconfiguring) {
 			_run_primary_prog(true);
 			control_time = time(NULL);
 			heartbeat_start();
 			if (!slurmctld_config.resume_backup && slurmctld_primary)
 				trigger_primary_ctld_res_op();
 		}

 		/* Set pointers after they have been set */
 		stepmgr_ops.acct_db_conn = acct_db_conn;
 		stepmgr_ops.job_list = job_list;
 		stepmgr_ops.up_node_bitmap = up_node_bitmap;

 		_accounting_cluster_ready();
 		_send_future_cloud_to_db();

 		/*
 		 * call after registering so that the current cluster's
 		 * control_host and control_port will be filled in.
 		 */
 		fed_mgr_init(acct_db_conn);

 		_restore_job_dependencies();

 		sync_job_priorities();

 		if (mcs_g_init() != SLURM_SUCCESS)
 			fatal("failed to initialize mcs plugin");

 		/*
 		 * create attached thread for state save
 		 */
 		slurm_thread_create(&slurmctld_config.thread_id_save,
 				    slurmctld_state_save, NULL);

 		/*
 		 * create attached thread for node power management
   		 */
 		power_save_init();

 		/*
 		 * create attached thread for purging completed job files
 		 */
 		slurm_thread_create(&slurmctld_config.thread_id_purge_files,
 				    _purge_files_thread, NULL);

 		/*
 		 * create attached thread for purging completed job files
 		 */
 		slurm_thread_create(&slurmctld_config.thread_id_acct_update,
 				    _acct_update_thread, NULL);

 		/*
 		 * If reconfiguring, we need to restart the gang scheduler.
 		 * Otherwise, gang scheduling was already started by
 		 * read_slurm_conf().
 		 */
 		if (controller_init_scheduling(reconfiguring) != SLURM_SUCCESS)
 			fatal("Failed to initialize the various schedulers");

 		if (!original && !reconfiguring) {
 			notify_parent_of_success();
 			if (!under_systemd)
 				_update_pidfile();
 			_post_reconfig();
 		}

 		/*
 		 * process slurm background activities, could run as pthread
 		 */
 		_slurmctld_background(NULL);

 		controller_fini_scheduling(); /* Stop all scheduling */
 		rpc_queue_shutdown();
 		agent_fini();

 		/* termination of controller */
 		switch_g_save();
 		priority_g_fini();
 		shutdown_state_save();
 		slurm_mutex_lock(&purge_thread_lock);
 		slurm_cond_signal(&purge_thread_cond); /* wake up last time */
 		slurm_mutex_unlock(&purge_thread_lock);
 		slurm_thread_join(slurmctld_config.thread_id_purge_files);
 		slurm_thread_join(slurmctld_config.thread_id_save);
 		slurm_mutex_lock(&slurmctld_config.acct_update_lock);
 		slurm_cond_broadcast(&slurmctld_config.acct_update_cond);
 		slurm_mutex_unlock(&slurmctld_config.acct_update_lock);
 		slurm_thread_join(slurmctld_config.thread_id_acct_update);

 		/* kill all scripts running by the slurmctld */
 		track_script_flush();
 		slurmscriptd_flush();
 		run_command_shutdown();

 		bb_g_fini();
 		mcs_g_fini();
 		fed_mgr_fini();

 		ctld_assoc_mgr_fini();

 		/* Save any pending state save RPCs */
 		acct_storage_g_close_connection(&acct_db_conn);
 		acct_storage_g_fini();

 		slurm_persist_conn_recv_server_fini();
 		power_save_fini();

 		/* attempt reconfig here */
 		if (reconfig) {
 			_attempt_reconfig();
 			continue;
 		}

 		config_power_mgr_fini();

 		/* stop the heartbeat last */
 		heartbeat_stop();

 		/*
 		 * Run SlurmctldPrimaryOffProg only if we are the primary
 		 * (backup_inx == 0). The backup controllers (backup_inx > 0)
 		 * already run it when dropping to standby mode.
 		 */
 		if (slurmctld_primary)
 			_run_primary_prog(false);

 		if (slurmctld_config.resume_backup == false)
 			break;

 		/* primary controller doesn't resume backup mode */
 		if (slurmctld_config.resume_backup && slurmctld_primary)
 			break;

 		/* The backup is now meant to relinquish control */
 		if (slurmctld_config.resume_backup && !slurmctld_primary)
 			backup_has_control = false;

 		recover = 2;

 		/*
 		 * We need to re-initialize run_command after
 		 * run_command_shutdown() was called. Pass NULL since we do
 		 * not want to change the script launcher location.
 		 */
 		(void) run_command_init(0, NULL, NULL);
 	}

 	slurmscriptd_fini();
 	jobcomp_g_fini();

 	/*
 	 * Since pidfile is created as user root (its owner is
 	 *   changed to SlurmUser) SlurmUser may not be able to
 	 *   remove it, so this is not necessarily an error.
 	 */
 	if (!under_systemd && (unlink(slurm_conf.slurmctld_pidfile) < 0)) {
 		verbose("Unable to remove pidfile '%s': %m",
 			slurm_conf.slurmctld_pidfile);
 	}


 #ifdef MEMORY_LEAK_DEBUG
 {
 	/*
 	 * This should purge all allocated memory.
 	 *  Anything left over represents a leak.
 	 */

 	xassert(list_is_empty(reconfig_reqs));
 	FREE_NULL_LIST(reconfig_reqs);
 	agent_purge();

 	/* Purge our local data structures */
 	configless_clear();
 	job_fini();
 	part_fini();	/* part_fini() must precede node_fini() */
 	node_fini();
 	mpi_fini();
 	node_features_g_fini();
 	resv_fini();
 	trigger_fini();
 	assoc_mgr_fini(1);
 	reserve_port_config(NULL, NULL);

 	/* Some plugins are needed to purge job/node data structures,
 	 * unplug after other data structures are purged */
 	gres_fini();
 	job_submit_g_fini(false);
 	prep_g_fini();
 	preempt_g_fini();
 	jobacct_gather_fini();
 	acct_gather_conf_destroy();
 	select_g_fini();
 	topology_g_destroy_config();
 	topology_g_fini();
 	auth_g_fini();
 	hash_g_fini();
 	conn_g_fini();
 	certmgr_g_fini();
 	switch_g_fini();
 	site_factor_g_fini();

 	/* purge remaining data structures */
 	group_cache_purge();
 	getnameinfo_cache_purge();
 	license_free();
 	FREE_NULL_LIST(slurmctld_config.acct_update_list);
 	cred_g_fini();
 	slurm_conf_destroy();
 	cluster_rec_free();
 	track_script_fini();
 	cgroup_conf_destroy();
 	usleep(500000);
 	serializer_g_fini();
 	bit_cache_fini();
 }
 #endif

 	conmgr_request_shutdown();
 	conmgr_fini();

 	rate_limit_shutdown();
 	log_fini();
 	sched_log_fini();

 	if (dump_core)
 		abort();
 	else
 		exit(0);
 }

 static int _find_node_event(void *x, void *key)
 {
 	slurmdb_event_rec_t *event = x;
 	char *node_name = key;

 	return !xstrcmp(event->node_name, node_name);
 }

 /*
  * Create db down events for FUTURE and CLOUD+POWERED_DOWN nodes
  */
 static void _send_future_cloud_to_db()
 {
 	time_t now = time(NULL);
 	slurmdb_event_rec_t *event = NULL;
 	list_t *event_list = NULL;
 	bool check_db = !running_cache;
 	node_record_t *node_ptr;

 	for (int i = 0; (node_ptr = next_node(&i)); i++) {
 		if (!IS_NODE_FUTURE(node_ptr) &&
 		    !IS_NODE_POWERED_DOWN(node_ptr))
 			continue;

 		/*
 		 * If the DBD is up, then try to avoid making duplicate
 		 * g_node_down() calls by reconciling with the db. If it's not
 		 * up, just send the down events to preserve the startup time
 		 * stamps.
 		 */
 		if (check_db && !event_list) {
 			slurmdb_event_cond_t event_cond = {0};
 			event_cond.event_type = SLURMDB_EVENT_NODE;
 			event_cond.cond_flags = SLURMDB_EVENT_COND_OPEN;

 			event_cond.cluster_list = list_create(xfree_ptr);
 			list_append(event_cond.cluster_list,
 				    xstrdup(slurm_conf.cluster_name));

 			event_cond.format_list = list_create(NULL);
 			list_append(event_cond.format_list, "node_name");

 			event_cond.state_list = list_create(xfree_ptr);
 			list_append(event_cond.state_list,
 				    xstrdup_printf("%u", NODE_STATE_FUTURE));
 			list_append(event_cond.state_list,
 				    xstrdup_printf("%"PRIu64,
 						   NODE_STATE_POWERED_DOWN));

 			event_list = acct_storage_g_get_events(acct_db_conn,
 							       getuid(),
 							       &event_cond);
 			if (!event_list)
 				check_db = false;

 			FREE_NULL_LIST(event_cond.cluster_list);
 			FREE_NULL_LIST(event_cond.format_list);
 			FREE_NULL_LIST(event_cond.state_list);
 		}

 		if (event_list &&
 		    (event = list_find_first(event_list, _find_node_event,
 					     node_ptr->name))) {
 			/* Open event record already exists, don't send again */
 			continue;
 		}

 		clusteracct_storage_g_node_down(
 			acct_db_conn, node_ptr, now,
 			IS_NODE_FUTURE(node_ptr) ? "Future" : "Powered down",
 			slurm_conf.slurm_user_id);
 	}

 	FREE_NULL_LIST(event_list);
 }

 /* initialization of common slurmctld configuration */
 static void  _init_config(void)
 {
 	struct rlimit rlim;

 	rlimits_use_max_nofile();
 	if (getrlimit(RLIMIT_CORE, &rlim) == 0) {
 		rlim.rlim_cur = rlim.rlim_max;
 		(void) setrlimit(RLIMIT_CORE, &rlim);
 	}
 	if (getrlimit(RLIMIT_STACK, &rlim) == 0) {
 		/* slurmctld can spawn lots of pthreads.
 		 * Set the (per thread) stack size to a
 		 * more "reasonable" value to avoid running
 		 * out of virtual memory and dying */
 		rlim.rlim_cur = rlim.rlim_max;
 		(void) setrlimit(RLIMIT_STACK, &rlim);
 	}
 	if (getrlimit(RLIMIT_DATA, &rlim) == 0) {
 		rlim.rlim_cur = rlim.rlim_max;
 		(void) setrlimit(RLIMIT_DATA, &rlim);
 	}

 	memset(&slurmctld_config, 0, sizeof(slurmctld_config_t));
 	FREE_NULL_LIST(slurmctld_config.acct_update_list);
 	slurmctld_config.acct_update_list =
 		list_create(slurmdb_destroy_update_object);
 	slurm_mutex_init(&slurmctld_config.acct_update_lock);
 	slurm_mutex_init(&slurmctld_config.thread_count_lock);
 	slurm_mutex_init(&slurmctld_config.backup_finish_lock);
 	slurm_mutex_lock(&slurmctld_config.acct_update_lock);
 	slurm_mutex_lock(&slurmctld_config.thread_count_lock);
 	slurm_mutex_lock(&slurmctld_config.backup_finish_lock);

 	slurm_cond_init(&slurmctld_config.acct_update_cond, NULL);
 	slurm_cond_init(&slurmctld_config.backup_finish_cond, NULL);
 	slurm_cond_init(&slurmctld_config.thread_count_cond, NULL);
 	slurmctld_config.boot_time      = time(NULL);
 	slurmctld_config.resume_backup  = false;
 	slurmctld_config.server_thread_count = 0;
 	slurmctld_config.shutdown_time  = (time_t) 0;
 	slurmctld_config.thread_id_main = pthread_self();
 	slurmctld_config.scheduling_disabled  = false;
 	slurmctld_config.submissions_disabled = false;
 	track_script_init();
 	slurmctld_config.thread_id_main    = (pthread_t) 0;
 	slurm_mutex_unlock(&slurmctld_config.backup_finish_lock);
 	slurm_mutex_unlock(&slurmctld_config.thread_count_lock);
 	slurm_mutex_unlock(&slurmctld_config.acct_update_lock);
 }

 static int _try_to_reconfig(void)
 {
 	extern char **environ;
 	char **child_env;
 	pid_t pid;
 	int to_parent[2] = {-1, -1};
 	int *skip_close = NULL, skip_index = 0, auth_fd = -1;

 	child_env = env_array_copy((const char **) environ);
 	setenvf(&child_env, "SLURMCTLD_RECONF", "1");
 	if (pidfd != -1) {
 		setenvf(&child_env, "SLURMCTLD_RECONF_PIDFD", "%d", pidfd);
 		fd_set_noclose_on_exec(pidfd);
 	}
 	slurm_mutex_lock(&listeners.mutex);
 	/*
 	 * Need space in array for:
 	 *  - to_parent[1]
 	 *  - pidfd
 	 *  - auth fd
 	 *  - terminator (-1)
 	 *  - listeners.count number of listening sockets
 	 */
 	skip_close = xcalloc((listeners.count + 4), sizeof(*skip_close));
 	if (listeners.count) {
 		char *ports = NULL, *pos = NULL;

 		setenvf(&child_env, "SLURMCTLD_RECONF_LISTEN_COUNT", "%d",
 			listeners.count);
 		for (int i = 0; i < listeners.count; i++) {
 			xstrfmtcatat(ports, &pos, "%d,", listeners.fd[i]);
 			if (listeners.fd[i] >= 0) {
 				fd_set_noclose_on_exec(listeners.fd[i]);
 				skip_close[skip_index++] = listeners.fd[i];
 			}
 		}
 		setenvf(&child_env, "SLURMCTLD_RECONF_LISTEN_FDS", "%s", ports);
 		xfree(ports);
 	}
 	slurm_mutex_unlock(&listeners.mutex);
 	if ((auth_fd = auth_g_get_reconfig_fd(AUTH_PLUGIN_SLURM)) >= 0)
 		skip_close[skip_index++] = auth_fd;
 	for (int i = 0; i < 3; i++)
 		fd_set_noclose_on_exec(i);
 	if (!daemonize && !under_systemd) {
 		/*
 		 * If in attached mode, the slurmctld does not fork() so it
 		 * does not change its PID. The slurmctld needs to call
 		 * slurmscriptd_fini() to reap the slurmscriptd PID, otherwise
 		 * the slurmscriptd PID would be a defunct entry in the process
 		 * table.
 		 * For detached mode, the parent slurmctld needs to keep the
 		 * slurmscriptd running in order to recover if the child
 		 * slurmctld fails to start. If the child slurmctld starts
 		 * successfully, then when the parent slurmctld shuts down the
 		 * corresponding slurmscriptd is reparented to init, shuts itself
 		 * down, and init will reap the PID for us.
 		 */
 		slurmscriptd_fini();
 		goto start_child;
 	}

 	if (pipe(to_parent))
 		fatal("%s: pipe() failed: %m", __func__);

 	setenvf(&child_env, "SLURMCTLD_RECONF_PARENT_FD", "%d", to_parent[1]);
 	if ((pid = fork()) < 0) {
 		fatal("%s: fork() failed: %m", __func__);
 	} else if (pid > 0) {
 		pid_t grandchild_pid;
 		int rc;
 		/*
 		 * Close the input side of the pipe so the read() will return
 		 * immediately if the child process fatal()s.
 		 * Otherwise we'd be stuck here indefinitely assuming another
 		 * internal thread might write something to the pipe.
 		 */
 		(void) close(to_parent[1]);
 		safe_read(to_parent[0], &grandchild_pid, sizeof(pid_t));
 		info("Relinquishing control to new slurmctld process");
 		/*
 		 * Ensure child has exited.
 		 * Grandchild should be owned by init.
 		 */
 		if (under_systemd) {
 			waitpid(pid, &rc, 0);
 			xsystemd_change_mainpid(grandchild_pid);
 		}
 		xfree(skip_close);
 		return SLURM_SUCCESS;

 rwfail:
 		close(to_parent[0]);
 		env_array_free(child_env);
 		waitpid(pid, &rc, 0);
 		info("Resuming operation, reconfigure failed.");
 		xfree(skip_close);
 		return SLURM_ERROR;
 	}

 start_child:
 	if (to_parent[1] >= 0)
 		skip_close[skip_index++] = to_parent[1];
 	if (pidfd >= 0)
 		skip_close[skip_index++] = pidfd;
 	skip_close[skip_index] = -1;
 	closeall_except(3, skip_close);

 	/*
 	 * This second fork() ensures that the new grandchild's parent is init,
 	 * which avoids a nuisance warning from systemd of:
 	 * "Supervising process 123456 which is not our child. We'll most likely not notice when it exits"
 	 */
 	if (under_systemd) {
 		if ((pid = fork()) < 0)
 			fatal("fork() failed: %m");
 		else if (pid)
 			exit(0);
 	}

 	execve(binary, main_argv, child_env);
 	fatal("execv() failed: %m");
 }

 extern void notify_parent_of_success(void)
 {
 	char *parent_fd_env = getenv("SLURMCTLD_RECONF_PARENT_FD");
 	pid_t pid = getpid();
 	int fd = -1;
 	static bool notified = false;

 	if (original || !parent_fd_env || notified)
 		return;

 	notified = true;

 	fd = atoi(parent_fd_env);
 	info("child started successfully");
 	safe_write(fd, &pid, sizeof(pid_t));
 	(void) close(fd);
 	return;

 rwfail:
 	error("failed to notify parent, may have two processes running now");
 	(void) close(fd);
 }

 extern void reconfigure_slurm(slurm_msg_t *msg)
 {
 	xassert(msg);

 	list_append(reconfig_reqs, msg);

 	pthread_kill(pthread_self(), SIGHUP);
 }

 static void _post_reconfig(void)
 {
 	if (running_configless) {
 		configless_update();
 		push_reconfig_to_slurmd();
 		sackd_mgr_push_reconfig();
 	} else {
 		msg_to_slurmd(REQUEST_RECONFIGURE);
 	}
 }

 /* Request that the job scheduler execute soon (typically within seconds) */
 extern void queue_job_scheduler(void)
 {
 	slurm_mutex_lock(&sched_cnt_mutex);
 	job_sched_cnt++;
 	slurm_mutex_unlock(&sched_cnt_mutex);
 }

 static void *_on_listen_connect(conmgr_fd_t *con, void *arg)
 {
 	const int *i_ptr = arg;
 	const int i = *i_ptr;
 	int rc = EINVAL;

 	debug3("%s: [%s] Successfully opened RPC listener",
 	       __func__, conmgr_fd_get_name(con));

 	slurm_mutex_lock(&listeners.mutex);

 	xassert(!listeners.cons[i]);
 	listeners.cons[i] = con;

 	if (!listeners.quiesced &&
 	    (rc = conmgr_unquiesce_fd(listeners.cons[i])))
 		fatal_abort("%s: conmgr_unquiesce_fd(%s) failed: %s",
 			    __func__, conmgr_fd_get_name(con),
 			    slurm_strerror(rc));

 	slurm_mutex_unlock(&listeners.mutex);

 	return arg;
 }

 static void _on_listen_finish(conmgr_fd_t *con, void *arg)
 {
 	int *i_ptr = arg;
 	const int i = *i_ptr;

 	debug3("%s: [%s] Closed RPC listener",
 	       __func__, conmgr_fd_get_name(con));

 	slurm_mutex_lock(&listeners.mutex);
 	xassert(listeners.cons[i] == con);
 	listeners.cons[i] = NULL;
 	slurm_mutex_unlock(&listeners.mutex);

 	xfree(i_ptr);
 }

 static void *_on_primary_connection(conmgr_fd_t *con, void *arg)
 {
 	debug3("%s: [%s] PRIMARY: New RPC connection",
 	       __func__, conmgr_fd_get_name(con));

 	return con;
 }

 static void _on_primary_finish(conmgr_fd_t *con, void *arg)
 {
 	debug3("%s: [%s] PRIMARY: RPC connection closed",
 	       __func__, conmgr_fd_get_name(con));
 }

 /*
  * Process incoming primary RPCs.
  *
  * WARNING: conmgr will read all available incoming data and could process
  * multiple RPCs on a single connection but the current RPC model of the
  * controller is to only have 1 incoming RPC and then to reply and close the
  * connection. This is not ideal if the RPC handler should try to read from the
  * extracted fd since conmgr may have already read some data it expects. This
  * currently does not appear to be an issue but may be one in the future until
  * all of the RPC handlers are converted to conmgr fully.
  */
 static int _on_primary_msg(conmgr_fd_t *con, slurm_msg_t *msg, void *arg)
 {
 	int rc = SLURM_SUCCESS;

 	if (!msg->auth_ids_set)
 		fatal_abort("this should never happen");

 	log_flag(AUDIT_RPCS, "[%s] msg_type=%s uid=%u client=[%pA] protocol=%u",
 		 conmgr_fd_get_name(con), rpc_num2string(msg->msg_type),
 		 msg->auth_uid, &msg->address, msg->protocol_version);

 	/*
 	 * Check msg against the rate limit. Tell client to retry in a second
 	 * to minimize controller disruption.
 	 */
 	if (rate_limit_exceeded(msg)) {
 		rc = slurm_send_rc_msg(msg, SLURMCTLD_COMMUNICATIONS_BACKOFF);
 		slurm_free_msg(msg);
 	} else {
 		/*
 		 * The fd will be extracted from conmgr, so the conmgr
 		 * connection ref should be removed from msg first.
 		 */
 		conmgr_fd_free_ref(&msg->conmgr_con);

 		if ((rc = conmgr_queue_extract_con_fd(
 			     con, _service_connection,
 			     XSTRINGIFY(_service_connection), msg)))
 			error("%s: [%s] Extracting FDs failed: %s",
 			      __func__, conmgr_fd_get_name(con),
 			      slurm_strerror(rc));
 	}

 	return rc;
 }

 static void *_on_connection(conmgr_fd_t *con, void *arg)
 {
 	bool standby_mode;

 	slurm_mutex_lock(&listeners.mutex);
 	standby_mode = listeners.standby_mode;
 	slurm_mutex_unlock(&listeners.mutex);

 	if (!standby_mode)
 		return _on_primary_connection(con, arg);
 	else
 		return on_backup_connection(con, arg);
 }

 static void _on_finish(conmgr_fd_t *con, void *arg)
 {
 	bool standby_mode;

 	slurm_mutex_lock(&listeners.mutex);
 	standby_mode = listeners.standby_mode;
 	slurm_mutex_unlock(&listeners.mutex);

 	if (!standby_mode)
 		return _on_primary_finish(con, arg);
 	else
 		return on_backup_finish(con, arg);
 }

 static int _on_msg(conmgr_fd_t *con, slurm_msg_t *msg, int unpack_rc, void *arg)
 {
 	bool standby_mode;

 	if ((unpack_rc == SLURM_PROTOCOL_AUTHENTICATION_ERROR) ||
 	    !msg->auth_ids_set) {
 		/*
 		 * Avoid closing connection immediately on authentication
 		 * failure to give the sender a hint to fix their authentication
 		 * issue with authentication disabled.
 		 */
 		msg->flags |= SLURM_NO_AUTH_CRED;
 		slurm_send_rc_msg(msg, SLURM_PROTOCOL_AUTHENTICATION_ERROR);
 		slurm_free_msg(msg);
 		return SLURM_SUCCESS;
 	} else if (unpack_rc) {
 		error("%s: [%s] rejecting malformed RPC and closing connection: %s",
 		      __func__, conmgr_fd_get_name(con),
 		      slurm_strerror(unpack_rc));
 		slurm_free_msg(msg);
 		return unpack_rc;
 	}

 	slurm_mutex_lock(&listeners.mutex);
 	standby_mode = listeners.standby_mode;
 	slurm_mutex_unlock(&listeners.mutex);

 	if (!standby_mode)
 		return _on_primary_msg(con, msg, arg);
 	else
 		return on_backup_msg(con, msg, arg);
 }

 extern void listeners_quiesce(void)
 {
 	slurm_mutex_lock(&listeners.mutex);

 	if (listeners.quiesced) {
 		slurm_mutex_unlock(&listeners.mutex);
 		return;
 	}

 	for (int i = 0; i < listeners.count; i++) {
 		int rc;

 		if (!listeners.cons[i])
 			continue;

 		/* This should always work */
 		if ((rc = conmgr_quiesce_fd(listeners.cons[i])))
 			fatal_abort("%s: conmgr_quiesce_fd(%s) failed: %s",
 				    __func__,
 				    conmgr_fd_get_name(listeners.cons[i]),
 				    slurm_strerror(rc));
 	}

 	listeners.quiesced = true;

 	slurm_mutex_unlock(&listeners.mutex);
 }

 extern void listeners_unquiesce(void)
 {
 	slurm_mutex_lock(&listeners.mutex);

 	if (!listeners.quiesced) {
 		slurm_mutex_unlock(&listeners.mutex);
 		return;
 	}

 	for (int i = 0; i < listeners.count; i++) {
 		int rc;

 		if (!listeners.cons[i])
 			continue;

 		/* This should always work */
 		if ((rc = conmgr_unquiesce_fd(listeners.cons[i])))
 			fatal_abort("%s: conmgr_unquiesce_fd(%s) failed: %s",
 				    __func__,
 				    conmgr_fd_get_name(listeners.cons[i]),
 				    slurm_strerror(rc));
 	}

 	listeners.quiesced = false;

 	slurm_mutex_unlock(&listeners.mutex);
 }

 /*
  * _open_ports - Open all ports for the slurmctld to listen on.
  */
 static void _open_ports(void)
 {
 	static const conmgr_events_t events = {
 		.on_listen_connect = _on_listen_connect,
 		.on_listen_finish = _on_listen_finish,
 		.on_connection = _on_connection,
 		.on_msg = _on_msg,
 		.on_finish = _on_finish,
 	};

 	slurm_mutex_lock(&listeners.mutex);

 	/* initialize ports for RPCs */
 	if (original) {
 		if (!(listeners.count = slurm_conf.slurmctld_port_count))
 			fatal("slurmctld port count is zero");
 		listeners.fd = xcalloc(listeners.count, sizeof(*listeners.fd));
 		listeners.cons = xcalloc(listeners.count,
 					 sizeof(*listeners.cons));
 		for (int i = 0; i < listeners.count; i++) {
 			listeners.fd[i] = slurm_init_msg_engine_port(
 				slurm_conf.slurmctld_port + i);
 		}
 	} else {
 		char *pos = getenv("SLURMCTLD_RECONF_LISTEN_FDS");
 		listeners.count = atoi(getenv("SLURMCTLD_RECONF_LISTEN_COUNT"));
 		listeners.fd = xcalloc(listeners.count, sizeof(*listeners.fd));
 		listeners.cons = xcalloc(listeners.count,
 					 sizeof(*listeners.cons));
 		for (int i = 0; i < listeners.count; i++) {
 			listeners.fd[i] = strtol(pos, &pos, 10);
 			pos++; /* skip comma */
 		}
 	}

 	for (uint64_t i = 0; i < listeners.count; i++) {
 		static conmgr_con_flags_t flags =
 			(CON_FLAG_RPC_KEEP_BUFFER | CON_FLAG_QUIESCE |
 			 CON_FLAG_WATCH_WRITE_TIMEOUT |
 			 CON_FLAG_WATCH_READ_TIMEOUT |
 			 CON_FLAG_WATCH_CONNECT_TIMEOUT);
 		int rc, *index_ptr;

 		index_ptr = xmalloc(sizeof(*index_ptr));
 		*index_ptr = i;

 		if (tls_enabled())
 			flags |= CON_FLAG_TLS_SERVER;

 		if ((rc = conmgr_process_fd_listen(listeners.fd[i],
 						   CON_TYPE_RPC, &events, flags,
 						   index_ptr))) {
 			if (rc == SLURM_COMMUNICATIONS_INVALID_FD)
 				fatal("%s: Unable to listen to file descriptors. Existing slurmctld process likely already is listening on the ports.",
 				      __func__);

 			fatal("%s: unable to process fd:%d error:%s",
 			      __func__, listeners.fd[i], slurm_strerror(rc));
 		}
 	}

 	slurm_mutex_unlock(&listeners.mutex);
 }

 /*
  * _service_connection - service the RPC
  * IN/OUT arg - really just the connection's file descriptor, freed
  *	upon completion
  * RET - NULL
  */
 static void _service_connection(conmgr_callback_args_t conmgr_args,
 				int input_fd, int output_fd, void *tls_conn,
 				void *arg)
 {
 	int rc;
 	slurm_msg_t *msg = arg;
 	slurmctld_rpc_t *this_rpc = NULL;

 	if (conmgr_args.status == CONMGR_WORK_STATUS_CANCELLED) {
 		debug3("%s: [fd:%d] connection work cancelled",
 		       __func__, input_fd);
 		goto invalid;
 	}

 	if ((input_fd < 0) || (output_fd < 0)) {
 		error("%s: Rejecting partially open connection input_fd=%d output_fd=%d",
 		      __func__, input_fd, output_fd);
 		goto invalid;
 	}

 	/*
 	 * The fd was extracted from conmgr, so the conmgr connection is
 	 * invalid.
 	 */
 	conmgr_fd_free_ref(&msg->conmgr_con);
 	if (tls_conn) {
 		msg->tls_conn = tls_conn;
 	} else {
 		conn_args_t tls_args = {
 			.input_fd = input_fd,
 			.output_fd = output_fd,
 		};
 		msg->tls_conn = conn_g_create(&tls_args);
 	}

 	server_thread_incr();

 	if (!(rc = rpc_enqueue(msg))) {
 		server_thread_decr();
 		return;
 	}

 	if (rc == SLURMCTLD_COMMUNICATIONS_BACKOFF) {
 		slurm_send_rc_msg(msg, SLURMCTLD_COMMUNICATIONS_BACKOFF);
 	} else if (rc == SLURMCTLD_COMMUNICATIONS_HARD_DROP) {
 		slurm_send_rc_msg(msg, SLURMCTLD_COMMUNICATIONS_HARD_DROP);
 	} else if ((this_rpc = find_rpc(msg->msg_type))) {
 		/* directly process the request */
 		slurmctld_req(msg, this_rpc);
 	} else {
 		error("invalid RPC msg_type=%s", rpc_num2string(msg->msg_type));
 		slurm_send_rc_msg(msg, EINVAL);
 	}

 	if (!this_rpc || !this_rpc->keep_msg) {
 		conn_g_destroy(msg->tls_conn, true);
 		msg->tls_conn = NULL;
 		log_flag(TLS, "Destroyed server TLS connection for incoming RPC on fd %d->%d",
 			 input_fd, output_fd);
 		slurm_free_msg(msg);
 	}

 	server_thread_decr();
 	return;

 invalid:
 	/* Cleanup for invalid RPC */
 	if (!tls_conn) {
 		if (input_fd != output_fd)
 			fd_close(&output_fd);
 		fd_close(&input_fd);
 	}
 	slurm_free_msg(msg);
 	conn_g_destroy(tls_conn, true);
 }

 /* Decrement slurmctld thread count (as applies to thread limit) */
 extern void server_thread_decr(void)
 {
 	slurm_mutex_lock(&slurmctld_config.thread_count_lock);
 	if (slurmctld_config.server_thread_count > 0)
 		slurmctld_config.server_thread_count--;
 	else
 		error("slurmctld_config.server_thread_count underflow");
 	slurm_cond_broadcast(&slurmctld_config.thread_count_cond);
 	slurm_mutex_unlock(&slurmctld_config.thread_count_lock);
 }

 /* Increment slurmctld thread count (as applies to thread limit) */
 extern void server_thread_incr(void)
 {
 	slurm_mutex_lock(&slurmctld_config.thread_count_lock);
 	slurmctld_config.server_thread_count++;
 	slurm_mutex_unlock(&slurmctld_config.thread_count_lock);
 }

 static int _accounting_cluster_ready(void)
 {
 	return clusteracct_storage_g_cluster_tres(acct_db_conn,
 						  NULL,
 						  NULL,
 						  0,
 						  SLURM_PROTOCOL_VERSION);
 }

 static int _accounting_mark_all_nodes_down(char *reason)
 {
 	char *state_file;
 	struct stat stat_buf;
 	node_record_t *node_ptr;
 	int i;
 	time_t event_time;
 	int rc = SLURM_ERROR;

 	state_file = xstrdup_printf("%s/node_state",
 	                            slurm_conf.state_save_location);
 	if (stat(state_file, &stat_buf)) {
 		debug("_accounting_mark_all_nodes_down: could not stat(%s) "
 		      "to record node down time", state_file);
 		event_time = time(NULL);
 	} else {
 		event_time = stat_buf.st_mtime;
 	}
 	xfree(state_file);

 	if ((rc = acct_storage_g_flush_jobs_on_cluster(acct_db_conn,
 						      event_time))
 	   == SLURM_ERROR)
 		return rc;

 	for (i = 0; (node_ptr = next_node(&i)); i++) {
 		if (!node_ptr->name)
 			continue;
 		if ((rc = clusteracct_storage_g_node_down(
 			acct_db_conn, node_ptr, event_time,
 			reason, slurm_conf.slurm_user_id))
 		   == SLURM_ERROR)
 			break;
 	}
 	return rc;
 }

 static void _remove_assoc(slurmdb_assoc_rec_t *rec)
 {
 	int cnt = 0;

 	bb_g_reconfig();

 	cnt = job_hold_by_assoc_id(rec->id);

 	if (cnt) {
 		info("Removed association id:%u user:%s, held %u jobs",
 		     rec->id, rec->user, cnt);
 	} else
 		debug("Removed association id:%u user:%s", rec->id, rec->user);
 }

 static int _foreach_part_remove_qos(void *x, void *arg)
 {
 	part_record_t *part_ptr = x;
 	slurmdb_qos_rec_t *rec = arg;

 	if (part_ptr->qos_ptr == rec) {
 		info("Partition %s's QOS %s was just removed, you probably didn't mean for this to happen unless you are also removing the partition.",
 		     part_ptr->name, rec->name);
 		part_ptr->qos_ptr = NULL;
 	}

 	return 0;
 }

 static void _remove_qos(slurmdb_qos_rec_t *rec)
 {
 	int cnt = 0;
 	slurmctld_lock_t part_write_lock =
 		{ NO_LOCK, NO_LOCK, NO_LOCK, WRITE_LOCK, NO_LOCK };

 	lock_slurmctld(part_write_lock);
 	if (part_list)
 		(void) list_for_each(part_list, _foreach_part_remove_qos, rec);
 	unlock_slurmctld(part_write_lock);

 	bb_g_reconfig();

 	cnt = job_hold_by_qos_id(rec->id);

 	if (cnt) {
 		info("Removed QOS:%s held %u jobs", rec->name, cnt);
 	} else
 		debug("Removed QOS:%s", rec->name);
 }

 static int _update_assoc_for_each(void *x, void *arg) {
 	slurmdb_assoc_rec_t *rec = arg;
 	job_record_t *job_ptr = x;

 	if ((rec == job_ptr->assoc_ptr) && (IS_JOB_PENDING(job_ptr)))
 		acct_policy_update_pending_job(job_ptr);

 	return 0;
 }

 static void _update_assoc(slurmdb_assoc_rec_t *rec)
 {
 	/* Write lock on jobs */
 	slurmctld_lock_t job_write_lock =
 		{ NO_LOCK, WRITE_LOCK, NO_LOCK, NO_LOCK, NO_LOCK };

 	if (!job_list || !accounting_enforce
 	    || !(accounting_enforce & ACCOUNTING_ENFORCE_LIMITS))
 		return;

 	lock_slurmctld(job_write_lock);
 	list_for_each(job_list, _update_assoc_for_each, rec);
 	unlock_slurmctld(job_write_lock);
 }

 static int _foreach_part_resize_qos(void *x, void *arg)
 {
 	part_record_t *part_ptr = x;

 	if (part_ptr->allow_qos)
 		qos_list_build(part_ptr->allow_qos, false,
 			       &part_ptr->allow_qos_bitstr);

 	if (part_ptr->deny_qos)
 		qos_list_build(part_ptr->deny_qos, false,
 			       &part_ptr->deny_qos_bitstr);
 	return 0;
 }

 static void _resize_qos(void)
 {
 	slurmctld_lock_t part_write_lock =
 		{ NO_LOCK, NO_LOCK, NO_LOCK, WRITE_LOCK, NO_LOCK };

 	lock_slurmctld(part_write_lock);
 	if (part_list)
 		(void) list_for_each(part_list, _foreach_part_resize_qos, NULL);
 	unlock_slurmctld(part_write_lock);
 }

 static int _update_qos_for_each(void *x, void *arg) {
 	slurmdb_qos_rec_t *rec = arg;
 	job_record_t *job_ptr = x;

 	if ((rec == job_ptr->qos_ptr) && (IS_JOB_PENDING(job_ptr)))
 		acct_policy_update_pending_job(job_ptr);

 	return 0;
 }

 static void _update_qos(slurmdb_qos_rec_t *rec)
 {
 	/* Write lock on jobs */
 	slurmctld_lock_t job_write_lock =
 		{ NO_LOCK, WRITE_LOCK, NO_LOCK, NO_LOCK, NO_LOCK };

 	if (!job_list || !accounting_enforce
 	    || !(accounting_enforce & ACCOUNTING_ENFORCE_LIMITS))
 		return;

 	lock_slurmctld(job_write_lock);
 	list_for_each(job_list, _update_qos_for_each, rec);
 	unlock_slurmctld(job_write_lock);
 }

 static int _init_tres(void)
 {
 	char *temp_char;
 	list_t *char_list = NULL;
 	list_t *add_list = NULL;
 	slurmdb_tres_rec_t *tres_rec;
 	slurmdb_update_object_t update_object;
 	assoc_mgr_lock_t locks = { .tres = READ_LOCK };

 	if (!slurm_conf.accounting_storage_tres) {
 		error("No tres defined, this should never happen");
 		return SLURM_ERROR;
 	}

 	char_list = list_create(xfree_ptr);
 	slurm_addto_char_list(char_list, slurm_conf.accounting_storage_tres);

 	memset(&update_object, 0, sizeof(slurmdb_update_object_t));
 	if (!slurm_with_slurmdbd()) {
 		update_object.type = SLURMDB_ADD_TRES;
 		update_object.objects = list_create(slurmdb_destroy_tres_rec);
 	} else if (!g_tres_count)
 		fatal("You are running with a database but for some reason "
 		      "we have no TRES from it.  This should only happen if "
 		      "the database is down and you don't have "
 		      "any state files.");
 	else if ((g_tres_count < TRES_ARRAY_TOTAL_CNT) ||
 		 (xstrcmp(assoc_mgr_tres_array[TRES_ARRAY_BILLING]->type,
 			  "billing")))
 		fatal("You are running with a database but for some reason we have less TRES than should be here (%d < %d) and/or the \"billing\" TRES is missing. This should only happen if the database is down after an upgrade.",
 		      g_tres_count, TRES_ARRAY_TOTAL_CNT);

 	while ((temp_char = list_pop(char_list))) {
 		tres_rec = xmalloc(sizeof(slurmdb_tres_rec_t));

 		tres_rec->type = temp_char;

 		if (!xstrcasecmp(temp_char, "cpu"))
 			tres_rec->id = TRES_CPU;
 		else if (!xstrcasecmp(temp_char, "mem"))
 			tres_rec->id = TRES_MEM;
 		else if (!xstrcasecmp(temp_char, "energy"))
 			tres_rec->id = TRES_ENERGY;
 		else if (!xstrcasecmp(temp_char, "node"))
 			tres_rec->id = TRES_NODE;
 		else if (!xstrcasecmp(temp_char, "billing"))
 			tres_rec->id = TRES_BILLING;
 		else if (!xstrcasecmp(temp_char, "vmem"))
 			tres_rec->id = TRES_VMEM;
 		else if (!xstrcasecmp(temp_char, "pages"))
 			tres_rec->id = TRES_PAGES;
 		else if (!xstrncasecmp(temp_char, "bb/", 3)) {
 			tres_rec->type[2] = '\0';
 			tres_rec->name = xstrdup(temp_char+3);
 			if (!tres_rec->name)
 				fatal("Burst Buffer type tres need to have a "
 				      "name, (i.e. bb/datawarp).  You gave %s",
 				      temp_char);
 		} else if (!xstrncasecmp(temp_char, "gres/", 5)) {
 			tres_rec->type[4] = '\0';
 			tres_rec->name = xstrdup(temp_char+5);
 			if (!tres_rec->name)
 				fatal("Gres type tres need to have a name, "
 				      "(i.e. Gres/GPU).  You gave %s",
 				      temp_char);
 		} else if (!xstrncasecmp(temp_char, "license/", 8)) {
 			tres_rec->type[7] = '\0';
 			tres_rec->name = xstrdup(temp_char+8);
 			if (!tres_rec->name)
 				fatal("License type tres need to "
 				      "have a name, (i.e. License/Foo).  "
 				      "You gave %s",
 				      temp_char);
 		} else if (!xstrncasecmp(temp_char, "fs/", 3)) {
 			tres_rec->type[2] = '\0';
 			tres_rec->name = xstrdup(temp_char+3);
 			if (!tres_rec->name)
 				fatal("Filesystem type tres need to have a name, (i.e. fs/disk).  You gave %s",
 				      temp_char);
 			if (!xstrncasecmp(tres_rec->name, "disk", 4))
 				tres_rec->id = TRES_FS_DISK;
 		} else if (!xstrncasecmp(temp_char, "ic/", 3)) {
 			tres_rec->type[2] = '\0';
 			tres_rec->name = xstrdup(temp_char+3);
 			if (!tres_rec->name)
 				fatal("Interconnect type tres need to have a name, (i.e. ic/ofed).  You gave %s",
 				      temp_char);
 		} else {
 			fatal("%s: Unknown tres type '%s', acceptable types are Billing,CPU,Energy,FS/,Gres/,IC/,License/,Mem,Node,Pages,VMem",
 			      __func__, temp_char);
 			xfree(tres_rec->type);
 			xfree(tres_rec);
 		}

 		if (!slurm_with_slurmdbd()) {
 			if (!tres_rec->id)
 				fatal("slurmdbd is required to run with TRES %s%s%s. Either setup slurmdbd or remove this TRES from your configuration.",
 				      tres_rec->type, tres_rec->name ? "/" : "",
 				      tres_rec->name ? tres_rec->name : "");
 			list_append(update_object.objects, tres_rec);
 		} else if (!tres_rec->id &&
 			   assoc_mgr_fill_in_tres(
 				   acct_db_conn, tres_rec,
 				   ACCOUNTING_ENFORCE_TRES, NULL, 0)
 			   != SLURM_SUCCESS) {
 			if (!add_list)
 				add_list = list_create(
 					slurmdb_destroy_tres_rec);
 			info("Couldn't find tres %s%s%s in the database, "
 			     "creating.",
 			     tres_rec->type, tres_rec->name ? "/" : "",
 			     tres_rec->name ? tres_rec->name : "");
 			list_append(add_list, tres_rec);
 		} else
 			slurmdb_destroy_tres_rec(tres_rec);
 	}
 	FREE_NULL_LIST(char_list);

 	if (add_list) {
 		if (acct_storage_g_add_tres(acct_db_conn,
 		                            slurm_conf.slurm_user_id,
 		                            add_list) != SLURM_SUCCESS)
 			fatal("Problem adding tres to the database, "
 			      "can't continue until database is able to "
 			      "make new tres");
 		/* refresh list here since the updates are not
 		   sent dynamically */
 		assoc_mgr_refresh_lists(acct_db_conn, ASSOC_MGR_CACHE_TRES);
 		FREE_NULL_LIST(add_list);
 	}

 	if (!slurm_with_slurmdbd()) {
 		assoc_mgr_update_tres(&update_object, false);
 		FREE_NULL_LIST(update_object.objects);
 	}

 	/* Set up the slurmctld_tres_cnt here (Current code is set to
 	 * not have this ever change).
 	*/
 	assoc_mgr_lock(&locks);
 	slurmctld_tres_cnt = g_tres_count;
 	assoc_mgr_unlock(&locks);

 	return SLURM_SUCCESS;
 }

 /*
  * NOTE: the job_write_lock as well as the assoc_mgr TRES Read lock should be
  * locked before coming in here.
  */
 static int _update_job_tres(void *x, void *arg)
 {
 	job_record_t *job_ptr = x;

 	xassert(verify_lock(JOB_LOCK, WRITE_LOCK));

 	/* If this returns 1 it means the positions were
 	   altered so just rebuild it.
 	*/
 	if (assoc_mgr_set_tres_cnt_array(&job_ptr->tres_req_cnt,
 					 job_ptr->tres_req_str,
 					 0, true, false, NULL))
 		job_set_req_tres(job_ptr, true);
 	if (assoc_mgr_set_tres_cnt_array(&job_ptr->tres_alloc_cnt,
 					 job_ptr->tres_alloc_str,
 					 0, true, false, NULL))
 		job_set_alloc_tres(job_ptr, true);

 	update_job_limit_set_tres(&job_ptr->limit_set.tres, slurmctld_tres_cnt);

 	return 0;
 }

 /* any association manager locks should be unlocked before hand */
 static void _update_cluster_tres(void)
 {
 	/* Write lock on jobs */
 	slurmctld_lock_t job_write_lock =
 		{ NO_LOCK, WRITE_LOCK, NO_LOCK, NO_LOCK, NO_LOCK };
 	assoc_mgr_lock_t locks = { .tres = READ_LOCK };

 	if (!job_list)
 		return;

 	lock_slurmctld(job_write_lock);
 	assoc_mgr_lock(&locks);
 	list_for_each(job_list, _update_job_tres, NULL);
 	assoc_mgr_unlock(&locks);
 	unlock_slurmctld(job_write_lock);
 }

 static void _update_parts_and_resvs()
 {
 	update_assocs_in_resvs();
 	part_list_update_assoc_lists();
 }

 static void _queue_reboot_msg(void)
 {
 	agent_arg_t *reboot_agent_args = NULL;
 	node_record_t *node_ptr;
 	char *host_str;
 	time_t now = time(NULL);
 	int i;
 	bool want_reboot;

 	want_nodes_reboot = false;
 	for (i = 0; (node_ptr = next_node(&i)); i++) {
 		/* Allow nodes in maintenance reservations to reboot
 		 * (they previously could not).
 		 */
 		if (!IS_NODE_REBOOT_REQUESTED(node_ptr))
 			continue;	/* No reboot needed */
 		else if (IS_NODE_REBOOT_ISSUED(node_ptr)) {
 			debug2("%s: Still waiting for boot of node %s",
 			       __func__, node_ptr->name);
 			continue;
 		}
 		if (IS_NODE_COMPLETING(node_ptr)) {
 			want_nodes_reboot = true;
 			continue;
 		}
                 /* only active idle nodes, don't reboot
                  * nodes that are idle but have suspended
                  * jobs on them
                  */
 		if (IS_NODE_IDLE(node_ptr)
                     && !IS_NODE_NO_RESPOND(node_ptr)
                     && !IS_NODE_POWERING_UP(node_ptr)
                     && node_ptr->sus_job_cnt == 0)
 			want_reboot = true;
 		else if (IS_NODE_FUTURE(node_ptr) &&
 			 (node_ptr->last_response == (time_t) 0))
 			want_reboot = true; /* system just restarted */
 		else if (IS_NODE_DOWN(node_ptr))
 			want_reboot = true;
 		else
 			want_reboot = false;
 		if (!want_reboot) {
 			want_nodes_reboot = true;	/* defer reboot */
 			continue;
 		}
 		if (reboot_agent_args == NULL) {
 			reboot_agent_args = xmalloc(sizeof(agent_arg_t));
 			reboot_agent_args->msg_type = REQUEST_REBOOT_NODES;
 			reboot_agent_args->retry = 0;
 			reboot_agent_args->hostlist = hostlist_create(NULL);
 			reboot_agent_args->protocol_version =
 				SLURM_PROTOCOL_VERSION;
 		}
 		if (reboot_agent_args->protocol_version
 		    > node_ptr->protocol_version)
 			reboot_agent_args->protocol_version =
 				node_ptr->protocol_version;
 		hostlist_push_host(reboot_agent_args->hostlist, node_ptr->name);
 		reboot_agent_args->node_count++;
 		/*
 		 * node_ptr->node_state &= ~NODE_STATE_MAINT;
 		 * The NODE_STATE_MAINT bit will just get set again as long
 		 * as the node remains in the maintenance reservation, so
 		 * don't clear it here because it won't do anything.
 		 */
 		node_ptr->node_state &=  NODE_STATE_FLAGS;
 		node_ptr->node_state |=  NODE_STATE_DOWN;
 		node_ptr->node_state &= ~NODE_STATE_REBOOT_REQUESTED;
 		node_ptr->node_state |= NODE_STATE_REBOOT_ISSUED;

 		bit_clear(avail_node_bitmap, node_ptr->index);
 		bit_clear(idle_node_bitmap, node_ptr->index);

 		/* Unset this as this node is not in reboot ASAP anymore. */
 		bit_clear(asap_node_bitmap, node_ptr->index);

 		node_ptr->boot_req_time = now;

 		set_node_reason(node_ptr, "reboot issued", now);

 		clusteracct_storage_g_node_down(acct_db_conn, node_ptr, now,
 		                                NULL, slurm_conf.slurm_user_id);
 	}
 	if (reboot_agent_args != NULL) {
 		hostlist_uniq(reboot_agent_args->hostlist);
 		host_str = hostlist_ranged_string_xmalloc(
 				reboot_agent_args->hostlist);
 		debug("Issuing reboot request for nodes %s", host_str);
 		xfree(host_str);
 		set_agent_arg_r_uid(reboot_agent_args, SLURM_AUTH_UID_ANY);
 		agent_queue_request(reboot_agent_args);
 		last_node_update = now;
 		schedule_node_save();
 	}
 }

 static void _flush_rpcs(void)
 {
 	struct timespec ts = {0, 0};
 	struct timeval now;
 	int exp_thread_cnt = slurmctld_config.resume_backup ? 1 : 0;

 	/* wait for RPCs to complete */
 	gettimeofday(&now, NULL);
 	ts.tv_sec = now.tv_sec + CONTROL_TIMEOUT;
 	ts.tv_nsec = now.tv_usec * 1000;

 	slurm_mutex_lock(&slurmctld_config.thread_count_lock);
 	while (slurmctld_config.server_thread_count > exp_thread_cnt) {
 		slurm_cond_timedwait(&slurmctld_config.thread_count_cond,
 				     &slurmctld_config.thread_count_lock, &ts);
 	}

 	if (slurmctld_config.server_thread_count > exp_thread_cnt) {
 		info("shutdown server_thread_count=%d",
 		     slurmctld_config.server_thread_count);
 	}

 	slurm_mutex_unlock(&slurmctld_config.thread_count_lock);
 }

 /*
  * _slurmctld_background - process slurmctld background activities
  *	purge defunct job records, save state, schedule jobs, and
  *	ping other nodes
  */
 static void *_slurmctld_background(void *no_data)
 {
 	static time_t last_sched_time;
 	static time_t last_config_list_update_time;
 	static time_t last_full_sched_time;
 	static time_t last_checkpoint_time;
 	static time_t last_group_time;
 	static time_t last_health_check_time;
 	static time_t last_acct_gather_node_time;
 	static time_t last_no_resp_msg_time;
 	static time_t last_ping_node_time = (time_t) 0;
 	static time_t last_ping_srun_time;
 	static time_t last_purge_job_time;
 	static time_t last_resv_time;
 	static time_t last_timelimit_time;
 	static time_t last_assert_primary_time;
 	static time_t last_trigger;
 	static time_t last_node_acct;
 	static time_t last_ctld_bu_ping;
 	static time_t last_uid_update;
 	time_t now;
 	int no_resp_msg_interval, ping_interval, purge_job_interval;
 	DEF_TIMERS;

 	/* Locks: Read config */
 	slurmctld_lock_t config_read_lock = {
 		READ_LOCK, NO_LOCK, NO_LOCK, NO_LOCK, NO_LOCK };
 	/* Locks: Read config, read job */
 	slurmctld_lock_t job_read_lock = {
 		READ_LOCK, READ_LOCK, NO_LOCK, NO_LOCK, NO_LOCK };
 	/* Locks: Read config, write job, write node, read partition */
 	slurmctld_lock_t job_write_lock = {
 		READ_LOCK, WRITE_LOCK, WRITE_LOCK, READ_LOCK, READ_LOCK };
 	/* Locks: Write job */
 	slurmctld_lock_t job_write_lock2 = {
 		NO_LOCK, WRITE_LOCK, NO_LOCK, NO_LOCK, NO_LOCK };
 	/* Locks: Read config, write job, write node
 	 * (Might kill jobs on nodes set DOWN) */
 	slurmctld_lock_t node_write_lock = {
 		READ_LOCK, WRITE_LOCK, WRITE_LOCK, NO_LOCK, NO_LOCK };
 	/* Locks: Write node */
 	slurmctld_lock_t node_write_lock2 = {
 		NO_LOCK, NO_LOCK, WRITE_LOCK, NO_LOCK, NO_LOCK };
 	/* Locks: Write partition */
 	slurmctld_lock_t part_write_lock = {
 		NO_LOCK, NO_LOCK, NO_LOCK, WRITE_LOCK, NO_LOCK };
 	/* Locks: Read job and node */
 	slurmctld_lock_t job_node_read_lock = {
 		NO_LOCK, READ_LOCK, READ_LOCK, NO_LOCK, NO_LOCK };
 	/*
 	 * purge_old_job modifies jobs and reads conf info. It can also
 	 * call re_kill_job(), which can modify nodes and reads fed info.
 	 */
 	slurmctld_lock_t purge_job_locks = {
 		.conf = READ_LOCK,
 		.job = WRITE_LOCK,
 		.node = WRITE_LOCK,
 		.fed = READ_LOCK,
 	};

 	/* Let the dust settle before doing work */
 	now = time(NULL);
 	last_sched_time = last_full_sched_time = now;
 	last_checkpoint_time = last_group_time = now;
 	last_purge_job_time = last_trigger = last_health_check_time = now;
 	last_timelimit_time = last_assert_primary_time = now;
 	last_no_resp_msg_time = last_resv_time = last_ctld_bu_ping = now;
 	last_uid_update = now;
 	last_acct_gather_node_time = now;
 	last_config_list_update_time = now;


 	last_ping_srun_time = now;
 	last_node_acct = now;
 	debug3("_slurmctld_background pid = %u", getpid());

 	while (1) {
 		bool call_schedule = false, full_queue = false;

 		slurm_mutex_lock(&shutdown_mutex);
 		if (!slurmctld_config.shutdown_time) {
 			struct timespec ts = {0, 0};

 			/* Listen to new incoming RPCs if not shutting down */
 			listeners_unquiesce();

 			ts.tv_sec = time(NULL) + 1;
 			slurm_cond_timedwait(&shutdown_cond, &shutdown_mutex,
 					     &ts);
 		}
 		slurm_mutex_unlock(&shutdown_mutex);

 		now = time(NULL);
 		START_TIMER;

 		if (slurm_conf.slurmctld_debug <= 3)
 			no_resp_msg_interval = 300;
 		else if (slurm_conf.slurmctld_debug == 4)
 			no_resp_msg_interval = 60;
 		else
 			no_resp_msg_interval = 1;

 		if ((slurm_conf.min_job_age > 0) &&
 		    (slurm_conf.min_job_age < PURGE_JOB_INTERVAL)) {
 			/* Purge jobs more quickly, especially for high job flow */
 			purge_job_interval = MAX(10, slurm_conf.min_job_age);
 		} else
 			purge_job_interval = PURGE_JOB_INTERVAL;

 		if (slurm_conf.slurmd_timeout) {
 			/* We ping nodes that haven't responded in SlurmdTimeout/3,
 			 * but need to do the test at a higher frequency or we might
 			 * DOWN nodes with times that fall in the gap. */
 			ping_interval = slurm_conf.slurmd_timeout / 3;
 		} else {
 			/* This will just ping non-responding nodes
 			 * and restore them to service */
 			ping_interval = 100;	/* 100 seconds */
 		}

 		if (!last_ping_node_time) {
 			last_ping_node_time = now + (time_t)MIN_CHECKIN_TIME -
 					      ping_interval;
 		}

 		if (slurmctld_config.shutdown_time) {
 			/* Always stop listening when shutdown requested */
 			listeners_quiesce();

 			_flush_rpcs();

 			/*
 			 * Wait for all already accepted connection work to
 			 * finish before continuing on with control loop that
 			 * will unload all the plugins which requires there be
 			 * no active RPCs.
 			 */
 			conmgr_quiesce(__func__);

 			if (!report_locks_set()) {
 				info("Saving all slurm state");
 				save_all_state();
 			} else {
 				error("Semaphores still set after %d seconds, "
 				      "can not save state", CONTROL_TIMEOUT);
 			}

 			/*
 			 * Allow other connections to start processing again as
 			 * the listeners are already quiesced
 			 */
 			conmgr_unquiesce(__func__);

 			break;
 		}

 		if (difftime(now, last_resv_time) >= 5) {
 			lock_slurmctld(node_write_lock);
 			now = time(NULL);
 			last_resv_time = now;
 			if (set_node_maint_mode() > 0)
 				queue_job_scheduler();
 			unlock_slurmctld(node_write_lock);
 		}

 		if (difftime(now, last_no_resp_msg_time) >=
 		    no_resp_msg_interval) {
 			lock_slurmctld(node_write_lock2);
 			now = time(NULL);
 			last_no_resp_msg_time = now;
 			node_no_resp_msg();
 			unlock_slurmctld(node_write_lock2);
 		}

 		validate_all_reservations(true, true);

 		if (difftime(now, last_timelimit_time) >= PERIODIC_TIMEOUT) {
 			lock_slurmctld(job_write_lock);
 			now = time(NULL);
 			last_timelimit_time = now;
 			debug2("Testing job time limits and checkpoints");
 			job_time_limit();
 			job_resv_check();
 			unlock_slurmctld(job_write_lock);

 			lock_slurmctld(node_write_lock);
 			check_node_timers();
 			unlock_slurmctld(node_write_lock);
 		}

 		if (!(slurm_conf.health_check_node_state &
 		      HEALTH_CHECK_START_ONLY) &&
 		    slurm_conf.health_check_interval &&
 		    (difftime(now, last_health_check_time) >=
 		     slurm_conf.health_check_interval) &&
 		    is_ping_done()) {
 			lock_slurmctld(node_write_lock);
 			if (slurm_conf.health_check_node_state &
 			     HEALTH_CHECK_CYCLE) {
 				/* Call run_health_check() on each cycle */
 			} else {
 				now = time(NULL);
 				last_health_check_time = now;
 			}
 			run_health_check();
 			unlock_slurmctld(node_write_lock);
 		}

 		if (slurm_conf.acct_gather_node_freq &&
 		    (difftime(now, last_acct_gather_node_time) >=
 		     slurm_conf.acct_gather_node_freq) &&
 		    is_ping_done()) {
 			lock_slurmctld(node_write_lock);
 			now = time(NULL);
 			last_acct_gather_node_time = now;
 			update_nodes_acct_gather_data();
 			unlock_slurmctld(node_write_lock);
 		}

 		if (((difftime(now, last_ping_node_time) >= ping_interval) ||
 		     ping_nodes_now) && is_ping_done()) {
 			lock_slurmctld(node_write_lock);
 			now = time(NULL);
 			last_ping_node_time = now;
 			ping_nodes_now = false;
 			ping_nodes();
 			unlock_slurmctld(node_write_lock);
 		}

 		if (slurm_conf.inactive_limit &&
 		    ((now - last_ping_srun_time) >=
 		     (slurm_conf.inactive_limit / 3))) {
 			lock_slurmctld(job_read_lock);
 			now = time(NULL);
 			last_ping_srun_time = now;
 			debug2("Performing srun ping");
 			srun_ping();
 			unlock_slurmctld(job_read_lock);
 		}

 		if (want_nodes_reboot) {
 			lock_slurmctld(node_write_lock);
 			_queue_reboot_msg();
 			unlock_slurmctld(node_write_lock);
 		}

 		/* Process any pending agent work */
 		agent_trigger(RPC_RETRY_INTERVAL, true, true);

 		if (slurm_conf.group_time &&
 		    (difftime(now, last_group_time)
 		     >= slurm_conf.group_time)) {
 			lock_slurmctld(part_write_lock);
 			now = time(NULL);
 			last_group_time = now;
 			load_part_uid_allow_list(slurm_conf.group_force);
 			reservation_update_groups(slurm_conf.group_force);
 			unlock_slurmctld(part_write_lock);
 			group_cache_cleanup();
 		}

 		if (difftime(now, last_purge_job_time) >= purge_job_interval) {
 			/*
 			 * If backfill is running, it will have a list of
 			 * job_record pointers which could include this
 			 * job. Skip over in that case to prevent
 			 * _attempt_backfill() from potentially dereferencing an
 			 * invalid pointer.
 			 */
 			slurm_mutex_lock(&check_bf_running_lock);
 			if (!slurmctld_diag_stats.bf_active) {
 				lock_slurmctld(purge_job_locks);
 				now = time(NULL);
 				last_purge_job_time = now;
 				debug2("Performing purge of old job records");
 				purge_old_job();
 				unlock_slurmctld(purge_job_locks);
 			}
 			slurm_mutex_unlock(&check_bf_running_lock);
 			free_old_jobs();
 		}

 		if (difftime(now, last_full_sched_time) >= sched_interval) {
 			slurm_mutex_lock(&sched_cnt_mutex);
 			call_schedule = true;
 			full_queue = true;
 			job_sched_cnt = 0;
 			slurm_mutex_unlock(&sched_cnt_mutex);
 			last_full_sched_time = now;
 		} else {
 			slurm_mutex_lock(&sched_cnt_mutex);
 			if (job_sched_cnt &&
 			    (difftime(now, last_sched_time) >=
 			     batch_sched_delay)) {
 				call_schedule = true;
 				job_sched_cnt = 0;
 			}
 			slurm_mutex_unlock(&sched_cnt_mutex);
 		}
 		if (call_schedule) {
 			lock_slurmctld(job_write_lock2);
 			now = time(NULL);
 			last_sched_time = now;
 			bb_g_load_state(false);	/* May alter job nice/prio */
 			unlock_slurmctld(job_write_lock2);
 			schedule(full_queue);
 			set_job_elig_time();
 		}

 		if (difftime(now, last_config_list_update_time) >=
 		    UPDATE_CONFIG_LIST_TIMEOUT) {
 			last_config_list_update_time = now;
 			consolidate_config_list(false, false);
 		}

 		if (slurm_conf.slurmctld_timeout &&
 		    (difftime(now, last_ctld_bu_ping) >
 		     slurm_conf.slurmctld_timeout)) {
 			ping_controllers(true);
 			last_ctld_bu_ping = now;
 		}

 		if (difftime(now, last_trigger) > TRIGGER_INTERVAL) {
 			lock_slurmctld(job_node_read_lock);
 			now = time(NULL);
 			last_trigger = now;
 			trigger_process();
 			unlock_slurmctld(job_node_read_lock);
 		}

 		if (difftime(now, last_checkpoint_time) >=
 		    PERIODIC_CHECKPOINT) {
 			now = time(NULL);
 			last_checkpoint_time = now;
 			debug2("Performing full system state save");
 			save_all_state();
 		}

 		if (difftime(now, last_node_acct) >= PERIODIC_NODE_ACCT) {
 			/* Report current node state to account for added
 			 * or reconfigured nodes.  Locks are done
 			 * inside _accounting_cluster_ready, don't
 			 * lock here. */
 			now = time(NULL);
 			last_node_acct = now;
 			_accounting_cluster_ready();
 		}

 		if (difftime(now, slurmctld_diag_stats.job_states_ts) >=
 		    JOB_COUNT_INTERVAL) {
 			lock_slurmctld(job_read_lock);
 			_update_diag_job_state_counts();
 			unlock_slurmctld(job_read_lock);
 		}

 		/* Stats will reset at midnight (approx) local time. */
 		if (last_proc_req_start == 0) {
 			last_proc_req_start = now;
 			next_stats_reset = now - (now % 86400) + 86400;
 		} else if (now >= next_stats_reset) {
 			next_stats_reset = now - (now % 86400) + 86400;
 			reset_stats(0);
 		}

 		/*
 		 * Reassert this machine as the primary controller.
 		 * A network or security problem could result in
 		 * the backup controller assuming control even
 		 * while the real primary controller is running.
 		 */
 		lock_slurmctld(config_read_lock);
 		if (slurmctld_primary && slurm_conf.slurmctld_timeout &&
 		    (difftime(now, last_assert_primary_time) >=
 		     slurm_conf.slurmctld_timeout)) {
 			now = time(NULL);
 			last_assert_primary_time = now;
 			(void) _shutdown_backup_controller();
 		}
 		unlock_slurmctld(config_read_lock);

 		if (difftime(now, last_uid_update) >= 3600) {
 			bool uid_set = false;
 			/*
 			 * Make sure we update the uids in the
 			 * assoc_mgr if there were any users
 			 * with unknown uids at the time of startup.
 			 */
 			now = time(NULL);
 			last_uid_update = now;
 			assoc_mgr_set_missing_uids(&uid_set);
 			/*
 			 * If a missing uid was set, schedule a full reservation
 			 * validation to make sure that the reservations are up
 			 * to date.
 			 */
 			if (uid_set)
 				validate_all_reservations(false, true);
 		}

 		END_TIMER2(__func__);
 	}

 	debug3("_slurmctld_background shutting down");

 	return NULL;
 }

 /* save_all_state - save entire slurmctld state for later recovery */
 extern void save_all_state(void)
 {
 	/* Each of these functions lock their own databases */
 	schedule_job_save();
 	schedule_node_save();
 	schedule_part_save();
 	schedule_resv_save();
 	schedule_trigger_save();
 	dump_assoc_mgr_state();
 	fed_mgr_state_save();
 }

 /* make sure the assoc_mgr is up and running with the most current state */
 extern void ctld_assoc_mgr_init(void)
 {
 	assoc_init_args_t assoc_init_arg;
 	int num_jobs = 0;
 	slurmctld_lock_t job_read_lock =
 		{ NO_LOCK, READ_LOCK, NO_LOCK, NO_LOCK, NO_LOCK };

 	memset(&assoc_init_arg, 0, sizeof(assoc_init_args_t));
 	assoc_init_arg.enforce = accounting_enforce;
 	assoc_init_arg.running_cache = &running_cache;
 	assoc_init_arg.add_license_notify = license_add_remote;
 	assoc_init_arg.resize_qos_notify = _resize_qos;
 	assoc_init_arg.remove_assoc_notify = _remove_assoc;
 	assoc_init_arg.remove_license_notify = license_remove_remote;
 	assoc_init_arg.remove_qos_notify = _remove_qos;
 	assoc_init_arg.sync_license_notify = license_sync_remote;
 	assoc_init_arg.update_assoc_notify = _update_assoc;
 	assoc_init_arg.update_license_notify = license_update_remote;
 	assoc_init_arg.update_qos_notify = _update_qos;
 	assoc_init_arg.update_cluster_tres = _update_cluster_tres;
 	assoc_init_arg.update_resvs = _update_parts_and_resvs;
 	assoc_init_arg.cache_level = ASSOC_MGR_CACHE_ASSOC |
 				     ASSOC_MGR_CACHE_USER  |
 				     ASSOC_MGR_CACHE_QOS   |
 				     ASSOC_MGR_CACHE_RES   |
 				     ASSOC_MGR_CACHE_TRES  |
 				     ASSOC_MGR_CACHE_WCKEY;
 	/* Don't save state but blow away old lists if they exist. */
 	assoc_mgr_fini(0);

 	_init_db_conn();

 	if (assoc_mgr_init(acct_db_conn, &assoc_init_arg, errno)) {
 		trigger_primary_dbd_fail();

 		error("Association database appears down, reading from state files.");

 		if (!slurm_conf.cluster_id ||
 		    (load_assoc_mgr_last_tres() != SLURM_SUCCESS) ||
 		    (load_assoc_mgr_state() != SLURM_SUCCESS)) {
 			error("Unable to get any information from the state file");
 			_retry_init_db_conn(&assoc_init_arg);
 		}
 	}

 	if (!slurm_conf.cluster_id) {
 		slurm_conf.cluster_id = generate_cluster_id();
 		_create_clustername_file();
 	}
 	sluid_init(slurm_conf.cluster_id, 0);

 	/* Now load the usage from a flat file since it isn't kept in
 	   the database
 	*/
 	load_assoc_usage();
 	load_qos_usage();

 	lock_slurmctld(job_read_lock);
 	if (job_list)
 		num_jobs = list_count(job_list);
 	unlock_slurmctld(job_read_lock);

 	_init_tres();

 	/* This thread is looking for when we get correct data from
 	   the database so we can update the assoc_ptr's in the jobs
 	*/
 	if ((running_cache != RUNNING_CACHE_STATE_NOTRUNNING) || num_jobs) {
 		slurm_thread_create(&assoc_cache_thread,
 				    _assoc_cache_mgr, NULL);
 	}

 }

 /* Make sure the assoc_mgr thread is terminated */
 extern void ctld_assoc_mgr_fini(void)
 {
 	if (running_cache == RUNNING_CACHE_STATE_NOTRUNNING)
 		return;

 	/* break out and end the association cache
 	 * thread since we are shutting down, no reason
 	 * to wait for current info from the database */
 	slurm_mutex_lock(&assoc_cache_mutex);
 	running_cache = RUNNING_CACHE_STATE_EXITING;
 	slurm_cond_signal(&assoc_cache_cond);
 	slurm_mutex_unlock(&assoc_cache_mutex);
 	slurm_thread_join(assoc_cache_thread);
 }

 static int _add_node_gres_tres(void *x, void *arg)
 {
 	uint64_t gres_cnt;
 	int tres_pos;
 	slurmdb_tres_rec_t *tres_rec_in = x;
 	node_record_t *node_ptr = arg;

 	xassert(tres_rec_in);

 	if (xstrcmp(tres_rec_in->type, "gres"))
 		return 0;

 	gres_cnt = gres_node_config_cnt(node_ptr->gres_list, tres_rec_in->name);

 	/*
 	 * Set the count here for named GRES as we don't store the count the
 	 * same way we do for unnamed GRES.
 	 */
 	if (strchr(tres_rec_in->name, ':'))
 		tres_rec_in->count += gres_cnt;

 	if ((tres_pos = assoc_mgr_find_tres_pos(tres_rec_in, true)) != -1)
 		node_ptr->tres_cnt[tres_pos] = gres_cnt;

 	return 0;
 }

 /*
  * Set the node's billing tres to the highest billing of all partitions that the
  * node is a part of.
  */
 static void _set_node_billing_tres(node_record_t *node_ptr, uint64_t cpu_count,
 				   bool assoc_mgr_locked)
 {
 	int i;
 	part_record_t *part_ptr = NULL;
 	double max_billing = 0;
 	xassert(node_ptr);

 	for (i = 0; i < node_ptr->part_cnt; i++) {
 		double tmp_billing;
 		part_ptr = node_ptr->part_pptr[i];
 		if (!part_ptr->billing_weights)
 			continue;

 		tmp_billing = assoc_mgr_tres_weighted(
 			node_ptr->tres_cnt, part_ptr->billing_weights,
 			slurm_conf.priority_flags, assoc_mgr_locked);
 		max_billing = MAX(max_billing, tmp_billing);
 	}

 	/* Set to the configured cpu_count if no partition has
 	 * tresbillingweights set because the job will be allocated the job's
 	 * cpu count if there are no tresbillingweights defined. */
 	if (!max_billing)
 		max_billing = cpu_count;
 	node_ptr->tres_cnt[TRES_ARRAY_BILLING] = max_billing;
 }

 extern void set_cluster_tres(bool assoc_mgr_locked)
 {
 	node_record_t *node_ptr;
 	slurmdb_tres_rec_t *tres_rec, *cpu_tres = NULL, *mem_tres = NULL;
 	int i;
 	uint64_t cluster_billing = 0;
 	char *unique_tres = NULL;
 	assoc_mgr_lock_t locks = {
 		.qos = WRITE_LOCK,
 		.tres = WRITE_LOCK };
 	int active_node_count = 0;

 	xassert(verify_lock(NODE_LOCK, WRITE_LOCK));
 	xassert(verify_lock(PART_LOCK, WRITE_LOCK));

 	if (!assoc_mgr_locked)
 		assoc_mgr_lock(&locks);

 	xassert(assoc_mgr_tres_array);

 	for (i = 0; i < g_tres_count; i++) {
 		tres_rec = assoc_mgr_tres_array[i];

 		if (!tres_rec->type) {
 			error("TRES %d doesn't have a type given, this should never happen",
 			      tres_rec->id);
 			continue; /* this should never happen */
 		}

 		if (unique_tres)
 			xstrfmtcat(unique_tres, ",%s",
 				   assoc_mgr_tres_name_array[i]);
 		else
 			unique_tres = xstrdup(assoc_mgr_tres_name_array[i]);


 		/* reset them now since we are about to add to them */
 		tres_rec->count = 0;
 		if (tres_rec->id == TRES_CPU) {
 			cpu_tres = tres_rec;
 			continue;
 		} else if (tres_rec->id == TRES_MEM) {
 			mem_tres = tres_rec;
 			continue;
 		} else if (!xstrcmp(tres_rec->type, "bb")) {
 			tres_rec->count = bb_g_get_system_size(tres_rec->name);
 			continue;
 		} else if (!xstrcmp(tres_rec->type, "gres")) {
 			/*
 			 * Skip named GRES as we don't store
 			 * the count the same way we do for unnamed GRES.
 			 */
 			if (strchr(tres_rec->name, ':'))
 				continue;

 			tres_rec->count =
 				gres_get_system_cnt(tres_rec->name, true);
 			if (tres_rec->count == NO_VAL64)
 				tres_rec->count = 0;   /* GRES name not found */
 			continue;
 		} else if (!xstrcmp(tres_rec->type, "license")) {
 			tres_rec->count = get_total_license_cnt(
 				tres_rec->name);
 			continue;
 		}
 		/* FIXME: set up the other tres here that aren't specific */
 	}

 	xfree(slurm_conf.accounting_storage_tres);
 	slurm_conf.accounting_storage_tres = unique_tres;

 	cluster_cpus = 0;

 	for (i = 0; (node_ptr = next_node(&i)); i++) {
 		uint64_t cpu_count = 0, mem_count = 0;
 		if (!node_ptr->name)
 			continue;

 		active_node_count++;
 		cpu_count = node_ptr->cpus_efctv;
 		mem_count = node_ptr->config_ptr->real_memory;

 		cluster_cpus += cpu_count;
 		if (mem_tres)
 			mem_tres->count += mem_count;

 		if (!node_ptr->tres_cnt)
 			node_ptr->tres_cnt = xcalloc(slurmctld_tres_cnt,
 						     sizeof(uint64_t));
 		node_ptr->tres_cnt[TRES_ARRAY_CPU] = cpu_count;
 		node_ptr->tres_cnt[TRES_ARRAY_MEM] = mem_count;

 		list_for_each(assoc_mgr_tres_list,
 			      _add_node_gres_tres, node_ptr);

 		_set_node_billing_tres(node_ptr, cpu_count, true);
 		cluster_billing += node_ptr->tres_cnt[TRES_ARRAY_BILLING];

 		xfree(node_ptr->tres_str);
 		node_ptr->tres_str =
 			assoc_mgr_make_tres_str_from_array(node_ptr->tres_cnt,
 							   TRES_STR_FLAG_SIMPLE,
 							   true);
 		xfree(node_ptr->tres_fmt_str);
 		node_ptr->tres_fmt_str =
 			assoc_mgr_make_tres_str_from_array(
 				node_ptr->tres_cnt,
 				TRES_STR_CONVERT_UNITS,
 				true);
 	}

 	/* FIXME: cluster_cpus probably needs to be removed and handled
 	 * differently in the spots this is used.
 	 */
 	if (cpu_tres)
 		cpu_tres->count = cluster_cpus;

 	assoc_mgr_tres_array[TRES_ARRAY_NODE]->count = active_node_count;
 	assoc_mgr_tres_array[TRES_ARRAY_BILLING]->count = cluster_billing;

 	set_partition_tres(true);

 	if (!assoc_mgr_locked)
 		assoc_mgr_unlock(&locks);
 }

 /*
  * slurmctld_shutdown - wake up _slurm_rpc_mgr thread via signal
  * RET 0 or error code
  */
 int slurmctld_shutdown(void)
 {
 	sched_debug("slurmctld terminating");
 	slurmctld_config.shutdown_time = time(NULL);
 	slurm_cond_signal(&shutdown_cond);
 	pthread_kill(pthread_self(), SIGUSR1);
 	return SLURM_SUCCESS;
 }

 /*
  * _parse_commandline - parse and process any command line arguments
  * IN argc - number of command line arguments
  * IN argv - the command line arguments
  * IN/OUT conf_ptr - pointer to current configuration, update as needed
  */
 static void _parse_commandline(int argc, char **argv)
 {
 	int c = 0;
 	char *tmp_char;

 	enum {
 		LONG_OPT_ENUM_START = 0x100,
 		LONG_OPT_SYSTEMD,
 	};

 	static struct option long_options[] = {
 		{"systemd", no_argument, 0, LONG_OPT_SYSTEMD},
 		{"version", no_argument, 0, 'V'},
 		{NULL, 0, 0, 0}
 	};

 	if (run_command_is_launcher(argc, argv)) {
 		char *ctx = getenv("SLURM_SCRIPT_CONTEXT");

 		if (!xstrcmp(ctx, "burst_buffer.lua")) {
 			unsetenv("SLURM_SCRIPT_CONTEXT");
 			slurmscriptd_handle_bb_lua_mode(argc, argv);
 			_exit(127);
 		}

 		run_command_launcher(argc, argv);
 		_exit(127); /* Should not get here */
 	}

 	opterr = 0;
 	while ((c = getopt_long(argc, argv, "cdDf:hiL:n:rRsvV",
 	       long_options, NULL)) > 0) {
 		switch (c) {
 		case 'c':
 			recover = 0;
 			break;
 		case 'D':
 			daemonize = false;
 			break;
 		case 'f':
 			xfree(slurm_conf_filename);
 			slurm_conf_filename = xstrdup(optarg);
 			break;
 		case 'h':
 			_usage();
 			exit(0);
 			break;
 		case 'i':
 			ignore_state_errors = true;
 			break;
 		case 'L':
 			xfree(debug_logfile);
 			debug_logfile = xstrdup(optarg);
 			break;
 		case 'n':
 			new_nice = strtol(optarg, &tmp_char, 10);
 			if (tmp_char[0] != '\0') {
 				error("Invalid option for -n option (nice "
 				      "value), ignored");
 				new_nice = 0;
 			}
 			break;
 		case 'r':
 			recover = 1;
 			break;
 		case 'R':
 			recover = 2;
 			break;
 		case 's':
 			setwd = true;
 			break;
 		case 'v':
 			debug_level++;
 			break;
 		case 'V':
 			print_slurm_version();
 			exit(0);
 			break;
 		case LONG_OPT_SYSTEMD:
 			under_systemd = true;
 			break;
 		default:
 			_usage();
 			exit(1);
 		}
 	}

 	if (under_systemd && !daemonize)
 		fatal("--systemd and -D options are mutually exclusive");

 	/*
 	 * Reconfiguration has historically been equivalent to recover = 1.
 	 * Force defaults in case the original process used '-c', '-i' or '-R'.
 	 */
 	if (!original) {
 		ignore_state_errors = false;
 		recover = 1;
 	}

 	if (under_systemd) {
 		if (!getenv("NOTIFY_SOCKET"))
 			fatal("Missing NOTIFY_SOCKET.");
 		daemonize = false;
 		setwd = true;
 	}

 	/*
 	 * Using setwd() later means a relative path to ourselves may shift.
 	 * Capture /proc/self/exe now and save this for reconfig later.
 	 * Cannot wait to capture it later as Linux will append " (deleted)"
 	 * to the filename if it's been replaced, which would break reconfig
 	 * after an upgrade.
 	 */
 	if (argv[0][0] != '/') {
 		if (readlink("/proc/self/exe", binary, PATH_MAX) < 0)
 			fatal("%s: readlink failed: %m", __func__);
 	} else {
 		strlcpy(binary, argv[0], PATH_MAX);
 	}
 }

 static void _usage(void)
 {
         char *txt;
         static_ref_to_cstring(txt, usage_txt);
         fprintf(stderr, "%s", txt);
         xfree(txt);
 }

 static void *_shutdown_bu_thread(void *arg)
 {
 	int bu_inx, rc = SLURM_SUCCESS, rc2 = SLURM_SUCCESS;
 	slurm_msg_t req;
 	bool do_shutdown = false;
 	shutdown_arg_t *shutdown_arg;
 	shutdown_msg_t shutdown_msg;

 	shutdown_arg = arg;
 	bu_inx = shutdown_arg->index;
 	do_shutdown = shutdown_arg->shutdown;
 	xfree(arg);

 	slurm_msg_t_init(&req);
 	slurm_msg_set_r_uid(&req, slurm_conf.slurm_user_id);
 	slurm_set_addr(&req.address, slurm_conf.slurmctld_port,
 	               slurm_conf.control_addr[bu_inx]);
 	if (do_shutdown) {
 		req.msg_type = REQUEST_SHUTDOWN;
 		shutdown_msg.options = SLURMCTLD_SHUTDOWN_CTLD;
 		req.data = &shutdown_msg;
 	} else {
 		req.msg_type = REQUEST_CONTROL;
 	}
 	debug("Requesting control from backup controller %s",
 	      slurm_conf.control_machine[bu_inx]);
 	if (slurm_send_recv_rc_msg_only_one(&req, &rc2,
 				(CONTROL_TIMEOUT * 1000)) < 0) {
 		error("%s:send/recv %s: %m",
 		      __func__, slurm_conf.control_machine[bu_inx]);
 		rc = SLURM_ERROR;
 	} else if (rc2 == ESLURM_DISABLED) {
 		debug("backup controller %s responding",
 		      slurm_conf.control_machine[bu_inx]);
 	} else if (rc2 == SLURM_SUCCESS) {
 		debug("backup controller %s has relinquished control",
 		      slurm_conf.control_machine[bu_inx]);
 	} else {
 		error("%s (%s): %s", __func__,
 		      slurm_conf.control_machine[bu_inx],
 		      slurm_strerror(rc2));
 		rc = SLURM_ERROR;
 	}

 	slurm_mutex_lock(&bu_mutex);
 	if (rc != SLURM_SUCCESS)
 		bu_rc = rc;
 	bu_thread_cnt--;
 	slurm_cond_signal(&bu_cond);
 	slurm_mutex_unlock(&bu_mutex);
 	return NULL;
 }

 /*
  * Tell the backup_controllers to relinquish control, primary control_machine
  *	has resumed operation. Messages sent to all controllers in parallel.
  * RET 0 or an error code
  * NOTE: READ lock_slurmctld config before entry (or be single-threaded)
  */
 static int _shutdown_backup_controller(void)
 {
 	int i;
 	shutdown_arg_t *shutdown_arg;

 	bu_rc = SLURM_SUCCESS;

 	/* If we don't have any backups configured just return */
 	if (slurm_conf.control_cnt == 1)
 		return bu_rc;

 	debug2("shutting down backup controllers (my index: %d)", backup_inx);
 	for (i = 1; i < slurm_conf.control_cnt; i++) {
 		if (i == backup_inx)
 			continue;	/* No message to self */

 		if ((slurm_conf.control_addr[i] == NULL) ||
 		    (slurm_conf.control_addr[i][0] == '\0'))
 			continue;

 		shutdown_arg = xmalloc(sizeof(*shutdown_arg));
 		shutdown_arg->index = i;
 		/*
 		 * need to send actual REQUEST_SHUTDOWN to non-primary ctlds
 		 * in order to have them properly shutdown and not contend
 		 * for primary position, otherwise "takeover" results in
 		 * contention among backups for primary position.
 		 */
 		if (i < backup_inx)
 			shutdown_arg->shutdown = true;
 		slurm_thread_create_detached(_shutdown_bu_thread,
 					     shutdown_arg);
 		slurm_mutex_lock(&bu_mutex);
 		bu_thread_cnt++;
 		slurm_mutex_unlock(&bu_mutex);
 	}

 	slurm_mutex_lock(&bu_mutex);
 	while (bu_thread_cnt != 0) {
 		slurm_cond_wait(&bu_cond, &bu_mutex);
 	}
 	slurm_mutex_unlock(&bu_mutex);

 	return bu_rc;
 }

 /*
  * Update log levels given requested levels
  * NOTE: Will not turn on originally configured off (quiet) channels
  */
 void update_log_levels(int req_slurmctld_debug, int req_syslog_debug)
 {
 	static bool conf_init = false;
 	static int conf_slurmctld_debug, conf_syslog_debug;
 	log_options_t log_opts = LOG_OPTS_INITIALIZER;
 	int slurmctld_debug;
 	int syslog_debug;

 	/*
 	 * Keep track of the original debug levels from slurm.conf so that
 	 * `scontrol setdebug` does not turn on non-active logging channels.
 	 * NOTE: It is known that `scontrol reconfigure` will cause an issue
 	 *       when reconfigured with a slurm.conf that changes SlurmctldDebug
 	 *       from level QUIET to a non-quiet value.
 	 * NOTE: Planned changes to `reconfigure` behavior should make this a
 	 *       non-issue in a future release.
 	 */
 	if (!conf_init) {
 		conf_slurmctld_debug = slurm_conf.slurmctld_debug;
 		conf_syslog_debug = slurm_conf.slurmctld_syslog_debug;
 		conf_init = true;
 	}

 	/*
 	 * NOTE: not offset by LOG_LEVEL_INFO, since it's inconvenient
 	 * to provide negative values for scontrol
 	 */
 	slurmctld_debug = MIN(req_slurmctld_debug, (LOG_LEVEL_END - 1));
 	slurmctld_debug = MAX(slurmctld_debug, LOG_LEVEL_QUIET);
 	syslog_debug = MIN(req_syslog_debug, (LOG_LEVEL_END - 1));
 	syslog_debug = MAX(syslog_debug, LOG_LEVEL_QUIET);

 	if (daemonize)
 		log_opts.stderr_level = LOG_LEVEL_QUIET;
 	else
 		log_opts.stderr_level = slurmctld_debug;

 	if (slurm_conf.slurmctld_logfile &&
 	    (conf_slurmctld_debug != LOG_LEVEL_QUIET))
 		log_opts.logfile_level = slurmctld_debug;
 	else
 		log_opts.logfile_level = LOG_LEVEL_QUIET;

 	if (conf_syslog_debug == LOG_LEVEL_QUIET)
 		log_opts.syslog_level = LOG_LEVEL_QUIET;
 	else if (slurm_conf.slurmctld_syslog_debug != LOG_LEVEL_END)
 		log_opts.syslog_level = syslog_debug;
 	else if (!daemonize)
 		log_opts.syslog_level = LOG_LEVEL_QUIET;
 	else if (!slurm_conf.slurmctld_logfile &&
 		 (conf_slurmctld_debug > LOG_LEVEL_QUIET))
 		log_opts.syslog_level = slurmctld_debug;
 	else
 		log_opts.syslog_level = LOG_LEVEL_FATAL;

 	log_alter(log_opts, LOG_DAEMON, slurm_conf.slurmctld_logfile);

 	debug("slurmctld log levels: stderr=%s logfile=%s syslog=%s",
 	      log_num2string(log_opts.stderr_level),
 	      log_num2string(log_opts.logfile_level),
 	      log_num2string(log_opts.syslog_level));
 }

 /*
  * Reset slurmctld logging based upon configuration parameters uses common
  * slurm_conf data structure
  */
 void update_logging(void)
 {
 	int rc;
 	uid_t slurm_user_id  = slurm_conf.slurm_user_id;
 	gid_t slurm_user_gid = gid_from_uid(slurm_user_id);

 	xassert(verify_lock(CONF_LOCK, WRITE_LOCK));

 	/* Preserve execute line arguments (if any) */
 	if (debug_level) {
 		slurm_conf.slurmctld_debug = MIN(
 			(LOG_LEVEL_INFO + debug_level),
 			(LOG_LEVEL_END - 1));
 	}
 	if (slurm_conf.slurmctld_debug != NO_VAL16) {
 		log_opts.logfile_level = slurm_conf.slurmctld_debug;
 	}
 	if (debug_logfile) {
 		xfree(slurm_conf.slurmctld_logfile);
 		slurm_conf.slurmctld_logfile = xstrdup(debug_logfile);
 	}

 	log_set_timefmt(slurm_conf.log_fmt);

 	update_log_levels(slurm_conf.slurmctld_debug,
 			  slurm_conf.slurmctld_syslog_debug);

 	debug("Log file re-opened");

 	/*
 	 * SchedLogLevel restore
 	 */
 	if (slurm_conf.sched_log_level != NO_VAL16)
 		sched_log_opts.logfile_level = slurm_conf.sched_log_level;

 	sched_log_alter(sched_log_opts, LOG_DAEMON, slurm_conf.sched_logfile);

 	if (slurm_conf.slurmctld_logfile) {
 		rc = chown(slurm_conf.slurmctld_logfile,
 			   slurm_user_id, slurm_user_gid);
 		if (rc && daemonize) {
 			error("chown(%s, %u, %u): %m",
 			      slurm_conf.slurmctld_logfile,
 			      slurm_user_id, slurm_user_gid);
 		}
 	}
 	if (slurm_conf.sched_logfile) {
 		rc = chown(slurm_conf.sched_logfile,
 			   slurm_user_id, slurm_user_gid);
 		if (rc && daemonize) {
 			error("chown(%s, %u, %u): %m",
 			      slurm_conf.sched_logfile,
 			      slurm_user_id, slurm_user_gid);
 		}
 	}
 }

 /* Reset slurmd nice value */
 static void _update_nice(void)
 {
 	int cur_nice;
 	id_t pid;

 	if (new_nice == 0)	/* No change */
 		return;

 	pid = getpid();
 	cur_nice = getpriority(PRIO_PROCESS, pid);
 	if (cur_nice == new_nice)
 		return;
 	if (setpriority(PRIO_PROCESS, pid, new_nice))
 		error("Unable to reset nice value to %d: %m", new_nice);
 }

 /*
  * Verify that ClusterName from slurm.conf matches the state directory.
  * If mismatched, exit immediately to protect state files from corruption.
  */
 static void _verify_clustername(void)
 {
 	FILE *fp;
 	char *filename = NULL;
 	char name[512] = {0};

 	xstrfmtcat(filename, "%s/clustername", slurm_conf.state_save_location);

 	if ((fp = fopen(filename, "r"))) {
 		char *pipe;

 		/* read value and compare */
 		if (!fgets(name, sizeof(name), fp)) {
 			error("%s: reading cluster name from clustername file",
 			      __func__);
 		}
 		fclose(fp);

 		pipe = xstrchr(name, '|');
 		if (pipe) {
 			pipe[0] = '\0';
 			slurm_conf.cluster_id = slurm_atoul(pipe+1);
 		}

 		if (xstrcmp(name, slurm_conf.cluster_name)) {
 			fatal("CLUSTER NAME MISMATCH.\n"
 			      "slurmctld has been started with \"ClusterName=%s\", but read \"%s\" from the state files in StateSaveLocation.\n"
 			      "Running multiple clusters from a shared StateSaveLocation WILL CAUSE CORRUPTION.\n"
 			      "Remove %s to override this safety check if this is intentional (e.g., the ClusterName has changed).",
 			      slurm_conf.cluster_name, name, filename);
 			exit(1);
 		}
 	}

 	xfree(filename);
 }

 static void _create_clustername_file(void)
 {
 	FILE *fp;
 	char *filename = NULL;
 	char *tmp_str = xstrdup_printf("%s|%u",
 				       slurm_conf.cluster_name,
 				       slurm_conf.cluster_id);

 	filename = xstrdup_printf("%s/clustername",
 	                          slurm_conf.state_save_location);

 	info("creating clustername file: ClusterName=%s ClusterID=%u",
 	     slurm_conf.cluster_name, slurm_conf.cluster_id);
 	clustername_existed = 0;

 	if (!(fp = fopen(filename, "w"))) {
 		fatal("%s: failed to create file %s", __func__, filename);
 		exit(1);
 	}

 	if (fputs(tmp_str, fp) < 0) {
 		fatal("%s: failed to write to file %s", __func__, filename);
 		exit(1);
 	}
 	fclose(fp);

 	xfree(tmp_str);
 	xfree(filename);
 }

 /* Kill the currently running slurmctld
  * NOTE: No need to lock the config data since we are still single-threaded */
 static void _kill_old_slurmctld(void)
 {
 	int fd;
 	pid_t oldpid = read_pidfile(slurm_conf.slurmctld_pidfile, &fd);
 	if (oldpid != (pid_t) 0) {
 		if (!ignore_state_errors && xstrstr(slurm_conf.slurmctld_params, "no_quick_restart"))
 			fatal("SlurmctldParameters=no_quick_restart set. Please shutdown your previous slurmctld (pid oldpid) before starting a new one. (-i to ignore this message)");

 		info ("killing old slurmctld[%ld]", (long) oldpid);
 		kill(oldpid, SIGTERM);

 		/*
 		 * Wait for previous daemon to terminate
 		 */
 		if (fd_get_readw_lock(fd) < 0)
 			fatal ("unable to wait for readw lock: %m");
 		(void) close(fd); /* Ignore errors */
 	}
 }

 /* NOTE: No need to lock the config data since we are still single-threaded */
 static void _init_pidfile(void)
 {
 	if (!xstrcmp(slurm_conf.slurmctld_pidfile, slurm_conf.slurmd_pidfile))
 		error("SlurmctldPid == SlurmdPid, use different names");

 	/* Don't close the fd returned here since we need to keep the
 	 * fd open to maintain the write lock */
 	pidfd = create_pidfile(slurm_conf.slurmctld_pidfile,
 			       slurm_conf.slurm_user_id);
 }

 static void _update_pidfile(void)
 {
 	char *env = getenv("SLURMCTLD_RECONF_PIDFD");

 	if (!env) {
 		debug("%s: missing SLURMCTLD_RECONF_PIDFD envvar", __func__);
 		return;
 	}

 	pidfd = atoi(env);
 	update_pidfile(pidfd);
 }

 /*
  * set_slurmctld_state_loc - create state directory as needed and "cd" to it
  * NOTE: config read lock must be set on entry
  */
 extern void set_slurmctld_state_loc(void)
 {
 	int rc;
 	struct stat st;
 	const char *path = slurm_conf.state_save_location;

 	/*
 	 * If state save location does not exist, try to create it.
 	 *  Otherwise, ensure path is a directory as expected, and that
 	 *  we have permission to write to it.
 	 */
 	if (((rc = stat(path, &st)) < 0) && (errno == ENOENT)) {
 		if (mkdir(path, 0755) < 0)
 			fatal("mkdir(%s): %m", path);
 	}
 	else if (rc < 0)
 		fatal("Unable to stat state save loc: %s: %m", path);
 	else if (!S_ISDIR(st.st_mode))
 		fatal("State save loc: %s: Not a directory!", path);
 	else if (access(path, R_OK|W_OK|X_OK) < 0)
 		fatal("Incorrect permissions on state save loc: %s", path);
 }

 static int _foreach_cache_update_job(void *x, void *arg)
 {
 	job_record_t *job_ptr = x;

 	(void) _update_job_tres(job_ptr, NULL);

 	if (job_ptr->assoc_id) {
 		slurmdb_assoc_rec_t assoc_rec = {
 			.id = job_ptr->assoc_id,
 		};

 		debug("assoc is %zx (%d) for %pJ",
 		      (size_t)job_ptr->assoc_ptr, job_ptr->assoc_id,
 		      job_ptr);

 		if (assoc_mgr_fill_in_assoc(
 			    acct_db_conn, &assoc_rec,
 			    accounting_enforce,
 			    &job_ptr->assoc_ptr, true)) {
 			verbose("Invalid association id %u for %pJ",
 				job_ptr->assoc_id, job_ptr);
 			/* not a fatal error, association could have
 			 * been removed */
 		}

 		debug("now assoc is %zx (%d) for %pJ",
 		      (size_t)job_ptr->assoc_ptr, job_ptr->assoc_id,
 		      job_ptr);
 	}

 	if (job_ptr->qos_list) {
 		list_flush(job_ptr->qos_list);
 		char *token, *last = NULL;
 		char *tmp_qos_req = xstrdup(job_ptr->details->qos_req);
 		slurmdb_qos_rec_t *qos_ptr = NULL;

 		token = strtok_r(tmp_qos_req, ",", &last);
 		while (token) {
 			slurmdb_qos_rec_t qos_rec = {
 				.name = token,
 			};
 			if ((assoc_mgr_fill_in_qos(
 				     acct_db_conn, &qos_rec,
 				     accounting_enforce,
 				     &qos_ptr,
 				     true)) != SLURM_SUCCESS) {
 				verbose("Invalid qos (%u) for %pJ",
 					job_ptr->qos_id, job_ptr);
 				/* not a fatal error, qos could have
 				 * been removed */
 			} else
 				list_append(job_ptr->qos_list, qos_ptr);
 			token = strtok_r(NULL, ",", &last);
 		}
 		xfree(tmp_qos_req);

 		if (list_count(job_ptr->qos_list)) {
 			list_sort(job_ptr->qos_list, priority_sort_qos_desc);
 			/* If we are pending we want the highest prio */
 			if (IS_JOB_PENDING(job_ptr)) {
 				job_ptr->qos_ptr = list_peek(job_ptr->qos_list);
 				job_ptr->qos_id = job_ptr->qos_ptr->id;
 			} else {
 				job_ptr->qos_ptr = list_find_first(
 					job_ptr->qos_list,
 					slurmdb_find_qos_in_list,
 					&job_ptr->qos_id);
 				if (!job_ptr->qos_ptr) {
 					verbose("Invalid qos (%u) for %pJ from qos_req '%s'",
 						job_ptr->qos_id,
 						job_ptr,
 						job_ptr->details->qos_req);
 					goto use_qos_id;
 				}
 			}
 		} else
 			FREE_NULL_LIST(job_ptr->qos_list);
 	} else if (job_ptr->qos_id) {
 use_qos_id: ; /* must be a blank ; for older compilers (el7) */
 		slurmdb_qos_rec_t qos_rec = {
 			.id = job_ptr->qos_id,
 		};

 		if ((assoc_mgr_fill_in_qos(
 			     acct_db_conn, &qos_rec,
 			     accounting_enforce,
 			     &job_ptr->qos_ptr,
 			     true)) != SLURM_SUCCESS) {
 			verbose("Invalid qos (%u) for %pJ",
 				job_ptr->qos_id, job_ptr);
 			/* not a fatal error, qos could have
 			 * been removed */
 		}
 	}

 	return 0;
 }

 static int _foreach_cache_update_part(void *x, void *arg)
 {
 	part_record_t *part_ptr = x;

 	if (part_ptr->allow_qos)
 		qos_list_build(part_ptr->allow_qos, true,
 			       &part_ptr->allow_qos_bitstr);

 	if (part_ptr->deny_qos)
 		qos_list_build(part_ptr->deny_qos, true,
 			       &part_ptr->deny_qos_bitstr);

 	if (part_ptr->qos_char) {
 		slurmdb_qos_rec_t qos_rec = {
 			.name = part_ptr->qos_char,
 		};

 		part_ptr->qos_ptr = NULL;
 		if (assoc_mgr_fill_in_qos(acct_db_conn, &qos_rec,
 					  accounting_enforce,
 					  &part_ptr->qos_ptr,
 					  true) != SLURM_SUCCESS) {
 			fatal("Partition %s has an invalid qos (%s), "
 			      "please check your configuration",
 			      part_ptr->name, qos_rec.name);
 		}
 	}

 	part_update_assoc_lists(part_ptr, NULL);

 	return 0;
 }

 /* _assoc_cache_mgr - hold out until we have real data from the
  * database so we can reset the job ptr's assoc ptr's */
 static void *_assoc_cache_mgr(void *no_data)
 {
 	/* Write lock on jobs, nodes and partitions */
 	slurmctld_lock_t job_write_lock =
 		{ NO_LOCK, WRITE_LOCK, WRITE_LOCK, WRITE_LOCK, NO_LOCK };
 	assoc_mgr_lock_t locks =
 		{ .assoc = READ_LOCK, .qos = WRITE_LOCK, .tres = WRITE_LOCK,
 		  .user = READ_LOCK };

 	if (running_cache != RUNNING_CACHE_STATE_RUNNING) {
 		slurm_mutex_lock(&assoc_cache_mutex);
 		lock_slurmctld(job_write_lock);
 		/*
 		 * It is ok to have the job_write_lock here as long as
 		 * running_cache != RUNNING_CACHE_STATE_NOTRUNNING. This short
 		 * circuits the association manager to not call callbacks. If
 		 * we come out of cache we need the job_write_lock locked until
 		 * the end to prevent a race condition on the job_list (some
 		 * running without new info and some running with the cached
 		 * info).
 		 *
 		 * Make sure not to have the assoc_mgr or the
 		 * slurmdbd_lock locked when refresh_lists is called or you may
 		 * get deadlock.
 		 */
 		assoc_mgr_refresh_lists(acct_db_conn, 0);
 		if (g_tres_count != slurmctld_tres_cnt) {
 			info("TRES in database does not match cache (%u != %u).  Updating...",
 			     g_tres_count, slurmctld_tres_cnt);
 			_init_tres();
 		}
 		slurm_mutex_unlock(&assoc_cache_mutex);
 	}

 	while (running_cache == RUNNING_CACHE_STATE_RUNNING) {
 		slurm_mutex_lock(&assoc_cache_mutex);
 		slurm_cond_wait(&assoc_cache_cond, &assoc_cache_mutex);
 		/* This is here to see if we are exiting.  If so then
 		   just return since we are closing down.
 		*/
 		if (running_cache == RUNNING_CACHE_STATE_EXITING) {
 			slurm_mutex_unlock(&assoc_cache_mutex);
 			return NULL;
 		}

 		lock_slurmctld(job_write_lock);
 		/*
 		 * It is ok to have the job_write_lock here as long as
 		 * running_cache != RUNNING_CACHE_STATE_NOTRUNNING. This short
 		 * circuits the association manager to not call callbacks. If
 		 * we come out of cache we need the job_write_lock locked until
 		 * the end to prevent a race condition on the job_list (some
 		 * running without new info and some running with the cached
 		 * info).
 		 *
 		 * Make sure not to have the assoc_mgr or the
 		 * slurmdbd_lock locked when refresh_lists is called or you may
 		 * get deadlock.
 		 */
 		assoc_mgr_refresh_lists(acct_db_conn, 0);
 		if (g_tres_count != slurmctld_tres_cnt) {
 			info("TRES in database does not match cache "
 			     "(%u != %u).  Updating...",
 			     g_tres_count, slurmctld_tres_cnt);
 			_init_tres();
 		}

 		/*
 		 * If running_cache == RUNNING_CACHE_STATE_LISTS_REFRESHED it
 		 * means the assoc_mgr has deemed all is good but we can't
 		 * actually enforce it until now since _init_tres() could call
 		 * assoc_mgr_refresh_lists() again which makes it so you could
 		 * get deadlock.
 		 */
 		if (running_cache == RUNNING_CACHE_STATE_LISTS_REFRESHED)
 			running_cache = RUNNING_CACHE_STATE_NOTRUNNING;
 		else if (running_cache == RUNNING_CACHE_STATE_RUNNING)
 			unlock_slurmctld(job_write_lock);

 		slurm_mutex_unlock(&assoc_cache_mutex);
 	}

 	assoc_mgr_lock(&locks);
 	if (job_list) {
 		debug2("got real data from the database refreshing the association ptr's for %d jobs",
 		       list_count(job_list));
 		(void) list_for_each(job_list, _foreach_cache_update_job, NULL);
 	}

 	if (part_list) {
 		(void) list_for_each(part_list, _foreach_cache_update_part,
 				     NULL);
 	}

 	set_cluster_tres(true);

 	assoc_mgr_unlock(&locks);
 	/* issuing a reconfig will reset the pointers on the burst
 	   buffers */
 	bb_g_reconfig();

 	unlock_slurmctld(job_write_lock);
 	/* This needs to be after the lock and after we update the
 	   jobs so if we need to send them we are set. */
 	_accounting_cluster_ready();
 	_get_fed_updates();

 	return NULL;
 }

 /*
  * Find this host in the controller index, or return -1 on error.
  */
 static int _controller_index(void)
 {
 	int i;

 	/*
 	 * Slurm internal HA mode (or no HA).
 	 * Each controller is separately defined, and a single hostname is in
 	 * each control_machine entry.
 	 */
 	for (i = 0; i < slurm_conf.control_cnt; i++) {
 		if (slurm_conf.control_machine[i] &&
 		    slurm_conf.control_addr[i]    &&
 		    (!xstrcmp(slurmctld_config.node_name_short,
 		              slurm_conf.control_machine[i])  ||
 		     !xstrcmp(slurmctld_config.node_name_long,
 		              slurm_conf.control_machine[i]))) {
 			return i;
 		}
 	}

 	/*
 	 * External HA mode. Here a single control_addr has been defined,
 	 * but multiple hostnames are in control_machine[0] with comma
 	 * separation. If our hostname matches any of those, we are considered
 	 * to be a valid controller, and which is active must be managed by
 	 * an external HA solution.
 	 */
 	if (xstrchr(slurm_conf.control_machine[0], ',')) {
 		char *token, *last = NULL;
 		char *tmp_name = xstrdup(slurm_conf.control_machine[0]);

 		token = strtok_r(tmp_name, ",", &last);
 		while (token) {
 			if (!xstrcmp(slurmctld_config.node_name_short, token) ||
 			    !xstrcmp(slurmctld_config.node_name_long, token)) {
 				xfree(tmp_name);
 				return 0;
 			}
 			token = strtok_r(NULL, ",", &last);
 		}
 		xfree(tmp_name);
 	}

 	return -1;
 }


 static void _test_thread_limit(void)
 {
 #ifdef RLIMIT_NOFILE
 	struct rlimit rlim[1];
 	if (getrlimit(RLIMIT_NOFILE, rlim) < 0)
 		error("Unable to get file count limit");
 	else if ((rlim->rlim_cur != RLIM_INFINITY) &&
 		 (max_server_threads > rlim->rlim_cur)) {
 		max_server_threads = rlim->rlim_cur;
 		info("Reducing max_server_thread to %u due to file count limit "
 		     "of %u", max_server_threads, max_server_threads);
 	}
 #endif
 }

 static void  _set_work_dir(void)
 {
 	bool success = false;

 	if (slurm_conf.slurmctld_logfile &&
 	    (slurm_conf.slurmctld_logfile[0] == '/')) {
 		char *slash_ptr, *work_dir;
 		work_dir = xstrdup(slurm_conf.slurmctld_logfile);
 		slash_ptr = strrchr(work_dir, '/');
 		if (slash_ptr == work_dir)
 			work_dir[1] = '\0';
 		else
 			slash_ptr[0] = '\0';
 		if ((access(work_dir, W_OK) != 0) || (chdir(work_dir) < 0))
 			error("chdir(%s): %m", work_dir);
 		else
 			success = true;
 		xfree(work_dir);
 	}

 	if (!success) {
 		if ((access(slurm_conf.state_save_location, W_OK) != 0) ||
 		    (chdir(slurm_conf.state_save_location) < 0)) {
 			error("chdir(%s): %m",
 			      slurm_conf.state_save_location);
 		} else
 			success = true;
 	}

 	if (!success) {
 		if ((access("/var/tmp", W_OK) != 0) ||
 		    (chdir("/var/tmp") < 0)) {
 			error("chdir(/var/tmp): %m");
 		} else
 			info("chdir to /var/tmp");
 	}
 }

 /*
  * _purge_files_thread - separate thread to remove job batch/environ files
  * from the state directory. Runs async from purge_old_jobs to avoid
  * holding locks while the files are removed, which can cause performance
  * problems under high throughput conditions.
  *
  * Uses the purge_cond to wakeup on demand, then works through the global
  * purge_files_list of job_ids and removes their files.
  */
 static void *_purge_files_thread(void *no_data)
 {
 	int *job_id;

 	/*
 	 * Use the purge_files_list as a queue. _delete_job_details()
 	 * in job_mgr.c always enqueues (at the end), while
 	 *_purge_files_thread consumes off the front.
 	 *
 	 * There is a potential race condition if the job numbers have
 	 * wrapped between _purge_thread removing the state files and
 	 * get_next_job_id trying to re-assign it. This is mitigated
 	 * the call to _dup_job_file_test() in job_mgr.c ensuring
 	 * there is no existing directory for an id before assigning it.
 	 */

 	/*
 	 * pthread_cond_wait requires a lock to release and reclaim.
 	 * the list structure is already handling locking for itself,
 	 * so this lock isn't actually useful, and the thread calling
 	 * pthread_cond_signal isn't required to have the lock. So
 	 * lock it once and hold it until slurmctld shuts down.
 	 */
 	slurm_mutex_lock(&purge_thread_lock);
 	while (!slurmctld_config.shutdown_time) {
 		slurm_cond_wait(&purge_thread_cond, &purge_thread_lock);
 		debug2("%s: starting, %d jobs to purge", __func__,
 		       list_count(purge_files_list));

 		/*
 		 * Use list_dequeue here (instead of list_flush) as it will not
 		 * hold up the list lock when we try to enqueue jobs that need
 		 * to be freed.
 		 */
 		while ((job_id = list_dequeue(purge_files_list))) {
 			debug2("%s: purging files from JobId=%u",
 			       __func__, *job_id);
 			delete_job_desc_files(*job_id);
 			xfree(job_id);
 		}
 	}
 	slurm_mutex_unlock(&purge_thread_lock);
 	return NULL;
 }

 static int _acct_update_list_for_each(void *x, void *arg)
 {
 	slurmdb_update_object_t *object = x;
 	bool locked = false;

 	switch (object->type) {
 	case SLURMDB_UPDATE_FEDS:
 #if HAVE_SYS_PRCTL_H
 		if (prctl(PR_SET_NAME, "fedmgr", NULL, NULL, NULL) < 0){
 			error("%s: cannot set my name to %s %m",
 			      __func__, "fedmgr");
 		}
 #endif
 		fed_mgr_update_feds(object);
 		break;
 	default:
 		(void) assoc_mgr_update_object(x, &locked);
 	}

 	/* Always delete it */
 	return 1;
 }

 static void *_acct_update_thread(void *no_data)
 {
 	slurm_mutex_lock(&slurmctld_config.acct_update_lock);
 	while (!slurmctld_config.shutdown_time) {
 		slurm_cond_wait(&slurmctld_config.acct_update_cond,
 				&slurmctld_config.acct_update_lock);

 		(void) list_delete_all(slurmctld_config.acct_update_list,
 				       _acct_update_list_for_each,
 				       NULL);
 	}
 	slurm_mutex_unlock(&slurmctld_config.acct_update_lock);

 	return NULL;
 }

 static void _get_fed_updates(void)
 {
 	list_t *fed_list = NULL;
 	slurmdb_update_object_t update = {0};
 	slurmdb_federation_cond_t fed_cond;

 	slurmdb_init_federation_cond(&fed_cond, 0);
 	fed_cond.cluster_list = list_create(NULL);
 	list_append(fed_cond.cluster_list, slurm_conf.cluster_name);

 	fed_list = acct_storage_g_get_federations(acct_db_conn,
 	                                          slurm_conf.slurm_user_id,
 	                                          &fed_cond);
 	FREE_NULL_LIST(fed_cond.cluster_list);

 	if (fed_list) {
 		update.objects = fed_list;
 		fed_mgr_update_feds(&update);
 	}

 	FREE_NULL_LIST(fed_list);
 }

 static int _foreach_job_running(void *object, void *arg)
 {
 	job_record_t *job_ptr = object;

 	if (IS_JOB_PENDING(job_ptr)) {
 		int job_cnt = (job_ptr->array_recs &&
 			       job_ptr->array_recs->task_cnt) ?
 			job_ptr->array_recs->task_cnt : 1;
 		slurmctld_diag_stats.jobs_pending += job_cnt;
 	}
 	if (IS_JOB_RUNNING(job_ptr))
 		slurmctld_diag_stats.jobs_running++;

 	return SLURM_SUCCESS;
 }

 static void _update_diag_job_state_counts(void)
 {
 	slurmctld_diag_stats.jobs_running = 0;
 	slurmctld_diag_stats.jobs_pending = 0;
 	slurmctld_diag_stats.job_states_ts = time(NULL);
 	list_for_each_ro(job_list, _foreach_job_running, NULL);
 }

 static void _run_primary_prog(bool primary_on)
 {
 	char *prog_name, *prog_type;
 	char *argv[2], *sep;
 	int status = 0;
 	pid_t cpid;

 	if (primary_on) {
 		prog_name = slurm_conf.slurmctld_primary_on_prog;
 		prog_type = "SlurmctldPrimaryOnProg";
 	} else {
 		prog_name = slurm_conf.slurmctld_primary_off_prog;
 		prog_type = "SlurmctldPrimaryOffProg";
 	}

 	if ((prog_name == NULL) || (prog_name[0] == '\0'))
 		return;

 	info("%s: Running %s", __func__, prog_type);

 	sep = strrchr(prog_name, '/');
 	if (sep)
 		argv[0] = sep + 1;
 	else
 		argv[0] = prog_name;
 	argv[1] = NULL;
 	if ((cpid = fork()) < 0) {	/* Error */
 		error("%s fork error: %m", __func__);
 		return;
 	}
 	if (cpid == 0) {		/* Child */
 		closeall(0);
 		setpgid(0, 0);
 		execv(prog_name, argv);
 		_exit(127);
 	}

 	waitpid(cpid, &status, 0);
 	if (status != 0)
 		error("%s: %s exit status %u:%u", __func__, prog_type,
 		      WEXITSTATUS(status), WTERMSIG(status));
 	else
 		info("%s: %s completed successfully", __func__, prog_type);
 }

 static int _init_dep_job_ptr(void *object, void *arg)
 {
 	depend_spec_t *dep_ptr = object;
 	dep_ptr->job_ptr = find_job_array_rec(dep_ptr->job_id,
 					      dep_ptr->array_task_id);
 	return SLURM_SUCCESS;
 }

 static int _foreach_restore_job_dependencies(void *x, void *arg)
 {
 	job_record_t *job_ptr = x;

 	if (job_ptr->details && job_ptr->details->depend_list)
 		list_for_each(job_ptr->details->depend_list,
 			      _init_dep_job_ptr, NULL);
 	return 0;
 }

 /*
  * Restore dependency job pointers.
  *
  * test_job_dependency() initializes dep_ptr->job_ptr but in
  * case a job's dependency is updated before test_job_dependency() is called,
  * dep_ptr->job_ptr needs to be initialized for all jobs so that we can test
  * for circular dependencies properly. Otherwise, if slurmctld is restarted,
  * then immediately a job dependency is updated before test_job_dependency()
  * is called, it is possible to create a circular dependency.
  */
 static void _restore_job_dependencies(void)
 {
 	slurmctld_lock_t job_fed_lock = {.job = WRITE_LOCK, .fed = READ_LOCK};

 	lock_slurmctld(job_fed_lock);

 	(void) list_for_each(job_list, _foreach_restore_job_dependencies, NULL);

 	unlock_slurmctld(job_fed_lock);
 }

 /*
  * Respond to request for primary/backup slurmctld status
  */
 extern void slurm_rpc_control_status(slurm_msg_t *msg)
 {
 	control_status_msg_t status = {
 		.backup_inx = backup_inx,
 		.control_time = control_time,
 	};

 	(void) send_msg_response(msg, RESPONSE_CONTROL_STATUS, &status);
 }

 extern int controller_init_scheduling(bool init_gang)
 {
 	int rc = sched_g_init();

 	if (rc != SLURM_SUCCESS) {
 		error("failed to initialize sched plugin");
 		return rc;
 	}

 	main_sched_init();

 	if (init_gang)
 		gs_init();

 	return rc;
 }

 extern void controller_fini_scheduling(void)
 {
 	(void) sched_g_fini();

 	main_sched_fini();

 	if (slurm_conf.preempt_mode & PREEMPT_MODE_GANG)
 		gs_fini();
 }

 extern void controller_reconfig_scheduling(void)
 {
 	gs_reconfig();

 	(void) sched_g_reconfig();
 }