src/slurmctld/slurmctld.h - SchedMD/slurm - Git at Google

 /*****************************************************************************\
  *  slurmctld.h - definitions of functions and structures for slurmctld use
  *****************************************************************************
  *  Copyright (C) 2002-2007 The Regents of the University of California.
  *  Copyright (C) 2008-2010 Lawrence Livermore National Security.
  *  Copyright (C) SchedMD LLC.
  *  Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
  *  Written by Morris Jette <jette1@llnl.gov> et. al.
  *  CODE-OCEC-09-009. All rights reserved.
  *
  *  This file is part of Slurm, a resource management program.
  *  For details, see <https://slurm.schedmd.com/>.
  *  Please also read the included file: DISCLAIMER.
  *
  *  Slurm is free software; you can redistribute it and/or modify it under
  *  the terms of the GNU General Public License as published by the Free
  *  Software Foundation; either version 2 of the License, or (at your option)
  *  any later version.
  *
  *  In addition, as a special exception, the copyright holders give permission
  *  to link the code of portions of this program with the OpenSSL library under
  *  certain conditions as described in each individual source file, and
  *  distribute linked combinations including the two. You must obey the GNU
  *  General Public License in all respects for all of the code used other than
  *  OpenSSL. If you modify file(s) with this exception, you may extend this
  *  exception to your version of the file(s), but you are not obligated to do
  *  so. If you do not wish to do so, delete this exception statement from your
  *  version.  If you delete this exception statement from all source files in
  *  the program, then also delete it here.
  *
  *  Slurm is distributed in the hope that it will be useful, but WITHOUT ANY
  *  WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
  *  FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
  *  details.
  *
  *  You should have received a copy of the GNU General Public License along
  *  with Slurm; if not, write to the Free Software Foundation, Inc.,
  *  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301  USA.
 \*****************************************************************************/

 #ifndef _HAVE_SLURMCTLD_H
 #define _HAVE_SLURMCTLD_H

 #include "config.h"

 #include <inttypes.h>
 #include <pthread.h>
 #include <string.h>
 #include <sys/types.h>
 #include <time.h>
 #include <unistd.h>

 #include "slurm/slurm.h"

 #include "src/common/bitstring.h"
 #include "src/common/cron.h"
 #include "src/common/extra_constraints.h"
 #include "src/common/identity.h"
 #include "src/common/job_record.h"
 #include "src/common/job_resources.h"
 #include "src/common/job_state_reason.h"
 #include "src/common/list.h"
 #include "src/common/log.h"
 #include "src/common/macros.h"
 #include "src/common/node_conf.h"
 #include "src/common/pack.h"
 #include "src/common/read_config.h" /* location of slurm_conf */
 #include "src/common/slurm_protocol_api.h"
 #include "src/common/slurm_protocol_defs.h"
 #include "src/common/timers.h"
 #include "src/common/xmalloc.h"

 #include "src/interfaces/cred.h"

 /*****************************************************************************\
  *  GENERAL CONFIGURATION parameters and data structures
 \*****************************************************************************/
 /* Maximum parallel threads to service incoming RPCs.
  * Also maximum parallel threads to service outgoing RPCs (separate counter).
  * Since some systems schedule pthread on a First-In-Last-Out basis,
  * increasing this value is strongly discouraged. */
 #ifndef MAX_SERVER_THREADS
 #define MAX_SERVER_THREADS 256
 #endif

 /* Maximum number of threads to service emails (see MailProg) */
 #ifndef MAX_MAIL_THREADS
 #define MAX_MAIL_THREADS 64
 #endif

 /* Perform full slurmctld's state every PERIODIC_CHECKPOINT seconds */
 #ifndef PERIODIC_CHECKPOINT
 #define	PERIODIC_CHECKPOINT	300
 #endif

 /* Retry an incomplete RPC agent request every RPC_RETRY_INTERVAL seconds */
 #ifndef RPC_RETRY_INTERVAL
 #define	RPC_RETRY_INTERVAL	60
 #endif

 /* Check for jobs reaching their time limit every PERIODIC_TIMEOUT seconds */
 #ifndef PERIODIC_TIMEOUT
 #define	PERIODIC_TIMEOUT	30
 #endif

 /* Attempt to purge defunct job records and resend job kill requests
  * every PURGE_JOB_INTERVAL seconds */
 #ifndef PURGE_JOB_INTERVAL
 #define PURGE_JOB_INTERVAL 60
 #endif

 /* Process pending trigger events every TRIGGER_INTERVAL seconds */
 #ifndef TRIGGER_INTERVAL
 #define TRIGGER_INTERVAL 15
 #endif

 #ifndef UPDATE_CONFIG_LIST_TIMEOUT
 #define UPDATE_CONFIG_LIST_TIMEOUT 60
 #endif

 /* Report current node accounting state every PERIODIC_NODE_ACCT seconds */
 #ifndef PERIODIC_NODE_ACCT
 #define PERIODIC_NODE_ACCT 300
 #endif

 /* Seconds to wait for backup controller response to REQUEST_CONTROL RPC */
 #ifndef CONTROL_TIMEOUT
 #define CONTROL_TIMEOUT 30	/* seconds */
 #endif

 /*****************************************************************************\
  *  General configuration parameters and data structures
 \*****************************************************************************/

 typedef struct slurmctld_config {
 	list_t *acct_update_list;
 	pthread_cond_t acct_update_cond;
 	pthread_mutex_t acct_update_lock;
 	pthread_cond_t backup_finish_cond;
 	pthread_mutex_t backup_finish_lock;
 	time_t	boot_time;
 	char    node_name_long[HOST_NAME_MAX];
 	char    node_name_short[HOST_NAME_MAX];
 	bool	resume_backup;
 	bool    scheduling_disabled;
 	int	server_thread_count;
 	time_t	shutdown_time;
 	bool    submissions_disabled;

 	pthread_cond_t thread_count_cond;
 	pthread_mutex_t thread_count_lock;
 	pthread_t thread_id_acct_update;
 	pthread_t thread_id_main;
 	pthread_t thread_id_save;
 	pthread_t thread_id_purge_files;
 } slurmctld_config_t;

 typedef enum {
 	SCHEDULE_EXIT_END,
 	SCHEDULE_EXIT_MAX_DEPTH,
 	SCHEDULE_EXIT_MAX_JOB_START,
 	SCHEDULE_EXIT_LIC,
 	SCHEDULE_EXIT_RPC_CNT,
 	SCHEDULE_EXIT_TIMEOUT,
 	SCHEDULE_EXIT_COUNT
 } schedule_exit_t;

 typedef enum {
 	BF_EXIT_END,
 	BF_EXIT_MAX_JOB_START,
 	BF_EXIT_MAX_JOB_TEST,
 	BF_EXIT_STATE_CHANGED,
 	BF_EXIT_TABLE_LIMIT,
 	BF_EXIT_TIMEOUT,
 	BF_EXIT_COUNT
 } bf_exit_t;

 /* Job scheduling statistics */
 typedef struct diag_stats {
 	int proc_req_threads;
 	int proc_req_raw;

 	uint32_t schedule_cycle_max;
 	uint32_t schedule_cycle_last;
 	uint32_t schedule_cycle_sum;
 	uint32_t schedule_cycle_counter;
 	uint32_t schedule_cycle_depth;
 	uint32_t schedule_exit[SCHEDULE_EXIT_COUNT];
 	uint32_t schedule_queue_len;

 	uint32_t jobs_submitted;
 	uint32_t jobs_started;
 	uint32_t jobs_completed;
 	uint32_t jobs_canceled;
 	uint32_t jobs_failed;

 	time_t job_states_ts;
 	uint32_t jobs_pending;
 	uint32_t jobs_running;

 	uint32_t backfilled_jobs;
 	uint32_t last_backfilled_jobs;
 	uint32_t backfilled_het_jobs;
 	uint32_t bf_active;
 	uint32_t bf_cycle_counter;
 	uint32_t bf_cycle_last;
 	uint32_t bf_cycle_max;
 	uint64_t bf_cycle_sum;
 	uint32_t bf_depth_sum;
 	uint32_t bf_depth_try_sum;
 	uint32_t bf_exit[BF_EXIT_COUNT];
 	uint32_t bf_last_depth;
 	uint32_t bf_last_depth_try;
 	uint32_t bf_queue_len;
 	uint32_t bf_queue_len_sum;
 	uint32_t bf_table_size;
 	uint32_t bf_table_size_sum;
 	time_t   bf_when_last_cycle;

 	uint32_t latency;
 } diag_stats_t;

 typedef struct {
 	int index;
 	bool shutdown;
 } shutdown_arg_t;

 /* This is used to point out constants that exist in the
  * curr_tres_array in tres_info_t  This should be the same order as
  * the tres_types_t enum that is defined in src/common/slurmdb_defs.h
  */
 enum {
 	TRES_ARRAY_CPU = 0,
 	TRES_ARRAY_MEM,
 	TRES_ARRAY_ENERGY,
 	TRES_ARRAY_NODE,
 	TRES_ARRAY_BILLING,
 	TRES_ARRAY_FS_DISK,
 	TRES_ARRAY_VMEM,
 	TRES_ARRAY_PAGES,
 	TRES_ARRAY_TOTAL_CNT
 };

 extern bool  preempt_send_user_signal;
 extern time_t	last_proc_req_start;
 extern diag_stats_t slurmctld_diag_stats;
 extern slurmctld_config_t slurmctld_config;
 extern void *acct_db_conn;
 extern uint16_t accounting_enforce;
 extern int   backup_inx;		/* BackupController# index */
 extern int   batch_sched_delay;
 extern bool cloud_dns;
 extern uint32_t   cluster_cpus;
 extern bool disable_remote_singleton;
 extern int listen_nports;
 extern int max_depend_depth;
 extern uint32_t max_powered_nodes;
 extern pthread_cond_t purge_thread_cond;
 extern pthread_mutex_t purge_thread_lock;
 extern pthread_mutex_t check_bf_running_lock;
 extern int   sched_interval;
 extern bool  slurmctld_init_db;
 extern bool slurmctld_primary;
 extern int   slurmctld_tres_cnt;
 extern slurmdb_cluster_rec_t *response_cluster_rec;

 /*****************************************************************************\
  * Configless data structures, defined in src/slurmctld/proc_req.c
 \*****************************************************************************/
 extern bool running_configless;

 /*****************************************************************************\
  *  NODE parameters and data structures, mostly in src/common/node_conf.h
 \*****************************************************************************/
 extern bool ping_nodes_now;		/* if set, ping nodes immediately */
 extern bool want_nodes_reboot;		/* if set, check for idle nodes */
 extern bool ignore_state_errors;

 extern list_t *conf_includes_list;  /* list of conf_includes_map_t */

 #define PACK_FANOUT_ADDRS(_X) \
 	(IS_NODE_DYNAMIC_FUTURE(_X) || \
 	 IS_NODE_DYNAMIC_NORM(_X) || \
 	 (!cloud_dns && IS_NODE_CLOUD(_X)))

 /*****************************************************************************\
  *  NODE states and bitmaps
  *  asap_node_bitmap 	    Set if the node is marked to be rebooted asap
  *  avail_node_bitmap       Set if node's state is not DOWN, DRAINING/DRAINED,
  *                          FAILING or NO_RESPOND (i.e. available to run a job)
  *  booting_node_bitmap     Set if node in process of booting
  *  cg_node_bitmap          Set if node in completing state
  *  cloud_node_bitmap       Set if node in CLOUD state
  *  external_node_bitmap    Set if node in EXTERNAL state
  *  future_node_bitmap      Set if node in FUTURE state
  *  idle_node_bitmap        Set if node has no jobs allocated to it
  *  power_down_node_bitmap  Set for nodes which are powered down
  *  power_up_node_bitmap    Set for nodes which are powered down
  *  share_node_bitmap       Set if no jobs allocated exclusive access to
  *                          resources on that node (cleared if --exclusive
  *                          option specified by job or OverSubscribe=NO
  *                          configured for the job's partition)
  *  up_node_bitmap          Set if the node's state is not DOWN
 \*****************************************************************************/
 extern bitstr_t *asap_node_bitmap; /* reboot asap nodes */
 extern bitstr_t *avail_node_bitmap;	/* bitmap of available nodes,
 					 * state not DOWN, DRAIN or FAILING */
 extern bitstr_t *bf_ignore_node_bitmap;	/* bitmap of nodes made available during
 					 * backfill cycle */
 extern bitstr_t *booting_node_bitmap;	/* bitmap of booting nodes */
 extern bitstr_t *cg_node_bitmap;	/* bitmap of completing nodes */
 extern bitstr_t *cloud_node_bitmap;	/* bitmap of cloud nodes */
 extern bitstr_t *external_node_bitmap;	/* bitmap of external nodes */
 extern bitstr_t *future_node_bitmap;	/* bitmap of FUTURE nodes */
 extern bitstr_t *idle_node_bitmap;	/* bitmap of idle nodes */
 extern bitstr_t *power_down_node_bitmap; /* Powered down nodes */
 extern bitstr_t *power_up_node_bitmap;	/* Powered up and requested nodes */
 extern bitstr_t *share_node_bitmap;	/* bitmap of sharable nodes */
 extern bitstr_t *up_node_bitmap;	/* bitmap of up nodes, not DOWN */
 extern bitstr_t *rs_node_bitmap;	/* next_state=resume nodes */

 /*****************************************************************************\
  *  PARTITION parameters and data structures
 \*****************************************************************************/
 extern list_t *part_list;		/* list of part_record entries */
 extern time_t last_part_update;		/* time of last part_list update */
 extern part_record_t default_part;	/* default configuration values */
 extern char *default_part_name;		/* name of default partition */
 extern part_record_t *default_part_loc;	/* default partition ptr */

 #define DEF_PART_MAX_PRIORITY   1
 extern uint16_t part_max_priority;      /* max priority_job_factor in all parts */

 /*****************************************************************************\
  *  RESERVATION parameters and data structures
 \*****************************************************************************/

 #define RESV_CTLD_ACCT_NOT       0x00000001
 #define RESV_CTLD_USER_NOT       0x00000002
 #define RESV_CTLD_FULL_NODE      0x00000004
 #define RESV_CTLD_NODE_FLAGS_SET 0x00000008
 #define RESV_CTLD_EPILOG         0x00000010
 #define RESV_CTLD_PROLOG         0x00000020

 typedef struct slurmctld_resv {
 	uint16_t magic;		/* magic cookie, RESV_MAGIC		*/
 				/* DO NOT ALPHABETIZE			*/
 	char *accounts;		/* names of accounts permitted to use	*/
 	int account_cnt;	/* count of accounts permitted to use	*/
 	char **account_list;	/* list of accounts permitted to use	*/
 	char *assoc_list;	/* list of associations			*/
 	uint32_t boot_time;	/* time it would take to reboot a node	*/
 	char *burst_buffer;	/* burst buffer resources		*/
 	char *comment;		/* arbitrary comment			*/
 	uint32_t ctld_flags;    /* see RESV_CTLD_* above */
 	bitstr_t *core_bitmap;	/* bitmap of reserved cores		*/
 	uint32_t core_cnt;	/* number of reserved cores		*/
 	job_resources_t *core_resrcs;	/* details of allocated cores	*/
 	uint32_t duration;	/* time in seconds for this
 				 * reservation to last                  */
 	time_t end_time;	/* end time of reservation		*/
 	time_t idle_start_time;	/* first time when reservation had no jobs
 				 * running on it */
 	char *features;		/* required node features		*/
 	uint64_t flags;		/* see RESERVE_FLAG_* in slurm.h	*/
 	list_t *gres_list_alloc;/* Allocated generic resource allocation
 				 * detail */
 	char *groups;		/* names of linux groups permitted to use */
 	uint32_t job_pend_cnt;	/* number of pending jobs		*/
 	uint32_t job_run_cnt;	/* number of running jobs		*/
 	list_t *license_list;	/* structure with license info		*/
 	char *licenses;		/* required system licenses (including those
 				 * from TRES requests */
 	uint32_t max_start_delay;/* Maximum delay in which jobs outside of the
 				  * reservation will be permitted to overlap
 				  * once any jobs are queued for the
 				  * reservation */
 	char *name;		/* name of reservation			*/
 	bitstr_t *node_bitmap;	/* bitmap of reserved nodes		*/
 	uint32_t node_cnt;	/* count of nodes required		*/
 	char *node_list;	/* list of reserved nodes or ALL	*/
 	char *partition;	/* name of partition to be used		*/
 	part_record_t *part_ptr;/* pointer to partition used		*/
 	uint32_t purge_comp_time; /* If PURGE_COMP flag is set the amount of
 				   * minutes this reservation will sit idle
 				   * until it is revoked.
 				   */
 	uint32_t resv_id;	/* unique reservation ID, internal use	*/
 	time_t start_time;	/* start time of reservation		*/
 	time_t start_time_first;/* when the reservation first started	*/
 	time_t start_time_prev;	/* If start time was changed this is
 				 * the previous start time.  Needed
 				 * for accounting */
 	time_t time_force; /* The actual start time of the reservation if the
 			    * FORCE_START flag was used */
 	char *tres_fmt_str;     /* formatted string of tres to deal with */
 	char *tres_str;         /* simple string of tres to deal with */
 	char *users;		/* names of users permitted to use	*/
 	int user_cnt;		/* count of users permitted to use	*/
 	uid_t *user_list;	/* array of users permitted to use	*/
 } slurmctld_resv_t;

 typedef struct {
 	bitstr_t *core_bitmap;
 	void *gres_js_exc;
 	void *gres_js_inc;
 	list_t *gres_list_exc;
 	list_t *gres_list_inc;
 	bitstr_t **exc_cores;
 } resv_exc_t;

 extern list_t *resv_list;	/* list of slurmctld_resv_t entries */
 extern time_t last_resv_update;	/* time of last resv_list update */

 /*****************************************************************************\
  *  Job lists
 \*****************************************************************************/
 extern list_t *job_list;		/* list of job_record entries */
 extern list_t *purge_jobs_list;		/* list of job_record_t to free */

 /*****************************************************************************\
  *  Global assoc_cache variables
 \*****************************************************************************/

 /* flag to let us know if we are running on cache or from the actual
  * database */
 extern uint16_t running_cache;
 /* mutex and signal to let us know if associations have been reset so we need to
  * redo all the pointers to the associations */
 extern pthread_mutex_t assoc_cache_mutex; /* assoc cache mutex */
 extern pthread_cond_t assoc_cache_cond; /* assoc cache condition */

 /*****************************************************************************\
  *  Global slurmctld functions
 \*****************************************************************************/

 /*
  * abort_job_on_node - Kill the specific job_id on a specific node,
  *	the request is not processed immediately, but queued.
  *	This is to prevent a flood of pthreads if slurmctld restarts
  *	without saved state and slurmd daemons register with a
  *	multitude of running jobs. Slurmctld will not recognize
  *	these jobs and use this function to kill them - one
  *	agent request per node as they register.
  * IN job_id - id of the job to be killed
  * IN job_ptr - pointer to terminating job (NULL if unknown, e.g. orphaned)
  * IN node_name - name of the node on which the job resides
  */
 extern void abort_job_on_node(uint32_t job_id, job_record_t *job_ptr,
 			      char *node_name);

 /*
  * abort_job_on_nodes - Kill the specific job_on the specific nodes,
  *	the request is not processed immediately, but queued.
  *	This is to prevent a flood of pthreads if slurmctld restarts
  *	without saved state and slurmd daemons register with a
  *	multitude of running jobs. Slurmctld will not recognize
  *	these jobs and use this function to kill them - one
  *	agent request per node as they register.
  * IN job_ptr - pointer to terminating job
  * IN node_name - name of the node on which the job resides
  */
 extern void abort_job_on_nodes(job_record_t *job_ptr, bitstr_t *node_bitmap);

 /*
  * If a job has a FAIL_ACCOUNT or FAIL_QOS start_reason check and set pointers
  * if they are now valid.
  */
 extern void set_job_failed_assoc_qos_ptr(job_record_t *job_ptr);

 /* set the tres_req_str and tres_req_fmt_str for the job.  assoc_mgr_locked
  * is set if the assoc_mgr read lock is already set.
  */
 extern void set_job_tres_req_str(job_record_t *job_ptr, bool assoc_mgr_locked);

 /* Note that the backup slurmctld has assumed primary control.
  * This function can be called multiple times. */
 extern void backup_slurmctld_restart(void);

 /* Handle SIGHUP while in backup mode */
 extern void backup_on_sighup(void);

 /* Complete a batch job requeue logic after all steps complete so that
  * subsequent jobs appear in a separate accounting record. */
 extern void batch_requeue_fini(job_record_t *job_ptr);

 /* Build a bitmap of nodes completing this job */
 extern void build_cg_bitmap(job_record_t *job_ptr);

 /* Build structure with job allocation details */
 extern resource_allocation_response_msg_t *build_job_info_resp(
 	job_record_t *job_ptr);

 /*
  * create_ctld_part_record - create a partition record
  * IN name - name will be xstrdup()'d into the part_record
  * RET a pointer to the record or NULL if error
  * global: default_part - default partition parameters
  *         part_list - global partition list
  * NOTE: the record's values are initialized to those of default_part
  * NOTE: allocates memory that should be xfreed with delete_part_record
  */
 extern part_record_t *create_ctld_part_record(const char *name);

 /*
  * build_part_bitmap - update the total_cpus, total_nodes, and node_bitmap
  *	for the specified partition, also reset the partition pointers in
  *	the node back to this partition.
  * IN part_ptr - pointer to the partition
  * RET 0 if no error, errno otherwise
  * global: node_record_table_ptr - pointer to global node table
  * NOTE: this does not report nodes defined in more than one partition. this
  *	is checked only upon reading the configuration file, not on an update
  */
 extern int build_part_bitmap(part_record_t *part_ptr);

 /*
  * job_limits_check - check the limits specified for the job.
  * IN job_ptr - pointer to job table entry.
  * IN check_min_time - if true test job's minimum time limit,
  *		otherwise test maximum time limit
  * RET WAIT_NO_REASON on success, fail status otherwise.
  */
 extern int job_limits_check(job_record_t **job_pptr, bool check_min_time);

 /*
  * delete_partition - delete the specified partition
  * IN delete_part_msg_t - partition specification from RPC
  * RET 0 on success, errno otherwise
  */
 extern int delete_partition(delete_part_msg_t *part_desc_ptr);

 /*
  * delete_step_record - delete record for job step for specified job_ptr
  *	and step_id
  * IN job_ptr - pointer to job table entry to have step record removed
  * IN step_ptr - pointer to step table entry of the desired job step
  */
 extern void delete_step_record(job_record_t *job_ptr, step_record_t *step_ptr);

 /*
  * Copy a job's dependency list
  * IN depend_list_src - a job's depend_lst
  * RET copy of depend_list_src, must bee freed by caller
  */
 extern list_t *depended_list_copy(list_t *depend_list_src);

 /*
  * drain_nodes - drain one or more nodes,
  *  no-op for nodes already drained or draining
  * IN nodes - nodes to drain
  * IN reason - reason to drain the nodes
  * IN reason_uid - who set the reason
  * RET SLURM_SUCCESS or error code
  * global: node_record_table_ptr - pointer to global node table
  */
 extern int drain_nodes ( char *nodes, char *reason, uint32_t reason_uid );

 /*
  * Set job state
  * IN job_ptr - Job to update
  * IN state - state from enum job_states
  */
 extern void job_state_set(job_record_t *job_ptr, uint32_t state);

 /*
  * Set job state flag
  * IN job_ptr - Job to update
  * IN flag - flag to set (from JOB_* macro)
  */
 extern void job_state_set_flag(job_record_t *job_ptr, uint32_t flag);

 /*
  * Unset job state flag
  * IN job_ptr - Job to update
  * IN flag - flag to unset (from JOB_* macro)
  */
 extern void job_state_unset_flag(job_record_t *job_ptr, uint32_t flag);

 /* dump_all_job_state - save the state of all jobs to file
  * RET 0 or error code */
 extern int dump_all_job_state ( void );

 /*
  * Notify/update job state hash table that job state has changed
  * IN job_ptr - Job about to be updated
  * IN new_state - New value that will be assigned to job_ptr->job_state.
  *                If NO_VAL, then delete the cache entry.
  */
 extern void on_job_state_change(job_record_t *job_ptr, uint32_t new_state);

 /* dump_all_node_state - save the state of all nodes to file */
 extern int dump_all_node_state ( void );

 /* dump_all_part_state - save the state of all partitions to file */
 extern int dump_all_part_state ( void );

 /*
  * dump_job_desc - dump the incoming job submit request message
  * IN job_desc - job specification from RPC
  */
 extern void dump_job_desc(job_desc_msg_t *job_desc);

 /*
  * dump_step_desc - dump the incoming step initiate request message
  * IN step_spec - job step request specification from RPC
  */
 extern void dump_step_desc(job_step_create_request_msg_t *step_spec);

 /* Remove one node from a job's allocation */
 extern void excise_node_from_job(job_record_t *job_ptr,
 				 node_record_t *node_ptr);

 /* make_node_avail - flag specified node as available */
 extern void make_node_avail(node_record_t *node_ptr);

 /*
  * Reset load & power statistics for node.
  *
  * When node is powered down or downed unexpectedly, the load/power stats
  * are effectively '0'.
  *
  * IN node_ptr - node to reset statistics for.
  */
 extern void node_mgr_reset_node_stats(node_record_t *node_ptr);

 /*
  * Copy a job's feature list
  * IN feature_list_src - a job's depend_lst
  * RET copy of depend_list_src, must be freed by caller
  */
 extern list_t *feature_list_copy(list_t *feature_list_src);

 typedef enum {
 	FOR_EACH_JOB_BY_ID_EACH_INVALID = 0,
 	FOR_EACH_JOB_BY_ID_EACH_CONT, /* continue for each processing */
 	FOR_EACH_JOB_BY_ID_EACH_STOP, /* stop for each processing */
 	FOR_EACH_JOB_BY_ID_EACH_FAIL, /* stop for each processing due to failure */
 	FOR_EACH_JOB_BY_ID_EACH_INVALID_MAX /* assertion only value on max value */
 } foreach_job_by_id_control_t;

 /*
  *  Function prototype for operating on each job that matches
  *  Returns control requested for processing
  */
 typedef foreach_job_by_id_control_t (*JobForEachFunc)(
 					job_record_t *job_ptr,
 					const slurm_selected_step_t *id,
 					void *arg);
 /*
  *  Function prototype for operating on each read only job that matches
  *  Returns control requested for processing
  */
 typedef foreach_job_by_id_control_t (*JobROForEachFunc)(
 					const job_record_t *job_ptr,
 					const slurm_selected_step_t *id,
 					void *arg);
 /*
  * Function prototype for operating on a job id that is not found
  * This is called just once for an array expression with a bitmap of array
  * tasks that were not found.
  * Returns control requested for processing
  */
 typedef foreach_job_by_id_control_t
 	(*JobNullForEachFunc)(const slurm_selected_step_t *id, void *arg);

 /*
  * Walk all matching job_record_t's that match filter
  * If a job id is a het job leader or array job leader, then all components of
  * the het job or all jobs in the array will be walked.
  * Warning: Caller must hold job write lock
  *
  * IN filter - Filter to select jobs
  * IN callback - Function to call on each matching job record pointer
  *               NOTE: If array_task_id was given and the task has not been
  *               split from the meta job record, the meta job record will be
  *               passed to the callback function.
  * IN null_callback - (optional) Function to call on each non-matching job id
  * IN arg - Arbitrary pointer to pass to callback
  * RET number of jobs matched.
  * 	negative if callback returns FOR_EACH_JOB_BY_ID_EACH_FAIL.
  * 	may be zero if no jobs matched.
  */
 extern int foreach_job_by_id(const slurm_selected_step_t *filter,
 			     JobForEachFunc callback,
 			     JobNullForEachFunc null_callback, void *arg);

 /*
  * Walk all matching read only job_record_t's that match filter
  * If a job id is a het job leader or array job leader, then all components of
  * the het job or all jobs in the array will be walked.
  * Warning: Caller must hold job read lock
  *
  * IN filter - Filter to select jobs
  * IN callback - Function to call on each matching job record pointer
  *               NOTE: If array_task_id was given and the task has not been
  *               split from the meta job record, the meta job record will be
  *               passed to the callback function.
  * IN null_callback - (optional) Function to call on each non-matching job id
  * IN arg - Arbitrary pointer to pass to callback
  * RET number of jobs matched.
  * 	negative if callback returns FOR_EACH_JOB_BY_ID_EACH_FAIL.
  * 	may be zero if no jobs matched.
  */
 extern int foreach_job_by_id_ro(const slurm_selected_step_t *filter,
 				JobROForEachFunc callback,
 				JobNullForEachFunc null_callback, void *arg);

 /*
  * find_job_array_rec - return a pointer to the job record with the given
  *	array_job_id/array_task_id
  * IN job_id - requested job's id
  * IN array_task_id - requested job's task id,
  *		      NO_VAL if none specified (i.e. not a job array)
  *		      INFINITE return any task for specified job id
  * RET pointer to the job's record, NULL on error
  */
 extern job_record_t *find_job_array_rec(uint32_t array_job_id,
 					uint32_t array_task_id);

 /*
  * find_het_job_record - return a pointer to the job record with the given ID
  * IN job_id - requested job's ID
  * in het_job_id - hetjob component ID
  * RET pointer to the job's record, NULL on error
  */
 extern job_record_t *find_het_job_record(uint32_t job_id, uint32_t het_job_id);

 /*
  * find_job_record - return a pointer to the job record with the given job_id
  * IN job_id - requested job's id
  * RET pointer to the job's record, NULL on error
  */
 extern job_record_t *find_job_record(uint32_t job_id);

 /*
  * find_part_record - find a record for partition with specified name
  * IN name - name of the desired partition
  * RET pointer to partition or NULL if not found
  */
 extern part_record_t *find_part_record(char *name);

 /*
  * get_job_env - return the environment variables and their count for a
  *	given job
  * IN job_ptr - pointer to job for which data is required
  * OUT env_size - number of elements to read
  * RET point to array of string pointers containing environment variables
  */
 extern char **get_job_env(job_record_t *job_ptr, uint32_t *env_size);

 /*
  * get_job_script - return the script for a given job
  * IN job_ptr - pointer to job for which data is required
  * RET buf_t containing job script
  */
 extern buf_t *get_job_script(const job_record_t *job_ptr);

 /*
  * job_get_sockets_per_node
  * IN job_ptr - pointer to the job
  * RET number of requested sockets per node if set, otherwise 1
  */
 extern uint16_t job_get_sockets_per_node(job_record_t *job_ptr);

 /*
  * Return the next available job_id to be used.
  * IN test_only - if true, doesn't advance the job_id sequence, just returns
  * 	what the next job id will be.
  * RET a valid job_id or SLURM_ERROR if all job_ids are exhausted.
  */
 extern uint32_t get_next_job_id(bool test_only);

 /*
  * get_part_list - find record for named partition(s)
  * IN name - partition name(s) in a comma separated list
  * OUT part_ptr_list - sorted list of pointers to the partitions or NULL
  * OUT prim_part_ptr - pointer to the primary partition
  * OUT err_part - The first invalid partition name.
  * NOTE: Caller must free the returned list
  * NOTE: Caller must free err_part
  */
 extern void get_part_list(char *name, list_t **part_ptr_list,
 			  part_record_t **prim_part_ptr, char **err_part);

 /*
  * init_depend_policy()
  * Initialize variables from DependencyParameters
  */
 extern void init_depend_policy(void);

 /*
  * init_job_conf - initialize the job configuration tables and values.
  *	this should be called after creating node information, but
  *	before creating any job entries.
  * global: last_job_update - time of last job table update
  *	job_list - pointer to global job list
  */
 extern void init_job_conf(void);

 /*
  * init_node_conf - initialize the node configuration tables and values.
  *	this should be called before creating any node or configuration
  *	entries.
  * global: node_record_table_ptr - pointer to global node table
  *         default_node_record - default values for node records
  *         default_config_record - default values for configuration records
  *         hash_table - table of hash indexes
  *         last_node_update - time of last node table update
  */
 extern void init_node_conf(void);

 /*
  * consolidate_config_list
  * Try to combine duplicate config records.
  *
  * IN is_locked - whether NODE_WRITE_LOCK is set or not.
  * IN bool - whether to do consolidate regardless of a queued event.
  */
 extern void consolidate_config_list(bool is_locked, bool force);

 /*
  * init_part_conf - initialize the default partition configuration values
  *	and create a (global) partition list.
  * this should be called before creating any partition entries.
  * global: default_part - default partition values
  *         part_list - global partition list
  */
 extern void init_part_conf(void);

 /* init_requeue_policy()
  * Initialize the requeue exit/hold bitmaps.
  */
 extern void init_requeue_policy(void);

 /*
  * is_node_down - determine if the specified node's state is DOWN
  * IN name - name of the node
  * RET true if node exists and is down, otherwise false
  */
 extern bool is_node_down (char *name);

 /*
  * is_node_resp - determine if the specified node's state is responding
  * IN name - name of the node
  * RET true if node exists and is responding, otherwise false
  */
 extern bool is_node_resp (char *name);

 /* Fail a job because the qos is no longer valid */
 extern int job_fail_qos(job_record_t *job_ptr, const char *func_name,
 			bool assoc_locked);

 /*
  * delete_job_desc_files - remove the state files and directory
  * for a given job_id from SlurmStateSaveLocation
  */
 extern void delete_job_desc_files(uint32_t job_id);

 /*
  * job_alloc_info - get details about an existing job allocation
  * IN uid - job issuing the code
  * IN job_id - ID of job for which info is requested
  * OUT job_pptr - set to pointer to job record
  * NOTE: See job_alloc_info_ptr() if job pointer is known
  */
 extern int job_alloc_info(uint32_t uid, uint32_t job_id,
 			  job_record_t **job_pptr);

 /*
  * job_alloc_info_ptr - get details about an existing job allocation
  * IN uid - job issuing the code
  * IN job_ptr - pointer to job record
  * NOTE: See job_alloc_info() if job pointer not known
  */
 extern int job_alloc_info_ptr(uint32_t uid, job_record_t *job_ptr);

 /*
  * job_allocate - create job_records for the supplied job specification and
  *	allocate nodes for it.
  * IN job_desc - job specifications
  * IN immediate - if set then either initiate the job immediately or fail
  * IN will_run - don't initiate the job if set, just test if it could run
  *	now or later
  * OUT resp - will run response (includes start location, time, etc.)
  * IN allocate - resource allocation request only if set, batch job if zero
  * IN submit_uid -uid of user issuing the request
  * IN cron - true if cron
  * OUT job_pptr - set to pointer to job record
  * OUT err_msg - Custom error message to the user, caller to xfree results
  * IN protocol_version - version of the code the caller is using
  * RET 0 or an error code. If the job would only be able to execute with
  *	some change in partition configuration then
  *	ESLURM_REQUESTED_PART_CONFIG_UNAVAILABLE is returned
  * NOTE: If allocating nodes lx[0-7] to a job and those nodes have cpu counts
  *	of 4, 4, 4, 4, 8, 8, 4, 4 then num_cpu_groups=3, cpus_per_node={4,8,4}
  *	and cpu_count_reps={4,2,2}
  * globals: job_list - pointer to global job list
  *	list_part - global list of partition info
  *	default_part_loc - pointer to default partition
  * NOTE: lock_slurmctld on entry: Read config Write job, Write node, Read part
  */
 extern int job_allocate(job_desc_msg_t *job_desc, int immediate,
 			int will_run, will_run_response_msg_t **resp,
 			int allocate, uid_t submit_uid, bool cron,
 			job_record_t **job_pptr,
 			char **err_msg, uint16_t protocol_version);

 /* If this is a job array meta-job, prepare it for being scheduled */
 extern void job_array_pre_sched(job_record_t *job_ptr);

 /* If this is a job array meta-job, clean up after scheduling attempt */
 extern job_record_t *job_array_post_sched(job_record_t *job_ptr, bool list_add);

 /* Create an exact copy of an existing job record for a job array.
  * IN job_ptr - META job record for a job array, which is to become an
  *		individual task of the job array.
  *		Set the job's array_task_id to the task to be split out.
  * IN list_add - add to the job_list or not.
  * RET - The new job record, which is the new META job record. */
 extern job_record_t *job_array_split(job_record_t *job_ptr, bool list_add);

 /* Record the start of one job array task */
 extern void job_array_start(job_record_t *job_ptr);

 /* Return true if a job array task can be started */
 extern bool job_array_start_test(job_record_t *job_ptr);

 /* Clear job's CONFIGURING flag and advance end time as needed */
 extern void job_config_fini(job_record_t *job_ptr);

 /* Reset a job's end_time based upon it's start_time and time_limit.
  * NOTE: Do not reset the end_time if already being preempted */
 extern void job_end_time_reset(job_record_t *job_ptr);
 /*
  * job_hold_by_assoc_id - Hold all pending jobs with a given
  *	association ID. This happens when an association is deleted (e.g. when
  *	a user is removed from the association database).
  * RET count of held jobs
  */
 extern int job_hold_by_assoc_id(uint32_t assoc_id);

 /*
  * job_hold_by_qos_id - Hold all pending jobs with a given
  *	QOS ID. This happens when a QOS is deleted (e.g. when
  *	a QOS is removed from the association database).
  * RET count of held jobs
  */
 extern int job_hold_by_qos_id(uint32_t qos_id);

 /* log the completion of the specified job */
 extern void job_completion_logger(job_record_t *job_ptr, bool requeue);

 /*
  * Return total amount of memory allocated to a job. This can be based upon
  * a GRES specification with various GRES/memory allocations on each node.
  * If current allocation information is not available, estimate memory based
  * upon pn_min_memory and either CPU or node count, or upon mem_per_gres
  * and estimated gres count (both values gotten from gres list).
  */
 extern uint64_t job_get_tres_mem(struct job_resources *job_res,
 				 uint64_t pn_min_memory, uint32_t cpu_cnt,
 				 uint32_t node_cnt, part_record_t *part_ptr,
 				 list_t *gres_list, bool user_set_mem,
 				 uint16_t min_sockets_per_node,
 				 uint32_t num_tasks);

 /*
  * job_epilog_complete - Note the completion of the epilog script for a
  *	given job
  * IN job_id      - id of the job for which the epilog was executed
  * IN node_name   - name of the node on which the epilog was executed
  * IN return_code - return code from epilog script
  * RET true if job is COMPLETED, otherwise false
  */
 extern bool job_epilog_complete(uint32_t job_id, char *node_name,
 		uint32_t return_code);

 /*
  * job_end_time - Process JOB_END_TIME
  * IN time_req_msg - job end time request
  * OUT timeout_msg - job timeout response to be sent
  * RET SLURM_SUCCESS or an error code
  */
 extern int job_end_time(job_alloc_info_msg_t *time_req_msg,
 			srun_timeout_msg_t *timeout_msg);

 /* job_fini - free all memory associated with job records */
 extern void job_fini (void);

 /* job_hold_requeue()
  *
  * Requeue the job based upon its current state.
  * If JOB_SPECIAL_EXIT then requeue and hold with JOB_SPECIAL_EXIT state.
  * If JOB_REQUEUE_HOLD then requeue and hold.
  * If JOB_REQUEUE then requeue and let it run again.
  * The requeue can happen directly from job_requeue() or from
  * job_epilog_complete() after the last component has finished.
  */
 extern bool job_hold_requeue(job_record_t *job_ptr);

 /*
  * determine if job is ready to execute per the node select plugin
  * IN job_id - job to test
  * OUT ready - 1 if job is ready to execute 0 otherwise
  * RET Slurm error code
  */
 extern int job_node_ready(uint32_t job_id, int *ready);

 /* Record accounting information for a job immediately before changing size */
 extern void job_pre_resize_acctg(job_record_t *job_ptr);

 /* Record accounting information for a job immediately after changing size */
 extern void job_post_resize_acctg(job_record_t *job_ptr);

 /*
  * job_signal - signal the specified job, access checks already done
  * IN job_ptr - job to be signaled
  * IN signal - signal to send, SIGKILL == cancel the job
  * IN flags  - see KILL_JOB_* flags in slurm.h
  * IN uid - uid of requesting user
  * IN preempt - true if job being preempted
  * RET 0 on success, otherwise ESLURM error code
  */
 extern int job_signal(job_record_t *job_ptr, uint16_t signal,
 		      uint16_t flags, uid_t uid, bool preempt);

 /*
  * het_job_signal - signal all components of a hetjob
  * IN het_job_leader - job record of job hetjob leader
  * IN signal - signal to send, SIGKILL == cancel the job
  * IN flags  - see KILL_JOB_* flags in slurm.h
  * IN uid - uid of requesting user
  * IN preempt - true if job being preempted
  * RET 0 on success, otherwise ESLURM error code
  */
 extern int het_job_signal(job_record_t *het_job_leader, uint16_t signal,
 			   uint16_t flags, uid_t uid, bool preempt);

 /*
  * job_str_signal - signal the specified job
  * IN job_id_str - id of the job to be signaled, valid formats include "#"
  *	"#_#" and "#_[expr]"
  * IN signal - signal to send, SIGKILL == cancel the job
  * IN flags  - see KILL_JOB_* flags in slurm.h
  * IN uid - uid of requesting user
  * IN preempt - true if job being preempted
  * RET 0 on success, otherwise ESLURM error code
  */
 extern int job_str_signal(char *job_id_str, uint16_t signal, uint16_t flags,
 			  uid_t uid, bool preempt);

 /*
  * Signal the jobs matching the specified filters and build a response message
  * detailing the results of the request.
  *
  * IN kill_msg - the specification for which jobs to signal
  * IN auth_uid - the authenticated UID of the requesting user
  * OUT resp_msg_p - a response message to send back to the requesting user
  * RET - SLURM_SUCCESS if successful, an error code otherwise
  */
 extern int job_mgr_signal_jobs(kill_jobs_msg_t *kill_msg, uid_t auth_uid,
 			       kill_jobs_resp_msg_t **resp_msg_p);

 /*
  * job_suspend/job_suspend2 - perform some suspend/resume operation
  * NB job_suspend  - Uses the job_id field and ignores job_id_str
  * NB job_suspend2 - Ignores the job_id field and uses job_id_str
  *
  * IN sus_ptr - suspend/resume request message
  * IN uid - user id of the user issuing the RPC
  * indf_susp IN - set if job is being suspended indefinitely by user or admin
  *                and we should clear it's priority, otherwise suspended
  *		  temporarily for gang scheduling
  * IN protocol_version - slurm protocol version of client
  * RET 0 on success, otherwise ESLURM error code
  */
 extern int job_suspend(slurm_msg_t *msg, suspend_msg_t *sus_ptr, uid_t uid,
 		       bool indf_susp, uint16_t protocol_version);
 extern int job_suspend2(slurm_msg_t *msg, suspend_msg_t *sus_ptr, uid_t uid,
 			bool indf_susp, uint16_t protocol_version);

 /*
  * job_complete - note the normal termination the specified job
  * IN job_id - id of the job which completed
  * IN uid - user id of user issuing the RPC
  * IN requeue - job should be run again if possible
  * IN node_fail - true if job terminated due to node failure
  * IN job_return_code - job's return code, if set then set state to JOB_FAILED
  * RET - 0 on success, otherwise ESLURM error code
  * global: job_list - pointer global job list
  *	last_job_update - time of last job table update
  */
 extern int job_complete(uint32_t job_id, uid_t uid, bool requeue,
 			bool node_fail, uint32_t job_return_code);

 /*
  * job_independent - determine if this job has a dependent job pending
  *	or if the job's scheduled begin time is in the future
  * IN job_ptr - pointer to job being tested
  * RET - true if job no longer must be deferred for another job
  */
 extern bool job_independent(job_record_t *job_ptr);

 /*
  * job_req_node_filter - job request node filter.
  *	clear from a bitmap the nodes which can not be used for a job
  *	test memory size, required features, processor count, etc.
  * NOTE: Does not support exclusive OR of features.
  *	It just matches first element of MOR and ignores count.
  * IN job_ptr - pointer to node to be scheduled
  * IN/OUT bitmap - set of nodes being considered for use
  * RET SLURM_SUCCESS or EINVAL if can't filter (exclusive OR of features)
  */
 extern int job_req_node_filter(job_record_t *job_ptr, bitstr_t *avail_bitmap,
 			       bool test_only);

 /*
  * job_requeue - Requeue a running or pending batch job
  * IN uid - user id of user issuing the RPC
  * IN job_id - id of the job to be requeued
  * IN msg - slurm_msg to send response back on
  * IN preempt - true if job being preempted
  * IN flags - JobExitRequeue | Hold | JobFailed | etc.
  * RET 0 on success, otherwise ESLURM error code
  */
 extern int job_requeue(uid_t uid, uint32_t job_id, slurm_msg_t *msg,
 		       bool preempt, uint32_t flags);

 /*
  * job_requeue2 - Requeue a running or pending batch job
  * IN uid - user id of user issuing the RPC
  * IN req_ptr - request including ID of the job to be requeued
  * IN msg - slurm_msg to send response back on
  * IN preempt - true if job being preempted
  * RET 0 on success, otherwise ESLURM error code
  */
 extern int job_requeue2(uid_t uid, requeue_msg_t *req_ptr, slurm_msg_t *msg,
 			bool preempt);

 /*
  * job_set_top - Move the specified job to the top of the queue (at least
  *	for that user ID, partition, account, and QOS).
  *
  * IN msg - original request msg
  * IN top_ptr - user request
  * IN uid - user id of the user issuing the RPC
  * IN protocol_version - slurm protocol version of client
  * RET 0 on success, otherwise ESLURM error code
  */
 extern int job_set_top(slurm_msg_t *msg, top_job_msg_t *top_ptr, uid_t uid,
 		       uint16_t protocol_version);

 /*
  * job_time_limit - terminate jobs which have exceeded their time limit
  * global: job_list - pointer global job list
  *	last_job_update - time of last job table update
  */
 extern void job_time_limit (void);

 /* Builds the tres_req_cnt and tres_req_str of a job.
  * Only set when job is pending.
  * NOTE: job write lock must be locked before calling this */
 extern void job_set_req_tres(job_record_t *job_ptr, bool assoc_mgr_locked);

 /*
  * job_set_tres - set the tres up when allocating the job.
  * Only set when job is running.
  * NOTE: job write lock must be locked before calling this */
 extern void job_set_alloc_tres(job_record_t *job_ptr, bool assoc_mgr_locked);

 /*
  * job_update_tres_cnt - when job is completing remove allocated tres
  *                      from count.
  * IN/OUT job_ptr - job structure to be updated
  * IN node_inx    - node bit that is finished with job.
  * RET SLURM_SUCCES on success SLURM_ERROR on cpu_cnt underflow
  */
 extern int job_update_tres_cnt(job_record_t *job_ptr, int node_inx);

 /*
  * check_job_step_time_limit - terminate jobsteps which have exceeded
  * their time limit
  */
 extern int check_job_step_time_limit(void *x, void *arg);

 /*
  * Kill job or job step
  *
  * IN job_step_kill_msg - msg with specs on which job/step to cancel.
  * IN uid               - uid of user requesting job/step cancel.
  */
 extern int kill_job_step(job_step_kill_msg_t *job_step_kill_msg, uint32_t uid);

 /*
  * kill_job_by_part_name - Given a partition name, deallocate resource for
  *	its jobs and kill them
  * IN part_name - name of a partition
  * RET number of killed jobs
  */
 extern int kill_job_by_part_name(char *part_name);

 /*
  * kill_job_on_node - Kill the specific job on a specific node.
  * IN job_ptr - pointer to terminating job
  * IN node_ptr - pointer to the node on which the job resides
  */
 extern void kill_job_on_node(job_record_t *job_ptr, node_record_t *node_ptr);

 /*
  * kill_running_job_by_node_ptr - Given a node_ptr, deallocate RUNNING
  *	or COMPLETING jobs from the node or kill them
  * IN node_name - name of a node
  * RET number of killed jobs
  */
 extern int kill_running_job_by_node_ptr(node_record_t *node_ptr);

 /* list_compare_config - compare two entry from the config list based upon
  *	weight, see common/list.h for documentation */
 int list_compare_config (void *config_entry1, void *config_entry2);

 /*
  * list_find_part - find an entry in the partition list, see common/list.h
  *	for documentation
  * IN key - partition name or "universal_key" for all partitions
  * RET 1 if matches key, 0 otherwise
  * global- part_list - the global partition list
  */
 extern int list_find_part (void *part_entry, void *key);

 /*
  * load_all_job_state - load the job state from file, recover from last
  *	checkpoint. Execute this after loading the configuration file data.
  * RET 0 or error code
  */
 extern int load_all_job_state ( void );

 /*
  * load_all_node_state - Load the node state from file, recover on slurmctld
  *	restart. Execute this after loading the configuration file data.
  *	Data goes into common storage.
  * IN state_only - if true over-write only node state, features, gres and reason
  * RET 0 or error code
  */
 extern int load_all_node_state ( bool state_only );

 /*
  * load_last_job_id - load only the last job ID from state save file.
  * RET 0 or error code
  */
 extern int load_last_job_id( void );

 /*
  * load_part_uid_allow_list - reload the allow_uid list of partitions
  *	if required (updated group file or force set)
  * IN force - if set then always reload the allow_uid list
  */
 extern void load_part_uid_allow_list(bool force);

 /*
  * load_all_part_state - load the partition state from file, recover from
  *	slurmctld restart. execute this after loading the configuration
  *	file data.
  */
 extern int load_all_part_state(uint16_t reconfig_flags);

 /* make_node_alloc - flag specified node as allocated to a job
  * IN node_ptr - pointer to node being allocated
  * IN job_ptr  - pointer to job that is starting
  */
 extern void make_node_alloc(node_record_t *node_ptr, job_record_t *job_ptr);

 extern void node_mgr_make_node_blocked(job_record_t *job_ptr, bool set);

 /* make_node_comp - flag specified node as completing a job
  * IN node_ptr - pointer to node marked for completion of job
  * IN job_ptr  - pointer to job that is completing
  * IN suspended - true if job was previously suspended
  */
 extern void make_node_comp(node_record_t *node_ptr, job_record_t *job_ptr,
 			   bool suspended);

 /*
  * make_node_idle - flag specified node as having finished with a job
  * IN node_ptr - pointer to node reporting job completion
  * IN job_ptr - pointer to job that just completed or NULL if not applicable
  */
 extern void make_node_idle(node_record_t *node_ptr, job_record_t *job_ptr);

 /* msg_to_slurmd - send given msg_type every slurmd, no args */
 extern void msg_to_slurmd (slurm_msg_type_t msg_type);

 /* request a "configless" RPC be send to all slurmd nodes */
 void push_reconfig_to_slurmd(void);

 /* node_fini - free all memory associated with node records */
 extern void node_fini (void);

 /* node_did_resp - record that the specified node is responding
  * IN name - name of the node */
 extern void node_did_resp (char *name);

 /*
  * node_not_resp - record that the specified node is not responding
  * IN name - name of the node
  * IN msg_time - time message was sent
  * IN resp_type - what kind of response came back from the node
  */
 extern void node_not_resp (char *name, time_t msg_time,
 			   slurm_msg_type_t resp_type);

 /* For every node with the "not_responding" flag set, clear the flag
  * and log that the node is not responding using a hostlist expression */
 extern void node_no_resp_msg(void);

 /* For a given job ID return the number of PENDING tasks which have their
  * own separate job_record (do not count tasks in pending META job record) */
 extern int num_pending_job_array_tasks(uint32_t array_job_id);

 /*
  * Dump state of jobs given list of jobs (or none for all jobs)
  * IN filter_jobs_count - number of entries in filter_jobs_ptr array
  * IN filter_jobs_ptr - array of jobs to filter
  * IN/OUT jobs_count_ptr - pointer to number of jobs dumped
  * IN/OUT jobs_pptr - pointer to dumped jobs array
  * RET SLURM_SUCCESS or error
  */
 extern int dump_job_state(const uint32_t filter_jobs_count,
 			  const slurm_selected_step_t *filter_jobs_ptr,
 			  uint32_t *jobs_count_ptr,
 			  job_state_response_job_t **jobs_pptr);

 /*
  * pack_all_jobs - dump all job information for all jobs in
  *	machine independent form (for network transmission)
  * IN show_flags - job filtering options
  * IN uid - uid of user making request (for partition filtering)
  * IN filter_uid - pack only jobs belonging to this user if not NO_VAL
  * IN protocol_version - slurm protocol version of client
  * OUT buffer
  * global: job_list - global list of job records
  * NOTE: change _unpack_job_desc_msg() in common/slurm_protocol_pack.c
  *	whenever the data format changes
  */
 extern buf_t *pack_all_jobs(uint16_t show_flags, uid_t uid, uint32_t filter_uid,
 			    uint16_t protocol_version);

 /*
  * pack_spec_jobs - dump job information for specified jobs in
  *	machine independent form (for network transmission)
  * IN show_flags - job filtering options
  * IN job_ids - list of job_ids to pack
  * IN uid - uid of user making request (for partition filtering)
  * IN filter_uid - pack only jobs belonging to this user if not NO_VAL
  * OUT buffer
  * global: job_list - global list of job records
  * NOTE: the buffer at *buffer_ptr must be xfreed by the caller
  * NOTE: change _unpack_job_desc_msg() in common/slurm_protocol_pack.c
  *	whenever the data format changes
  */
 extern buf_t *pack_spec_jobs(list_t *job_ids, uint16_t show_flags, uid_t uid,
 			     uint32_t filter_uid, uint16_t protocol_version);

 /*
  * pack_all_nodes - dump all configuration and node information for all nodes
  *	in machine independent form (for network transmission)
  * IN show_flags - node filtering options
  * IN uid - uid of user making request (for partition filtering)
  * IN protocol_version - slurm protocol version of client
  * OUT buffer
  * global: node_record_table_ptr - pointer to global node table
  * NOTE: change slurm_load_node() in api/node_info.c when data format changes
  * NOTE: READ lock_slurmctld config before entry
  */
 extern buf_t *pack_all_nodes(uint16_t show_flags, uid_t uid,
 			     uint16_t protocol_version);

 /* Pack all scheduling statistics */
 extern buf_t *pack_all_stat(uint16_t protocol_version);

 /*
  * pack_ctld_job_step_info_response_msg - packs job step info
  * IN step_id - specific id or NO_VAL/NO_VAL for all
  * IN uid - user issuing request
  * IN show_flags - job step filtering options
  * OUT buffer - location to store data, pointers automatically advanced
  * IN protocol_version - slurm protocol version of client
  * RET - 0 or error code
  * NOTE: MUST free_buf buffer
  */
 extern int pack_ctld_job_step_info_response_msg(
 	slurm_step_id_t *step_id, uid_t uid, uint16_t show_flags,
 	buf_t *buffer, uint16_t protocol_version);

 /*
  * pack_all_part - dump all partition information for all partitions in
  *	machine independent form (for network transmission)
  * IN show_flags - partition filtering options
  * IN uid - uid of user making request (for partition filtering)
  * IN protocol_version - slurm protocol version of client
  * OUT buffer
  * global: part_list - global list of partition records
  * NOTE: change slurm_load_part() in api/part_info.c if data format changes
  */
 extern buf_t *pack_all_part(uint16_t show_flags, uid_t uid,
 			    uint16_t protocol_version);

 /*
  * pack_job - dump all configuration information about a specific job in
  *	machine independent form (for network transmission)
  * IN dump_job_ptr - pointer to job for which information is requested
  * IN show_flags - job filtering options
  * IN/OUT buffer - buffer in which data is placed, pointers automatically
  *	updated
  * IN uid - user requesting the data
  * IN has_qos_lock - true if assoc_lock .qos=READ_LOCK already acquired
  * NOTE: change _unpack_job_desc_msg() in common/slurm_protocol_pack.c
  *	  whenever the data format changes
  */
 extern void pack_job(job_record_t *dump_job_ptr, uint16_t show_flags,
 		     buf_t *buffer, uint16_t protocol_version, uid_t uid,
 		     bool has_qos_lock);

 /*
  * pack_part - dump all configuration information about a specific partition
  *	in machine independent form (for network transmission)
  * IN part_ptr - pointer to partition for which information is requested
  * IN/OUT buffer - buffer in which data is placed, pointers automatically
  *	updated
  * global: default_part_loc - pointer to the default partition
  * NOTE: if you make any changes here be sure to make the corresponding
  *	changes to load_part_config in api/partition_info.c
  */
 extern void pack_part(part_record_t *part_ptr, buf_t *buffer,
 		      uint16_t protocol_version);

 /*
  * pack_one_job - dump information for one jobs in
  *	machine independent form (for network transmission)
  * IN job_id - ID of job that we want info for
  * IN show_flags - job filtering options
  * IN uid - uid of user making request (for partition filtering)
  * OUT buffer
  * NOTE: change _unpack_job_desc_msg() in common/slurm_protocol_pack.c
  *	whenever the data format changes
  */
 extern buf_t *pack_one_job(uint32_t job_id, uint16_t show_flags, uid_t uid,
 			   uint16_t protocol_version);

 /*
  * pack_one_node - dump all configuration and node information for one node
  *	in machine independent form (for network transmission)
  * IN show_flags - node filtering options
  * IN uid - uid of user making request (for partition filtering)
  * IN node_name - name of node for which information is desired,
  *		  use first node if name is NULL
  * IN protocol_version - slurm protocol version of client
  * OUT buffer
  * global: node_record_table_ptr - pointer to global node table
  * NOTE: change slurm_load_node() in api/node_info.c when data format changes
  * NOTE: READ lock_slurmctld config before entry
  */
 extern buf_t *pack_one_node(uint16_t show_flags, uid_t uid, char *node_name,
 			    uint16_t protocol_version);

 /* part_not_on_list - helper function to check if array parts contains x */
 extern int part_not_on_list(part_record_t **parts, part_record_t *x);

 /*
  * build_visible_parts - returns an array with pointers to partitions visible
  * to user based on partition Hidden and AllowedGroups properties.
  */
 extern part_record_t **build_visible_parts(uid_t uid, bool privileged);

 /* part_fini - free all memory associated with partition records */
 extern void part_fini (void);

 /*
  * Create a copy of a job's part_list *partition list
  * IN part_list_src - a job's part_list
  * RET copy of part_list_src, must be freed by caller
  */
 extern list_t *part_list_copy(list_t *part_list_src);

 /*
  * Validate a job's account against the partition's AllowAccounts or
  *	DenyAccounts parameters.
  * IN part_ptr - Partition pointer
  * IN acct - account name
  * in job_ptr - Job pointer or NULL. If set and job can not run, then set the
  *		job's state_desc and state_reason fields
  * RET SLURM_SUCCESS or error code
  */
 extern int part_policy_valid_acct(part_record_t *part_ptr, char *acct,
 				  job_record_t *job_ptr);

 /*
  * Validate a job's QOS against the partition's AllowQOS or DenyQOS parameters.
  * IN part_ptr - Partition pointer
  * IN qos_ptr - QOS pointer
  * IN submit_uid - uid of user issuing the request
  * in job_ptr - Job pointer or NULL. If set and job can not run, then set the
  *		job's state_desc and state_reason fields
  * RET SLURM_SUCCESS or error code
  */
 extern int part_policy_valid_qos(part_record_t *part_ptr,
 				 slurmdb_qos_rec_t *qos_ptr,
 				 uid_t submit_uid,
 				 job_record_t *job_ptr);

 /*
  * part_list_update_assoc_lists - Update assoc_mgr pointers from
  *                                [allow|deny]_accts_lists.
  */
 extern void part_list_update_assoc_lists(void);


 /*
  * part_update_assoc_lists - Update assoc_mgr pointers from
  *                           [allow|deny]_accts_lists.
  * IN x - part_rec_t
  */
 extern int part_update_assoc_lists(void *x, void *arg);

 /*
  * partition_in_use - determine whether a partition is in use by a RUNNING
  *	PENDING or SUSPENDED job
  * IN part_name - name of a partition
  * RET true if the partition is in use, else false
  */
 extern bool partition_in_use(char *part_name);

 /*
  * Set "batch_host" for this job based upon it's "batch_features" and
  * "node_bitmap". The selection is deferred in case a node's "active_features"
  * is changed by a reboot.
  * Return SLURM_SUCCESS or error code
  */
 extern int pick_batch_host(job_record_t *job_ptr);

 /*
  * prolog_complete - note the normal termination of the prolog
  * IN job_id - id of the job which completed
  * IN prolog_return_code - prolog's return code,
  *    if set then set job state to FAILED
  * RET - 0 on success, otherwise ESLURM error code
  * global: job_list - pointer global job list
  *	last_job_update - time of last job table update
  */
 extern int prolog_complete(uint32_t job_id, uint32_t prolog_return_code,
 			   char *node_name);

 /*
  * If the job or slurm.conf requests to not kill on invalid dependency,
  * then set the job state reason to WAIT_DEP_INVALID. Otherwise, kill the
  * job.
  */
 extern void handle_invalid_dependency(job_record_t *job_ptr);

 /*
  * purge_old_job - purge old job records.
  *	The jobs must have completed at least MIN_JOB_AGE minutes ago.
  *	Test job dependencies, handle after_ok, after_not_ok before
  *	purging any jobs.
  * NOTE: READ lock slurmctld config and WRITE lock jobs before entry
  */
 void purge_old_job(void);

 /*
  * Free memory from purged job records. This is a distinct phase from
  * purge_old_job() so this can run outside of the job write lock.
  */
 extern void free_old_jobs(void);

 /* Convert a comma delimited list of QOS names into a bitmap */
 extern void qos_list_build(char *qos, bool locked, bitstr_t **qos_bits);

 /* Request that the job scheduler execute soon (typically within seconds) */
 extern void queue_job_scheduler(void);

 /*
  * rehash_jobs - Create or rebuild the job hash table.
  * NOTE: run lock_slurmctld before entry: Read config, write job
  */
 extern void rehash_jobs(void);

 /*
  * Setup and prepare job state cache (if configured)
  * IN new_hash_table_size - number of entries in hash table
  */
 extern void setup_job_state_hash(int new_hash_table_size);

 /* update first assigned job id as needed on reconfigure */
 extern void reset_first_job_id(void);

 /* Reset a node's CPU load value */
 extern void reset_node_load(char *node_name, uint32_t cpu_load);

 /* Reset a node's free memory value */
 extern void reset_node_free_mem(char *node_name, uint64_t free_mem);

 /* Reset all scheduling statistics
  * level IN - clear backfilled_jobs count if set */
 extern void reset_stats(int level);

 /*
  * restore_node_features - Make node and config (from slurm.conf) fields
  *	consistent for Features, Gres and Weight
  * IN recover -
  *              0, 1 - use data from config record, built using slurm.conf
  *              2 = use data from node record, built from saved state
  */
 extern void restore_node_features(int recover);

 /* run_backup - this is the backup controller, it should run in standby
  *	mode, assuming control when the primary controller stops responding */
 extern void run_backup(void);

 /* conmgr RPC connection callbacks */
 extern void *on_backup_connection(conmgr_fd_t *con, void *arg);
 extern void on_backup_finish(conmgr_fd_t *con, void *arg);
 extern int on_backup_msg(conmgr_fd_t *con, slurm_msg_t *msg, void *arg);

 /*
  * ping_controllers - ping other controllers in HA configuration.
  * IN active_controller - true if active controller, false if backup
  */
 extern int ping_controllers(bool active_controller);

 /* Spawn health check function for every node that is not DOWN */
 extern void run_health_check(void);

 /* save_all_state - save entire slurmctld state for later recovery */
 extern void save_all_state(void);

 /* make sure the assoc_mgr lists are up and running and state is
  * restored */
 extern void ctld_assoc_mgr_init(void);

 /* Make sure the assoc_mgr thread is terminated */
 extern void ctld_assoc_mgr_fini(void);

 /* A slurmctld lock needs to at least have a node read lock set before
  * this is called */
 extern void set_cluster_tres(bool assoc_mgr_locked);

 /* sends all jobs in eligible state to accounting.  Only needed at
  * first registration
  */
 extern int send_jobs_to_accounting(void);

 /* send all nodes in a down like state to accounting.  Only needed at
  * first registration
  */
 extern int send_nodes_to_accounting(time_t event_time);

 /* Decrement slurmctld thread count (as applies to thread limit) */
 extern void server_thread_decr(void);

 /* Increment slurmctld thread count (as applies to thread limit) */
 extern void server_thread_incr(void);

 /*
  * Set a job's node_addrs
  *
  * IN job_ptr - job to set node_addrs on
  * IN origin_cluster - cluster creating/requesting addrs.
  */
 extern void set_job_node_addrs(job_record_t *job_ptr,
 			       const char *origin_cluster);

 /*
  * Set a job's initial alias_list/node_addrs.
  *
  * If the job's node list has powering up nodes then set alias_list to "TBD".
  */
 extern void set_initial_job_alias_list(job_record_t *job_ptr);

 /* Set a job's alias_list string */
 extern void set_job_alias_list(job_record_t *job_ptr);

 /* Set a job's features_use and feature_list_use pointers */
 extern void set_job_features_use(job_details_t *details_ptr);

 /*
  * set_job_prio - set a default job priority
  * IN job_ptr - pointer to the job_record
  */
 extern void set_job_prio(job_record_t *job_ptr);

 /*
  * set_node_down - make the specified node's state DOWN if possible
  *	(not in a DRAIN state), kill jobs as needed
  * IN name - name of the node
  * IN reason - why the node is DOWN
  */
 extern void set_node_down (char *name, char *reason);

 /*
  * set_node_down_ptr - make the specified compute node's state DOWN and
  *	kill jobs as needed
  * IN node_ptr - node_ptr to the node
  * IN reason - why the node is DOWN
  */
 void set_node_down_ptr(node_record_t *node_ptr, char *reason);

 /*
  * set_slurmctld_state_loc - create state directory as needed and "cd" to it
  */
 extern void set_slurmctld_state_loc(void);

 /*
  * signal_step_tasks - send specific signal to specific job step
  * IN step_ptr - step record pointer
  * IN signal - signal to send
  * IN msg_type - message type to send
  */
 void signal_step_tasks(step_record_t *step_ptr, uint16_t signal,
 		       slurm_msg_type_t msg_type);

 /*
  * signal_step_tasks_on_node - send specific signal to specific job step
  *                             on a specific node.
  * IN node_name - name of node on which to signal tasks
  * IN step_ptr - step record pointer
  * IN signal - signal to send
  * IN msg_type - message type to send
  */
 void signal_step_tasks_on_node(char* node_name, step_record_t *step_ptr,
 			       uint16_t signal, slurm_msg_type_t msg_type);

 /*
  * slurmctld_shutdown - wake up slurm_rpc_mgr thread via signal
  * RET 0 or error code
  */
 extern int slurmctld_shutdown(void);

 /*
  * job_mgr_dump_job_state - dump the state of a specific job, its details, and
  *	steps to a buffer
  * IN dump_job_ptr - pointer to job for which information is requested
  * IN/OUT buffer - location to store data, pointers automatically advanced
  */
 extern int job_mgr_dump_job_state(void *object, void *arg);

 /*
  * job_mgr_load_job_state - Unpack a job's state information from a buffer
  *
  * If job_ptr_out is not NULL it will be filled in outside of the job_list.
  *
  * NOTE: assoc_mgr qos, tres and assoc read lock must be unlocked before
  * calling
  */
 extern int job_mgr_load_job_state(buf_t *buffer,
 				  uint16_t protocol_version);


 /* For the job array data structure, build the string representation of the
  * bitmap.
  * NOTE: bit_fmt_hexmask() is far more scalable than bit_fmt(). */
 extern void build_array_str(job_record_t *job_ptr);

 /*
  * Return the number of usable logical processors by a given job on
  * some specified node. Returns INFINITE16 if no limit.
  */
 extern uint16_t job_mgr_determine_cpus_per_core(
 	job_details_t *details, int node_inx);

 /* Return true if ALL tasks of specific array job ID are complete */
 extern bool test_job_array_complete(uint32_t array_job_id);

 /* Return true if ALL tasks of specific array job ID are completed */
 extern bool test_job_array_completed(uint32_t array_job_id);

 /* Return true if ALL tasks of specific array job ID are finished */
 extern bool test_job_array_finished(uint32_t array_job_id);

 /* Return true if ANY tasks of specific array job ID are pending */
 extern bool test_job_array_pending(uint32_t array_job_id);

 /* Determine of the nodes are ready to run a job
  * RET true if ready */
 extern bool test_job_nodes_ready(job_record_t *job_ptr);

 /*
  * Synchronize the batch job in the system with their files.
  * All pending batch jobs must have script and environment files
  * No other jobs should have such files
  */
 extern int sync_job_files(void);

 /* After recovering job state, if using priority/basic then we increment the
  * priorities of all jobs to avoid decrementing the base down to zero */
 extern void sync_job_priorities(void);

 /* True if running jobs are allowed to expand, false otherwise. */
 extern bool permit_job_expansion(void);

 /* True if running jobs are allowed to shrink, false otherwise. */
 extern bool permit_job_shrink(void);

 /*
  * update_job - update a job's parameters per the supplied specifications
  * IN msg - RPC to update job, including change specification
  * IN uid - uid of user issuing RPC
  * IN send_msg - whether to send msg back or not
  * RET returns an error code from slurm_errno.h
  * global: job_list - global list of job entries
  *	last_job_update - time of last job table update
  */
 extern int update_job(slurm_msg_t *msg, uid_t uid, bool send_msg);

 /*
  * IN msg - RPC to update job, including change specification
  * IN uid - uid of user issuing RPC
  * RET returns an error code from slurm_errno.h
  * global: job_list - global list of job entries
  *	last_job_update - time of last job table update
  */
 extern int update_job_str(slurm_msg_t *msg, uid_t uid);

 /*
  * Allocate a kill_job_msg_t and populate most fields.
  */
 extern kill_job_msg_t *create_kill_job_msg(job_record_t *job_ptr,
 					   uint16_t protocol_version);

 /*
  * Modify the wckey associated with a pending job
  * IN module - where this is called from
  * IN job_ptr - pointer to job which should be modified
  * IN new_wckey - desired wckey name
  * RET SLURM_SUCCESS or error code
  */
 extern int update_job_wckey(char *module, job_record_t *job_ptr,
 			    char *new_wckey);

 /*
  * Update log levels given requested levels
  * IN req_slurmctld_debug - requested debug level
  * IN req_syslog_debug - requested syslog level
  * NOTE: Will not turn on originally configured off (quiet) channels
  */
 extern void update_log_levels(int req_slurmctld_debug, int req_syslog_debug);

 /* Reset slurmctld logging based upon configuration parameters
  * uses common slurm_conf data structure */
 extern void update_logging(void);

 /*
  * update_node - update the configuration data for one or more nodes
  * IN update_node_msg - update node request
  * IN auth_uid - UID that issued the update
  * RET 0 or error code
  * global: node_record_table_ptr - pointer to global node table
  */
 extern int update_node(update_node_msg_t *update_node_msg, uid_t auth_uid);

 /* Update nodes accounting usage data */
 extern void update_nodes_acct_gather_data(void);

 /*
  * update_node_record_acct_gather_data - update the energy data in the
  * node_record
  * IN msg - node energy data message
  * RET 0 if no error, ENOENT if no such node
  */
 extern int update_node_record_acct_gather_data(
 	acct_gather_node_resp_msg_t *msg);

 /*
  * Create nodes from scontrol using slurm.conf nodeline syntax.
  *
  * IN msg - creation message including nodeline
  * OUT err_msg - pass error messages out.
  * RET SLURM_SUCCESS on success, SLURM_ERROR otherwise.
  */
 extern int create_nodes(update_node_msg_t *msg, char **err_msg);

 /*
  * Create and add dynamic node to system from registration.
  *
  * IN msg - slurm_msg_t containing slurm_node_registration_status_msg_t.
  * RET SLURM_SUCCESS on success, SLURM_ERROR otherwise.
  */
 extern int create_dynamic_reg_node(slurm_msg_t *msg);

 /*
  * Delete node names from system from a slurmctld perspective.
  *
  * e.g. remove node from partitions, reconfig cons_tres, etc.
  *
  * IN names - node names to delete.
  * OUT err_msg - pass error messages out.
  * RET SLURM_SUCCESS on success, error code otherwise.
  */
 extern int delete_nodes(char *names, char **err_msg);

 /*
  * Process string and set partition fields to appropriate values if valid
  *
  * IN billing_weights_str - suggested billing weights
  * IN part_ptr - pointer to partition
  * IN fail - whether the inner function should fatal if the string is invalid.
  * RET return SLURM_ERROR on error, SLURM_SUCCESS otherwise.
  */
 extern int set_partition_billing_weights(char *billing_weights_str,
 					 part_record_t *part_ptr, bool fail);

 /*
  * update_part - create or update a partition's configuration data
  * IN part_desc - description of partition changes
  * IN create_flag - create a new partition
  * RET 0 or an error code
  * global: part_list - list of partition entries
  *	last_part_update - update time of partition records
  */
 extern int update_part (update_part_msg_t * part_desc, bool create_flag);

 /*
  * Sort all jobs' part_ptr_list to be in descending order according to
  * partition priority tier. This Should be called anytime a partition's priority
  * tier is modified.
  */
 extern void sort_all_jobs_partition_lists();

 /*
  * Common code to handle a job when a cred can't be created.
  */
 extern void job_mgr_handle_cred_failure(job_record_t *job_ptr);

 /*
  * validate_alloc_node - validate that the allocating node
  * is allowed to use this partition
  * IN part_ptr - pointer to a partition
  * IN alloc_node - allocating node of the request
  * RET 1 if permitted to run, 0 otherwise
  */
 extern int validate_alloc_node(part_record_t *part_ptr, char *alloc_node);

 /*
  * validate_group - validate that the uid is authorized to access the partition
  * IN part_ptr - pointer to a partition
  * IN run_uid - user to run the job as
  * RET 1 if permitted to run, 0 otherwise
  */
 extern int validate_group(part_record_t *part_ptr, uid_t run_uid);

 /* Perform some size checks on strings we store to prevent
  * malicious user filling slurmctld's memory
  * IN job_desc   - user job submit request
  * IN submit_uid - UID making job submit request
  * OUT err_msg   - custom error message to return
  * RET 0 or error code */
 extern int validate_job_create_req(job_desc_msg_t *job_desc, uid_t submit_uid,
 				   char **err_msg);

 /*
  * validate_jobs_on_node - validate that any jobs that should be on the node
  *	are actually running, if not clean up the job records and/or node
  *	records.
  *
  * IN slurm_msg - contains the node registration message
  */
 extern void validate_jobs_on_node(slurm_msg_t *slurm_msg);

 /*
  * validate_node_specs - validate the node's specifications as valid,
  *	if not set state to down, in any case update last_response
  * IN slurm_msg - get node registration message it
  * OUT newly_up - set if node newly brought into service
  * RET 0 if no error, ENOENT if no such node, EINVAL if values too low
  * NOTE: READ lock_slurmctld config before entry
  */
 extern int validate_node_specs(slurm_msg_t *slurm_msg, bool *newly_up);

 /*
  * validate_super_user - validate that the uid is authorized at the
  *      root, SlurmUser, or SLURMDB_ADMIN_SUPER_USER level
  * IN uid - user to validate
  * RET true if permitted to run, false otherwise
  */
 extern bool validate_super_user(uid_t uid);

 /*
  * validate_operator - validate that the uid is authorized at the
  *      root, SlurmUser, or SLURMDB_ADMIN_OPERATOR level
  * IN uid - user to validate
  * RET true if permitted to run, false otherwise
  */
 extern bool validate_operator(uid_t uid);
 extern bool validate_operator_locked(uid_t uid);

 /*
  * validate_operator_user_rec - validate that the user is authorized at the
  *      root, SlurmUser, or SLURMDB_ADMIN_OPERATOR level
  * IN user - slurmdb_user_rec_t of user to check
  * RET true if permitted to run, false otherwise
  */
 extern bool validate_operator_user_rec(slurmdb_user_rec_t *user);

 /* cleanup_completing()
  *
  * Clean up the JOB_COMPLETING flag and eventually
  * requeue the job if there is a pending request
  * for it. This function assumes the caller has the
  * appropriate locks on the job_record.
  * This function is called when a job completes
  * by either when the slurmd epilog finishes or
  * when the slurmctld epilog finishes, whichever
  * comes last.
  */
 extern void cleanup_completing(job_record_t *job_ptr, bool requeue);

 /*
  * Determine if slurmctld will respond to "configless" RPCs. If so,
  * load the internal cached config values to avoid regenerating on each
  * RPC.
  */
 extern void configless_setup(void);
 /* Reload the internal cached config values. */
 extern void configless_update(void);
 /* Free cached values to avoid memory leak. */
 extern void configless_clear(void);

 /*
  * Calculate and populate the number of tres' for all partitions.
  */
 extern void set_partition_tres(bool assoc_mgr_locked);

 /*
  * Update job's federated siblings strings.
  *
  * IN job_ptr - job_ptr to update
  */
 extern void update_job_fed_details(job_record_t *job_ptr);

 /*
  * purge_job_record - purge specific job record. No testing is performed to
  *	ensure the job records has no active references. Use only for job
  *	records that were never fully operational (e.g. WILL_RUN test, failed
  *	job load, failed job create, etc.).
  * IN job_id - job_id of job record to be purged
  * RET int - count of job's purged
  * global: job_list - global job table
  */
 extern int purge_job_record(uint32_t job_id);

 /*
  * Remove job from job hashes so that it can't be found, but leave job in
  * job_table so that it can be deleted by _list_delete_job().
  *
  * IN job_ptr - job_ptr to be unlinked
  */
 extern void unlink_job_record(job_record_t *job_ptr);

 /*
  * copy_job_record_to_job_desc - construct a job_desc_msg_t for a job.
  * IN job_ptr - the job record
  * RET the job_desc_msg_t, NULL on error
  */
 extern job_desc_msg_t *copy_job_record_to_job_desc(job_record_t *job_ptr);


 /*
  * Set the allocation response with the current cluster's information and the
  * job's allocated node's addr's if the allocation is being filled by a cluster
  * other than the cluster that submitted the job
  *
  * Note: make sure that the resp's working_cluster_rec is NULL'ed out before the
  * resp is free'd since it points to global memory.
  *
  * IN resp - allocation response being sent back to client.
  * IN job_ptr - allocated job
  * IN req_cluster - the cluster requesting the allocation info.
  */
 extern void
 set_remote_working_response(resource_allocation_response_msg_t *resp,
 			    job_record_t *job_ptr,
 			    const char *req_cluster);

 /*
  * Calculate billable TRES based on partition's defined BillingWeights. If none
  * is defined, return total_cpus. This is cached on job_ptr->billable_tres and
  * is updated if the job was resized since the last iteration.
  *
  * IN job_ptr          - job to calc billable tres on
  * IN start_time       - time the has started or been resized
  * IN assoc_mgr_locked - whether the tres assoc lock is set or not
  */
 extern double calc_job_billable_tres(job_record_t *job_ptr, time_t start_time,
 				     bool assoc_mgr_locked);

 /*
  * Check for node timed events
  *
  * Such as:
  * reboots - If the node hasn't booted by ResumeTimeout, mark the node as down.
  * resume_after - Resume a down|drain node after resume_after time.
  */
 extern void check_node_timers(void);

 /*
  * Send warning signal to job before end time.
  *
  * IN job_ptr - job to send warn signal to.
  * IN ignore_time - If set, ignore the warn time and just send it.
  */
 extern void send_job_warn_signal(job_record_t *job_ptr, bool ignore_time);

 /*
  * Check if waiting for the node to still boot.
  *
  * IN node_ptr - node to check if still waiting for boot.
  *
  * RET return true if still expecting the node to boot, false otherwise.
  */
 extern bool waiting_for_node_boot(node_record_t *node_ptr);

 /*
  * Check if waiting for the node to still power down.
  *
  * IN node_ptr - node to check if still waiting for power down.
  *
  * RET return true if still expecting the node to power down, false otherwise.
  */
 extern bool waiting_for_node_power_down(node_record_t *node_ptr);

 /*
  * Check if any part of job_ptr is overlapping node_map.
  * IN node_map - bitstr of nodes set.
  * IN job_ptr (hetjob or not) to check.
  *
  * RET true if we overlap, false otherwise
  */
 extern bool job_overlap_and_running(bitstr_t *node_map, list_t *license_list,
 				    job_record_t *job_ptr);

 /*
  * Respond to request for backup slurmctld status
  */
 extern void slurm_rpc_control_status(slurm_msg_t *msg);

 /*
  * Callbacks to let the PrEp plugins signal completion if running async.
  */
 extern void prep_prolog_slurmctld_callback(int rc, uint32_t job_id,
 					   bool timed_out);
 extern void prep_epilog_slurmctld_callback(int rc, uint32_t job_id,
 					   bool timed_out);

 /*
  * Set node's comm_name and hostname.
  *
  * If comm_name is NULL, hostname will be used for both fields.
  */
 extern void set_node_comm_name(node_record_t *node_ptr, char *comm_name,
 			       char *hostname);

 /*
  * Create a new file (file_name) and write (data) into it.
  * The file will have a trailing '\0' written into it, this makes it
  * easier to work with then loaded with create_mmap_buf as the string
  * representation of the file will be NUL terminated for us already.
  */
 extern int write_data_to_file(char *file_name, char *data);

 /*
  * Update a user's crontab entry, and submit new jobs as required.
  * Will mark existing crontab-submitted jobs as complete.
  */
 extern void crontab_submit(crontab_update_request_msg_t *req_msg,
 			   crontab_update_response_msg_t *response,
 			   char *alloc_node, identity_t *id,
 			   uint16_t protocol_version);

 extern void crontab_add_disabled_lines(uid_t uid, int line_start, int line_end);

 /*
  * Return a env** araray with common output job env variables.
  *
  * Used for <Pro|Epi>logSlurmctld and MailProg.
  */
 extern char **job_common_env_vars(job_record_t *job_ptr, bool is_complete);

 /*
  * update_node_active_features - Update active features associated with nodes
  * IN node_names - list of nodes to update
  * IN active_features - New active features value
  * IN mode - FEATURE_MODE_IND : Print each node change individually
  *           FEATURE_MODE_COMB: Try to combine like changes (SEE NOTE BELOW)
  *           FEATURE_MODE_PEND: Print any pending change message
  * RET: SLURM_SUCCESS or error code
  * NOTE: Use mode=FEATURE_MODE_IND in a loop with node write lock set,
  *	 then call with mode=FEATURE_MODE_PEND at the end of the loop
  */
 extern int update_node_active_features(char *node_names, char *active_features,
 				       int mode);

 /*
  * update_node_avail_features - Update available features associated with
  *	nodes, build new config list records as needed
  * IN node_names - list of nodes to update
  * IN avail_features - New available features value
  * IN mode - FEATURE_MODE_IND : Print each node change individually
  *           FEATURE_MODE_COMB: Try to combine like changes (SEE NOTE BELOW)
  *           FEATURE_MODE_PEND: Print any pending change message
  * RET: SLURM_SUCCESS or error code
  * NOTE: Use mode=FEATURE_MODE_IND in a loop with node write lock set,
  *	 then call with mode=FEATURE_MODE_PEND at the end of the loop
  */
 extern int update_node_avail_features(char *node_names, char *avail_features,
 				      int mode);

 /*
  * Filter out changeable features and only feature conf only features
  *
  * IN features - features string to remove changeable features from
  *
  * RET: return xmalloc'ed string that doesn't contain changeable features.
  */
 extern char *filter_out_changeable_features(const char *features);

 /*
  * Reset a nodes active features to only non-changeable available features.
  */
 extern void reset_node_active_features(node_record_t *node_ptr);

 /*
  * Reset a node's instance variables
  */
 extern void reset_node_instance(node_record_t *node_ptr);

 /*
  * Return a hostlist with expanded node specification.
  *
  * Handles node range expressions, nodesets and ALL keyword.
  *
  * IN nodes - nodelist that can have nodesets or ALL in it.
  * IN uniq - call hostlist_uniq() before returning the hostlist
  * OUT nodesets (optional) - list of nodesets found in nodes string
  *
  * RET NULL on error, hostlist_t otherwise.
  *
  * NOTE: Caller must FREE_NULL_HOSTLIST() returned hostlist_t.
  * NOTE: Caller should interpret a non-NULL but empty hostlist conveniently.
  */
 extern hostlist_t *nodespec_to_hostlist(const char *nodes, bool uniq,
 					char **nodesets);

 /*
  * set_node_reason - appropriately set node reason with message
  * IN node_ptr - node_ptr to the node
  * IN message - message to be set/appended
  * IN time - timestamp of message
  */
 extern void set_node_reason(node_record_t *node_ptr,
 			    char *message,
 			    time_t time);

 extern int set_part_topology_idx(void *x, void *arg);

 extern void reconfigure_slurm(slurm_msg_t *msg);

 extern void notify_parent_of_success(void);

 /*
  * free_job_record - delete a job record and its corresponding
  *	job_details,
  *	see common/list.h for documentation
  * IN job_entry - pointer to job_record to delete
  */
 extern void free_job_record(void *job_entry);

 /*
  * Build a job rec from an advanced reservation request.
  */
 extern job_record_t *job_mgr_copy_resv_desc_to_job_record(
 	resv_desc_msg_t *resv_desc_ptr);

 /*
  * Initialize the various schedulers.
  */
 extern int controller_init_scheduling(bool init_gang);

 /*
  * Finalize the various schedulers.
  */
 extern void controller_fini_scheduling(void);

 /*
  * Reconfigure the various schedulers.
  */
 extern void controller_reconfig_scheduling(void);

 /*
  * Return a comma separate xstr of partition names from a list of
  * part_record_t's.
  */
 extern char *part_list_to_xstr(list_t *list);

 /* Allow listener sockets to accept() new incoming requests */
 extern void listeners_unquiesce(void);

 /* Stop listener sockets from accept()ing new incoming requests */
 extern void listeners_quiesce(void);

 #endif /* !_HAVE_SLURMCTLD_H */