src/slurmctld/slurmctld.h - SchedMD/slurm - Git at Google

 /*****************************************************************************\
  *  slurmctld.h - definitions of functions and structures for slurmcltd use
  *****************************************************************************
  *  Copyright (C) 2002-2007 The Regents of the University of California.
  *  Copyright (C) 2008-2010 Lawrence Livermore National Security.
  *  Portions Copyright (C) 2010 SchedMD <http://www.schedmd.com>.
  *  Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
  *  Written by Morris Jette <jette1@llnl.gov> et. al.
  *  CODE-OCEC-09-009. All rights reserved.
  *
  *  This file is part of SLURM, a resource management program.
  *  For details, see <http://slurm.schedmd.com/>.
  *  Please also read the included file: DISCLAIMER.
  *
  *  SLURM is free software; you can redistribute it and/or modify it under
  *  the terms of the GNU General Public License as published by the Free
  *  Software Foundation; either version 2 of the License, or (at your option)
  *  any later version.
  *
  *  In addition, as a special exception, the copyright holders give permission
  *  to link the code of portions of this program with the OpenSSL library under
  *  certain conditions as described in each individual source file, and
  *  distribute linked combinations including the two. You must obey the GNU
  *  General Public License in all respects for all of the code used other than
  *  OpenSSL. If you modify file(s) with this exception, you may extend this
  *  exception to your version of the file(s), but you are not obligated to do
  *  so. If you do not wish to do so, delete this exception statement from your
  *  version.  If you delete this exception statement from all source files in
  *  the program, then also delete it here.
  *
  *  SLURM is distributed in the hope that it will be useful, but WITHOUT ANY
  *  WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
  *  FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
  *  details.
  *
  *  You should have received a copy of the GNU General Public License along
  *  with SLURM; if not, write to the Free Software Foundation, Inc.,
  *  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301  USA.
 \*****************************************************************************/

 #ifndef _HAVE_SLURMCTLD_H
 #define _HAVE_SLURMCTLD_H


 #if HAVE_CONFIG_H
 #  include "config.h"
 #  if HAVE_INTTYPES_H
 #    include <inttypes.h>
 #  else
 #    if HAVE_STDINT_H
 #      include <stdint.h>
 #    endif
 #  endif			/* HAVE_INTTYPES_H */
 #endif

 #include <pthread.h>
 /* #include <stdlib.h> */
 #include <time.h>
 #include <strings.h>
 #include <sys/types.h>
 #include <unistd.h>

 #ifdef WITH_PTHREADS
 #  include <pthread.h>
 #endif				/* WITH_PTHREADS */

 #include "slurm/slurm.h"

 #include "src/common/bitstring.h"
 #include "src/common/checkpoint.h"
 #include "src/common/list.h"
 #include "src/common/log.h"
 #include "src/common/macros.h"
 #include "src/common/node_conf.h"
 #include "src/common/pack.h"
 #include "src/common/read_config.h" /* location of slurmctld_conf */
 #include "src/common/job_resources.h"
 #include "src/common/slurm_cred.h"
 #include "src/common/slurm_protocol_api.h"
 #include "src/common/slurm_protocol_defs.h"
 #include "src/common/switch.h"
 #include "src/common/timers.h"
 #include "src/common/xmalloc.h"

 /*****************************************************************************\
  *  GENERAL CONFIGURATION parameters and data structures
 \*****************************************************************************/
 /* Maximum index for a job array. The minimum index will always be 0. */
 #ifndef MAX_JOB_ARRAY_VALUE
 #define MAX_JOB_ARRAY_VALUE 1000
 #endif

 /* Maximum parallel threads to service incoming RPCs.
  * Since some systems schedule pthread on a First-In-Last-Out basis,
  * increasing this value is strongly discouraged. */
 #ifndef MAX_SERVER_THREADS
 #define MAX_SERVER_THREADS 256
 #endif

 /* Perform full slurmctld's state every PERIODIC_CHECKPOINT seconds */
 #ifndef PERIODIC_CHECKPOINT
 #define	PERIODIC_CHECKPOINT	300
 #endif

 /* Retry an incomplete RPC agent request every RPC_RETRY_INTERVAL seconds */
 #ifndef RPC_RETRY_INTERVAL
 #define	RPC_RETRY_INTERVAL	60
 #endif

 /* Check for jobs reaching their time limit every PERIODIC_TIMEOUT seconds */
 #ifndef PERIODIC_TIMEOUT
 #define	PERIODIC_TIMEOUT	30
 #endif

 /* Attempt to purge defunct job records and resend job kill requests
  * every PURGE_JOB_INTERVAL seconds */
 #ifndef PURGE_JOB_INTERVAL
 #define PURGE_JOB_INTERVAL 60
 #endif

 /* Process pending trigger events every TRIGGER_INTERVAL seconds */
 #ifndef TRIGGER_INTERVAL
 #define TRIGGER_INTERVAL 15
 #endif

 /* Report current node accounting state every PERIODIC_NODE_ACCT seconds */
 #ifndef PERIODIC_NODE_ACCT
 #define PERIODIC_NODE_ACCT 300
 #endif

 /* Pathname of group file record for checking update times */
 #ifndef GROUP_FILE
 #define GROUP_FILE	"/etc/group"
 #endif

 /* Seconds to wait for backup controller response to REQUEST_CONTROL RPC */
 #ifndef CONTROL_TIMEOUT
 #define CONTROL_TIMEOUT 10	/* seconds */
 #endif

 /*****************************************************************************\
  *  General configuration parameters and data structures
 \*****************************************************************************/

 typedef struct slurmctld_config {
 	int	daemonize;
 	bool	resume_backup;
 	time_t	boot_time;
 	time_t	shutdown_time;
 	int	server_thread_count;

 	slurm_cred_ctx_t cred_ctx;
 #ifdef WITH_PTHREADS
 	pthread_mutex_t thread_count_lock;
 	pthread_t thread_id_main;
 	pthread_t thread_id_save;
 	pthread_t thread_id_sig;
 	pthread_t thread_id_power;
 	pthread_t thread_id_rpc;
 #else
 	int thread_count_lock;
 	int thread_id_main;
 	int thread_id_save;
 	int thread_id_sig;
 	int thread_id_power;
 	int thread_id_rpc;
 #endif
 } slurmctld_config_t;

 /* Job scheduling statistics */
 typedef struct diag_stats {
 	int proc_req_threads;
 	int proc_req_raw;

 	uint32_t schedule_cycle_max;
 	uint32_t schedule_cycle_last;
 	uint32_t schedule_cycle_sum;
 	uint32_t schedule_cycle_counter;
 	uint32_t schedule_cycle_depth;
 	uint32_t schedule_queue_len;

 	uint32_t jobs_submitted;
 	uint32_t jobs_started;
 	uint32_t jobs_completed;
 	uint32_t jobs_canceled;
 	uint32_t jobs_failed;

 	uint32_t backfilled_jobs;
 	uint32_t last_backfilled_jobs;
 	uint32_t bf_cycle_counter;
 	uint32_t bf_cycle_last;
 	uint32_t bf_cycle_max;
 	uint32_t bf_cycle_sum;
 	uint32_t bf_last_depth;
 	uint32_t bf_last_depth_try;
 	uint32_t bf_depth_sum;
 	uint32_t bf_depth_try_sum;
 	uint32_t bf_queue_len;
 	uint32_t bf_queue_len_sum;
 	time_t   bf_when_last_cycle;
 	uint32_t bf_active;
 } diag_stats_t;

 extern diag_stats_t slurmctld_diag_stats;
 extern slurmctld_config_t slurmctld_config;
 extern int   bg_recover;		/* state recovery mode */
 extern char *slurmctld_cluster_name;	/* name of cluster */
 extern void *acct_db_conn;
 extern int   accounting_enforce;
 extern int   association_based_accounting;
 extern uint32_t   cluster_cpus;
 extern int   with_slurmdbd;
 extern bool  load_2_4_state;
 extern int   batch_sched_delay;
 extern int   sched_interval;
 extern bool  slurmctld_init_db;
 extern int   slurmctld_primary;

 /*****************************************************************************\
  *  NODE parameters and data structures, mostly in src/common/node_conf.h
 \*****************************************************************************/
 extern uint32_t total_cpus;		/* count of CPUs in the entire cluster */
 extern bool ping_nodes_now;		/* if set, ping nodes immediately */
 extern bool want_nodes_reboot;		/* if set, check for idle nodes */

 /*****************************************************************************\
  *  NODE states and bitmaps
  *
  *  avail_node_bitmap       Set if node's state is not DOWN, DRAINING/DRAINED,
  *                          FAILING or NO_RESPOND (i.e. available to run a job)
  *  cg_node_bitmap          Set if node in completing state
  *  idle_node_bitmap        Set if node has no jobs allocated to it
  *  power_node_bitmap       Set for nodes which are powered down
  *  share_node_bitmap       Set if no jobs allocated exclusive access to
  *                          resources on that node (cleared if --exclusive
  *                          option specified by job or Shared=NO configured for
  *                          the job's partition)
  *  up_node_bitmap          Set if the node's state is not DOWN
 \*****************************************************************************/
 extern bitstr_t *avail_node_bitmap;	/* bitmap of available nodes,
 					 * state not DOWN, DRAIN or FAILING */
 extern bitstr_t *cg_node_bitmap;	/* bitmap of completing nodes */
 extern bitstr_t *idle_node_bitmap;	/* bitmap of idle nodes */
 extern bitstr_t *power_node_bitmap;	/* Powered down nodes */
 extern bitstr_t *share_node_bitmap;	/* bitmap of sharable nodes */
 extern bitstr_t *up_node_bitmap;	/* bitmap of up nodes, not DOWN */

 /*****************************************************************************\
  *  FRONT_END parameters and data structures
 \*****************************************************************************/
 #define FRONT_END_MAGIC 0xfe9b82fe

 typedef struct front_end_record {
 	gid_t *allow_gids;		/* zero terminated list of allowed groups */
 	char *allow_groups;		/* allowed group string */
 	uid_t *allow_uids;		/* zero terminated list of allowed users */
 	char *allow_users;		/* allowed user string */
 	time_t boot_time;		/* Time of node boot,
 					 * computed from up_time */
 	char *comm_name;		/* communications path name to node */
 	gid_t *deny_gids;		/* zero terminated list of denied groups */
 	char *deny_groups;		/* denied group string */
 	uid_t *deny_uids;		/* zero terminated list of denied users */
 	char *deny_users;		/* denied user string */
 	uint32_t job_cnt_comp;		/* count of completing jobs on node */
 	uint16_t job_cnt_run;		/* count of running jobs on node */
 	time_t last_response;		/* Time of last communication */
 	uint32_t magic;			/* magic cookie to test data integrity */
 	char *name;			/* frontend node name */
 	uint16_t node_state;		/* enum node_states, ORed with
 					 * NODE_STATE_NO_RESPOND if not
 					 * responding */
 	bool not_responding;		/* set if fails to respond,
 					 * clear after logging this */
 	slurm_addr_t slurm_addr;	/* network address */
 	uint16_t port;			/* frontend specific port */
 	uint16_t protocol_version;	/* Slurm version number */
 	char *reason;			/* reason for down frontend node */
 	time_t reason_time;		/* Time stamp when reason was set,
 					 * ignore if no reason is set. */
 	uint32_t reason_uid;   		/* User that set the reason, ignore if
 					 * no reason is set. */
 	time_t slurmd_start_time;	/* Time of slurmd startup */
 	char *version;			/* Slurm version */
 } front_end_record_t;

 extern front_end_record_t *front_end_nodes;
 extern uint16_t front_end_node_cnt;
 extern time_t last_front_end_update;	/* time of last front_end update */

 /*****************************************************************************\
  *  PARTITION parameters and data structures
 \*****************************************************************************/
 #define PART_MAGIC 0xaefe8495

 struct part_record {
 	char *allow_accounts;	/* comma delimited list of accounts,
 				 * NULL indicates all */
 	char **allow_account_array; /* NULL terminated list of allowed
 				 * accounts */
 	char *allow_alloc_nodes;/* comma delimited list of allowed
 				 * allocating nodes
 				 * NULL indicates all */
 	char *allow_groups;	/* comma delimited list of groups,
 				 * NULL indicates all */
 	uid_t *allow_uids;	/* zero terminated list of allowed user IDs */
 	char *allow_qos;	/* comma delimited list of qos,
 				 * NULL indicates all */
 	bitstr_t *allow_qos_bitstr; /* (DON'T PACK) assocaited with
 				 * char *allow_qos but used internally */
 	char *alternate; 	/* name of alternate partition */
 	uint32_t def_mem_per_cpu; /* default MB memory per allocated CPU */
 	uint32_t default_time;	/* minutes, NO_VAL or INFINITE */
 	char *deny_accounts;	/* comma delimited list of denied accounts */
 	char **deny_account_array; /* NULL terminated list of denied accounts */
 	char *deny_qos;		/* comma delimited list of denied qos */
 	bitstr_t *deny_qos_bitstr; /* (DON'T PACK) associated with
 				 * char *deny_qos but used internallly */
 	uint16_t flags;		/* see PART_FLAG_* in slurm.h */
 	uint32_t grace_time;	/* default preempt grace time in seconds */
 	uint32_t magic;		/* magic cookie to test data integrity */
 	uint32_t max_cpus_per_node; /* maximum allocated CPUs per node */
 	uint32_t max_mem_per_cpu; /* maximum MB memory per allocated CPU */
 	uint32_t max_nodes;	/* per job or INFINITE */
 	uint32_t max_nodes_orig;/* unscaled value (c-nodes on BlueGene) */
 	uint32_t max_offset;	/* select plugin max offset */
 	uint16_t max_share;	/* number of jobs to gang schedule */
 	uint32_t max_time;	/* minutes or INFINITE */
 	uint32_t min_nodes;	/* per job */
 	uint32_t min_offset;	/* select plugin min offset */
 	uint32_t min_nodes_orig;/* unscaled value (c-nodes on BlueGene) */
 	char *name;		/* name of the partition */
 	bitstr_t *node_bitmap;	/* bitmap of nodes in partition */
 	char *nodes;		/* comma delimited list names of nodes */
 	double   norm_priority;	/* normalized scheduling priority for
 				 * jobs (DON'T PACK) */
 	uint16_t preempt_mode;	/* See PREEMPT_MODE_* in slurm/slurm.h */
 	uint16_t priority;	/* scheduling priority for jobs */
 	uint16_t state_up;	/* See PARTITION_* states in slurm.h */
 	uint32_t total_nodes;	/* total number of nodes in the partition */
 	uint32_t total_cpus;	/* total number of cpus in the partition */
 	uint16_t cr_type;	/* Custom CR values for partition (if supported by select plugin) */
 };

 extern List part_list;			/* list of part_record entries */
 extern time_t last_part_update;		/* time of last part_list update */
 extern struct part_record default_part;	/* default configuration values */
 extern char *default_part_name;		/* name of default partition */
 extern struct part_record *default_part_loc;	/* default partition ptr */
 extern uint16_t part_max_priority;      /* max priority in all partitions */

 /*****************************************************************************\
  *  RESERVATION parameters and data structures
 \*****************************************************************************/

 typedef struct slurmctld_resv {
 	char *accounts;		/* names of accounts permitted to use	*/
 	int account_cnt;	/* count of accounts permitted to use	*/
 	char **account_list;	/* list of accounts permitted to use	*/
 	bool account_not;	/* account_list users NOT permitted to use */
 	char *assoc_list;	/* list of associations			*/
 	uint32_t cpu_cnt;	/* number of reserved CPUs		*/
 	bitstr_t *core_bitmap;	/* bitmap of reserved cores		*/
 	uint32_t duration;	/* time in seconds for this
 				 * reservation to last                  */
 	time_t end_time;	/* end time of reservation		*/
 	char *features;		/* required node features		*/
 	uint32_t flags;		/* see RESERVE_FLAG_* in slurm.h	*/
 	bool full_nodes;	/* when reservation uses full nodes or not */
 	uint32_t job_pend_cnt;	/* number of pending jobs		*/
 	uint32_t job_run_cnt;	/* number of running jobs		*/
 	List license_list;	/* structure with license info		*/
 	char *licenses;		/* required system licenses		*/
 	uint16_t magic;		/* magic cookie, RESV_MAGIC		*/
 	bool flags_set_node;	/* flags (i.e. NODE_STATE_MAINT |
 				 * NODE_STATE_RES) set for nodes	*/
 	char *name;		/* name of reservation			*/
 	bitstr_t *node_bitmap;	/* bitmap of reserved nodes		*/
 	uint32_t node_cnt;	/* count of nodes required		*/
 	char *node_list;	/* list of reserved nodes or ALL	*/
 	char *partition;	/* name of partition to be used		*/
 	struct part_record *part_ptr;	/* pointer to partition used	*/
 	uint32_t resv_id;	/* unique reservation ID, internal use	*/
 	bool run_epilog;	/* set if epilog has been executed	*/
 	bool run_prolog;	/* set if prolog has been executed	*/
 	time_t start_time;	/* start time of reservation		*/
 	time_t start_time_first;/* when the reservation first started	*/
 	time_t start_time_prev;	/* If start time was changed this is
 				 * the pervious start time.  Needed
 				 * for accounting */
 	char *users;		/* names of users permitted to use	*/
 	int user_cnt;		/* count of users permitted to use	*/
 	uid_t *user_list;	/* array of users permitted to use	*/
 	bool user_not;		/* user_list users NOT permitted to use	*/
 } slurmctld_resv_t;

 /*****************************************************************************\
  *  JOB parameters and data structures
 \*****************************************************************************/
 extern time_t last_job_update;	/* time of last update to job records */

 #define DETAILS_MAGIC	0xdea84e7
 #define JOB_MAGIC	0xf0b7392c
 #define STEP_MAGIC	0xce593bc1

 #define FEATURE_OP_OR   0
 #define FEATURE_OP_AND  1
 #define FEATURE_OP_XOR  2
 #define FEATURE_OP_XAND 3
 #define FEATURE_OP_END  4		/* last entry lacks separator */
 struct feature_record {
 	char *name;			/* name of feature */
 	uint16_t count;			/* count of nodes with this feature */
 	uint8_t op_code;		/* separator, see FEATURE_OP_ above */
 };

 /* job_details - specification of a job's constraints,
  * can be purged after initiation */
 struct job_details {
 	char *acctg_freq;		/* accounting polling interval */
 	uint32_t argc;			/* count of argv elements */
 	char **argv;			/* arguments for a batch job script */
 	time_t begin_time;		/* start at this time (srun --begin),
 					 * resets to time first eligible
 					 * (all dependencies satisfied) */
 	char *ckpt_dir;			/* directory to store checkpoint
 					 * images */
 	uint16_t contiguous;		/* set if requires contiguous nodes */
 	uint16_t core_spec;		/* specialized core count */
 	char *cpu_bind;			/* binding map for map/mask_cpu */
 	uint16_t cpu_bind_type;		/* see cpu_bind_type_t */
 	uint16_t cpus_per_task;		/* number of processors required for
 					 * each task */
 	List depend_list;		/* list of job_ptr:state pairs */
 	char *dependency;		/* wait for other jobs */
 	char *orig_dependency;		/* original value (for archiving) */
 	uint16_t env_cnt;		/* size of env_sup (see below) */
 	char **env_sup;			/* supplemental environment variables
 					 * as set by Moab */
 	bitstr_t *exc_node_bitmap;	/* bitmap of excluded nodes */
 	char *exc_nodes;		/* excluded nodes */
 	uint32_t expanding_jobid;	/* ID of job to be expanded */
 	List feature_list;		/* required features with
 					 * node counts */
 	char *features;			/* required features */
 	uint32_t magic;			/* magic cookie for data integrity */
 	uint32_t max_cpus;		/* maximum number of cpus */
 	uint32_t max_nodes;		/* maximum number of nodes */
 	multi_core_data_t *mc_ptr;	/* multi-core specific data */
 	char *mem_bind;			/* binding map for map/mask_cpu */
 	uint16_t mem_bind_type;		/* see mem_bind_type_t */
 	uint32_t min_cpus;		/* minimum number of cpus */
 	uint32_t min_nodes;		/* minimum number of nodes */
 	uint16_t nice;			/* requested priority change,
 					 * NICE_OFFSET == no change */
 	uint16_t ntasks_per_node;	/* number of tasks on each node */
 	uint32_t num_tasks;		/* number of tasks to start */
 	uint8_t open_mode;		/* stdout/err append or trunctate */
 	uint8_t overcommit;		/* processors being over subscribed */
 	uint16_t plane_size;		/* plane size when task_dist =
 					 * SLURM_DIST_PLANE */
 	/* job constraints: */
 	uint32_t pn_min_cpus;		/* minimum processors per node */
 	uint32_t pn_min_memory;		/* minimum memory per node (MB) OR
 					 * memory per allocated
 					 * CPU | MEM_PER_CPU */
 	uint32_t pn_min_tmp_disk;	/* minimum tempdisk per node, MB */
 	uint8_t prolog_running;		/* set while prolog_slurmctld is
 					 * running */
 	uint32_t reserved_resources;	/* CPU minutes of resources reserved
 					 * for this job while it was pending */
 	bitstr_t *req_node_bitmap;	/* bitmap of required nodes */
 	uint16_t *req_node_layout;	/* task layout for required nodes */
 	time_t preempt_start_time;	/* time that preeption began to start
 					 * this job */
 	char *req_nodes;		/* required nodes */
 	uint16_t requeue;		/* controls ability requeue job */
 	char *restart_dir;		/* restart execution from ckpt images
 					 * in this dir */
 	uint8_t share_res;		/* set if job can share resources with
 					 * other jobs */
 	char *std_err;			/* pathname of job's stderr file */
 	char *std_in;			/* pathname of job's stdin file */
 	char *std_out;			/* pathname of job's stdout file */
 	time_t submit_time;		/* time of submission */
 	uint16_t task_dist;		/* task layout for this job. Only
 					 * useful when Consumable Resources
 					 * is enabled */
 	uint32_t usable_nodes;		/* node count needed by preemption */
 	uint8_t whole_node;		/* job requested exclusive node use */
 	char *work_dir;			/* pathname of working directory */
 };

 struct job_record {
 	char    *account;		/* account number to charge */
 	char	*alias_list;		/* node name to address aliases */
 	char    *alloc_node;		/* local node making resource alloc */
 	uint16_t alloc_resp_port;	/* RESPONSE_RESOURCE_ALLOCATION port */
 	uint32_t alloc_sid;		/* local sid making resource alloc */
 	uint32_t array_job_id;		/* job_id of a job array or 0 if N/A */
 	uint32_t array_task_id;		/* task_id of a job array */
 	uint32_t assoc_id;              /* used for accounting plugins */
 	void    *assoc_ptr;		/* job's association record ptr, it is
 					 * void* because of interdependencies
 					 * in the header files, confirm the
 					 * value before use */
 	uint16_t batch_flag;		/* 1 or 2 if batch job (with script),
 					 * 2 indicates retry mode (one retry) */
 	char *batch_host;		/* host executing batch script */
 	check_jobinfo_t check_job;      /* checkpoint context, opaque */
 	uint16_t ckpt_interval;		/* checkpoint interval in minutes */
 	time_t ckpt_time;		/* last time job was periodically
 					 * checkpointed */
 	char *comment;			/* arbitrary comment */
 	uint32_t cpu_cnt;		/* current count of CPUs held
 					 * by the job, decremented while job is
 					 * completing (N/A for bluegene
 					 * systems) */
 	uint16_t cr_enabled;            /* specify if if Consumable Resources
 					 * is enabled. Needed since CR deals
 					 * with a finer granularity in its
 					 * node/cpu scheduling (available cpus
 					 * instead of available nodes) than the
 					 * bluegene and the linear plugins
 					 * 0 if cr is NOT enabled,
 					 * 1 if cr is enabled */
 	uint32_t db_index;              /* used only for database
 					 * plugins */
 	uint32_t derived_ec;		/* highest exit code of all job steps */
 	struct job_details *details;	/* job details */
 	uint16_t direct_set_prio;	/* Priority set directly if
 					 * set the system will not
 					 * change the priority any further. */
 	time_t end_time;		/* time of termination,
 					 * actual or expected */
 	bool epilog_running;		/* true of EpilogSlurmctld is running */
 	uint32_t exit_code;		/* exit code for job (status from
 					 * wait call) */
 	front_end_record_t *front_end_ptr; /* Pointer to front-end node running
 					 * this job */
 	char *gres;			/* generic resources requested by job */
 	List gres_list;			/* generic resource allocation detail */
 	char *gres_alloc;		/* Allocated GRES added over all nodes
 					 * to be passed to slurmdbd */
 	char *gres_req;			/* Requested GRES added over all nodes
 					 * to be passed to slurmdbd */
 	char *gres_used;		/* Actual GRES use added over all nodes
 					 * to be passed to slurmdbd */
 	uint32_t group_id;		/* group submitted under */
 	uint32_t job_id;		/* job ID */
 	struct job_record *job_next;	/* next entry with same hash index */
 	struct job_record *job_array_next_j; /* job array linked list by job_id */
 	struct job_record *job_array_next_t; /* job array linked list by task_id */
 	job_resources_t *job_resrcs;	/* details of allocated cores */
 	uint16_t job_state;		/* state of the job */
 	uint16_t kill_on_node_fail;	/* 1 if job should be killed on
 					 * node failure */
 	char *licenses;			/* licenses required by the job */
 	List license_list;		/* structure with license info */
 	uint16_t limit_set_max_cpus;	/* if max_cpus was set from
 					 * a limit false if user set */
 	uint16_t limit_set_max_nodes;	/* if max_nodes was set from
 					 * a limit false if user set */
 	uint16_t limit_set_min_cpus;	/* if max_cpus was set from
 					 * a limit false if user set */
 	uint16_t limit_set_min_nodes;	/* if max_nodes was set from
 					 * a limit false if user set */
 	uint16_t limit_set_pn_min_memory; /* if pn_min_memory was set from
 					 * a limit false if user set */
 	uint16_t limit_set_time;    	/* if time_limit was set from
 					 * a limit false if user set */
 	uint16_t limit_set_qos;	   	/* if qos_limit was set from
 					 * a limit false if user set */
 	uint16_t mail_type;		/* see MAIL_JOB_* in slurm.h */
 	char *mail_user;		/* user to get e-mail notification */
 	uint32_t magic;			/* magic cookie for data integrity */
 	char *name;			/* name of the job */
 	char *network;			/* network/switch requirement spec */
 	uint32_t next_step_id;		/* next step id to be used */
 	char *nodes;			/* list of nodes allocated to job */
 	slurm_addr_t *node_addr;	/* addresses of the nodes allocated to
 					 * job */
 	bitstr_t *node_bitmap;		/* bitmap of nodes allocated to job */
 	bitstr_t *node_bitmap_cg;	/* bitmap of nodes completing job */
 	uint32_t node_cnt;		/* count of nodes currently
 					 * allocated to job */
 	uint32_t node_cnt_wag;		/* count of nodes Slurm thinks
 					 * will be allocated when the
 					 * job is pending and node_cnt
 					 * wasn't given by the user.
 					 * This is packed in total_nodes
 					 * when dumping state.  When
 					 * state is read in check for
 					 * pending state and set this
 					 * instead of total_nodes */
 	char *nodes_completing;		/* nodes still in completing state
 					 * for this job, used to insure
 					 * epilog is not re-run for job */
 	uint16_t other_port;		/* port for client communications */
 	char *partition;		/* name of job partition(s) */
 	List part_ptr_list;		/* list of pointers to partition recs */
 	bool part_nodes_missing;	/* set if job's nodes removed from this
 					 * partition */
 	struct part_record *part_ptr;	/* pointer to the partition record */
 	time_t pre_sus_time;		/* time job ran prior to last suspend */
 	time_t preempt_time;		/* job preemption signal time */
 	bool preempt_in_progress;	/* Premption of other jobs in progress
 					 * in order to start this job,
 					 * (Internal use only, don't save) */
 	uint32_t priority;		/* relative priority of the job,
 					 * zero == held (don't initiate) */
 	uint32_t *priority_array;	/* partition based priority */
 	priority_factors_object_t *prio_factors; /* cached value used
 						  * by sprio command */
 	uint32_t profile;		/* Acct_gather_profile option */
 	uint32_t qos_id;		/* quality of service id */
 	void *qos_ptr;			/* pointer to the quality of
 					 * service record used for
 					 * this job, it is
 					 * void* because of interdependencies
 					 * in the header files, confirm the
 					 * value before use */
 	uint16_t restart_cnt;		/* count of restarts */
 	time_t resize_time;		/* time of latest size change */
 	uint32_t resv_id;		/* reservation ID */
 	char *resv_name;		/* reservation name */
 	struct slurmctld_resv *resv_ptr;/* reservation structure pointer */
 	uint32_t requid;	    	/* requester user ID */
 	char *resp_host;		/* host for srun communications */
 	dynamic_plugin_data_t *select_jobinfo;/* opaque data, BlueGene */
 	char **spank_job_env;		/* environment variables for job prolog
 					 * and epilog scripts as set by SPANK
 					 * plugins */
 	uint32_t spank_job_env_size;	/* element count in spank_env */
 	time_t start_time;		/* time execution begins,
 					 * actual or expected */
 	char *state_desc;		/* optional details for state_reason */
 	uint16_t state_reason;		/* reason job still pending or failed
 					 * see slurm.h:enum job_wait_reason */
 	List step_list;			/* list of job's steps */
 	time_t suspend_time;		/* time job last suspended or resumed */
 	time_t time_last_active;	/* time of last job activity */
 	uint32_t time_limit;		/* time_limit minutes or INFINITE,
 					 * NO_VAL implies partition max_time */
 	uint32_t time_min;		/* minimum time_limit minutes or
 					 * INFINITE,
 					 * zero implies same as time_limit */
 	time_t tot_sus_time;		/* total time in suspend state */
 	uint32_t total_cpus;		/* number of allocated cpus,
 					 * for accounting */
 	uint32_t total_nodes;		/* number of allocated nodes
 					 * for accounting */
 	uint32_t user_id;		/* user the job runs as */
 	uint16_t wait_all_nodes;	/* if set, wait for all nodes to boot
 					 * before starting the job */
 	uint16_t warn_flags;		/* flags for signal to send */
 	uint16_t warn_signal;		/* signal to send before end_time */
 	uint16_t warn_time;		/* when to send signal before
 					 * end_time (secs) */
 	char *wckey;			/* optional wckey */

 	/* Request number of switches support */
 	uint32_t req_switch;  /* Minimum number of switches                */
 	uint32_t wait4switch; /* Maximum time to wait for minimum switches */
 	bool     best_switch; /* true=min number of switches met           */
 	time_t wait4switch_start; /* Time started waiting for switch       */
 };

 /* Job dependency specification, used in "depend_list" within job_record */
 #define SLURM_DEPEND_AFTER		1	/* After job begins */
 #define SLURM_DEPEND_AFTER_ANY		2	/* After job completes */
 #define SLURM_DEPEND_AFTER_NOT_OK	3	/* After job fails */
 #define SLURM_DEPEND_AFTER_OK		4	/* After job completes
 						 * successfully */
 #define SLURM_DEPEND_SINGLETON		5	/* Only one job for this
 						 * user/name at a time */
 #define SLURM_DEPEND_EXPAND		6	/* Expand running job */
 struct	depend_spec {
 	uint32_t	array_task_id;	/* INFINITE for all array tasks */
 	uint16_t	depend_type;	/* SLURM_DEPEND_* type */
 	uint32_t	job_id;		/* SLURM job_id */
 	struct job_record *job_ptr;	/* pointer to this job */
 };

 struct 	step_record {
 	uint16_t batch_step;		/* 1 if batch job step, 0 otherwise */
 	uint16_t ckpt_interval;		/* checkpoint interval in minutes */
 	check_jobinfo_t check_job;	/* checkpoint context, opaque */
 	char *ckpt_dir;			/* path to checkpoint image files */
 	time_t ckpt_time;		/* time of last checkpoint */
 	bitstr_t *core_bitmap_job;	/* bitmap of cores allocated to this
 					 * step relative to job's nodes,
 					 * see src/common/job_resources.h */
 	uint32_t cpu_count;		/* count of step's CPUs */
 	uint32_t cpu_freq;		/* requested cpu frequency */
 	uint16_t cpus_per_task;		/* cpus per task initiated */
 	uint16_t cyclic_alloc;		/* set for cyclic task allocation
 					 * across nodes */
 	uint16_t exclusive;		/* dedicated resources for the step */
 	uint32_t exit_code;		/* highest exit code from any task */
 	bitstr_t *exit_node_bitmap;	/* bitmap of exited nodes */
 	ext_sensors_data_t *ext_sensors; /* external sensors plugin data */
 	char *gres;			/* generic resources required */
 	List gres_list;			/* generic resource allocation detail */
 	char *host;			/* host for srun communications */
 	struct job_record* job_ptr; 	/* ptr to the job that owns the step */
 	jobacctinfo_t *jobacct;         /* keep track of process info in the
 					 * step */
 	uint32_t pn_min_memory;		/* minimum real memory per node OR
 					 * real memory per CPU | MEM_PER_CPU,
 					 * default=0 (use job limit) */
 	char *name;			/* name of job step */
 	char *network;			/* step's network specification */
 	uint8_t no_kill;		/* 1 if no kill on node failure */
 	uint16_t port;			/* port for srun communications */
 	time_t pre_sus_time;		/* time step ran prior to last suspend */
 	int *resv_port_array;		/* reserved port indexes */
 	uint16_t resv_port_cnt;		/* count of ports reserved per node */
 	char *resv_ports;		/* ports reserved for job */
 	uint32_t requid;	    	/* requester user ID */
 	time_t start_time;		/* step allocation start time */
 	uint32_t time_limit;	  	/* step allocation time limit */
 	dynamic_plugin_data_t *select_jobinfo;/* opaque data, BlueGene */
 	uint16_t state;			/* state of the step. See job_states */
 	uint32_t step_id;		/* step number */
 	slurm_step_layout_t *step_layout;/* info about how tasks are laid out
 					  * in the step */
 	bitstr_t *step_node_bitmap;	/* bitmap of nodes allocated to job
 					 * step */
 /*	time_t suspend_time;		 * time step last suspended or resumed
 					 * implicitly the same as suspend_time
 					 * in the job record */
 	switch_jobinfo_t *switch_job;	/* switch context, opaque */
 	time_t time_last_active;	/* time step was last found on node */
 	time_t tot_sus_time;		/* total time in suspended state */
 };

 extern List job_list;			/* list of job_record entries */

 /*****************************************************************************\
  *  Consumable Resources parameters and data structures
 \*****************************************************************************/

 /*
  * Define the type of update and of data retrieval that can happen
  * from the "select/cons_res" plugin. This information needed to
  * support processors as consumable resources.  This structure will be
  * useful when updating other types of consumable resources as well
 */
 enum select_plugindata_info {
 	SELECT_CR_PLUGIN,    /* data-> uint32 1 if CR plugin */
 	SELECT_BITMAP,       /* Unused since version 2.0 */
 	SELECT_ALLOC_CPUS,   /* data-> uint16 alloc cpus (CR support) */
 	SELECT_ALLOC_LPS,    /* data-> uint32 alloc lps  (CR support) */
 	SELECT_AVAIL_MEMORY, /* data-> uint32 avail mem  (CR support) */
 	SELECT_STATIC_PART,  /* data-> uint16, 1 if static partitioning
 			      * BlueGene support */
 	SELECT_CONFIG_INFO   /* data-> List get .conf info from select
 			      * plugin */
 } ;

 /*****************************************************************************\
  *  Global slurmctld functions
 \*****************************************************************************/

 /*
  * abort_job_on_node - Kill the specific job_id on a specific node,
  *	the request is not processed immediately, but queued.
  *	This is to prevent a flood of pthreads if slurmctld restarts
  *	without saved state and slurmd daemons register with a
  *	multitude of running jobs. Slurmctld will not recognize
  *	these jobs and use this function to kill them - one
  *	agent request per node as they register.
  * IN job_id - id of the job to be killed
  * IN job_ptr - pointer to terminating job (NULL if unknown, e.g. orphaned)
  * IN node_name - name of the node on which the job resides
  */
 extern void abort_job_on_node(uint32_t job_id, struct job_record *job_ptr,
 			      char *node_name);

 /* Note that the backup slurmctld has assumed primary control.
  * This function can be called multiple times. */
 extern void backup_slurmctld_restart(void);

 /* Complete a batch job requeue logic after all steps complete so that
  * subsequent jobs appear in a separate accounting record. */
 void batch_requeue_fini(struct job_record  *job_ptr);

 /* Build a bitmap of nodes completing this job */
 extern void build_cg_bitmap(struct job_record *job_ptr);

 /* Given a config_record with it's bitmap already set, update feature_list */
 extern void  build_config_feature_list(struct config_record *config_ptr);

 /*
  * create_job_record - create an empty job_record including job_details.
  *	load its values with defaults (zeros, nulls, and magic cookie)
  * IN/OUT error_code - set to zero if no error, errno otherwise
  * RET pointer to the record or NULL if error
  * global: job_list - global job list
  *	job_count - number of jobs in the system
  *	last_job_update - time of last job table update
  * NOTE: allocates memory that should be xfreed with _list_delete_job
  */
 extern struct job_record * create_job_record (int *error_code);

 /*
  * create_part_record - create a partition record
  * RET a pointer to the record or NULL if error
  * global: default_part - default partition parameters
  *         part_list - global partition list
  * NOTE: the record's values are initialized to those of default_part
  * NOTE: allocates memory that should be xfreed with delete_part_record
  */
 extern struct part_record *create_part_record (void);

 /*
  * job_limits_check - check the limits specified for the job.
  * IN job_ptr - pointer to job table entry.
  * IN check_min_time - if true test job's minimum time limit,
  *		otherwise test maximum time limit
  * RET WAIT_NO_REASON on success, fail status otherwise.
  */
 extern int job_limits_check(struct job_record **job_pptr, bool check_min_time);

 /*
  * delete_job_details - delete a job's detail record and clear it's pointer
  *	this information can be deleted as soon as the job is allocated
  *	resources and running (could need to restart batch job)
  * IN job_entry - pointer to job_record to clear the record of
  */
 extern void  delete_job_details (struct job_record *job_entry);

 /*
  * delete_partition - delete the specified partition (actually leave
  *	the entry, just flag it as defunct)
  * IN job_specs - job specification from RPC
  * RET 0 on success, errno otherwise
  */
 extern int delete_partition(delete_part_msg_t *part_desc_ptr);

 /*
  * delete_step_record - delete record for job step for specified job_ptr
  *	and step_id
  * IN job_ptr - pointer to job table entry to have step record removed
  * IN step_id - id of the desired job step
  * RET 0 on success, errno otherwise
  */
 extern int delete_step_record (struct job_record *job_ptr, uint32_t step_id);

 /*
  * delete_step_records - delete step record for specified job_ptr
  * IN job_ptr - pointer to job table entry to have step records removed
  */
 extern void delete_step_records (struct job_record *job_ptr);

 /*
  * Copy a job's dependency list
  * IN depend_list_src - a job's depend_lst
  * RET copy of depend_list_src, must bee freed by caller
  */
 extern List depended_list_copy(List depend_list_src);

 /*
  * drain_nodes - drain one or more nodes,
  *  no-op for nodes already drained or draining
  * IN nodes - nodes to drain
  * IN reason - reason to drain the nodes
  * IN reason_uid - who set the reason
  * RET SLURM_SUCCESS or error code
  * global: node_record_table_ptr - pointer to global node table
  */
 extern int drain_nodes ( char *nodes, char *reason, uint32_t reason_uid );

 /* dump_all_job_state - save the state of all jobs to file
  * RET 0 or error code */
 extern int dump_all_job_state ( void );

 /* dump_all_node_state - save the state of all nodes to file */
 extern int dump_all_node_state ( void );

 /* dump_all_part_state - save the state of all partitions to file */
 extern int dump_all_part_state ( void );

 /*
  * dump_job_desc - dump the incoming job submit request message
  * IN job_specs - job specification from RPC
  */
 extern void dump_job_desc(job_desc_msg_t * job_specs);

 /*
  * dump_job_step_state - dump the state of a specific job step to a buffer,
  *	load with load_step_state
  * IN job_ptr - pointer to job for which information is to be dumpped
  * IN step_ptr - pointer to job step for which information is to be dumpped
  * IN/OUT buffer - location to store data, pointers automatically advanced
  */
 extern void dump_job_step_state(struct job_record *job_ptr,
 				struct step_record *step_ptr, Buf buffer);

 /*
  * dump_step_desc - dump the incoming step initiate request message
  * IN step_spec - job step request specification from RPC
  */
 extern void dump_step_desc(job_step_create_request_msg_t *step_spec);

 /* Remove one node from a job's allocation */
 extern void excise_node_from_job(struct job_record *job_ptr,
 				 struct node_record *node_ptr);

 /*
  * Copy a job's feature list
  * IN feature_list_src - a job's depend_lst
  * RET copy of depend_list_src, must be freed by caller
  */
 extern List feature_list_copy(List feature_list_src);

 /*
  * find_job_array_rec - return a pointer to the job record with the given
  *	array_job_id/array_task_id
  * IN job_id - requested job's id
  * IN array_task_id - requested job's task id (NO_VAL if none specified)
  * RET pointer to the job's record, NULL on error
  */
 extern struct job_record *find_job_array_rec(uint32_t array_job_id,
 					     uint32_t array_task_id);

 /*
  * find_job_record - return a pointer to the job record with the given job_id
  * IN job_id - requested job's id
  * RET pointer to the job's record, NULL on error
  */
 struct job_record *find_job_record(uint32_t job_id);

 /*
  * find_first_node_record - find a record for first node in the bitmap
  * IN node_bitmap
  */
 extern struct node_record *find_first_node_record (bitstr_t *node_bitmap);

 /*
  * find_part_record - find a record for partition with specified name
  * IN name - name of the desired partition
  * RET pointer to partition or NULL if not found
  */
 extern struct part_record *find_part_record(char *name);

 /*
  * find_step_record - return a pointer to the step record with the given
  *	job_id and step_id
  * IN job_ptr - pointer to job table entry to have step record added
  * IN step_id - id of the desired job step
  * RET pointer to the job step's record, NULL on error
  */
 extern struct step_record * find_step_record(struct job_record *job_ptr,
 					     uint32_t step_id);

 /*
  * get_job_env - return the environment variables and their count for a
  *	given job
  * IN job_ptr - pointer to job for which data is required
  * OUT env_size - number of elements to read
  * RET point to array of string pointers containing environment variables
  */
 extern char **get_job_env (struct job_record *job_ptr, uint32_t *env_size);

 /*
  * get_job_script - return the script for a given job
  * IN job_ptr - pointer to job for which data is required
  * RET point to string containing job script
  */
 extern char *get_job_script (struct job_record *job_ptr);

 /*
  * get_next_job_id - return the job_id to be used by default for
  *	the next job
  */
 extern uint32_t get_next_job_id(void);

 /*
  * get_part_list - find record for named partition(s)
  * IN name - partition name(s) in a comma separated list
  * RET List of pointers to the partitions or NULL if not found
  * NOTE: Caller must free the returned list
  */
 extern List get_part_list(char *name);

 /*
  * init_job_conf - initialize the job configuration tables and values.
  *	this should be called after creating node information, but
  *	before creating any job entries.
  * RET 0 if no error, otherwise an error code
  * global: last_job_update - time of last job table update
  *	job_list - pointer to global job list
  */
 extern int init_job_conf (void);

 /*
  * init_node_conf - initialize the node configuration tables and values.
  *	this should be called before creating any node or configuration
  *	entries.
  * RET 0 if no error, otherwise an error code
  * global: node_record_table_ptr - pointer to global node table
  *         default_node_record - default values for node records
  *         default_config_record - default values for configuration records
  *         hash_table - table of hash indexes
  *         last_node_update - time of last node table update
  */
 extern int init_node_conf ();

 /*
  * init_part_conf - initialize the default partition configuration values
  *	and create a (global) partition list.
  * this should be called before creating any partition entries.
  * RET 0 if no error, otherwise an error code
  * global: default_part - default partition values
  *         part_list - global partition list
  */
 extern int init_part_conf (void);

 /*
  * is_node_down - determine if the specified node's state is DOWN
  * IN name - name of the node
  * RET true if node exists and is down, otherwise false
  */
 extern bool is_node_down (char *name);

 /*
  * is_node_resp - determine if the specified node's state is responding
  * IN name - name of the node
  * RET true if node exists and is responding, otherwise false
  */
 extern bool is_node_resp (char *name);

 /*
  * allocated_session_in_use - check if an interactive session is already running
  * IN new_alloc - allocation (alloc_node:alloc_sid) to test for
  * Returns true if an interactive session of the same node:sid already exists.
  */
 extern bool allocated_session_in_use(job_desc_msg_t *new_alloc);

 /*
  * job_alloc_info - get details about an existing job allocation
  * IN uid - job issuing the code
  * IN job_id - ID of job for which info is requested
  * OUT job_pptr - set to pointer to job record
  */
 extern int job_alloc_info(uint32_t uid, uint32_t job_id,
 			  struct job_record **job_pptr);
 /*
  * job_allocate - create job_records for the supplied job specification and
  *	allocate nodes for it.
  * IN job_specs - job specifications
  * IN immediate - if set then either initiate the job immediately or fail
  * IN will_run - don't initiate the job if set, just test if it could run
  *	now or later
  * OUT resp - will run response (includes start location, time, etc.)
  * IN allocate - resource allocation request only if set, batch job if zero
  * IN submit_uid -uid of user issuing the request
  * OUT job_pptr - set to pointer to job record
  * OUT err_msg - Custom error message to the user, caller to xfree results
  * RET 0 or an error code. If the job would only be able to execute with
  *	some change in partition configuration then
  *	ESLURM_REQUESTED_PART_CONFIG_UNAVAILABLE is returned
  * NOTE: If allocating nodes lx[0-7] to a job and those nodes have cpu counts
  *	of 4, 4, 4, 4, 8, 8, 4, 4 then num_cpu_groups=3, cpus_per_node={4,8,4}
  *	and cpu_count_reps={4,2,2}
  * globals: job_list - pointer to global job list
  *	list_part - global list of partition info
  *	default_part_loc - pointer to default partition
  * NOTE: lock_slurmctld on entry: Read config Write job, Write node, Read part
  */
 extern int job_allocate(job_desc_msg_t * job_specs, int immediate,
 		int will_run, will_run_response_msg_t **resp,
 		int allocate, uid_t submit_uid, struct job_record **job_pptr,
 		char **err_msg);

 /* Reset a job's end_time based upon it's start_time and time_limit.
  * NOTE: Do not reset the end_time if already being preempted */
 extern void job_end_time_reset(struct job_record  *job_ptr);
 /*
  * job_hold_by_assoc_id - Hold all pending jobs with a given
  *	association ID. This happens when an association is deleted (e.g. when
  *	a user is removed from the association database).
  * RET count of held jobs
  */
 extern int job_hold_by_assoc_id(uint32_t assoc_id);

 /*
  * job_hold_by_qos_id - Hold all pending jobs with a given
  *	QOS ID. This happens when a QOS is deleted (e.g. when
  *	a QOS is removed from the association database).
  * RET count of held jobs
  */
 extern int job_hold_by_qos_id(uint32_t qos_id);

 /* Perform checkpoint operation on a job */
 extern int job_checkpoint(checkpoint_msg_t *ckpt_ptr, uid_t uid,
 			  slurm_fd_t conn_fd, uint16_t protocol_version);

 /* log the completion of the specified job */
 extern void job_completion_logger(struct job_record  *job_ptr, bool requeue);

 /*
  * job_epilog_complete - Note the completion of the epilog script for a
  *	given job
  * IN job_id      - id of the job for which the epilog was executed
  * IN node_name   - name of the node on which the epilog was executed
  * IN return_code - return code from epilog script
  * RET true if job is COMPLETED, otherwise false
  */
 extern bool job_epilog_complete(uint32_t job_id, char *node_name,
 		uint32_t return_code);

 /*
  * job_end_time - Process JOB_END_TIME
  * IN time_req_msg - job end time request
  * OUT timeout_msg - job timeout response to be sent
  * RET SLURM_SUCESS or an error code
  */
 extern int job_end_time(job_alloc_info_msg_t *time_req_msg,
 			srun_timeout_msg_t *timeout_msg);

 /* job_fini - free all memory associated with job records */
 extern void job_fini (void);

 /*
  * job_fail - terminate a job due to initiation failure
  * IN job_id - id of the job to be killed
  * IN job_state - desired job state (JOB_BOOT_FAIL, JOB_NODE_FAIL, etc.)
  * RET 0 on success, otherwise ESLURM error code
  */
 extern int job_fail(uint32_t job_id, uint16_t job_state);


 /* job_hold_requeue()
  *
  * Requeue the job based upon its current state.
  * If JOB_SPECIAL_EXIT then requeue and hold with JOB_SPECIAL_EXIT state.
  * If JOB_REQUEUE_HOLD then requeue and hold.
  * If JOB_REQUEUE then requeue and let it run again.
  * The requeue can happen directly from job_requeue() or from
  * job_epilog_complete() after the last component has finished.
  */
 extern void job_hold_requeue(struct job_record *job_ptr);

 /*
  * determine if job is ready to execute per the node select plugin
  * IN job_id - job to test
  * OUT ready - 1 if job is ready to execute 0 otherwise
  * RET SLURM error code
  */
 extern int job_node_ready(uint32_t job_id, int *ready);

 /* Record accounting information for a job immediately before changing size */
 extern void job_pre_resize_acctg(struct job_record *job_ptr);

 /* Record accounting information for a job immediately after changing size */
 extern void job_post_resize_acctg(struct job_record *job_ptr);

 /*
  * job_restart - Restart a batch job from checkpointed state
  *
  * Restart a job is similar to submit a new job, except that
  * the job requirements is load from the checkpoint file and
  * the job id is restored.
  *
  * IN ckpt_ptr - checkpoint request message
  * IN uid - user id of the user issuing the RPC
  * IN conn_fd - file descriptor on which to send reply
  * IN protocol_version - slurm protocol version of client
  * RET 0 on success, otherwise ESLURM error code
  */
 extern int job_restart(checkpoint_msg_t *ckpt_ptr, uid_t uid,
 		       slurm_fd_t conn_fd, uint16_t protocol_version);

 /*
  * job_signal - signal the specified job
  * IN job_id - id of the job to be signaled
  * IN signal - signal to send, SIGKILL == cancel the job
  * IN flags  - see KILL_JOB_* flags in slurm.h
  * IN uid - uid of requesting user
  * IN preempt - true if job being preempted
  * RET 0 on success, otherwise ESLURM error code
  */
 extern int job_signal(uint32_t job_id, uint16_t signal, uint16_t flags,
 		      uid_t uid, bool preempt);

 /*
  * job_step_checkpoint - perform some checkpoint operation
  * IN ckpt_ptr - checkpoint request message
  * IN uid - user id of the user issuing the RPC
  * IN conn_fd - file descriptor on which to send reply
  * IN protocol_version - slurm protocol version of client
  * RET 0 on success, otherwise ESLURM error code
  */
 extern int job_step_checkpoint(checkpoint_msg_t *ckpt_ptr,
 		uid_t uid, slurm_fd_t conn_fd, uint16_t protocol_version);

 /*
  * job_step_checkpoint_comp - note job step checkpoint completion
  * IN ckpt_ptr - checkpoint complete status message
  * IN uid - user id of the user issuing the RPC
  * IN conn_fd - file descriptor on which to send reply
  * IN protocol_version - slurm protocol version of client
  * RET 0 on success, otherwise ESLURM error code
  */
 extern int job_step_checkpoint_comp(checkpoint_comp_msg_t *ckpt_ptr,
 		uid_t uid, slurm_fd_t conn_fd, uint16_t protocol_version);
 /*
  * job_step_checkpoint_task_comp - note task checkpoint completion
  * IN ckpt_ptr - checkpoint task complete status message
  * IN uid - user id of the user issuing the RPC
  * IN conn_fd - file descriptor on which to send reply
  * IN protocol_version - slurm protocol version of client
  * RET 0 on success, otherwise ESLURM error code
  */
 extern int job_step_checkpoint_task_comp(checkpoint_task_comp_msg_t *ckpt_ptr,
 		uid_t uid, slurm_fd_t conn_fd, uint16_t protocol_version);

 /*
  * job_suspend - perform some suspend/resume operation
  * IN sus_ptr - suspend/resume request message
  * IN uid - user id of the user issuing the RPC
  * IN conn_fd - file descriptor on which to send reply,
  *              -1 if none
  * indf_susp IN - set if job is being suspended indefinitely by user or admin
  *                and we should clear it's priority, otherwise suspended
  *		  temporarily for gang scheduling
  * IN protocol_version - slurm protocol version of client
  * RET 0 on success, otherwise ESLURM error code
  */
 extern int job_suspend(suspend_msg_t *sus_ptr, uid_t uid,
 		       slurm_fd_t conn_fd, bool indf_susp,
 		       uint16_t protocol_version);

 /*
  * job_complete - note the normal termination the specified job
  * IN job_id - id of the job which completed
  * IN uid - user id of user issuing the RPC
  * IN requeue - job should be run again if possible
  * IN node_fail - true of job terminated due to node failure
  * IN job_return_code - job's return code, if set then set state to JOB_FAILED
  * RET - 0 on success, otherwise ESLURM error code
  * global: job_list - pointer global job list
  *	last_job_update - time of last job table update
  */
 extern int job_complete(uint32_t job_id, uid_t uid, bool requeue,
 			bool node_fail, uint32_t job_return_code);

 /*
  * job_independent - determine if this job has a dependent job pending
  *	or if the job's scheduled begin time is in the future
  * IN job_ptr - pointer to job being tested
  * IN will_run - is this a test for will_run or not
  * RET - true if job no longer must be defered for another job
  */
 extern bool job_independent(struct job_record *job_ptr, int will_run);

 /*
  * job_req_node_filter - job reqeust node filter.
  * clear from a bitmap the nodes which can not be used for a job
  * test memory size, required features, processor count, etc.
  * IN job_ptr - pointer to node to be scheduled
  * IN/OUT bitmap - set of nodes being considered for use
  * RET SLURM_SUCCESS or EINVAL if can't filter (exclusive OR of features)
  */
 extern int job_req_node_filter(struct job_record *job_ptr,
 			       bitstr_t *avail_bitmap);

 /*
  * job_requeue - Requeue a running or pending batch job
  * IN uid - user id of user issuing the RPC
  * IN job_id - id of the job to be requeued
  * IN conn_fd - file descriptor on which to send reply
  * IN protocol_version - slurm protocol version of client
  * IN preempt - true if job being preempted
  * RET 0 on success, otherwise ESLURM error code
  */
 extern int job_requeue(uid_t uid,
                        uint32_t job_id,
                        slurm_fd_t conn_fd,
                        uint16_t protocol_version,
                        bool preempt);
 /*
  * job_step_complete - note normal completion the specified job step
  * IN job_id - id of the job to be completed
  * IN step_id - id of the job step to be completed
  * IN uid - user id of user issuing the RPC
  * IN requeue - job should be run again if possible
  * IN job_return_code - job's return code, if set then set state to JOB_FAILED
  * RET 0 on success, otherwise ESLURM error code
  * global: job_list - pointer global job list
  *	last_job_update - time of last job table update
  */
 extern int job_step_complete (uint32_t job_id, uint32_t job_step_id,
 			uid_t uid, bool requeue, uint32_t job_return_code);

 /*
  * job_step_signal - signal the specified job step
  * IN job_id - id of the job to be cancelled
  * IN step_id - id of the job step to be cancelled
  * IN signal - user id of user issuing the RPC
  * IN uid - user id of user issuing the RPC
  * RET 0 on success, otherwise ESLURM error code
  * global: job_list - pointer global job list
  *	last_job_update - time of last job table update
  */
 extern int job_step_signal(uint32_t job_id, uint32_t step_id,
 			   uint16_t signal, uid_t uid);

 /*
  * job_time_limit - terminate jobs which have exceeded their time limit
  * global: job_list - pointer global job list
  *	last_job_update - time of last job table update
  */
 extern void job_time_limit (void);

 /*
  * job_update_cpu_cnt - when job is completing remove allocated cpus
  *                      from count.
  * IN/OUT job_ptr - job structure to be updated
  * IN node_inx    - node bit that is finished with job.
  * RET SLURM_SUCCES on success SLURM_ERROR on cpu_cnt underflow
  */
 extern int job_update_cpu_cnt(struct job_record *job_ptr, int node_inx);

 /*
  * check_job_step_time_limit - terminate jobsteps which have exceeded
  * their time limit
  * IN job_ptr - pointer to job containing steps to check
  * IN now - current time to use for the limit check
  */
 extern void check_job_step_time_limit (struct job_record *job_ptr, time_t now);

 /*
  * kill_job_by_part_name - Given a partition name, deallocate resource for
  *	its jobs and kill them
  * IN part_name - name of a partition
  * RET number of killed jobs
  */
 extern int kill_job_by_part_name(char *part_name);

 /*
  * kill_job_on_node - Kill the specific job_id on a specific node.
  *	agent request per node as they register.
  * IN job_id - id of the job to be killed
  * IN job_ptr - pointer to terminating job (NULL if unknown, e.g. orphaned)
  * IN node_ptr - pointer to the node on which the job resides
  */
 extern void kill_job_on_node(uint32_t job_id, struct job_record *job_ptr,
 			     struct node_record *node_ptr);

 /*
  * kill_job_by_front_end_name - Given a front end node name, deallocate
  *	resource for its jobs and kill them.
  * IN node_name - name of a front end node
  * RET number of jobs associated with this front end node
  */
 extern int kill_job_by_front_end_name(char *node_name);

 /*
  * kill_running_job_by_node_name - Given a node name, deallocate RUNNING
  *	or COMPLETING jobs from the node or kill them
  * IN node_name - name of a node
  * RET number of killed jobs
  */
 extern int kill_running_job_by_node_name(char *node_name);

 /*
  * kill_step_on_node - determine if the specified job has any job steps
  *	allocated to the specified node and kill them unless no_kill flag
  *	is set on the step
  * IN job_ptr - pointer to an active job record
  * IN node_ptr - pointer to a node record
  * IN node_fail - true of removed node has failed
  * RET count of killed job steps
  */
 extern int kill_step_on_node(struct job_record  *job_ptr,
 			     struct node_record *node_ptr, bool node_fail);

 /* list_compare_config - compare two entry from the config list based upon
  *	weight, see common/list.h for documentation */
 int list_compare_config (void *config_entry1, void *config_entry2);

 /*
  * list_find_feature - find an entry in the feature list, see list.h for
  *	documentation
  * IN key - is feature name or NULL for all features
  * RET 1 if found, 0 otherwise
  */
 extern int list_find_feature(void *feature_entry, void *key);

 /*
  * list_find_part - find an entry in the partition list, see common/list.h
  *	for documentation
  * IN key - partition name or "universal_key" for all partitions
  * RET 1 if matches key, 0 otherwise
  * global- part_list - the global partition list
  */
 extern int list_find_part (void *part_entry, void *key);

 /*
  * load_all_job_state - load the job state from file, recover from last
  *	checkpoint. Execute this after loading the configuration file data.
  * RET 0 or error code
  */
 extern int load_all_job_state ( void );

 /*
  * load_all_node_state - Load the node state from file, recover on slurmctld
  *	restart. Execute this after loading the configuration file data.
  *	Data goes into common storage.
  * IN state_only - if true over-write only node state, features, gres and reason
  * RET 0 or error code
  */
 extern int load_all_node_state ( bool state_only );

 /*
  * load_last_job_id - load only the last job ID from state save file.
  * RET 0 or error code
  */
 extern int load_last_job_id( void );

 /*
  * load_part_uid_allow_list - reload the allow_uid list of partitions
  *	if required (updated group file or force set)
  * IN force - if set then always reload the allow_uid list
  */
 extern void load_part_uid_allow_list ( int force );

 /*
  * load_all_part_state - load the partition state from file, recover from
  *	slurmctld restart. execute this after loading the configuration
  *	file data.
  */
 extern int load_all_part_state ( void );

 /*
  * Create a new job step from data in a buffer (as created by
  * dump_job_stepstate)
  * IN/OUT - job_ptr - point to a job for which the step is to be loaded.
  * IN/OUT buffer - location from which to get data, pointers
  *                 automatically advanced
  */
 extern int load_step_state(struct job_record *job_ptr, Buf buffer,
 			   uint16_t protocol_version);

 /* make_node_alloc - flag specified node as allocated to a job
  * IN node_ptr - pointer to node being allocated
  * IN job_ptr  - pointer to job that is starting
  */
 extern void make_node_alloc(struct node_record *node_ptr,
 			    struct job_record *job_ptr);

 /* make_node_comp - flag specified node as completing a job
  * IN node_ptr - pointer to node marked for completion of job
  * IN job_ptr  - pointer to job that is completing
  * IN suspended - true if job was previously suspended
  */
 extern void make_node_comp(struct node_record *node_ptr,
 			   struct job_record *job_ptr, bool suspended);

 /*
  * make_node_idle - flag specified node as having finished with a job
  * IN node_ptr - pointer to node reporting job completion
  * IN job_ptr  - pointer to job that just completed
  */
 extern void make_node_idle(struct node_record *node_ptr,
 			   struct job_record *job_ptr);

 /*
  * Determine of the specified job can execute right now or is currently
  * blocked by a partition state or limit. These job states should match the
  * reason values returned by job_limits_check().
  */
 extern bool misc_policy_job_runnable_state(struct job_record *job_ptr);

 /* msg_to_slurmd - send given msg_type every slurmd, no args */
 extern void msg_to_slurmd (slurm_msg_type_t msg_type);

 /* node_fini - free all memory associated with node records */
 extern void node_fini (void);

 /* node_did_resp - record that the specified node is responding
  * IN name - name of the node */
 extern void node_did_resp (char *name);

 /*
  * node_not_resp - record that the specified node is not responding
  * IN name - name of the node
  * IN msg_time - time message was sent
  * IN resp_type - what kind of response came back from the node
  */
 extern void node_not_resp (char *name, time_t msg_time,
 			   slurm_msg_type_t resp_type);

 /* For every node with the "not_responding" flag set, clear the flag
  * and log that the node is not responding using a hostlist expression */
 extern void node_no_resp_msg(void);

 /*
  * pack_all_jobs - dump all job information for all jobs in
  *	machine independent form (for network transmission)
  * OUT buffer_ptr - the pointer is set to the allocated buffer.
  * OUT buffer_size - set to size of the buffer in bytes
  * IN show_flags - job filtering options
  * IN uid - uid of user making request (for partition filtering)
  * IN filter_uid - pack only jobs belonging to this user if not NO_VAL
  * IN protocol_version - slurm protocol version of client
  * global: job_list - global list of job records
  * NOTE: the buffer at *buffer_ptr must be xfreed by the caller
  * NOTE: change _unpack_job_desc_msg() in common/slurm_protocol_pack.c
  *	whenever the data format changes
  */
 extern void pack_all_jobs(char **buffer_ptr, int *buffer_size,
 			  uint16_t show_flags, uid_t uid, uint32_t filter_uid,
 			  uint16_t protocol_version);

 /*
  * pack_all_node - dump all configuration and node information for all nodes
  *	in machine independent form (for network transmission)
  * OUT buffer_ptr - pointer to the stored data
  * OUT buffer_size - set to size of the buffer in bytes
  * IN show_flags - node filtering options
  * IN uid - uid of user making request (for partition filtering)
  * IN protocol_version - slurm protocol version of client
  * global: node_record_table_ptr - pointer to global node table
  * NOTE: the caller must xfree the buffer at *buffer_ptr
  * NOTE: change slurm_load_node() in api/node_info.c when data format changes
  * NOTE: READ lock_slurmctld config before entry
  */
 extern void pack_all_node (char **buffer_ptr, int *buffer_size,
 			   uint16_t show_flags, uid_t uid,
 			   uint16_t protocol_version);

 /* Pack all scheduling statistics */
 extern void pack_all_stat(int resp, char **buffer_ptr, int *buffer_size,
 			  uint16_t protocol_version);

 /*
  * pack_ctld_job_step_info_response_msg - packs job step info
  * IN job_id - specific id or NO_VAL for all
  * IN step_id - specific id or NO_VAL for all
  * IN uid - user issuing request
  * IN show_flags - job step filtering options
  * OUT buffer - location to store data, pointers automatically advanced
  * IN protocol_version - slurm protocol version of client
  * RET - 0 or error code
  * NOTE: MUST free_buf buffer
  */
 extern int pack_ctld_job_step_info_response_msg(
 	uint32_t job_id, uint32_t step_id, uid_t uid,
 	uint16_t show_flags, Buf buffer, uint16_t protocol_version);

 /*
  * pack_all_part - dump all partition information for all partitions in
  *	machine independent form (for network transmission)
  * OUT buffer_ptr - the pointer is set to the allocated buffer.
  * OUT buffer_size - set to size of the buffer in bytes
  * IN show_flags - partition filtering options
  * IN uid - uid of user making request (for partition filtering)
  * IN protocol_version - slurm protocol version of client
  * global: part_list - global list of partition records
  * NOTE: the buffer at *buffer_ptr must be xfreed by the caller
  * NOTE: change slurm_load_part() in api/part_info.c if data format changes
  */
 extern void pack_all_part(char **buffer_ptr, int *buffer_size,
 			  uint16_t show_flags, uid_t uid,
 			  uint16_t protocol_version);

 /*
  * pack_job - dump all configuration information about a specific job in
  *	machine independent form (for network transmission)
  * IN dump_job_ptr - pointer to job for which information is requested
  * IN show_flags - job filtering options
  * IN/OUT buffer - buffer in which data is placed, pointers automatically
  *	updated
  * IN uid - user requesting the data
  * NOTE: change _unpack_job_desc_msg() in common/slurm_protocol_pack.c
  *	  whenever the data format changes
  */
 extern void pack_job (struct job_record *dump_job_ptr, uint16_t show_flags,
 		      Buf buffer, uint16_t protocol_version, uid_t uid);

 /*
  * pack_part - dump all configuration information about a specific partition
  *	in machine independent form (for network transmission)
  * IN part_ptr - pointer to partition for which information is requested
  * IN/OUT buffer - buffer in which data is placed, pointers automatically
  *	updated
  * global: default_part_loc - pointer to the default partition
  * NOTE: if you make any changes here be sure to make the corresponding
  *	changes to load_part_config in api/partition_info.c
  */
 extern void pack_part (struct part_record *part_ptr, Buf buffer,
 		       uint16_t protocol_version);

 /*
  * pack_one_job - dump information for one jobs in
  *	machine independent form (for network transmission)
  * OUT buffer_ptr - the pointer is set to the allocated buffer.
  * OUT buffer_size - set to size of the buffer in bytes
  * IN job_id - ID of job that we want info for
  * IN show_flags - job filtering options
  * IN uid - uid of user making request (for partition filtering)
  * NOTE: the buffer at *buffer_ptr must be xfreed by the caller
  * NOTE: change _unpack_job_desc_msg() in common/slurm_protocol_pack.c
  *	whenever the data format changes
  */
 extern int pack_one_job(char **buffer_ptr, int *buffer_size,
 			uint32_t job_id, uint16_t show_flags, uid_t uid,
 			uint16_t protocol_version);

 /*
  * pack_one_node - dump all configuration and node information for one node
  *	in machine independent form (for network transmission)
  * OUT buffer_ptr - pointer to the stored data
  * OUT buffer_size - set to size of the buffer in bytes
  * IN show_flags - node filtering options
  * IN uid - uid of user making request (for partition filtering)
  * IN node_name - name of node for which information is desired,
  *		  use first node if name is NULL
  * IN protocol_version - slurm protocol version of client
  * global: node_record_table_ptr - pointer to global node table
  * NOTE: the caller must xfree the buffer at *buffer_ptr
  * NOTE: change slurm_load_node() in api/node_info.c when data format changes
  * NOTE: READ lock_slurmctld config before entry
  */
 extern void pack_one_node (char **buffer_ptr, int *buffer_size,
 			   uint16_t show_flags, uid_t uid, char *node_name,
 			   uint16_t protocol_version);

 /* part_filter_clear - Clear the partition's hidden flag based upon a user's
  * group access. This must follow a call to part_filter_set() */
 extern void part_filter_clear(void);

 /* part_filter_set - Set the partition's hidden flag based upon a user's
  * group access. This must be followed by a call to part_filter_clear() */
 extern void part_filter_set(uid_t uid);

 /* part_fini - free all memory associated with partition records */
 extern void part_fini (void);

 /*
  * Create a copy of a job's part_list *partition list
  * IN part_list_src - a job's part_list
  * RET copy of part_list_src, must be freed by caller
  */
 extern List part_list_copy(List part_list_src);

 /*
  * Determine of the specified job can execute right now or is currently
  * blocked by a partition state or limit. Execute job_limits_check() to
  * re-validate job state.
  */
 extern bool part_policy_job_runnable_state(struct job_record *job_ptr);

 /* Validate a job's account against the partition's AllowAccounts or
  * DenyAccounts parameters. */
 extern int part_policy_valid_acct(struct part_record *part_ptr, char *acct);

 /* Validate a job's QOS against the partition's AllowQOS or
  * DenyQOS parameters. */
 extern int part_policy_valid_qos(
 	struct part_record *part_ptr, slurmdb_qos_rec_t *qos_ptr);

 /*
  * partition_in_use - determine whether a partition is in use by a RUNNING
  *	PENDING or SUSPENDED job
  * IN part_name - name of a partition
  * RET true if the partition is in use, else false
  */
 extern bool partition_in_use(char *part_name);

 /*
  * purge_old_job - purge old job records.
  *	The jobs must have completed at least MIN_JOB_AGE minutes ago.
  *	Test job dependencies, handle after_ok, after_not_ok before
  *	purging any jobs.
  * NOTE: READ lock slurmctld config and WRITE lock jobs before entry
  */
 void purge_old_job(void);

 /* Convert a comma delimited list of QOS names into a bitmap */
 extern void qos_list_build(char *qos, bitstr_t **qos_bits);

 /* Request that the job scheduler execute soon (typically within seconds) */
 extern void queue_job_scheduler(void);

 /*
  * rehash_jobs - Create or rebuild the job hash table.
  * NOTE: run lock_slurmctld before entry: Read config, write job
  */
 extern void rehash_jobs(void);

 /*
  * Rebuild a job step's core_bitmap_job after a job has just changed size
  * job_ptr IN - job that was just re-sized
  * orig_job_node_bitmap IN - The job's original node bitmap
  */
 extern void rebuild_step_bitmaps(struct job_record *job_ptr,
 				 bitstr_t *orig_job_node_bitmap);

 /*
  * After a job has fully completed run this to release the resouces
  * and remove it from the system.
  */
 extern int post_job_step(struct step_record *step_ptr);

 /* update first assigned job id as needed on reconfigure */
 extern void reset_first_job_id(void);

 /*
  * reset_job_bitmaps - reestablish bitmaps for existing jobs.
  *	this should be called after rebuilding node information,
  *	but before using any job entries.
  * global: last_job_update - time of last job table update
  *	job_list - pointer to global job list
  */
 extern void reset_job_bitmaps (void);

 /* Reset a node's CPU load value */
 extern void reset_node_load(char *node_name, uint32_t cpu_load);

 /* Reset all scheduling statistics
  * level IN - clear backfilled_jobs count if set */
 extern void reset_stats(int level);

 /*
  * restore_node_features - Make node and config (from slurm.conf) fields
  *	consistent for Features, Gres and Weight
  * IN recover -
  *              0, 1 - use data from config record, built using slurm.conf
  *              2 = use data from node record, built from saved state
  */
 extern void restore_node_features(int recover);

 /* Update time stamps for job step resume */
 extern void resume_job_step(struct job_record *job_ptr);

 /* run_backup - this is the backup controller, it should run in standby
  *	mode, assuming control when the primary controller stops responding */
 extern void run_backup(slurm_trigger_callbacks_t *callbacks);

 /* Spawn health check function for every node that is not DOWN */
 extern void run_health_check(void);

 /* save_all_state - save entire slurmctld state for later recovery */
 extern void save_all_state(void);

 /* make sure the assoc_mgr lists are up and running and state is
  * restored */
 extern void ctld_assoc_mgr_init(slurm_trigger_callbacks_t *callbacks);

 /* send all info for the controller to accounting */
 extern void send_all_to_accounting(time_t event_time);

 /* A slurmctld lock needs to at least have a node read lock set before
  * this is called */
 extern void set_cluster_cpus(void);

 /* sends all jobs in eligible state to accounting.  Only needed at
  * first registration
  */
 extern int send_jobs_to_accounting();

 /* send all nodes in a down like state to accounting.  Only needed at
  * first registration
  */
 extern int send_nodes_to_accounting(time_t event_time);

 /* Set a job's alias_list string */
 extern void set_job_alias_list(struct job_record *job_ptr);

 /*
  * set_job_prio - set a default job priority
  * IN job_ptr - pointer to the job_record
  */
 extern void set_job_prio(struct job_record *job_ptr);

 /*
  * set_node_down - make the specified node's state DOWN if possible
  *	(not in a DRAIN state), kill jobs as needed
  * IN name - name of the node
  * IN reason - why the node is DOWN
  */
 extern void set_node_down (char *name, char *reason);

 /*
  * set_node_down_ptr - make the specified compute node's state DOWN and
  *	kill jobs as needed
  * IN node_ptr - node_ptr to the node
  * IN reason - why the node is DOWN
  */
 void set_node_down_ptr (struct node_record *node_ptr, char *reason);

 /*
  * set_slurmctld_state_loc - create state directory as needed and "cd" to it
  */
 extern void set_slurmctld_state_loc(void);

 /* set_slurmd_addr - establish the slurm_addr_t for the slurmd on each node
  *	Uses common data structures. */
 extern void set_slurmd_addr (void);

 /*
  * signal_step_tasks - send specific signal to specific job step
  * IN step_ptr - step record pointer
  * IN signal - signal to send
  * IN msg_type - message type to send
  */
 void signal_step_tasks(struct step_record *step_ptr, uint16_t signal,
 		       slurm_msg_type_t msg_type);

 /*
  * signal_step_tasks_on_node - send specific signal to specific job step
  *                             on a specific node.
  * IN node_name - name of node on which to signal tasks
  * IN step_ptr - step record pointer
  * IN signal - signal to send
  * IN msg_type - message type to send
  */
 void signal_step_tasks_on_node(char* node_name, struct step_record *step_ptr,
 			       uint16_t signal, slurm_msg_type_t msg_type);

 /*
  * slurmctld_shutdown - wake up slurm_rpc_mgr thread via signal
  * RET 0 or error code
  */
 extern int slurmctld_shutdown(void);

 /* Perform periodic job step checkpoints (per user request) */
 extern void step_checkpoint(void);

 /* Update a job's record of allocated CPUs when a job step gets scheduled */
 extern void step_alloc_lps(struct step_record *step_ptr);

 /*
  * step_create - creates a step_record in step_specs->job_id, sets up the
  *	according to the step_specs.
  * IN step_specs - job step specifications
  * OUT new_step_record - pointer to the new step_record (NULL on error)
  * IN batch_step - set if step is a batch script
  * RET - 0 or error code
  * NOTE: don't free the returned step_record because that is managed through
  * 	the job.
  */
 extern int step_create(job_step_create_request_msg_t *step_specs,
 		       struct step_record** new_step_record, bool batch_step);

 /*
  * step_layout_create - creates a step_layout according to the inputs.
  * IN step_ptr - step having tasks layed out
  * IN step_node_list - node list of hosts in step
  * IN node_count - count of nodes in step allocation
  * IN num_tasks - number of tasks in step
  * IN cpus_per_task - number of cpus per task
  * IN task_dist - type of task distribution
  * IN plane_size - size of plane (only needed for the plane distribution)
  * RET - NULL or slurm_step_layout_t *
  * NOTE: you need to free the returned step_layout usually when the
  *       step is freed.
  */
 extern slurm_step_layout_t *step_layout_create(struct step_record *step_ptr,
 					       char *step_node_list,
 					       uint32_t node_count,
 					       uint32_t num_tasks,
 					       uint16_t cpus_per_task,
 					       uint16_t task_dist,
 					       uint16_t plane_size);

 /* start_power_mgr - Start power management thread as needed. The thread
  *	terminates automatically at slurmctld shutdown time.
  * IN thread_id - pointer to thread ID of the started pthread.
  */
 extern void start_power_mgr(pthread_t *thread_id);

 /*
  * step_epilog_complete - note completion of epilog on some node and
  *	release it's switch windows if appropriate. can perform partition
  *	switch window releases.
  * IN job_ptr - pointer to job which has completed epilog
  * IN node_name - name of node which has completed epilog
  */
 extern int step_epilog_complete(struct job_record  *job_ptr,
 	char *node_name);

 /*
  * step_partial_comp - Note the completion of a job step on at least
  *	some of its nodes
  * IN req     - step_completion_msg RPC from slurmstepd
  * IN uid     - UID issuing the request
  * OUT rem    - count of nodes for which responses are still pending
  * OUT max_rc - highest return code for any step thus far
  * RET 0 on success, otherwise ESLURM error code
  */
 extern int step_partial_comp(step_complete_msg_t *req, uid_t uid,
 			     int *rem, uint32_t *max_rc);

 /* Update time stamps for job step suspend */
 extern void suspend_job_step(struct job_record *job_ptr);

 /*
  * Synchronize the batch job in the system with their files.
  * All pending batch jobs must have script and environment files
  * No other jobs should have such files
  */
 extern int sync_job_files(void);

 /* After recovering job state, if using priority/basic then we increment the
  * priorities of all jobs to avoid decrementing the base down to zero */
 extern void sync_job_priorities(void);
 /*
  * update_job - update a job's parameters per the supplied specifications
  * IN job_specs - a job's specification
  * IN uid - uid of user issuing RPC
  * RET returns an error code from slurm_errno.h
  * global: job_list - global list of job entries
  *	last_job_update - time of last job table update
  */
 extern int update_job (job_desc_msg_t * job_specs, uid_t uid);

 /*
  * Modify the account associated with a pending job
  * IN module - where this is called from
  * IN job_ptr - pointer to job which should be modified
  * IN new_account - desired account name
  * RET SLURM_SUCCESS or error code
  */
 extern int update_job_account(char *module, struct job_record *job_ptr,
 			      char *new_account);

 /*
  * Modify the wckey associated with a pending job
  * IN module - where this is called from
  * IN job_ptr - pointer to job which should be modified
  * IN new_wckey - desired wckey name
  * RET SLURM_SUCCESS or error code
  */
 extern int update_job_wckey(char *module, struct job_record *job_ptr,
 			    char *new_wckey);

 /* Reset nodes_completing field for all jobs */
 extern void update_job_nodes_completing(void);

 /* Reset slurmctld logging based upon configuration parameters
  * uses common slurmctld_conf data structure */
 extern void update_logging(void);

 /*
  * update_node - update the configuration data for one or more nodes
  * IN update_node_msg - update node request
  * RET 0 or error code
  * global: node_record_table_ptr - pointer to global node table
  */
 extern int update_node ( update_node_msg_t * update_node_msg )  ;

 /* Update nodes accounting usage data */
 extern void update_nodes_acct_gather_data(void);

 /*
  * update_node_record_acct_gather_data - update the energy data in the
  * node_record
  * IN msg - node energy data message
  * RET 0 if no error, ENOENT if no such node
  */
 extern int update_node_record_acct_gather_data(
 	acct_gather_node_resp_msg_t *msg);

 /*
  * update_part - create or update a partition's configuration data
  * IN part_desc - description of partition changes
  * IN create_flag - create a new partition
  * RET 0 or an error code
  * global: part_list - list of partition entries
  *	last_part_update - update time of partition records
  */
 extern int update_part (update_part_msg_t * part_desc, bool create_flag);

 /* Process job step update request from specified user,
  * RET - 0 or error code */
 extern int update_step(step_update_request_msg_t *req, uid_t uid);

 /*
  * validate_alloc_node - validate that the allocating node
  * is allowed to use this partition
  * IN part_ptr - pointer to a partition
  * IN alloc_node - allocting node of the request
  * RET 1 if permitted to run, 0 otherwise
  */
 extern int validate_alloc_node(struct part_record *part_ptr, char* alloc_node);

 /*
  * validate_group - validate that the submit uid is authorized to run in
  *	this partition
  * IN part_ptr - pointer to a partition
  * IN run_uid - user to run the job as
  * RET 1 if permitted to run, 0 otherwise
  */
 extern int validate_group (struct part_record *part_ptr, uid_t run_uid);

 /* Perform some size checks on strings we store to prevent
  * malicious user filling slurmctld's memory
  * RET 0 or error code */
 extern int validate_job_create_req(job_desc_msg_t * job_desc);

 /*
  * validate_jobs_on_node - validate that any jobs that should be on the node
  *	are actually running, if not clean up the job records and/or node
  *	records, call this function after validate_node_specs() sets the node
  *	state properly
  * IN reg_msg - node registration message
  */
 extern void validate_jobs_on_node(slurm_node_registration_status_msg_t *reg_msg);

 /*
  * validate_node_specs - validate the node's specifications as valid,
  *	if not set state to down, in any case update last_response
  * IN reg_msg - node registration message
  * IN protocol_version - Version of Slurm on this node
  * OUT newly_up - set if node newly brought into service
  * RET 0 if no error, ENOENT if no such node, EINVAL if values too low
  * NOTE: READ lock_slurmctld config before entry
  */
 extern int validate_node_specs(slurm_node_registration_status_msg_t *reg_msg,
 			       uint16_t protocol_version, bool *newly_up);

 /*
  * validate_nodes_via_front_end - validate all nodes on a cluster as having
  *	a valid configuration as soon as the front-end registers. Individual
  *	nodes will not register with this configuration
  * IN reg_msg - node registration message
  * IN protocol_version - Version of Slurm on this node
  * OUT newly_up - set if node newly brought into service
  * RET 0 if no error, SLURM error code otherwise
  * NOTE: READ lock_slurmctld config before entry
  */
 extern int validate_nodes_via_front_end(
 		slurm_node_registration_status_msg_t *reg_msg,
 		uint16_t protocol_version, bool *newly_up);

 /*
  * validate_slurm_user - validate that the uid is authorized to see
  *      privileged data (either user root or SlurmUser)
  * IN uid - user to validate
  * RET true if permitted to run, false otherwise
  */
 extern bool validate_slurm_user(uid_t uid);

 /*
  * validate_super_user - validate that the uid is authorized at the
  *      root, SlurmUser, or SLURMDB_ADMIN_SUPER_USER level
  * IN uid - user to validate
  * RET true if permitted to run, false otherwise
  */
 extern bool validate_super_user(uid_t uid);

 /*
  * validate_operator - validate that the uid is authorized at the
  *      root, SlurmUser, or SLURMDB_ADMIN_OPERATOR level
  * IN uid - user to validate
  * RET true if permitted to run, false otherwise
  */
 extern bool validate_operator(uid_t uid);

 /* cleanup_completing()
  *
  * Clean up the JOB_COMPLETING flag and eventually
  * requeue the job if there is a pending request
  * for it. This function assumes the caller has the
  * appropriate locks on the job_record.
  * This function is called when a job completes
  * by either when the slurmd epilog finishes or
  * when the slurmctld epilog finishes, whichever
  * comes last.
  */
 extern void cleanup_completing(struct job_record *);


 #endif /* !_HAVE_SLURMCTLD_H */