blob: 2f8d0fce1a3d1474a81b47cc6c7ebfc0556f5a9f [file] [log] [blame]
/*****************************************************************************\
* node_conf.h - definitions for reading the node part of slurm configuration
* file and work with the corresponding structures
*****************************************************************************
* Copyright (C) 2002-2007 The Regents of the University of California.
* Copyright (C) 2008-2010 Lawrence Livermore National Security.
* Copyright (C) SchedMD LLC.
* Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
* Written by Morris Jette <jette1@llnl.gov> et. al.
* CODE-OCEC-09-009. All rights reserved.
*
* This file is part of Slurm, a resource management program.
* For details, see <https://slurm.schedmd.com/>.
* Please also read the included file: DISCLAIMER.
*
* Slurm is free software; you can redistribute it and/or modify it under
* the terms of the GNU General Public License as published by the Free
* Software Foundation; either version 2 of the License, or (at your option)
* any later version.
*
* In addition, as a special exception, the copyright holders give permission
* to link the code of portions of this program with the OpenSSL library under
* certain conditions as described in each individual source file, and
* distribute linked combinations including the two. You must obey the GNU
* General Public License in all respects for all of the code used other than
* OpenSSL. If you modify file(s) with this exception, you may extend this
* exception to your version of the file(s), but you are not obligated to do
* so. If you do not wish to do so, delete this exception statement from your
* version. If you delete this exception statement from all source files in
* the program, then also delete it here.
*
* Slurm is distributed in the hope that it will be useful, but WITHOUT ANY
* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
* FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
* details.
*
* You should have received a copy of the GNU General Public License along
* with Slurm; if not, write to the Free Software Foundation, Inc.,
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
\*****************************************************************************/
#ifndef _HAVE_NODE_CONF_H
#define _HAVE_NODE_CONF_H
#include "config.h"
#include <inttypes.h>
#include <time.h>
#include "src/common/bitstring.h"
#include "src/common/extra_constraints.h"
#include "src/common/hostlist.h"
#include "src/common/list.h"
#include "src/common/read_config.h"
#include "src/common/slurm_protocol_defs.h"
#include "src/common/xhash.h"
#define CONFIG_MAGIC 0xc065eded
#define NODE_MAGIC 0x0de575ed
typedef struct {
uint16_t boards; /* count of boards configured */
uint16_t core_spec_cnt; /* number of specialized cores */
uint16_t cores; /* number of cores per socket */
uint32_t cpu_bind; /* default CPU binding type */
char *cpu_spec_list; /* arbitrary list of specialized cpus */
uint16_t cpus; /* count of processors running on the node */
char *feature; /* arbitrary list of node's features */
char *gres; /* arbitrary list of node's generic resources */
uint32_t magic; /* magic cookie to test data integrity */
uint64_t mem_spec_limit; /* MB real memory for memory specialization */
bitstr_t *node_bitmap; /* bitmap of nodes with this configuration */
char *nodes; /* name of nodes with this configuration */
uint64_t real_memory; /* MB real memory on the node */
uint16_t res_cores_per_gpu; /* number of cores per GPU to allow
* to only GPU jobs */
uint16_t threads; /* number of threads per core */
uint32_t tmp_disk; /* MB total storage in TMP_FS file system */
uint16_t tot_sockets; /* number of sockets per node */
char *topology_str; /* topology address string */
double *tres_weights; /* array of TRES weights */
char *tres_weights_str; /* per TRES billing weight string */
uint32_t weight; /* arbitrary priority of node for
* scheduling work on */
} config_record_t;
extern list_t *config_list; /* list of config_record entries */
typedef struct node_record node_record_t;
struct node_record {
/*
* alloc fields are managed by the select plugin
* call select_g_select_nodeinfo_set_all() to update
*/
uint16_t alloc_cpus; /* allocated cpus */
uint64_t alloc_memory; /* allocated memory */
char *alloc_tres_fmt_str; /* allocated TRES */
char *arch; /* computer architecture */
char *bcast_address; /* BcastAddr */
uint16_t boards; /* count of boards configured */
time_t boot_req_time; /* Time of node boot request */
time_t boot_time; /* Time of node boot,
* computed from up_time */
time_t cert_last_renewal; /* Time of last TLS cert renewal */
char *cert_token; /* unique token for certmgr validation */
char *comm_name; /* communications path name to node */
char *comment; /* arbitrary comment */
uint16_t comp_job_cnt; /* count of jobs completing on node */
config_record_t *config_ptr; /* configuration spec ptr */
uint16_t core_spec_cnt; /* number of specialized cores on node*/
uint16_t cores; /* number of cores per socket */
uint32_t cpu_bind; /* default CPU binding type */
uint32_t cpu_load; /* CPU load * 100 */
time_t cpu_load_time; /* Time when cpu_load last set */
char *cpu_spec_list; /* node's specialized cpus */
uint16_t cpus; /* count of processors on the node */
uint16_t cpus_efctv; /* count of effective cpus on the node.
i.e. cpus minus specialized cpus*/
acct_gather_energy_t *energy; /* power consumption data */
char *extra; /* arbitrary string */
data_t *extra_data; /* Data serialized from extra */
char *features; /* node's available features, used only
* for state save/restore, DO NOT
* use for scheduling purposes */
char *features_act; /* node's active features, used only
* for state save/restore, DO NOT
* use for scheduling purposes */
uint64_t free_mem; /* Free memory in MiB */
time_t free_mem_time; /* Time when free_mem last set */
char *gpu_spec; /* node's cores reserved for GPU jobs */
bitstr_t *gpu_spec_bitmap; /* node gpu core specialization
* bitmap */
char *gres; /* node's generic resources, used only
* for state save/restore, DO NOT
* use for scheduling purposes */
list_t *gres_list; /* list of gres state info managed by
* plugins */
uint32_t index; /* Index into node_record_table_ptr */
char *instance_id; /* cloud instance id */
char *instance_type; /* cloud instance type */
time_t last_busy; /* time node was last busy (no jobs) */
time_t last_response; /* last response from the node */
uint32_t magic; /* magic cookie for data integrity */
char *mcs_label; /* mcs_label if mcs plugin in use */
uint64_t mem_spec_limit; /* MB memory limit for specialization */
char *name; /* name of the node. NULL==defunct */
uint32_t next_state; /* state after reboot */
uint16_t no_share_job_cnt; /* count of jobs running that will
* not share nodes */
char *node_hostname; /* hostname of the node */
node_record_t *node_next; /* next entry with same hash index */
uint32_t node_rank; /* Hilbert number based on node name,
* or other sequence number used to
* order nodes by location,
* no need to save/restore */
bitstr_t *node_spec_bitmap; /* node cpu specialization bitmap */
uint32_t node_state; /* enum node_states, ORed with
* NODE_STATE_NO_RESPOND if not
* responding */
bool not_responding; /* set if fails to respond,
* clear after logging this */
char *os; /* operating system now running */
uint32_t owner; /* User allowed to use node or NO_VAL */
uint16_t owner_job_cnt; /* Count of exclusive jobs by "owner" */
uint16_t part_cnt; /* number of associated partitions */
void **part_pptr; /* array of pointers to partitions
* associated with this node*/
uint16_t port; /* TCP port number of the slurmd */
time_t power_save_req_time; /* Time of power_save request */
uint16_t protocol_version; /* Slurm version number */
uint64_t real_memory; /* MB real memory on the node */
char *reason; /* why a node is DOWN or DRAINING */
time_t reason_time; /* Time stamp when reason was
* set, ignore if no reason is set. */
uint32_t reason_uid; /* User that set the reason, ignore if
* no reason is set. */
uint16_t res_cores_per_gpu; /* number of cores per GPU to allow to
* only GPU jobs */
time_t resume_after; /* automatically resume DOWN or DRAINED
* node at this point in time */
uint16_t resume_timeout; /* time required in order to perform a
* node resume operation */
char *resv_name; /* If node is in a reservation this is
* the name of the reservation */
uint16_t run_job_cnt; /* count of jobs running on node */
uint64_t sched_weight; /* Node's weight for scheduling
* purposes. For cons_tres use */
time_t slurmd_start_time; /* Time of slurmd startup */
uint16_t sus_job_cnt; /* count of jobs suspended on node */
uint32_t suspend_time; /* node idle for this long before
* power save mode */
uint16_t suspend_timeout; /* time required in order to perform a
* node suspend operation */
char *topology_str; /* topology address string
* (used by topology_plugin) */
uint64_t *tres_cnt; /* tres this node has. NO_PACK*/
char *tres_fmt_str; /* tres this node has */
char *tres_str; /* tres this node has */
uint16_t threads; /* number of threads per core */
uint32_t tmp_disk; /* MB total disk in TMP_FS */
uint16_t tot_cores; /* number of cores per node */
uint16_t tot_sockets; /* number of sockets per node */
uint32_t up_time; /* seconds since node boot */
char *version; /* Slurm version */
uint16_t tpc; /* number of threads we are using per
* core */
uint32_t weight; /* original weight, used only for state
* save/restore, DO NOT use for
* scheduling purposes. */
};
extern node_record_t **node_record_table_ptr; /* ptr to node records */
extern int node_record_count; /* number of node slots
* node_record_table_ptr */
extern int active_node_record_count; /* non-null node count in
* node_record_table_ptr */
extern xhash_t* node_hash_table; /* hash table for node records */
extern time_t last_node_update; /* time of last node record update */
extern uint16_t *cr_node_num_cores;
extern uint32_t *cr_node_cores_offset;
extern time_t slurmd_start_time;
extern list_t *active_feature_list; /* list of currently active features_records */
extern list_t *avail_feature_list; /* list of available features_records */
/*
* bitmap2node_name_sortable - given a bitmap, build a list of comma
* separated node names. names may include regular expressions
* (e.g. "lx[01-10]")
* IN bitmap - bitmap pointer
* IN sort - returned ordered list or not
* RET pointer to node list or NULL on error
* globals: node_record_table_ptr - pointer to node table
* NOTE: the caller must xfree the memory at node_list when no longer required
*/
char * bitmap2node_name_sortable (bitstr_t *bitmap, bool sort);
/*
* bitmap2node_name - given a bitmap, build a list of comma separated node
* names. names may include regular expressions (e.g. "lx[01-10]")
* IN bitmap - bitmap pointer
* RET pointer to node list or NULL on error
* globals: node_record_table_ptr - pointer to node table
* NOTE: the caller must xfree the memory at node_list when no longer required
*/
char * bitmap2node_name (bitstr_t *bitmap);
/*
* bitmap2hostlist - given a bitmap, build a hostlist
* IN bitmap - bitmap pointer
* RET pointer to hostlist or NULL on error
* globals: node_record_table_ptr - pointer to node table
* NOTE: the caller must xfree the memory at node_list when no longer required
*/
hostlist_t *bitmap2hostlist(bitstr_t *bitmap);
/*
* build_all_nodeline_info - get a array of slurm_conf_node_t structures
* from the slurm.conf reader, build table, and set values
* IN set_bitmap - if true then set node_bitmap in config record (used by
* slurmd), false is used by slurmctld, clients, and testsuite
* IN tres_cnt - number of TRES configured on system (used on controller side)
*/
extern int build_all_nodeline_info(bool set_bitmap, int tres_cnt);
/*
* Build a node's node_spec_bitmap and core_spec_cnt from it's cpu_spec_list.
*/
extern int build_node_spec_bitmap(node_record_t *node_ptr);
/*
* Expand a nodeline's node names, host names, addrs, ports into separate nodes.
*/
extern int expand_nodeline_info(slurm_conf_node_t *node_ptr,
config_record_t *config_ptr,
char **err_msg,
int (*_callback) (
char *alias, char *hostname,
char *address, char *bcast_addr,
uint16_t port, int state_val,
slurm_conf_node_t *node_ptr,
config_record_t *config_ptr));
/*
* create_config_record - create a config_record entry and set is values to
* the defaults. each config record corresponds to a line in the
* slurm.conf file and typically describes the configuration of a
* large number of nodes
* RET pointer to the config_record
* NOTE: memory allocated will remain in existence until
* _delete_config_record() is called to delete all configuration records
*/
extern config_record_t *create_config_record(void);
/*
* Create a config_record and initialize it with the given conf_node.
*
* IN conf_node - conf_node from slurm.conf to initialize config_record with.
* IN tres_cnt - number of system tres to initialize tres arrays.
* RET return config_record_t* on success, NULL otherwise.
*/
extern config_record_t *config_record_from_conf_node(
slurm_conf_node_t *conf_node, int tres_cnt);
/*
* Grow the node_record_table_ptr.
*/
extern void grow_node_record_table_ptr(void);
/*
* create_node_record - create a node record and set its values to defaults
* IN config_ptr - pointer to node's configuration information
* IN node_name - name of the node
* OUT node_ptr - node_record_t** with created node on SUCCESS, NULL otherwise.
* RET SUCCESS, or error code
* NOTE: grows node_record_table_ptr if needed and appends a new node_record_t *
* to node_record_table_ptr and increases node_record_count.
*/
extern int create_node_record(config_record_t *config_ptr, char *node_name,
node_record_t **node_ptr);
/*
* Create a new node_record_t * at the specified index.
*
* IN index - index in node_record_table_ptr where to create new
* node_record_t *. node_record_table_ptr[index] should be null and
* less than node_record_count.
* IN node_name - name of node to create
* IN config_ptr - pointer to node's configuration information
* RET new node_record_t * on success, NULL otherwise.
* NOTE: node_record_count isn't changed.
*/
extern node_record_t *create_node_record_at(int index, char *node_name,
config_record_t *config_ptr);
/*
* Add a node to node_record_table_ptr without growing the table and increasing
* node_reocrd_count. The node in an empty slot in the node_record_table_ptr.
*
* IN alias - name of node.
* IN config_ptr - config_record_t* to initialize node with.
* OUT node_ptr - node_record_t** with added node on SUCCESS, NULL otherwise.
* RET SUCCESS, or error code
*/
extern int add_node_record(char *alias, config_record_t *config_ptr,
node_record_t **node_ptr);
/*
* Add existing record to node_record_table_ptr at specific index
*
* Node must fit in currently allocated node_record_count/MaxNodeCount.
* node_ptr->config_ptr is added to the the global config_list.
*/
extern void insert_node_record_at(node_record_t *node_ptr, int index);
/*
* Delete node from node_record_table_ptr.
*
* IN node_ptr - node_ptr to delete
*/
extern void delete_node_record(node_record_t *node_ptr);
/*
* find_node_record - find a record for node with specified name
* IN: name - name of the desired node
* RET: pointer to node record or NULL if not found
* NOTE: Logs an error if the node name is NOT found
*/
extern node_record_t *find_node_record(char *name);
/*
* find_node_record2 - find a record for node with specified name
* IN: name - name of the desired node
* RET: pointer to node record or NULL if not found
* NOTE: Does not log an error if the node name is NOT found
*/
extern node_record_t *find_node_record2(char *name);
/*
* find_node_record_no_alias - find a record for node with specified name
* without looking at the node's alias (NodeHostName).
* IN: name - name of the desired node
* RET: pointer to node record or NULL if not found
* NOTE: Does not log an error if the node name is NOT found
*/
extern node_record_t *find_node_record_no_alias(char *name);
/*
* hostlist2bitmap - given a hostlist, build a bitmap representation
* IN hl - hostlist
* IN best_effort - if set don't return an error on invalid node name entries
* OUT bitmap - set to bitmap, may not have all bits set on error
* RET 0 if no error, otherwise EINVAL
*/
extern int hostlist2bitmap(hostlist_t *hl, bool best_effort, bitstr_t **bitmap);
/*
* init_node_conf - initialize the node configuration tables and values.
* this should be called before creating any node or configuration
* entries.
*/
extern void init_node_conf(void);
/* node_fini2 - free memory associated with node records (except bitmaps) */
extern void node_fini2 (void);
/*
* given a node name return inx in node_record_table_ptr
*/
extern int node_name_get_inx(char *node_name);
extern void add_nodes_with_feature_to_bitmap(bitstr_t *bitmap, char *feature);
extern int parse_hostlist_functions(hostlist_t **hostlist);
/*
* node_name2bitmap - given a node name regular expression, build a bitmap
* representation
* IN node_names - list of nodes
* IN best_effort - if set don't return an error on invalid node name entries
* OUT bitmap - set to bitmap, may not have all bits set on error
* IN/OUT invalid_hostlist - hostlist of invalid host names, initialize to NULL
* RET 0 if no error, otherwise EINVAL
* NOTE: the caller must bit_free() memory at bitmap when no longer required
*/
extern int node_name2bitmap (char *node_names, bool best_effort,
bitstr_t **bitmap, hostlist_t **invalid_hostlist);
/* Purge the contents of a node record */
extern void purge_node_rec(void *in);
/*
* rehash_node - build a hash table of the node_record entries.
* NOTE: manages memory for node_hash_table
*/
extern void rehash_node (void);
/* Convert a node state string to it's equivalent enum value */
extern int state_str2int(const char *state_str, char *node_name);
/* (re)set cr_node_num_cores arrays */
extern void cr_init_global_core_data(node_record_t **node_ptr, int node_cnt);
extern void cr_fini_global_core_data(void);
/*return the coremap index to the first core of the given node */
extern uint32_t cr_get_coremap_offset(uint32_t node_index);
/*
* Determine maximum number of CPUs on this node usable by a job
* ntasks_per_core IN - tasks-per-core to be launched by this job
* cpus_per_task IN - number of required CPUs per task for this job
* total_cores IN - total number of cores on this node
* total_cpus IN - total number of CPUs on this node
* RET count of usable CPUs on this node usable by this job
*/
extern int adjust_cpus_nppcu(uint16_t ntasks_per_core, int cpus_per_task,
int total_cores, int total_cpus);
/*
* find_hostname - Given a position and a string of hosts, return the hostname
* from that position.
* IN pos - position in hosts you want returned.
* IN hosts - string representing a hostlist of hosts.
* RET - hostname or NULL on error.
* NOTE: caller must xfree result.
*/
extern char *find_hostname(uint32_t pos, char *hosts);
/*
* Return the next non-null node_record_t * in the node_record_table_ptr.
*
* IN/OUT index - index to start iterating node_record_table_ptr from.
* Should be used in the following form so that i will increment
* to the next slot and i == node_ptr->index.
* e.g.
* for (int i = 0; (node_ptr = next_node(&i); i++)
* RET - next non-null node_record_t * or NULL if finished iterating.
*/
extern node_record_t *next_node(int *index);
/*
* Return the next non-null node_record_t * in the node_record_table_ptr that
* is set in the given bitmap.
*
* IN bitmap - bitmap of available nodes in node_record_table_ptr. The bitmap
* must match size of node_record_table_ptr.
* IN/OUT index - index to start iterating node_record_table_ptr from.
* RET - next non-null node_record_t * or NULL if finished iterating.
*/
extern node_record_t *next_node_bitmap(bitstr_t *bitmap, int *index);
/*
* Return bitmap with all active nodes set.
*
* node_record_table_ptr may have NULL slots in it, so return a bitmap with only
* non-null node bits set.
*
* NOTE: caller must free returned bitmap.
*/
extern bitstr_t *node_conf_get_active_bitmap(void);
/*
* Set bitmap with all active node bits.
*
* node_record_table_ptr may have NULL slots in it, so only set non-null node
* bits.
*/
extern void node_conf_set_all_active_bits(bitstr_t *b);
/*
* Tokenize node string on comma not followed by a digit. Return pointer to next
* token or NULL if there are no more.
*
* Commas followed by a digit are assumed to be within brackets.
* ex. string h[1,5],h[8,9] tokens are h[1,5] and h[8,9]
*
* IN s - pointer to string
* IN save_ptr - maintains context between successive calls that parse the same
* string (similar to strtok_r())
*
* NOTE: Like strtok_r() characters in s may be modified.
*/
extern char *node_conf_nodestr_tokenize(char *s, char **save_ptr);
/*
* Make a bitmap the size of the full cluster.
* IN/OUT core_bitmap - If *core_bitmap noop, otherwise create a bitstr_t the
* size of the cluster.
*/
extern void node_conf_create_cluster_core_bitmap(bitstr_t **core_bitmap);
/* used for stepmgr and omits sensitive fields */
extern void node_record_pack(void *in,
uint16_t protocol_version,
buf_t *buffer);
/* used for state files and may contain sensitive fields */
extern void node_record_pack_state(void *in,
uint16_t protocol_version,
buf_t *buffer);
extern int node_record_unpack(void **out,
uint16_t protocol_version,
buf_t *buffer);
/*
* Convert CPU list to reserve whole cores
* OUT:
* node_ptr->cpu_spec_list
*/
extern void node_conf_convert_cpu_spec_list(node_record_t *node_ptr);
/*
* Select cores and CPUs to be reserved for core specialization.
*/
extern void node_conf_select_spec_cores(node_record_t *node_ptr);
/* Create config_record_t from a packed node_record_t */
extern config_record_t *config_record_from_node_record(node_record_t *node_ptr);
extern int list_find_feature(void *feature_entry, void *key);
#endif /* !_HAVE_NODE_CONF_H */