| /*****************************************************************************\ |
| * node_conf.c - partially manage the node records of slurm |
| * (see src/slurmctld/node_mgr.c for the set of functionalities |
| * related to slurmctld usage of nodes) |
| * Note: there is a global node table (node_record_table_ptr), its |
| * hash table (node_hash_table), time stamp (last_node_update) and |
| * configuration list (config_list) |
| ***************************************************************************** |
| * Copyright (C) 2002-2007 The Regents of the University of California. |
| * Copyright (C) 2008-2010 Lawrence Livermore National Security. |
| * Copyright (C) 2010-2017 SchedMD LLC. |
| * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). |
| * Written by Morris Jette <jette1@llnl.gov> et. al. |
| * CODE-OCEC-09-009. All rights reserved. |
| * |
| * This file is part of Slurm, a resource management program. |
| * For details, see <https://slurm.schedmd.com/>. |
| * Please also read the included file: DISCLAIMER. |
| * |
| * Slurm is free software; you can redistribute it and/or modify it under |
| * the terms of the GNU General Public License as published by the Free |
| * Software Foundation; either version 2 of the License, or (at your option) |
| * any later version. |
| * |
| * In addition, as a special exception, the copyright holders give permission |
| * to link the code of portions of this program with the OpenSSL library under |
| * certain conditions as described in each individual source file, and |
| * distribute linked combinations including the two. You must obey the GNU |
| * General Public License in all respects for all of the code used other than |
| * OpenSSL. If you modify file(s) with this exception, you may extend this |
| * exception to your version of the file(s), but you are not obligated to do |
| * so. If you do not wish to do so, delete this exception statement from your |
| * version. If you delete this exception statement from all source files in |
| * the program, then also delete it here. |
| * |
| * Slurm is distributed in the hope that it will be useful, but WITHOUT ANY |
| * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS |
| * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more |
| * details. |
| * |
| * You should have received a copy of the GNU General Public License along |
| * with Slurm; if not, write to the Free Software Foundation, Inc., |
| * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. |
| \*****************************************************************************/ |
| |
| #include "config.h" |
| |
| #include <ctype.h> |
| #include <errno.h> |
| #include <fcntl.h> |
| #include <inttypes.h> |
| #include <stdio.h> |
| #include <stdlib.h> |
| #include <string.h> |
| #include <sys/types.h> |
| #include <sys/stat.h> |
| #include <time.h> |
| |
| #include "src/common/assoc_mgr.h" |
| #include "src/common/gres.h" |
| #include "src/common/hostlist.h" |
| #include "src/common/macros.h" |
| #include "src/common/node_select.h" |
| #include "src/common/pack.h" |
| #include "src/common/parse_time.h" |
| #include "src/common/read_config.h" |
| #include "src/common/slurm_accounting_storage.h" |
| #include "src/common/slurm_acct_gather_energy.h" |
| #include "src/common/slurm_ext_sensors.h" |
| #include "src/common/slurm_topology.h" |
| #include "src/common/xassert.h" |
| #include "src/common/xmalloc.h" |
| #include "src/common/xstring.h" |
| |
| #define _DEBUG 0 |
| |
| strong_alias(init_node_conf, slurm_init_node_conf); |
| strong_alias(build_all_nodeline_info, slurm_build_all_nodeline_info); |
| strong_alias(rehash_node, slurm_rehash_node); |
| strong_alias(hostlist2bitmap, slurm_hostlist2bitmap); |
| |
| /* Global variables */ |
| List config_list = NULL; /* list of config_record entries */ |
| List front_end_list = NULL; /* list of slurm_conf_frontend_t entries */ |
| time_t last_node_update = (time_t) 0; /* time of last update */ |
| node_record_t *node_record_table_ptr = NULL; /* node records */ |
| xhash_t* node_hash_table = NULL; |
| int node_record_count = 0; /* count in node_record_table_ptr */ |
| uint16_t *cr_node_num_cores = NULL; |
| uint32_t *cr_node_cores_offset = NULL; |
| |
| /* Local function definitions */ |
| static int _delete_config_record (void); |
| #if _DEBUG |
| static void _dump_hash (void); |
| #endif |
| static node_record_t *_find_node_record(char *name, bool test_alias, |
| bool log_missing); |
| static void _list_delete_config (void *config_entry); |
| static void _node_record_hash_identity (void* item, const char** key, |
| uint32_t* key_len); |
| |
| /* |
| * _delete_config_record - delete all configuration records |
| * RET 0 if no error, errno otherwise |
| * global: config_list - list of all configuration records |
| */ |
| static int _delete_config_record (void) |
| { |
| last_node_update = time (NULL); |
| list_flush(config_list); |
| list_flush(front_end_list); |
| |
| return SLURM_SUCCESS; |
| } |
| |
| |
| #if _DEBUG |
| /* |
| * helper function used by _dump_hash to print the hash table elements |
| */ |
| static void xhash_walk_helper_cbk (void* item, void* arg) |
| { |
| static int i = 0; /* sequential walk, so just update a static i */ |
| int inx; |
| node_record_t *node_ptr = (node_record_t *) item; |
| |
| inx = node_ptr - node_record_table_ptr; |
| debug3("node_hash[%d]:%d(%s)", i++, inx, node_ptr->name); |
| } |
| /* |
| * _dump_hash - print the node_hash_table contents, used for debugging |
| * or analysis of hash technique |
| * global: node_record_table_ptr - pointer to global node table |
| * node_hash_table - table of hash indexes |
| */ |
| static void _dump_hash (void) |
| { |
| if (node_hash_table == NULL) |
| return; |
| debug2("node_hash: indexing %ld elements", |
| xhash_count(node_hash_table)); |
| xhash_walk(node_hash_table, xhash_walk_helper_cbk, NULL); |
| } |
| #endif |
| |
| /* _list_delete_config - delete an entry from the config list, |
| * see list.h for documentation */ |
| static void _list_delete_config (void *config_entry) |
| { |
| config_record_t *config_ptr = (config_record_t *) config_entry; |
| |
| xassert(config_ptr); |
| xassert(config_ptr->magic == CONFIG_MAGIC); |
| xfree(config_ptr->cpu_spec_list); |
| xfree(config_ptr->feature); |
| xfree(config_ptr->gres); |
| xfree (config_ptr->nodes); |
| FREE_NULL_BITMAP (config_ptr->node_bitmap); |
| xfree(config_ptr->tres_weights); |
| xfree(config_ptr->tres_weights_str); |
| xfree (config_ptr); |
| } |
| |
| /* |
| * xhash helper function to index node_record per name field |
| * in node_hash_table |
| */ |
| static void _node_record_hash_identity (void* item, const char** key, |
| uint32_t* key_len) |
| { |
| node_record_t *node_ptr = (node_record_t *) item; |
| *key = node_ptr->name; |
| *key_len = strlen(node_ptr->name); |
| } |
| |
| /* |
| * bitmap2hostlist - given a bitmap, build a hostlist |
| * IN bitmap - bitmap pointer |
| * RET pointer to hostlist or NULL on error |
| * globals: node_record_table_ptr - pointer to node table |
| * NOTE: the caller must xfree the memory at node_list when no longer required |
| */ |
| hostlist_t bitmap2hostlist (bitstr_t *bitmap) |
| { |
| int i, first, last; |
| hostlist_t hl; |
| |
| if (bitmap == NULL) |
| return NULL; |
| |
| first = bit_ffs(bitmap); |
| if (first == -1) |
| return NULL; |
| |
| last = bit_fls(bitmap); |
| hl = hostlist_create(NULL); |
| for (i = first; i <= last; i++) { |
| if (bit_test(bitmap, i) == 0) |
| continue; |
| hostlist_push_host(hl, node_record_table_ptr[i].name); |
| } |
| return hl; |
| |
| } |
| |
| /* |
| * bitmap2node_name_sortable - given a bitmap, build a list of comma |
| * separated node names. names may include regular expressions |
| * (e.g. "lx[01-10]") |
| * IN bitmap - bitmap pointer |
| * IN sort - returned sorted list or not |
| * RET pointer to node list or NULL on error |
| * globals: node_record_table_ptr - pointer to node table |
| * NOTE: the caller must xfree the memory at node_list when no longer required |
| */ |
| char * bitmap2node_name_sortable (bitstr_t *bitmap, bool sort) |
| { |
| hostlist_t hl; |
| char *buf; |
| |
| hl = bitmap2hostlist (bitmap); |
| if (hl == NULL) |
| return xstrdup(""); |
| if (sort) |
| hostlist_sort(hl); |
| buf = hostlist_ranged_string_xmalloc(hl); |
| hostlist_destroy(hl); |
| return buf; |
| } |
| |
| /* |
| * bitmap2node_name - given a bitmap, build a list of sorted, comma |
| * separated node names. names may include regular expressions |
| * (e.g. "lx[01-10]") |
| * IN bitmap - bitmap pointer |
| * RET pointer to node list or NULL on error |
| * globals: node_record_table_ptr - pointer to node table |
| * NOTE: the caller must xfree the memory at node_list when no longer required |
| */ |
| char * bitmap2node_name (bitstr_t *bitmap) |
| { |
| return bitmap2node_name_sortable(bitmap, 1); |
| } |
| |
| #ifdef HAVE_FRONT_END |
| /* Log the contents of a frontend record */ |
| static void _dump_front_end(slurm_conf_frontend_t *fe_ptr) |
| { |
| info("fe name:%s addr:%s port:%u state:%u reason:%s " |
| "allow_groups:%s allow_users:%s " |
| "deny_groups:%s deny_users:%s", |
| fe_ptr->frontends, fe_ptr->addresses, |
| fe_ptr->port, fe_ptr->node_state, fe_ptr->reason, |
| fe_ptr->allow_groups, fe_ptr->allow_users, |
| fe_ptr->deny_groups, fe_ptr->deny_users); |
| } |
| #endif |
| |
| /* |
| * build_all_frontend_info - get a array of slurm_conf_frontend_t structures |
| * from the slurm.conf reader, build table, and set values |
| * is_slurmd_context: set to true if run from slurmd |
| * RET 0 if no error, error code otherwise |
| */ |
| extern int build_all_frontend_info (bool is_slurmd_context) |
| { |
| slurm_conf_frontend_t **ptr_array; |
| #ifdef HAVE_FRONT_END |
| slurm_conf_frontend_t *fe_single, *fe_line; |
| int i, count, max_rc = SLURM_SUCCESS; |
| bool front_end_debug; |
| |
| if (slurm_get_debug_flags() & DEBUG_FLAG_FRONT_END) |
| front_end_debug = true; |
| else |
| front_end_debug = false; |
| count = slurm_conf_frontend_array(&ptr_array); |
| if (count == 0) |
| fatal("No FrontendName information available!"); |
| |
| for (i = 0; i < count; i++) { |
| hostlist_t hl_name, hl_addr; |
| char *fe_name, *fe_addr; |
| |
| fe_line = ptr_array[i]; |
| hl_name = hostlist_create(fe_line->frontends); |
| if (hl_name == NULL) |
| fatal("Invalid FrontendName:%s", fe_line->frontends); |
| hl_addr = hostlist_create(fe_line->addresses); |
| if (hl_addr == NULL) |
| fatal("Invalid FrontendAddr:%s", fe_line->addresses); |
| if (hostlist_count(hl_name) != hostlist_count(hl_addr)) { |
| fatal("Inconsistent node count between " |
| "FrontendName(%s) and FrontendAddr(%s)", |
| fe_line->frontends, fe_line->addresses); |
| } |
| while ((fe_name = hostlist_shift(hl_name))) { |
| fe_addr = hostlist_shift(hl_addr); |
| fe_single = xmalloc(sizeof(slurm_conf_frontend_t)); |
| list_append(front_end_list, fe_single); |
| fe_single->frontends = xstrdup(fe_name); |
| fe_single->addresses = xstrdup(fe_addr); |
| free(fe_name); |
| free(fe_addr); |
| if (fe_line->allow_groups && fe_line->allow_groups[0]) { |
| fe_single->allow_groups = |
| xstrdup(fe_line->allow_groups); |
| } |
| if (fe_line->allow_users && fe_line->allow_users[0]) { |
| fe_single->allow_users = |
| xstrdup(fe_line->allow_users); |
| } |
| if (fe_line->deny_groups && fe_line->deny_groups[0]) { |
| fe_single->deny_groups = |
| xstrdup(fe_line->deny_groups); |
| } |
| if (fe_line->deny_users && fe_line->deny_users[0]) { |
| fe_single->deny_users = |
| xstrdup(fe_line->deny_users); |
| } |
| fe_single->port = fe_line->port; |
| if (fe_line->reason && fe_line->reason[0]) |
| fe_single->reason = xstrdup(fe_line->reason); |
| fe_single->node_state = fe_line->node_state; |
| if (front_end_debug && !is_slurmd_context) |
| _dump_front_end(fe_single); |
| } |
| hostlist_destroy(hl_addr); |
| hostlist_destroy(hl_name); |
| } |
| return max_rc; |
| #else |
| if (slurm_conf_frontend_array(&ptr_array) != 0) |
| fatal("FrontendName information configured!"); |
| return SLURM_SUCCESS; |
| #endif |
| } |
| |
| static void _check_callback(char *alias, char *hostname, |
| char *address, char *bcast_address, |
| uint16_t port, int state_val, |
| slurm_conf_node_t *node_ptr, |
| config_record_t *config_ptr) |
| { |
| node_record_t *node_rec; |
| |
| if ((node_rec = find_node_record2(alias))) |
| fatal("Duplicated NodeHostName %s in config file", alias); |
| |
| node_rec = create_node_record(config_ptr, alias); |
| if ((state_val != NO_VAL) && |
| (state_val != NODE_STATE_UNKNOWN)) |
| node_rec->node_state = state_val; |
| node_rec->last_response = (time_t) 0; |
| node_rec->comm_name = xstrdup(address); |
| node_rec->cpu_bind = node_ptr->cpu_bind; |
| node_rec->node_hostname = xstrdup(hostname); |
| node_rec->bcast_address = xstrdup(bcast_address); |
| node_rec->port = port; |
| node_rec->weight = node_ptr->weight; |
| node_rec->features = xstrdup(node_ptr->feature); |
| node_rec->reason = xstrdup(node_ptr->reason); |
| } |
| |
| /* |
| * build_all_nodeline_info - get a array of slurm_conf_node_t structures |
| * from the slurm.conf reader, build table, and set values |
| * IN set_bitmap - if true then set node_bitmap in config record (used by |
| * slurmd), false is used by slurmctld and testsuite |
| * IN tres_cnt - number of TRES configured on system (used on controller side) |
| * RET 0 if no error, error code otherwise |
| */ |
| extern int build_all_nodeline_info(bool set_bitmap, int tres_cnt) |
| { |
| slurm_conf_node_t *node, **ptr_array; |
| config_record_t *config_ptr = NULL; |
| int count; |
| int i, rc, max_rc = SLURM_SUCCESS; |
| bool in_daemon; |
| static bool daemon_run = false, daemon_set = false; |
| |
| in_daemon = run_in_daemon(&daemon_run, &daemon_set, "slurmctld,slurmd"); |
| |
| count = slurm_conf_nodename_array(&ptr_array); |
| if (count == 0) |
| fatal("No NodeName information available!"); |
| |
| for (i = 0; i < count; i++) { |
| node = ptr_array[i]; |
| |
| config_ptr = create_config_record(); |
| config_ptr->nodes = xstrdup(node->nodenames); |
| config_ptr->cpu_bind = node->cpu_bind; |
| config_ptr->cpus = node->cpus; |
| config_ptr->boards = node->boards; |
| config_ptr->sockets = node->sockets; |
| config_ptr->cores = node->cores; |
| config_ptr->core_spec_cnt = node->core_spec_cnt; |
| config_ptr->cpu_spec_list = xstrdup(node->cpu_spec_list); |
| config_ptr->threads = node->threads; |
| config_ptr->real_memory = node->real_memory; |
| config_ptr->mem_spec_limit = node->mem_spec_limit; |
| config_ptr->tmp_disk = node->tmp_disk; |
| |
| if (tres_cnt) { |
| config_ptr->tres_weights_str = |
| xstrdup(node->tres_weights_str); |
| config_ptr->tres_weights = |
| slurm_get_tres_weight_array( |
| node->tres_weights_str, |
| tres_cnt, true); |
| } |
| |
| config_ptr->weight = node->weight; |
| if (node->feature && node->feature[0]) |
| config_ptr->feature = xstrdup(node->feature); |
| if (in_daemon) { |
| config_ptr->gres = gres_plugin_name_filter(node->gres, |
| node->nodenames); |
| } |
| |
| rc = check_nodeline_info(node, config_ptr, LOG_LEVEL_FATAL, |
| _check_callback); |
| max_rc = MAX(max_rc, rc); |
| } |
| |
| if (set_bitmap) { |
| ListIterator config_iterator; |
| config_iterator = list_iterator_create(config_list); |
| while ((config_ptr = list_next(config_iterator))) { |
| node_name2bitmap(config_ptr->nodes, true, |
| &config_ptr->node_bitmap); |
| } |
| list_iterator_destroy(config_iterator); |
| } |
| |
| return max_rc; |
| } |
| |
| /* |
| * check_nodeline_info - From the slurm.conf reader, build table, |
| * and set values |
| * RET 0 if no error, error code otherwise |
| * Note: Operates on common variables |
| * default_node_record - default node configuration values |
| */ |
| extern int check_nodeline_info(slurm_conf_node_t *node_ptr, |
| config_record_t *config_ptr, |
| log_level_t lvl, |
| void (*_callback) ( |
| char *alias, char *hostname, |
| char *address, char *bcast_address, |
| uint16_t port, int state_val, |
| slurm_conf_node_t *node_ptr, |
| config_record_t *config_ptr)) |
| { |
| int error_code = SLURM_SUCCESS; |
| hostlist_t address_list = NULL; |
| hostlist_t alias_list = NULL; |
| hostlist_t bcast_list = NULL; |
| hostlist_t hostname_list = NULL; |
| hostlist_t port_list = NULL; |
| char *address = NULL; |
| char *alias = NULL; |
| char *bcast_address = NULL; |
| char *hostname = NULL; |
| char *port_str = NULL; |
| int state_val = NODE_STATE_UNKNOWN; |
| int address_count, alias_count, bcast_count, hostname_count, port_count; |
| uint16_t port = 0; |
| |
| if ((node_ptr->nodenames == NULL) || (node_ptr->nodenames[0] == '\0')) |
| return -1; |
| |
| if (node_ptr->state != NULL) { |
| state_val = state_str2int(node_ptr->state, node_ptr->nodenames); |
| if (state_val == NO_VAL) |
| fatal("Invalid state %s from %s", |
| node_ptr->state, node_ptr->nodenames); |
| } |
| |
| if (!(address_list = hostlist_create(node_ptr->addresses))) |
| fatal("Unable to create NodeAddr list from %s", |
| node_ptr->addresses); |
| |
| if (!(alias_list = hostlist_create(node_ptr->nodenames))) |
| fatal("Unable to create NodeName list from %s", |
| node_ptr->nodenames); |
| |
| if (!(bcast_list = hostlist_create(node_ptr->bcast_addresses))) |
| fatal("Unable to create BcastAddr list from %s", |
| node_ptr->bcast_addresses); |
| |
| if (!(hostname_list = hostlist_create(node_ptr->hostnames))) |
| fatal("Unable to create NodeHostname list from %s", |
| node_ptr->hostnames); |
| |
| if (node_ptr->port_str && node_ptr->port_str[0] && |
| (node_ptr->port_str[0] != '[') && |
| (strchr(node_ptr->port_str, '-') || |
| strchr(node_ptr->port_str, ','))) { |
| xstrfmtcat(port_str, "[%s]", node_ptr->port_str); |
| port_list = hostlist_create(port_str); |
| xfree(port_str); |
| } else |
| port_list = hostlist_create(node_ptr->port_str); |
| |
| if (!port_list) |
| fatal("Unable to create Port list from %s", |
| node_ptr->port_str); |
| |
| /* some sanity checks */ |
| address_count = hostlist_count(address_list); |
| bcast_count = hostlist_count(bcast_list); |
| alias_count = hostlist_count(alias_list); |
| hostname_count = hostlist_count(hostname_list); |
| port_count = hostlist_count(port_list); |
| #ifdef HAVE_FRONT_END |
| if ((hostname_count != alias_count) && (hostname_count != 1)) |
| fatal("NodeHostname count must equal that of NodeName records of there must be no more than one"); |
| |
| if ((address_count != alias_count) && (address_count != 1)) |
| fatal("NodeAddr count must equal that of NodeName records of there must be no more than one"); |
| #else |
| #ifdef MULTIPLE_SLURMD |
| if ((address_count != alias_count) && (address_count != 1)) |
| fatal("NodeAddr count must equal that of NodeName records of there must be no more than one"); |
| if (bcast_count && (bcast_count != alias_count) && (bcast_count != 1)) |
| fatal("BcastAddr count must equal that of NodeName records, or there must be no more than one"); |
| #else |
| if (address_count < alias_count) |
| fatal("At least as many NodeAddr are required as NodeName"); |
| |
| if (bcast_count && (bcast_count < alias_count)) |
| fatal("At least as many BcastAddr are required as NodeName"); |
| |
| if (hostname_count < alias_count) |
| fatal("At least as many NodeHostname are required as NodeName"); |
| #endif /* MULTIPLE_SLURMD */ |
| #endif /* HAVE_FRONT_END */ |
| if ((port_count != alias_count) && (port_count > 1)) |
| fatal("Port count must equal that of NodeName records or there must be no more than one (%u != %u)", |
| port_count, alias_count); |
| |
| /* now build the individual node structures */ |
| while ((alias = hostlist_shift(alias_list))) { |
| if (address_count > 0) { |
| address_count--; |
| if (address) |
| free(address); |
| address = hostlist_shift(address_list); |
| } |
| if (bcast_count > 0) { |
| bcast_count--; |
| if (bcast_address) |
| free(bcast_address); |
| bcast_address = hostlist_shift(bcast_list); |
| } |
| if (hostname_count > 0) { |
| hostname_count--; |
| if (hostname) |
| free(hostname); |
| hostname = hostlist_shift(hostname_list); |
| } |
| if (port_count > 0) { |
| int port_int; |
| port_count--; |
| if (port_str) |
| free(port_str); |
| port_str = hostlist_shift(port_list); |
| port_int = atoi(port_str); |
| if ((port_int <= 0) || (port_int > 0xffff)) { |
| log_var(lvl, "Invalid Port %s", |
| node_ptr->port_str); |
| } |
| port = port_int; |
| } |
| |
| (*_callback)(alias, hostname, address, bcast_address, |
| port, state_val, node_ptr, config_ptr); |
| |
| free(alias); |
| } |
| /* free allocated storage */ |
| if (address) |
| free(address); |
| if (bcast_address) |
| free(bcast_address); |
| if (hostname) |
| free(hostname); |
| if (port_str) |
| free(port_str); |
| if (address_list) |
| hostlist_destroy(address_list); |
| if (alias_list) |
| hostlist_destroy(alias_list); |
| if (bcast_list) |
| hostlist_destroy(bcast_list); |
| if (hostname_list) |
| hostlist_destroy(hostname_list); |
| if (port_list) |
| hostlist_destroy(port_list); |
| return error_code; |
| } |
| |
| /* |
| * create_config_record - create a config_record entry and set is values to |
| * the defaults. each config record corresponds to a line in the |
| * slurm.conf file and typically describes the configuration of a |
| * large number of nodes |
| * RET pointer to the config_record |
| * NOTE: memory allocated will remain in existence until |
| * _delete_config_record() is called to delete all configuration records |
| */ |
| extern config_record_t *create_config_record(void) |
| { |
| config_record_t *config_ptr = xmalloc(sizeof(*config_ptr)); |
| |
| last_node_update = time (NULL); |
| |
| config_ptr->nodes = NULL; |
| config_ptr->node_bitmap = NULL; |
| xassert (config_ptr->magic = CONFIG_MAGIC); /* set value */ |
| |
| list_append(config_list, config_ptr); |
| |
| return config_ptr; |
| } |
| |
| /* |
| * create_node_record - create a node record and set its values to defaults |
| * IN config_ptr - pointer to node's configuration information |
| * IN node_name - name of the node |
| * RET pointer to the record or NULL if error |
| * NOTE: allocates memory at node_record_table_ptr that must be xfreed when |
| * the global node table is no longer required |
| */ |
| extern node_record_t *create_node_record(config_record_t *config_ptr, |
| char *node_name) |
| { |
| node_record_t *node_ptr; |
| int old_buffer_size, new_buffer_size; |
| |
| last_node_update = time (NULL); |
| xassert(config_ptr); |
| xassert(node_name); |
| |
| /* round up the buffer size to reduce overhead of xrealloc */ |
| old_buffer_size = (node_record_count) * sizeof(node_record_t); |
| old_buffer_size = |
| ((int) ((old_buffer_size / BUF_SIZE) + 1)) * BUF_SIZE; |
| new_buffer_size = |
| (node_record_count + 1) * sizeof(node_record_t); |
| new_buffer_size = |
| ((int) ((new_buffer_size / BUF_SIZE) + 1)) * BUF_SIZE; |
| if (!node_record_table_ptr) { |
| node_record_table_ptr = xmalloc(new_buffer_size); |
| } else if (old_buffer_size != new_buffer_size) { |
| xrealloc (node_record_table_ptr, new_buffer_size); |
| /* |
| * You need to rehash the hash after we realloc or we will have |
| * only bad memory references in the hash. |
| */ |
| rehash_node(); |
| } |
| node_ptr = node_record_table_ptr + (node_record_count++); |
| node_ptr->name = xstrdup(node_name); |
| if (!node_hash_table) |
| node_hash_table = xhash_init(_node_record_hash_identity, NULL); |
| xhash_add(node_hash_table, node_ptr); |
| |
| node_ptr->config_ptr = config_ptr; |
| /* these values will be overwritten when the node actually registers */ |
| node_ptr->cpus = config_ptr->cpus; |
| node_ptr->cpu_load = NO_VAL; |
| node_ptr->free_mem = NO_VAL64; |
| node_ptr->cpu_spec_list = xstrdup(config_ptr->cpu_spec_list); |
| node_ptr->boards = config_ptr->boards; |
| node_ptr->sockets = config_ptr->sockets; |
| node_ptr->cores = config_ptr->cores; |
| node_ptr->core_spec_cnt = config_ptr->core_spec_cnt; |
| node_ptr->threads = config_ptr->threads; |
| node_ptr->mem_spec_limit = config_ptr->mem_spec_limit; |
| node_ptr->real_memory = config_ptr->real_memory; |
| node_ptr->node_spec_bitmap = NULL; |
| node_ptr->tmp_disk = config_ptr->tmp_disk; |
| node_ptr->select_nodeinfo = select_g_select_nodeinfo_alloc(); |
| node_ptr->energy = acct_gather_energy_alloc(1); |
| node_ptr->ext_sensors = ext_sensors_alloc(); |
| node_ptr->owner = NO_VAL; |
| node_ptr->mcs_label = NULL; |
| node_ptr->next_state = NO_VAL; |
| node_ptr->protocol_version = SLURM_MIN_PROTOCOL_VERSION; |
| xassert (node_ptr->magic = NODE_MAGIC) /* set value */; |
| return node_ptr; |
| } |
| |
| /* |
| * find_node_record - find a record for node with specified name |
| * IN: name - name of the desired node |
| * RET: pointer to node record or NULL if not found |
| * NOTE: Logs an error if the node name is NOT found |
| */ |
| extern node_record_t *find_node_record(char *name) |
| { |
| return _find_node_record(name, true, true); |
| } |
| |
| /* |
| * find_node_record2 - find a record for node with specified name |
| * IN: name - name of the desired node |
| * RET: pointer to node record or NULL if not found |
| * NOTE: Does not log an error if the node name is NOT found |
| */ |
| extern node_record_t *find_node_record2(char *name) |
| { |
| return _find_node_record(name, true, false); |
| } |
| |
| /* |
| * find_node_record_no_alias - find a record for node with specified name |
| * without looking at the node's alias (NodeHostName). |
| * IN: name - name of the desired node |
| * RET: pointer to node record or NULL if not found |
| * NOTE: Logs an error if the node name is NOT found |
| */ |
| extern node_record_t *find_node_record_no_alias(char *name) |
| { |
| return _find_node_record(name, false, true); |
| } |
| |
| /* |
| * _find_node_record - find a record for node with specified name |
| * IN: name - name of the desired node |
| * IN: test_alias - if set, also test NodeHostName value |
| * IN: log_missing - if set, then print an error message if the node is not found |
| * RET: pointer to node record or NULL if not found |
| */ |
| static node_record_t *_find_node_record(char *name, bool test_alias, |
| bool log_missing) |
| { |
| node_record_t *node_ptr; |
| |
| if ((name == NULL) || (name[0] == '\0')) { |
| info("%s: passed NULL node name", __func__); |
| return NULL; |
| } |
| |
| /* nothing added yet */ |
| if (!node_hash_table) |
| return NULL; |
| |
| /* try to find via hash table, if it exists */ |
| if ((node_ptr = xhash_get_str(node_hash_table, name))) { |
| xassert(node_ptr->magic == NODE_MAGIC); |
| return node_ptr; |
| } |
| |
| if ((node_record_count == 1) && |
| (xstrcmp(node_record_table_ptr[0].name, "localhost") == 0)) |
| return (&node_record_table_ptr[0]); |
| |
| if (log_missing) |
| error("%s(%d): lookup failure for %s", |
| __func__, __LINE__, name); |
| |
| if (test_alias) { |
| char *alias = slurm_conf_get_nodename(name); |
| /* look for the alias node record if the user put this in |
| * instead of what slurm sees the node name as */ |
| if (!alias) |
| return NULL; |
| |
| node_ptr = xhash_get_str(node_hash_table, alias); |
| if (log_missing) |
| error("%s(%d): lookup failure for %s alias %s", |
| __func__, __LINE__, name, alias); |
| xfree(alias); |
| return node_ptr; |
| } |
| |
| return NULL; |
| } |
| |
| /* |
| * init_node_conf - initialize the node configuration tables and values. |
| * this should be called before creating any node or configuration |
| * entries. |
| * RET 0 if no error, otherwise an error code |
| */ |
| extern int init_node_conf (void) |
| { |
| last_node_update = time (NULL); |
| int i; |
| node_record_t *node_ptr; |
| |
| node_ptr = node_record_table_ptr; |
| for (i = 0; i < node_record_count; i++, node_ptr++) |
| purge_node_rec(node_ptr); |
| |
| node_record_count = 0; |
| xfree(node_record_table_ptr); |
| xhash_free(node_hash_table); |
| |
| if (config_list) /* delete defunct configuration entries */ |
| (void) _delete_config_record (); |
| else { |
| config_list = list_create (_list_delete_config); |
| front_end_list = list_create (destroy_frontend); |
| } |
| |
| return SLURM_SUCCESS; |
| } |
| |
| |
| /* node_fini2 - free memory associated with node records (except bitmaps) */ |
| extern void node_fini2 (void) |
| { |
| int i; |
| node_record_t *node_ptr; |
| |
| if (config_list) { |
| FREE_NULL_LIST(config_list); |
| FREE_NULL_LIST(front_end_list); |
| } |
| |
| xhash_free(node_hash_table); |
| node_ptr = node_record_table_ptr; |
| for (i = 0; i < node_record_count; i++, node_ptr++) |
| purge_node_rec(node_ptr); |
| |
| xfree(node_record_table_ptr); |
| node_record_count = 0; |
| } |
| |
| |
| /* |
| * node_name2bitmap - given a node name regular expression, build a bitmap |
| * representation |
| * IN node_names - list of nodes |
| * IN best_effort - if set don't return an error on invalid node name entries |
| * OUT bitmap - set to bitmap, may not have all bits set on error |
| * RET 0 if no error, otherwise EINVAL |
| * NOTE: call FREE_NULL_BITMAP() to free bitmap memory when no longer required |
| */ |
| extern int node_name2bitmap (char *node_names, bool best_effort, |
| bitstr_t **bitmap) |
| { |
| int rc = SLURM_SUCCESS; |
| char *this_node_name; |
| bitstr_t *my_bitmap; |
| hostlist_t host_list; |
| |
| my_bitmap = (bitstr_t *) bit_alloc (node_record_count); |
| *bitmap = my_bitmap; |
| |
| if (node_names == NULL) { |
| info("node_name2bitmap: node_names is NULL"); |
| return rc; |
| } |
| |
| if ( (host_list = hostlist_create (node_names)) == NULL) { |
| /* likely a badly formatted hostlist */ |
| error ("hostlist_create on %s error:", node_names); |
| if (!best_effort) |
| rc = EINVAL; |
| return rc; |
| } |
| |
| while ( (this_node_name = hostlist_shift (host_list)) ) { |
| node_record_t *node_ptr; |
| node_ptr = _find_node_record(this_node_name, best_effort, true); |
| if (node_ptr) { |
| bit_set (my_bitmap, (bitoff_t) (node_ptr - |
| node_record_table_ptr)); |
| } else { |
| error ("node_name2bitmap: invalid node specified %s", |
| this_node_name); |
| if (!best_effort) |
| rc = EINVAL; |
| } |
| free (this_node_name); |
| } |
| hostlist_destroy (host_list); |
| |
| return rc; |
| } |
| |
| /* |
| * hostlist2bitmap - given a hostlist, build a bitmap representation |
| * IN hl - hostlist |
| * IN best_effort - if set don't return an error on invalid node name entries |
| * OUT bitmap - set to bitmap, may not have all bits set on error |
| * RET 0 if no error, otherwise EINVAL |
| */ |
| extern int hostlist2bitmap (hostlist_t hl, bool best_effort, bitstr_t **bitmap) |
| { |
| int rc = SLURM_SUCCESS; |
| bitstr_t *my_bitmap; |
| char *name; |
| hostlist_iterator_t hi; |
| |
| FREE_NULL_BITMAP(*bitmap); |
| my_bitmap = (bitstr_t *) bit_alloc (node_record_count); |
| *bitmap = my_bitmap; |
| |
| hi = hostlist_iterator_create(hl); |
| while ((name = hostlist_next(hi))) { |
| node_record_t *node_ptr; |
| node_ptr = _find_node_record(name, best_effort, true); |
| if (node_ptr) { |
| bit_set (my_bitmap, (bitoff_t) (node_ptr - |
| node_record_table_ptr)); |
| } else { |
| error ("hostlist2bitmap: invalid node specified %s", |
| name); |
| if (!best_effort) |
| rc = EINVAL; |
| } |
| free (name); |
| } |
| |
| hostlist_iterator_destroy(hi); |
| return rc; |
| |
| } |
| |
| /* Purge the contents of a node record */ |
| extern void purge_node_rec(node_record_t *node_ptr) |
| { |
| xfree(node_ptr->arch); |
| xfree(node_ptr->comm_name); |
| xfree(node_ptr->cpu_spec_list); |
| xfree(node_ptr->features); |
| xfree(node_ptr->features_act); |
| xfree(node_ptr->gres); |
| FREE_NULL_LIST(node_ptr->gres_list); |
| xfree(node_ptr->name); |
| xfree(node_ptr->node_hostname); |
| FREE_NULL_BITMAP(node_ptr->node_spec_bitmap); |
| xfree(node_ptr->os); |
| xfree(node_ptr->part_pptr); |
| xfree(node_ptr->power); |
| xfree(node_ptr->reason); |
| xfree(node_ptr->version); |
| acct_gather_energy_destroy(node_ptr->energy); |
| ext_sensors_destroy(node_ptr->ext_sensors); |
| select_g_select_nodeinfo_free(node_ptr->select_nodeinfo); |
| xfree(node_ptr->tres_str); |
| xfree(node_ptr->tres_fmt_str); |
| xfree(node_ptr->tres_cnt); |
| } |
| |
| /* |
| * rehash_node - build a hash table of the node_record entries. |
| * NOTE: using xhash implementation |
| */ |
| extern void rehash_node (void) |
| { |
| int i; |
| node_record_t *node_ptr = node_record_table_ptr; |
| |
| xhash_free (node_hash_table); |
| node_hash_table = xhash_init(_node_record_hash_identity, NULL); |
| for (i = 0; i < node_record_count; i++, node_ptr++) { |
| if ((node_ptr->name == NULL) || |
| (node_ptr->name[0] == '\0')) |
| continue; /* vestigial record */ |
| xhash_add(node_hash_table, node_ptr); |
| } |
| |
| #if _DEBUG |
| _dump_hash(); |
| #endif |
| return; |
| } |
| |
| /* Convert a node state string to it's equivalent enum value */ |
| extern int state_str2int(const char *state_str, char *node_name) |
| { |
| int state_val = NO_VAL; |
| int i; |
| |
| for (i = 0; i <= NODE_STATE_END; i++) { |
| if (xstrcasecmp(node_state_string(i), "END") == 0) |
| break; |
| if (xstrcasecmp(node_state_string(i), state_str) == 0) { |
| state_val = i; |
| break; |
| } |
| } |
| if (i >= NODE_STATE_END) { |
| if (xstrncasecmp("CLOUD", state_str, 5) == 0) |
| state_val = NODE_STATE_IDLE | NODE_STATE_CLOUD | |
| NODE_STATE_POWER_SAVE; |
| else if (xstrncasecmp("DRAIN", state_str, 5) == 0) |
| state_val = NODE_STATE_UNKNOWN | NODE_STATE_DRAIN; |
| else if (xstrncasecmp("FAIL", state_str, 4) == 0) |
| state_val = NODE_STATE_IDLE | NODE_STATE_FAIL; |
| } |
| if (state_val == NO_VAL) { |
| error("node %s has invalid state %s", node_name, state_str); |
| errno = EINVAL; |
| } |
| return state_val; |
| } |
| |
| /* (re)set cr_node_num_cores arrays */ |
| extern void cr_init_global_core_data(node_record_t *node_ptr, int node_cnt) |
| { |
| uint32_t n; |
| |
| cr_fini_global_core_data(); |
| |
| cr_node_num_cores = xmalloc(node_cnt * sizeof(uint16_t)); |
| cr_node_cores_offset = xmalloc((node_cnt+1) * sizeof(uint32_t)); |
| |
| for (n = 0; n < node_cnt; n++) { |
| uint16_t cores = node_ptr[n].config_ptr->cores; |
| cores *= node_ptr[n].config_ptr->sockets; |
| |
| cr_node_num_cores[n] = cores; |
| if (n > 0) { |
| cr_node_cores_offset[n] = cr_node_cores_offset[n-1] + |
| cr_node_num_cores[n-1] ; |
| } else |
| cr_node_cores_offset[0] = 0; |
| } |
| |
| /* an extra value is added to get the total number of cores */ |
| /* as cr_get_coremap_offset is sometimes used to get the total */ |
| /* number of cores in the cluster */ |
| cr_node_cores_offset[node_cnt] = cr_node_cores_offset[node_cnt-1] + |
| cr_node_num_cores[node_cnt-1] ; |
| |
| } |
| |
| extern void cr_fini_global_core_data(void) |
| { |
| xfree(cr_node_num_cores); |
| xfree(cr_node_cores_offset); |
| } |
| |
| /* return the coremap index to the first core of the given node */ |
| |
| extern uint32_t cr_get_coremap_offset(uint32_t node_index) |
| { |
| xassert(cr_node_cores_offset); |
| return cr_node_cores_offset[node_index]; |
| } |
| |
| /* Return a bitmap the size of the machine in cores. On a Bluegene |
| * system it will return a bitmap in cnodes. */ |
| extern bitstr_t *cr_create_cluster_core_bitmap(int core_mult) |
| { |
| /* DEF_TIMERS; */ |
| /* START_TIMER; */ |
| bitstr_t *core_bitmap; |
| static int cnt = 0; |
| |
| if (!cnt) { |
| cnt = cr_get_coremap_offset(node_record_count); |
| if (core_mult) |
| cnt *= core_mult; |
| } |
| core_bitmap = bit_alloc(cnt); |
| /* END_TIMER; */ |
| /* info("creating of core bitmap of %d took %s", cnt, TIME_STR); */ |
| return core_bitmap; |
| } |
| |
| /* |
| * Determine maximum number of CPUs on this node usable by a job |
| * ntasks_per_core IN - tasks-per-core to be launched by this job |
| * cpus_per_task IN - number of required CPUs per task for this job |
| * total_cores IN - total number of cores on this node |
| * total_cpus IN - total number of CPUs on this node |
| * RET count of usable CPUs on this node usable by this job |
| */ |
| extern int adjust_cpus_nppcu(uint16_t ntasks_per_core, int cpus_per_task, |
| int total_cores, int total_cpus) |
| { |
| int cpus = total_cpus; |
| |
| //FIXME: This function ignores tasks-per-socket and tasks-per-node checks. |
| // Those parameters are tested later |
| if ((ntasks_per_core != 0) && (ntasks_per_core != 0xffff) && |
| (cpus_per_task != 0)) { |
| cpus = MAX((total_cores * ntasks_per_core * cpus_per_task), |
| total_cpus); |
| } |
| |
| return cpus; |
| } |
| |
| extern char *find_hostname(uint32_t pos, char *hosts) |
| { |
| hostlist_t hostlist = NULL; |
| char *temp = NULL, *host = NULL; |
| |
| if (!hosts || (pos == NO_VAL) || (pos == INFINITE)) |
| return NULL; |
| |
| hostlist = hostlist_create(hosts); |
| temp = hostlist_nth(hostlist, pos); |
| if (temp) { |
| host = xstrdup(temp); |
| free(temp); |
| } |
| hostlist_destroy(hostlist); |
| return host; |
| } |