| /*****************************************************************************\ |
| * node_mgr.c - manage the node records of slurm |
| * Note: there is a global node table (node_record_table_ptr), its |
| * hash table (node_hash_table), time stamp (last_node_update) and |
| * configuration list (config_list) |
| * |
| * $Id$ |
| ***************************************************************************** |
| * Copyright (C) 2002-2006 The Regents of the University of California. |
| * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). |
| * Written by Morris Jette <jette1@llnl.gov>, et. al. |
| * UCRL-CODE-217948. |
| * |
| * This file is part of SLURM, a resource management program. |
| * For details, see <http://www.llnl.gov/linux/slurm/>. |
| * |
| * SLURM is free software; you can redistribute it and/or modify it under |
| * the terms of the GNU General Public License as published by the Free |
| * Software Foundation; either version 2 of the License, or (at your option) |
| * any later version. |
| * |
| * In addition, as a special exception, the copyright holders give permission |
| * to link the code of portions of this program with the OpenSSL library under |
| * certain conditions as described in each individual source file, and |
| * distribute linked combinations including the two. You must obey the GNU |
| * General Public License in all respects for all of the code used other than |
| * OpenSSL. If you modify file(s) with this exception, you may extend this |
| * exception to your version of the file(s), but you are not obligated to do |
| * so. If you do not wish to do so, delete this exception statement from your |
| * version. If you delete this exception statement from all source files in |
| * the program, then also delete it here. |
| * |
| * SLURM is distributed in the hope that it will be useful, but WITHOUT ANY |
| * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS |
| * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more |
| * details. |
| * |
| * You should have received a copy of the GNU General Public License along |
| * with SLURM; if not, write to the Free Software Foundation, Inc., |
| * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. |
| \*****************************************************************************/ |
| |
| #ifdef HAVE_CONFIG_H |
| # include "config.h" |
| #endif |
| |
| #include <ctype.h> |
| #include <errno.h> |
| #include <stdio.h> |
| #include <stdlib.h> |
| #include <string.h> |
| #include <time.h> |
| #include <sys/types.h> |
| #include <sys/stat.h> |
| #include <fcntl.h> |
| |
| #include "src/common/hostlist.h" |
| #include "src/common/macros.h" |
| #include "src/common/pack.h" |
| #include "src/common/parse_time.h" |
| #include "src/common/xassert.h" |
| #include "src/common/xstring.h" |
| #include "src/common/node_select.h" |
| #include "src/common/read_config.h" |
| #include "src/slurmctld/agent.h" |
| #include "src/slurmctld/locks.h" |
| #include "src/slurmctld/ping_nodes.h" |
| #include "src/slurmctld/proc_req.h" |
| #include "src/slurmctld/sched_plugin.h" |
| #include "src/slurmctld/slurmctld.h" |
| |
| #define _DEBUG 0 |
| #define MAX_RETRIES 10 |
| |
| /* Global variables */ |
| List config_list = NULL; /* list of config_record entries */ |
| struct node_record *node_record_table_ptr = NULL; /* node records */ |
| struct node_record **node_hash_table = NULL; /* node_record hash table */ |
| time_t last_bitmap_update = (time_t) NULL; /* time of last node creation |
| * or deletion */ |
| time_t last_node_update = (time_t) NULL; /* time of last update to |
| * node records */ |
| bitstr_t *avail_node_bitmap = NULL; /* bitmap of available nodes */ |
| bitstr_t *idle_node_bitmap = NULL; /* bitmap of idle nodes */ |
| bitstr_t *share_node_bitmap = NULL; /* bitmap of sharable nodes */ |
| bitstr_t *up_node_bitmap = NULL; /* bitmap of non-down nodes */ |
| |
| static int _delete_config_record (void); |
| static void _dump_node_state (struct node_record *dump_node_ptr, |
| Buf buffer); |
| static struct node_record * _find_alias_node_record (char *name); |
| static int _hash_index (char *name); |
| static void _list_delete_config (void *config_entry); |
| static int _list_find_config (void *config_entry, void *key); |
| static void _make_node_down(struct node_record *node_ptr); |
| static void _node_did_resp(struct node_record *node_ptr); |
| static bool _node_is_hidden(struct node_record *node_ptr); |
| static void _node_not_resp (struct node_record *node_ptr, time_t msg_time); |
| static void _pack_node (struct node_record *dump_node_ptr, Buf buffer); |
| static void _sync_bitmaps(struct node_record *node_ptr, int job_count); |
| static bool _valid_node_state_change(uint16_t old, uint16_t new); |
| #if _DEBUG |
| static void _dump_hash (void); |
| #endif |
| |
| /* |
| * bitmap2node_name - given a bitmap, build a list of comma separated node |
| * names. names may include regular expressions (e.g. "lx[01-10]") |
| * IN bitmap - bitmap pointer |
| * RET pointer to node list or NULL on error |
| * globals: node_record_table_ptr - pointer to node table |
| * NOTE: the caller must xfree the memory at node_list when no longer required |
| */ |
| char * bitmap2node_name (bitstr_t *bitmap) |
| { |
| int i; |
| hostlist_t hl; |
| char buf[8192]; |
| |
| if (bitmap == NULL) |
| return xstrdup(""); |
| |
| hl = hostlist_create(""); |
| for (i = 0; i < node_record_count; i++) { |
| if (bit_test (bitmap, i) == 0) |
| continue; |
| hostlist_push(hl, node_record_table_ptr[i].name); |
| } |
| hostlist_uniq(hl); |
| hostlist_ranged_string(hl, sizeof(buf), buf); |
| hostlist_destroy(hl); |
| return xstrdup(buf); |
| } |
| |
| |
| /* |
| * create_config_record - create a config_record entry and set is values to |
| * the defaults. each config record corresponds to a line in the |
| * slurm.conf file and typically describes the configuration of a |
| * large number of nodes |
| * RET pointer to the config_record |
| * NOTE: memory allocated will remain in existence until |
| * _delete_config_record() is called to delete all configuration records |
| */ |
| struct config_record * create_config_record (void) |
| { |
| struct config_record *config_ptr; |
| |
| last_node_update = time (NULL); |
| config_ptr = (struct config_record *) |
| xmalloc (sizeof (struct config_record)); |
| |
| config_ptr->nodes = NULL; |
| config_ptr->node_bitmap = NULL; |
| xassert (config_ptr->magic = CONFIG_MAGIC); /* set value */ |
| |
| if (list_append(config_list, config_ptr) == NULL) |
| fatal ("create_config_record: unable to allocate memory"); |
| |
| return config_ptr; |
| } |
| |
| |
| /* |
| * create_node_record - create a node record and set its values to defaults |
| * IN config_ptr - pointer to node's configuration information |
| * IN node_name - name of the node |
| * RET pointer to the record or NULL if error |
| * NOTE: allocates memory at node_record_table_ptr that must be xfreed when |
| * the global node table is no longer required |
| */ |
| struct node_record * |
| create_node_record (struct config_record *config_ptr, char *node_name) |
| { |
| struct node_record *node_ptr; |
| int old_buffer_size, new_buffer_size; |
| |
| last_node_update = time (NULL); |
| xassert(config_ptr); |
| xassert(node_name); |
| xassert(strlen (node_name) < MAX_SLURM_NAME); |
| |
| /* round up the buffer size to reduce overhead of xrealloc */ |
| old_buffer_size = (node_record_count) * sizeof (struct node_record); |
| old_buffer_size = |
| ((int) ((old_buffer_size / BUF_SIZE) + 1)) * BUF_SIZE; |
| new_buffer_size = |
| (node_record_count + 1) * sizeof (struct node_record); |
| new_buffer_size = |
| ((int) ((new_buffer_size / BUF_SIZE) + 1)) * BUF_SIZE; |
| if (node_record_count == 0) |
| node_record_table_ptr = |
| (struct node_record *) xmalloc (new_buffer_size); |
| else if (old_buffer_size != new_buffer_size) |
| xrealloc (node_record_table_ptr, new_buffer_size); |
| node_ptr = node_record_table_ptr + (node_record_count++); |
| strcpy (node_ptr->name, node_name); |
| node_ptr->last_response = (time_t)0; |
| node_ptr->config_ptr = config_ptr; |
| node_ptr->part_cnt = 0; |
| node_ptr->part_pptr = NULL; |
| /* these values will be overwritten when the node actually registers */ |
| node_ptr->cpus = config_ptr->cpus; |
| node_ptr->real_memory = config_ptr->real_memory; |
| node_ptr->tmp_disk = config_ptr->tmp_disk; |
| xassert (node_ptr->magic = NODE_MAGIC) /* set value */; |
| last_bitmap_update = time (NULL); |
| return node_ptr; |
| } |
| |
| |
| /* |
| * _delete_config_record - delete all configuration records |
| * RET 0 if no error, errno otherwise |
| * global: config_list - list of all configuration records |
| */ |
| static int _delete_config_record (void) |
| { |
| last_node_update = time (NULL); |
| (void) list_delete_all (config_list, &_list_find_config, |
| "universal_key"); |
| return SLURM_SUCCESS; |
| } |
| |
| |
| /* dump_all_node_state - save the state of all nodes to file */ |
| int dump_all_node_state ( void ) |
| { |
| int error_code = 0, inx, log_fd; |
| char *old_file, *new_file, *reg_file; |
| /* Locks: Read config and node */ |
| slurmctld_lock_t node_read_lock = { READ_LOCK, NO_LOCK, READ_LOCK, |
| NO_LOCK }; |
| Buf buffer = init_buf(BUF_SIZE*16); |
| DEF_TIMERS; |
| |
| START_TIMER; |
| /* write header: time */ |
| pack_time (time (NULL), buffer); |
| |
| /* write node records to buffer */ |
| lock_slurmctld (node_read_lock); |
| for (inx = 0; inx < node_record_count; inx++) { |
| xassert (node_record_table_ptr[inx].magic == NODE_MAGIC); |
| xassert (node_record_table_ptr[inx].config_ptr->magic == |
| CONFIG_MAGIC); |
| |
| _dump_node_state (&node_record_table_ptr[inx], buffer); |
| } |
| unlock_slurmctld (node_read_lock); |
| |
| /* write the buffer to file */ |
| old_file = xstrdup (slurmctld_conf.state_save_location); |
| xstrcat (old_file, "/node_state.old"); |
| reg_file = xstrdup (slurmctld_conf.state_save_location); |
| xstrcat (reg_file, "/node_state"); |
| new_file = xstrdup (slurmctld_conf.state_save_location); |
| xstrcat (new_file, "/node_state.new"); |
| lock_state_files (); |
| log_fd = creat (new_file, 0600); |
| if (log_fd == 0) { |
| error ("Can't save state, error creating file %s %m", |
| new_file); |
| error_code = errno; |
| } else { |
| int pos = 0, nwrite = get_buf_offset(buffer), amount; |
| char *data = (char *)get_buf_data(buffer); |
| |
| while (nwrite > 0) { |
| amount = write(log_fd, &data[pos], nwrite); |
| if ((amount < 0) && (errno != EINTR)) { |
| error("Error writing file %s, %m", new_file); |
| error_code = errno; |
| break; |
| } |
| nwrite -= amount; |
| pos += amount; |
| } |
| fsync(log_fd); |
| close(log_fd); |
| } |
| if (error_code) |
| (void) unlink (new_file); |
| else { /* file shuffle */ |
| (void) unlink (old_file); |
| (void) link (reg_file, old_file); |
| (void) unlink (reg_file); |
| (void) link (new_file, reg_file); |
| (void) unlink (new_file); |
| } |
| xfree (old_file); |
| xfree (reg_file); |
| xfree (new_file); |
| unlock_state_files (); |
| |
| free_buf (buffer); |
| END_TIMER; |
| debug3("dump_all_node_state %s", TIME_STR); |
| return error_code; |
| } |
| |
| /* |
| * _dump_node_state - dump the state of a specific node to a buffer |
| * IN dump_node_ptr - pointer to node for which information is requested |
| * IN/OUT buffer - location to store data, pointers automatically advanced |
| */ |
| static void |
| _dump_node_state (struct node_record *dump_node_ptr, Buf buffer) |
| { |
| packstr (dump_node_ptr->name, buffer); |
| packstr (dump_node_ptr->reason, buffer); |
| pack16 (dump_node_ptr->node_state, buffer); |
| pack32 (dump_node_ptr->cpus, buffer); |
| pack32 (dump_node_ptr->real_memory, buffer); |
| pack32 (dump_node_ptr->tmp_disk, buffer); |
| } |
| |
| /* |
| * _find_alias_node_record - find a record for node with the alias of |
| * the specified name supplied |
| * input: name - name to be aliased of the desired node |
| * output: return pointer to node record or NULL if not found |
| * global: node_record_table_ptr - pointer to global node table |
| * node_hash_table - table of hash indecies |
| */ |
| static struct node_record * |
| _find_alias_node_record (char *name) |
| { |
| int i; |
| char *alias = NULL; |
| |
| if ((name == NULL) |
| || (name[0] == '\0')) { |
| info("_find_alias_node_record: passed NULL name"); |
| return NULL; |
| } |
| /* Get the alias we have just to make sure the user isn't |
| * trying to use the real hostname to run on something that has |
| * been aliased. |
| */ |
| alias = slurm_conf_get_nodename(name); |
| |
| if(!alias) |
| return NULL; |
| |
| /* try to find via hash table, if it exists */ |
| if (node_hash_table) { |
| struct node_record *node_ptr; |
| |
| i = _hash_index (alias); |
| node_ptr = node_hash_table[i]; |
| while (node_ptr) { |
| xassert(node_ptr->magic == NODE_MAGIC); |
| if (!strcmp(node_ptr->name, alias)) { |
| xfree(alias); |
| return node_ptr; |
| } |
| node_ptr = node_ptr->node_next; |
| } |
| error ("_find_alias_node_record: lookup failure for %s", name); |
| } |
| |
| /* revert to sequential search */ |
| else { |
| for (i = 0; i < node_record_count; i++) { |
| if (!strcmp (alias, node_record_table_ptr[i].name)) { |
| xfree(alias); |
| return (&node_record_table_ptr[i]); |
| } |
| } |
| } |
| |
| xfree(alias); |
| return (struct node_record *) NULL; |
| } |
| |
| /* |
| * load_all_node_state - Load the node state from file, recover on slurmctld |
| * restart. Execute this after loading the configuration file data. |
| * Data goes into common storage. |
| * IN state_only - if true over-write only node state and reason fields |
| * RET 0 or error code |
| * NOTE: READ lock_slurmctld config before entry |
| */ |
| extern int load_all_node_state ( bool state_only ) |
| { |
| char *node_name, *reason = NULL, *data = NULL, *state_file; |
| int data_allocated, data_read = 0, error_code = 0, node_cnt = 0; |
| uint16_t node_state, name_len; |
| uint32_t cpus, real_memory, tmp_disk, data_size = 0; |
| struct node_record *node_ptr; |
| int state_fd; |
| time_t time_stamp; |
| Buf buffer; |
| |
| /* read the file */ |
| state_file = xstrdup (slurmctld_conf.state_save_location); |
| xstrcat (state_file, "/node_state"); |
| lock_state_files (); |
| state_fd = open (state_file, O_RDONLY); |
| if (state_fd < 0) { |
| info ("No node state file (%s) to recover", state_file); |
| error_code = ENOENT; |
| } |
| else { |
| data_allocated = BUF_SIZE; |
| data = xmalloc(data_allocated); |
| while (1) { |
| data_read = read(state_fd, &data[data_size], BUF_SIZE); |
| if (data_read < 0) { |
| if (errno == EINTR) |
| continue; |
| else { |
| error ("Read error on %s: %m", |
| state_file); |
| break; |
| } |
| } else if (data_read == 0) /* eof */ |
| break; |
| data_size += data_read; |
| data_allocated += data_read; |
| xrealloc(data, data_allocated); |
| } |
| close (state_fd); |
| } |
| xfree (state_file); |
| unlock_state_files (); |
| |
| buffer = create_buf (data, data_size); |
| safe_unpack_time (&time_stamp, buffer); |
| |
| while (remaining_buf (buffer) > 0) { |
| uint16_t base_state; |
| safe_unpackstr_xmalloc (&node_name, &name_len, buffer); |
| safe_unpackstr_xmalloc (&reason, &name_len, buffer); |
| safe_unpack16 (&node_state, buffer); |
| safe_unpack32 (&cpus, buffer); |
| safe_unpack32 (&real_memory, buffer); |
| safe_unpack32 (&tmp_disk, buffer); |
| base_state = node_state & NODE_STATE_BASE; |
| |
| /* validity test as possible */ |
| if ((cpus == 0) || (base_state >= NODE_STATE_END)) { |
| error ("Invalid data for node %s: cpus=%u, state=%u", |
| node_name, cpus, node_state); |
| error ("No more node data will be processed from the " |
| "checkpoint file"); |
| xfree (node_name); |
| error_code = EINVAL; |
| break; |
| |
| } |
| |
| /* find record and perform update */ |
| node_ptr = find_node_record (node_name); |
| if (node_ptr == NULL) { |
| error ("Node %s has vanished from configuration", |
| node_name); |
| xfree(reason); |
| } else if (state_only) { |
| node_cnt++; |
| if (node_ptr->node_state == NODE_STATE_UNKNOWN) { |
| if (node_state & NODE_STATE_DRAIN) |
| node_ptr->node_state = |
| NODE_STATE_DRAIN; |
| else if (base_state == NODE_STATE_DOWN) |
| node_ptr->node_state = NODE_STATE_DOWN; |
| } |
| if (node_ptr->reason == NULL) |
| node_ptr->reason = reason; |
| else |
| xfree(reason); |
| } else { |
| node_cnt++; |
| node_ptr->node_state = node_state; |
| xfree(node_ptr->reason); |
| node_ptr->reason = reason; |
| node_ptr->part_cnt = 0; |
| xfree(node_ptr->part_pptr); |
| node_ptr->cpus = cpus; |
| node_ptr->real_memory = real_memory; |
| node_ptr->tmp_disk = tmp_disk; |
| node_ptr->last_response = (time_t) 0; |
| } |
| xfree (node_name); |
| } |
| |
| info ("Recovered state of %d nodes", node_cnt); |
| free_buf (buffer); |
| return error_code; |
| |
| unpack_error: |
| error ("Incomplete node data checkpoint file"); |
| info("Recovered state of %d nodes", node_cnt); |
| free_buf (buffer); |
| return EFAULT; |
| } |
| |
| /* |
| * find_node_record - find a record for node with specified name |
| * input: name - name of the desired node |
| * output: return pointer to node record or NULL if not found |
| * global: node_record_table_ptr - pointer to global node table |
| * node_hash_table - table of hash indecies |
| */ |
| struct node_record * |
| find_node_record (char *name) |
| { |
| int i; |
| |
| if ((name == NULL) |
| || (name[0] == '\0')) { |
| info("find_node_record passed NULL name"); |
| return NULL; |
| } |
| |
| /* try to find via hash table, if it exists */ |
| if (node_hash_table) { |
| struct node_record *node_ptr; |
| |
| i = _hash_index (name); |
| node_ptr = node_hash_table[i]; |
| while (node_ptr) { |
| xassert(node_ptr->magic == NODE_MAGIC); |
| if (!strcmp(node_ptr->name, name)) { |
| return node_ptr; |
| } |
| node_ptr = node_ptr->node_next; |
| } |
| error ("find_node_record: lookup failure for %s", name); |
| } |
| |
| /* revert to sequential search */ |
| else { |
| for (i = 0; i < node_record_count; i++) { |
| if (!strcmp (name, node_record_table_ptr[i].name)) { |
| return (&node_record_table_ptr[i]); |
| } |
| } |
| } |
| |
| /* look for the alias node record if the user put this in |
| instead of what slurm sees the node name as */ |
| return _find_alias_node_record (name); |
| } |
| |
| |
| /* |
| * _hash_index - return a hash table index for the given node name |
| * IN name = the node's name |
| * RET the hash table index |
| */ |
| static int _hash_index (char *name) |
| { |
| int index = 0; |
| int j; |
| |
| if ((node_record_count == 0) |
| || (name == NULL)) |
| return 0; /* degenerate case */ |
| |
| /* Multiply each character by its numerical position in the |
| * name string to add a bit of entropy, because host names such |
| * as cluster[0001-1000] can cause excessive index collisions. |
| */ |
| for (j = 1; *name; name++, j++) |
| index += (int)*name * j; |
| index %= node_record_count; |
| |
| return index; |
| } |
| |
| |
| /* |
| * init_node_conf - initialize the node configuration tables and values. |
| * this should be called before creating any node or configuration |
| * entries. |
| * RET 0 if no error, otherwise an error code |
| * global: node_record_table_ptr - pointer to global node table |
| * node_hash_table - table of hash indecies |
| * last_node_update - time of last node table update |
| */ |
| int init_node_conf (void) |
| { |
| last_node_update = time (NULL); |
| |
| node_record_count = 0; |
| xfree(node_record_table_ptr); |
| xfree(node_hash_table); |
| |
| if (config_list) /* delete defunct configuration entries */ |
| (void) _delete_config_record (); |
| else { |
| config_list = list_create (_list_delete_config); |
| if (config_list == NULL) |
| fatal("list_create malloc failure"); |
| } |
| |
| return SLURM_SUCCESS; |
| } |
| |
| |
| /* list_compare_config - compare two entry from the config list based upon |
| * weight, see common/list.h for documentation */ |
| int list_compare_config (void *config_entry1, void *config_entry2) |
| { |
| int weight1, weight2; |
| weight1 = ((struct config_record *) config_entry1)->weight; |
| weight2 = ((struct config_record *) config_entry2)->weight; |
| return (weight1 - weight2); |
| } |
| |
| |
| /* _list_delete_config - delete an entry from the config list, |
| * see list.h for documentation */ |
| static void _list_delete_config (void *config_entry) |
| { |
| struct config_record *config_ptr = (struct config_record *) |
| config_entry; |
| |
| xassert(config_ptr); |
| xassert(config_ptr->magic == CONFIG_MAGIC); |
| xfree (config_ptr->feature); |
| xfree (config_ptr->nodes); |
| FREE_NULL_BITMAP (config_ptr->node_bitmap); |
| xfree (config_ptr); |
| } |
| |
| |
| /* |
| * _list_find_config - find an entry in the config list, see list.h for |
| * documentation |
| * IN key - is "universal_key" for all config |
| * RET 1 if key == "universal_key", 0 otherwise |
| */ |
| static int _list_find_config (void *config_entry, void *key) |
| { |
| if (strcmp (key, "universal_key") == 0) |
| return 1; |
| return 0; |
| } |
| |
| |
| /* |
| * node_name2bitmap - given a node name regular expression, build a bitmap |
| * representation |
| * IN node_names - list of nodes |
| * IN best_effort - if set don't return an error on invalid node name entries |
| * OUT bitmap - set to bitmap or NULL on error |
| * RET 0 if no error, otherwise EINVAL |
| * global: node_record_table_ptr - pointer to global node table |
| * NOTE: the caller must bit_free() memory at bitmap when no longer required |
| */ |
| extern int node_name2bitmap (char *node_names, bool best_effort, |
| bitstr_t **bitmap) |
| { |
| int rc = SLURM_SUCCESS; |
| char *this_node_name; |
| bitstr_t *my_bitmap; |
| hostlist_t host_list; |
| |
| my_bitmap = (bitstr_t *) bit_alloc (node_record_count); |
| if (my_bitmap == NULL) |
| fatal("bit_alloc malloc failure"); |
| *bitmap = my_bitmap; |
| |
| if (node_names == NULL) { |
| error ("node_name2bitmap: node_names is NULL"); |
| return rc; |
| } |
| |
| if ( (host_list = hostlist_create (node_names)) == NULL) { |
| /* likely a badly formatted hostlist */ |
| error ("hostlist_create on %s error:", node_names); |
| if (!best_effort) |
| rc = EINVAL; |
| return rc; |
| } |
| |
| while ( (this_node_name = hostlist_shift (host_list)) ) { |
| struct node_record *node_ptr; |
| node_ptr = find_node_record (this_node_name); |
| if (node_ptr) { |
| bit_set (my_bitmap, (bitoff_t) (node_ptr - |
| node_record_table_ptr)); |
| } else { |
| error ("node_name2bitmap: invalid node specified %s", |
| this_node_name); |
| if (!best_effort) { |
| free (this_node_name); |
| rc = EINVAL; |
| break; |
| } |
| } |
| free (this_node_name); |
| } |
| hostlist_destroy (host_list); |
| |
| return rc; |
| } |
| |
| static bool _node_is_hidden(struct node_record *node_ptr) |
| { |
| int i; |
| bool shown = false; |
| |
| for (i=0; i<node_ptr->part_cnt; i++) { |
| if (node_ptr->part_pptr[i]->hidden == 0) { |
| shown = true; |
| break; |
| } |
| } |
| |
| if (shown || (node_ptr->part_cnt == 0)) |
| return false; |
| return true; |
| } |
| |
| /* |
| * pack_all_node - dump all configuration and node information for all nodes |
| * in machine independent form (for network transmission) |
| * OUT buffer_ptr - pointer to the stored data |
| * OUT buffer_size - set to size of the buffer in bytes |
| * IN show_flags - node filtering options |
| * IN uid - uid of user making request (for partition filtering) |
| * global: node_record_table_ptr - pointer to global node table |
| * NOTE: the caller must xfree the buffer at *buffer_ptr |
| * NOTE: change slurm_load_node() in api/node_info.c when data format changes |
| * NOTE: READ lock_slurmctld config before entry |
| */ |
| extern void pack_all_node (char **buffer_ptr, int *buffer_size, |
| uint16_t show_flags, uid_t uid) |
| { |
| int inx; |
| uint32_t nodes_packed, tmp_offset; |
| Buf buffer; |
| time_t now = time(NULL); |
| struct node_record *node_ptr = node_record_table_ptr; |
| |
| buffer_ptr[0] = NULL; |
| *buffer_size = 0; |
| |
| buffer = init_buf (BUF_SIZE*16); |
| |
| /* write header: version and time */ |
| nodes_packed = 0 ; |
| pack32 (nodes_packed, buffer); |
| pack_time (now, buffer); |
| |
| /* write node records */ |
| part_filter_set(uid); |
| for (inx = 0; inx < node_record_count; inx++, node_ptr++) { |
| xassert (node_ptr->magic == NODE_MAGIC); |
| xassert (node_ptr->config_ptr->magic == |
| CONFIG_MAGIC); |
| |
| if (((show_flags & SHOW_ALL) == 0) |
| && (_node_is_hidden(node_ptr))) |
| continue; |
| |
| _pack_node(node_ptr, buffer); |
| nodes_packed ++ ; |
| } |
| part_filter_clear(); |
| |
| tmp_offset = get_buf_offset (buffer); |
| set_buf_offset (buffer, 0); |
| pack32 ((uint32_t) nodes_packed, buffer); |
| set_buf_offset (buffer, tmp_offset); |
| |
| *buffer_size = get_buf_offset (buffer); |
| buffer_ptr[0] = xfer_buf_data (buffer); |
| } |
| |
| |
| /* |
| * _pack_node - dump all configuration information about a specific node in |
| * machine independent form (for network transmission) |
| * IN dump_node_ptr - pointer to node for which information is requested |
| * IN/OUT buffer - buffer where data is placed, pointers automatically updated |
| * NOTE: if you make any changes here be sure to make the corresponding |
| * changes to load_node_config in api/node_info.c |
| * NOTE: READ lock_slurmctld config before entry |
| */ |
| static void _pack_node (struct node_record *dump_node_ptr, Buf buffer) |
| { |
| packstr (dump_node_ptr->name, buffer); |
| pack16 (dump_node_ptr->node_state, buffer); |
| if (slurmctld_conf.fast_schedule) { |
| /* Only data from config_record used for scheduling */ |
| pack32 (dump_node_ptr->config_ptr->cpus, buffer); |
| pack32 (dump_node_ptr->config_ptr->real_memory, buffer); |
| pack32 (dump_node_ptr->config_ptr->tmp_disk, buffer); |
| } else { |
| /* Individual node data used for scheduling */ |
| pack32 (dump_node_ptr->cpus, buffer); |
| pack32 (dump_node_ptr->real_memory, buffer); |
| pack32 (dump_node_ptr->tmp_disk, buffer); |
| } |
| pack32 (dump_node_ptr->config_ptr->weight, buffer); |
| packstr (dump_node_ptr->config_ptr->feature, buffer); |
| packstr (dump_node_ptr->reason, buffer); |
| } |
| |
| |
| /* |
| * rehash_node - build a hash table of the node_record entries. |
| * global: node_record_table_ptr - pointer to global node table |
| * node_hash_table - table of hash indecies |
| * NOTE: manages memory for node_hash_table |
| */ |
| void rehash_node (void) |
| { |
| int i, inx; |
| |
| xfree (node_hash_table); |
| node_hash_table = xmalloc (sizeof (struct node_record *) * |
| node_record_count); |
| |
| for (i = 0; i < node_record_count; i++) { |
| if (strlen (node_record_table_ptr[i].name) == 0) |
| continue; /* vestigial record */ |
| inx = _hash_index (node_record_table_ptr[i].name); |
| node_record_table_ptr[i].node_next = node_hash_table[inx]; |
| node_hash_table[inx] = &node_record_table_ptr[i]; |
| } |
| |
| #if _DEBUG |
| _dump_hash(); |
| #endif |
| return; |
| } |
| |
| |
| /* |
| * set_slurmd_addr - establish the slurm_addr for the slurmd on each node |
| * Uses common data structures. |
| * NOTE: READ lock_slurmctld config before entry |
| */ |
| void set_slurmd_addr (void) |
| { |
| int i; |
| struct node_record *node_ptr = node_record_table_ptr; |
| DEF_TIMERS; |
| |
| START_TIMER; |
| for (i = 0; i < node_record_count; i++, node_ptr++) { |
| if (node_ptr->name[0] == '\0') |
| continue; |
| if (node_ptr->port == 0) |
| node_ptr->port = slurmctld_conf.slurmd_port; |
| slurm_set_addr (&node_ptr->slurm_addr, |
| node_ptr->port, |
| node_ptr->comm_name); |
| if (node_ptr->slurm_addr.sin_port) |
| continue; |
| fatal ("slurm_set_addr failure on %s", |
| node_ptr->comm_name); |
| } |
| |
| END_TIMER; |
| debug("set_slurmd_addr: got IP addresses for all nodes %s", |
| TIME_STR); |
| return; |
| } |
| |
| /* |
| * update_node - update the configuration data for one or more nodes |
| * IN update_node_msg - update node request |
| * RET SLURM_SUCCESS or error code |
| * global: node_record_table_ptr - pointer to global node table |
| */ |
| int update_node ( update_node_msg_t * update_node_msg ) |
| { |
| int error_code = 0, base_state = 0, node_inx; |
| struct node_record *node_ptr = NULL; |
| char *this_node_name = NULL; |
| hostlist_t host_list; |
| uint16_t node_flags = 0, state_val; |
| |
| if (update_node_msg -> node_names == NULL ) { |
| error ("update_node: invalid node name %s", |
| update_node_msg -> node_names ); |
| return ESLURM_INVALID_NODE_NAME; |
| } |
| |
| if ( (host_list = hostlist_create (update_node_msg -> node_names)) |
| == NULL) { |
| error ("hostlist_create error on %s: %m", |
| update_node_msg -> node_names); |
| return ESLURM_INVALID_NODE_NAME; |
| } |
| |
| last_node_update = time (NULL); |
| while ( (this_node_name = hostlist_shift (host_list)) ) { |
| int err_code = 0; |
| state_val = update_node_msg->node_state; |
| node_ptr = find_node_record (this_node_name); |
| node_inx = node_ptr - node_record_table_ptr; |
| if (node_ptr == NULL) { |
| error ("update_node: node %s does not exist", |
| this_node_name); |
| error_code = ESLURM_INVALID_NODE_NAME; |
| free (this_node_name); |
| break; |
| } |
| |
| if (state_val != (uint16_t) NO_VAL) { |
| base_state = node_ptr->node_state; |
| if (!_valid_node_state_change(base_state, state_val)) { |
| info("Invalid node state transition requested " |
| "for node %s from=%s to=%s", |
| this_node_name, |
| node_state_string(base_state), |
| node_state_string(state_val)); |
| state_val = (uint16_t) NO_VAL; |
| error_code = ESLURM_INVALID_NODE_STATE; |
| } |
| } |
| if (state_val != (uint16_t) NO_VAL) { |
| if (state_val == NODE_RESUME) { |
| node_ptr->node_state &= (~NODE_STATE_DRAIN); |
| base_state &= NODE_STATE_BASE; |
| if (base_state == NODE_STATE_DOWN) |
| state_val = NODE_STATE_IDLE; |
| else |
| state_val = base_state; |
| } |
| if (state_val == NODE_STATE_DOWN) { |
| /* We must set node DOWN before killing |
| * its jobs */ |
| _make_node_down(node_ptr); |
| kill_running_job_by_node_name (this_node_name, |
| false); |
| } |
| else if (state_val == NODE_STATE_IDLE) { |
| /* assume they want to clear DRAIN flag too */ |
| node_ptr->node_state &= (~NODE_STATE_DRAIN); |
| bit_set (avail_node_bitmap, node_inx); |
| bit_set (idle_node_bitmap, node_inx); |
| bit_set (up_node_bitmap, node_inx); |
| reset_job_priority(); |
| } |
| else if (state_val == NODE_STATE_ALLOCATED) { |
| if (!(node_ptr->node_state & NODE_STATE_DRAIN)) |
| bit_set (up_node_bitmap, node_inx); |
| bit_set (avail_node_bitmap, node_inx); |
| bit_clear (idle_node_bitmap, node_inx); |
| } |
| else if (state_val == NODE_STATE_DRAIN) { |
| bit_clear (avail_node_bitmap, node_inx); |
| state_val = node_ptr->node_state | |
| NODE_STATE_DRAIN; |
| } |
| else { |
| info ("Invalid node state specified %d", |
| state_val); |
| err_code = 1; |
| error_code = ESLURM_INVALID_NODE_STATE; |
| } |
| |
| if (err_code == 0) { |
| node_flags = node_ptr->node_state & |
| NODE_STATE_FLAGS; |
| node_ptr->node_state = state_val | node_flags; |
| |
| select_g_update_node_state(node_inx, |
| state_val); |
| |
| info ("update_node: node %s state set to %s", |
| this_node_name, |
| node_state_string(state_val)); |
| } |
| } |
| |
| if ((update_node_msg -> reason) && |
| (update_node_msg -> reason[0])) { |
| xfree(node_ptr->reason); |
| node_ptr->reason = xstrdup(update_node_msg->reason); |
| info ("update_node: node %s reason set to: %s", |
| this_node_name, node_ptr->reason); |
| } |
| |
| base_state = node_ptr->node_state & NODE_STATE_BASE; |
| if ((base_state != NODE_STATE_DOWN) |
| && ((node_ptr->node_state & NODE_STATE_DRAIN) == 0)) |
| xfree(node_ptr->reason); |
| |
| free (this_node_name); |
| } |
| |
| hostlist_destroy (host_list); |
| return error_code; |
| } |
| |
| /* |
| * drain_nodes - drain one or more nodes, |
| * no-op for nodes already drained or draining |
| * IN nodes - nodes to drain |
| * IN reason - reason to drain the nodes |
| * RET SLURM_SUCCESS or error code |
| * global: node_record_table_ptr - pointer to global node table |
| */ |
| extern int drain_nodes ( char *nodes, char *reason ) |
| { |
| int error_code = 0, node_inx; |
| struct node_record *node_ptr; |
| char *this_node_name ; |
| hostlist_t host_list; |
| |
| if ((nodes == NULL) || (nodes[0] == '\0')) { |
| error ("drain_nodes: invalid node name %s", nodes); |
| return ESLURM_INVALID_NODE_NAME; |
| } |
| |
| if ( (host_list = hostlist_create (nodes)) == NULL) { |
| error ("hostlist_create error on %s: %m", nodes); |
| return ESLURM_INVALID_NODE_NAME; |
| } |
| |
| last_node_update = time (NULL); |
| while ( (this_node_name = hostlist_shift (host_list)) ) { |
| node_ptr = find_node_record (this_node_name); |
| node_inx = node_ptr - node_record_table_ptr; |
| if (node_ptr == NULL) { |
| error ("drain_nodes: node %s does not exist", |
| this_node_name); |
| error_code = ESLURM_INVALID_NODE_NAME; |
| free (this_node_name); |
| break; |
| } |
| |
| if (node_ptr->node_state & NODE_STATE_DRAIN) { |
| /* state already changed, nothing to do */ |
| free (this_node_name); |
| continue; |
| } |
| |
| node_ptr->node_state |= NODE_STATE_DRAIN; |
| bit_clear (avail_node_bitmap, node_inx); |
| info ("drain_nodes: node %s state set to DRAIN", |
| this_node_name); |
| |
| xfree(node_ptr->reason); |
| node_ptr->reason = xstrdup(reason); |
| |
| select_g_update_node_state(node_inx, node_ptr->node_state); |
| |
| free (this_node_name); |
| } |
| |
| hostlist_destroy (host_list); |
| return error_code; |
| } |
| /* Return true if admin request to change node state from old to new is valid */ |
| static bool _valid_node_state_change(uint16_t old, uint16_t new) |
| { |
| uint16_t base_state, node_flags; |
| if (old == new) |
| return true; |
| |
| base_state = (old) & NODE_STATE_BASE; |
| node_flags = (old) & NODE_STATE_FLAGS; |
| switch (new) { |
| case NODE_STATE_DOWN: |
| case NODE_STATE_DRAIN: |
| return true; |
| break; |
| |
| case NODE_RESUME: |
| if (base_state == NODE_STATE_UNKNOWN) |
| return false; |
| if ((base_state == NODE_STATE_DOWN) |
| || (node_flags & NODE_STATE_DRAIN)) |
| return true; |
| break; |
| |
| case NODE_STATE_IDLE: |
| if ((base_state == NODE_STATE_DOWN) |
| || (base_state == NODE_STATE_IDLE)) |
| return true; |
| break; |
| |
| case NODE_STATE_ALLOCATED: |
| if (base_state == NODE_STATE_ALLOCATED) |
| return true; |
| break; |
| |
| default: /* All others invalid */ |
| break; |
| } |
| |
| return false; |
| } |
| |
| /* |
| * validate_node_specs - validate the node's specifications as valid, |
| * if not set state to down, in any case update last_response |
| * IN node_name - name of the node |
| * IN cpus - number of cpus measured |
| * IN real_memory - mega_bytes of real_memory measured |
| * IN tmp_disk - mega_bytes of tmp_disk measured |
| * IN job_count - number of jobs allocated to this node |
| * IN status - node status code |
| * RET 0 if no error, ENOENT if no such node, EINVAL if values too low |
| * global: node_record_table_ptr - pointer to global node table |
| * NOTE: READ lock_slurmctld config before entry |
| */ |
| extern int |
| validate_node_specs (char *node_name, uint32_t cpus, |
| uint32_t real_memory, uint32_t tmp_disk, |
| uint32_t job_count, uint32_t status) |
| { |
| int error_code, i; |
| struct config_record *config_ptr; |
| struct node_record *node_ptr; |
| char *reason_down = NULL; |
| uint16_t base_state, node_flags; |
| |
| node_ptr = find_node_record (node_name); |
| if (node_ptr == NULL) |
| return ENOENT; |
| node_ptr->last_response = time (NULL); |
| |
| config_ptr = node_ptr->config_ptr; |
| error_code = 0; |
| |
| if (cpus < config_ptr->cpus) { |
| error ("Node %s has low cpu count %u", node_name, cpus); |
| error_code = EINVAL; |
| reason_down = "Low CPUs"; |
| } |
| if ((node_ptr->cpus != cpus) |
| && (slurmctld_conf.fast_schedule == 0)) { |
| for (i=0; i<node_ptr->part_cnt; i++) { |
| node_ptr->part_pptr[i]->total_cpus += |
| (cpus - node_ptr->cpus); |
| } |
| } |
| node_ptr->cpus = cpus; |
| |
| if (real_memory < config_ptr->real_memory) { |
| error ("Node %s has low real_memory size %u", |
| node_name, real_memory); |
| error_code = EINVAL; |
| reason_down = "Low RealMemory"; |
| } |
| node_ptr->real_memory = real_memory; |
| |
| if (tmp_disk < config_ptr->tmp_disk) { |
| error ("Node %s has low tmp_disk size %u", |
| node_name, tmp_disk); |
| error_code = EINVAL; |
| reason_down = "Low TmpDisk"; |
| } |
| node_ptr->tmp_disk = tmp_disk; |
| |
| if (node_ptr->node_state & NODE_STATE_NO_RESPOND) { |
| last_node_update = time (NULL); |
| reset_job_priority(); |
| node_ptr->node_state &= (uint16_t) (~NODE_STATE_NO_RESPOND); |
| } |
| base_state = node_ptr->node_state & NODE_STATE_BASE; |
| node_flags = node_ptr->node_state & NODE_STATE_FLAGS; |
| if (error_code) { |
| if (base_state != NODE_STATE_DOWN) |
| error ("Setting node %s state to DOWN", node_name); |
| last_node_update = time (NULL); |
| set_node_down(node_name, reason_down); |
| _sync_bitmaps(node_ptr, job_count); |
| } else if (status == ESLURMD_PROLOG_FAILED) { |
| if ((node_flags & NODE_STATE_DRAIN) == 0) { |
| last_node_update = time (NULL); |
| error ("Prolog failure on node %s, state to DOWN", |
| node_name); |
| set_node_down(node_name, "Prolog failed"); |
| } |
| } else { |
| if (node_ptr->node_state == NODE_STATE_UNKNOWN) { |
| last_node_update = time (NULL); |
| reset_job_priority(); |
| debug("validate_node_specs: node %s has registered", |
| node_name); |
| if (job_count) { |
| node_ptr->node_state = NODE_STATE_ALLOCATED | |
| node_flags; |
| } else { |
| node_ptr->node_state = NODE_STATE_IDLE | |
| node_flags; |
| } |
| xfree(node_ptr->reason); |
| } else if ((base_state == NODE_STATE_DOWN) && |
| (slurmctld_conf.ret2service == 1) && |
| (node_ptr->reason != NULL) && |
| (strncmp(node_ptr->reason, "Not responding", 14) |
| == 0)) { |
| last_node_update = time (NULL); |
| if (job_count) { |
| node_ptr->node_state = NODE_STATE_ALLOCATED | |
| node_flags; |
| } else { |
| node_ptr->node_state = NODE_STATE_IDLE | |
| node_flags; |
| } |
| info ("node %s returned to service", node_name); |
| xfree(node_ptr->reason); |
| reset_job_priority(); |
| } else if ((base_state == NODE_STATE_ALLOCATED) && |
| (job_count == 0)) { /* job vanished */ |
| last_node_update = time (NULL); |
| node_ptr->node_state = NODE_STATE_IDLE | node_flags; |
| } else if ((node_flags & NODE_STATE_COMPLETING) && |
| (job_count == 0)) { /* job already done */ |
| last_node_update = time (NULL); |
| node_ptr->node_state &= (~NODE_STATE_COMPLETING); |
| } |
| select_g_update_node_state((node_ptr - node_record_table_ptr), |
| node_ptr->node_state); |
| _sync_bitmaps(node_ptr, job_count); |
| } |
| |
| return error_code; |
| } |
| |
| /* |
| * validate_nodes_via_front_end - validate all nodes on a cluster as having |
| * a valid configuration as soon as the front-end registers. Individual |
| * nodes will not register with this configuration |
| * IN job_count - number of jobs which should be running on cluster |
| * IN job_id_ptr - pointer to array of job_ids that should be on cluster |
| * IN step_id_ptr - pointer to array of job step ids that should be on cluster |
| * IN status - cluster status code |
| * RET 0 if no error, SLURM error code otherwise |
| * global: node_record_table_ptr - pointer to global node table |
| * NOTE: READ lock_slurmctld config before entry |
| */ |
| extern int validate_nodes_via_front_end(uint32_t job_count, |
| uint32_t *job_id_ptr, uint16_t *step_id_ptr, |
| uint32_t status) |
| { |
| int error_code = 0, i, jobs_on_node; |
| bool updated_job = false; |
| struct job_record *job_ptr; |
| struct config_record *config_ptr; |
| struct node_record *node_ptr; |
| time_t now = time(NULL); |
| ListIterator job_iterator; |
| hostlist_t return_hostlist = NULL, reg_hostlist = NULL; |
| hostlist_t prolog_hostlist = NULL; |
| char host_str[64]; |
| uint16_t base_state, node_flags; |
| |
| /* First validate the job info */ |
| node_ptr = &node_record_table_ptr[0]; /* All msg send to node zero, |
| * the front-end for the wholel cluster */ |
| for (i = 0; i < job_count; i++) { |
| if ( (job_id_ptr[i] >= MIN_NOALLOC_JOBID) && |
| (job_id_ptr[i] <= MAX_NOALLOC_JOBID) ) { |
| info("NoAllocate job %u.%u reported", |
| job_id_ptr[i], step_id_ptr[i]); |
| continue; |
| } |
| |
| job_ptr = find_job_record(job_id_ptr[i]); |
| if (job_ptr == NULL) { |
| error("Orphan job %u.%u reported", |
| job_id_ptr[i], step_id_ptr[i]); |
| kill_job_on_node(job_id_ptr[i], job_ptr, node_ptr); |
| } |
| |
| else if ((job_ptr->job_state == JOB_RUNNING) |
| || (job_ptr->job_state == JOB_SUSPENDED)) { |
| debug3("Registered job %u.%u", |
| job_id_ptr[i], step_id_ptr[i]); |
| if (job_ptr->batch_flag) { |
| /* NOTE: Used for purging defunct batch jobs */ |
| job_ptr->time_last_active = now; |
| } |
| } |
| |
| else if (job_ptr->job_state & JOB_COMPLETING) { |
| /* Re-send kill request as needed, |
| * not necessarily an error */ |
| kill_job_on_node(job_id_ptr[i], job_ptr, node_ptr); |
| } |
| |
| |
| else if (job_ptr->job_state == JOB_PENDING) { |
| error("Registered PENDING job %u.%u", |
| job_id_ptr[i], step_id_ptr[i]); |
| /* FIXME: Could possibly recover the job */ |
| job_ptr->job_state = JOB_FAILED; |
| job_ptr->exit_code = 1; |
| last_job_update = now; |
| job_ptr->start_time = job_ptr->end_time = now; |
| kill_job_on_node(job_id_ptr[i], job_ptr, node_ptr); |
| job_completion_logger(job_ptr); |
| delete_job_details(job_ptr); |
| } |
| |
| else { /* else job is supposed to be done */ |
| error("Registered job %u.%u in state %s", |
| job_id_ptr[i], step_id_ptr[i], |
| job_state_string(job_ptr->job_state)); |
| kill_job_on_node(job_id_ptr[i], job_ptr, node_ptr); |
| } |
| } |
| |
| /* purge orphan batch jobs */ |
| job_iterator = list_iterator_create(job_list); |
| while ((job_ptr = (struct job_record *) list_next(job_iterator))) { |
| if ((job_ptr->job_state != JOB_RUNNING) |
| || (job_ptr->batch_flag == 0)) |
| continue; |
| #ifdef HAVE_BG |
| /* slurmd does not report job presence until after prolog |
| * completes which waits for bgblock boot to complete. |
| * This can take several minutes on BlueGene. */ |
| if (difftime(now, job_ptr->time_last_active) <= |
| (1400 + 5 * job_ptr->node_cnt)) |
| continue; |
| #else |
| if (difftime(now, job_ptr->time_last_active) <= 5) |
| continue; |
| #endif |
| |
| info("Killing orphan batch job %u", job_ptr->job_id); |
| job_complete(job_ptr->job_id, 0, false, 0); |
| } |
| list_iterator_destroy(job_iterator); |
| |
| /* Now validate the node info */ |
| for (i=0; i<node_record_count; i++) { |
| node_ptr = &node_record_table_ptr[i]; |
| config_ptr = node_ptr->config_ptr; |
| jobs_on_node = node_ptr->run_job_cnt + node_ptr->comp_job_cnt; |
| node_ptr->last_response = time (NULL); |
| |
| if (node_ptr->node_state & NODE_STATE_NO_RESPOND) { |
| updated_job = true; |
| node_ptr->node_state &= (uint16_t) |
| (~NODE_STATE_NO_RESPOND); |
| } |
| |
| if (status == ESLURMD_PROLOG_FAILED) { |
| if (!(node_ptr->node_state & NODE_STATE_DRAIN)) { |
| updated_job = true; |
| if (prolog_hostlist) |
| (void) hostlist_push_host( |
| prolog_hostlist, |
| node_ptr->name); |
| else |
| prolog_hostlist = hostlist_create( |
| node_ptr->name); |
| set_node_down(node_ptr->name, "Prolog failed"); |
| } |
| } else { |
| base_state = node_ptr->node_state & NODE_STATE_BASE; |
| node_flags = node_ptr->node_state & NODE_STATE_FLAGS; |
| if (base_state == NODE_STATE_UNKNOWN) { |
| updated_job = true; |
| if (reg_hostlist) |
| (void) hostlist_push_host( |
| reg_hostlist, node_ptr->name); |
| else |
| reg_hostlist = hostlist_create( |
| node_ptr->name); |
| if (jobs_on_node) { |
| node_ptr->node_state = |
| NODE_STATE_ALLOCATED | |
| node_flags; |
| } else |
| node_ptr->node_state = |
| NODE_STATE_IDLE | |
| node_flags; |
| xfree(node_ptr->reason); |
| } else if ((base_state == NODE_STATE_DOWN) && |
| (slurmctld_conf.ret2service == 1)) { |
| updated_job = true; |
| if (jobs_on_node) { |
| node_ptr->node_state = |
| NODE_STATE_ALLOCATED | |
| node_flags; |
| } else { |
| node_ptr->node_state = |
| NODE_STATE_IDLE | |
| node_flags; |
| } |
| if (return_hostlist) |
| (void) hostlist_push_host( |
| return_hostlist, node_ptr->name); |
| else |
| return_hostlist = hostlist_create( |
| node_ptr->name); |
| xfree(node_ptr->reason); |
| } else if ((base_state == NODE_STATE_ALLOCATED) && |
| (jobs_on_node == 0)) { /* job vanished */ |
| updated_job = true; |
| node_ptr->node_state = NODE_STATE_IDLE | |
| node_flags; |
| } else if ((node_flags & NODE_STATE_COMPLETING) && |
| (jobs_on_node == 0)) { /* job already done */ |
| updated_job = true; |
| node_ptr->node_state &= (~NODE_STATE_COMPLETING); |
| } |
| select_g_update_node_state( |
| (node_ptr - node_record_table_ptr), |
| node_ptr->node_state); |
| _sync_bitmaps(node_ptr, jobs_on_node); |
| } |
| } |
| |
| if (prolog_hostlist) { |
| hostlist_uniq(prolog_hostlist); |
| hostlist_ranged_string(prolog_hostlist, sizeof(host_str), |
| host_str); |
| error("Prolog failure on nodes %s, set to DOWN", host_str); |
| hostlist_destroy(prolog_hostlist); |
| } |
| if (reg_hostlist) { |
| hostlist_uniq(reg_hostlist); |
| hostlist_ranged_string(reg_hostlist, sizeof(host_str), |
| host_str); |
| debug("Nodes %s have registered", host_str); |
| hostlist_destroy(reg_hostlist); |
| } |
| if (return_hostlist) { |
| hostlist_uniq(return_hostlist); |
| hostlist_ranged_string(return_hostlist, sizeof(host_str), |
| host_str); |
| info("Nodes %s returned to service", host_str); |
| hostlist_destroy(return_hostlist); |
| } |
| |
| if (updated_job) { |
| last_node_update = time (NULL); |
| reset_job_priority(); |
| } |
| return error_code;; |
| } |
| |
| /* Sync idle, share, and avail_node_bitmaps for a given node */ |
| static void _sync_bitmaps(struct node_record *node_ptr, int job_count) |
| { |
| uint16_t base_state; |
| int node_inx = node_ptr - node_record_table_ptr; |
| |
| if (job_count == 0) { |
| bit_set (idle_node_bitmap, node_inx); |
| bit_set (share_node_bitmap, node_inx); |
| } |
| base_state = node_ptr->node_state & NODE_STATE_BASE; |
| if ((base_state == NODE_STATE_DOWN) |
| || (node_ptr->node_state & NODE_STATE_DRAIN)) |
| bit_clear (avail_node_bitmap, node_inx); |
| else |
| bit_set (avail_node_bitmap, node_inx); |
| if (base_state == NODE_STATE_DOWN) |
| bit_clear (up_node_bitmap, node_inx); |
| else |
| bit_set (up_node_bitmap, node_inx); |
| } |
| |
| /* |
| * node_did_resp - record that the specified node is responding |
| * IN name - name of the node |
| * NOTE: READ lock_slurmctld config before entry |
| */ |
| void node_did_resp (char *name) |
| { |
| struct node_record *node_ptr; |
| #ifdef HAVE_FRONT_END /* Fake all other nodes */ |
| int i; |
| |
| for (i=0; i<node_record_count; i++) { |
| node_ptr = &node_record_table_ptr[i]; |
| _node_did_resp(node_ptr); |
| } |
| debug2("node_did_resp %s",name); |
| #else |
| node_ptr = find_node_record (name); |
| if (node_ptr == NULL) { |
| error ("node_did_resp unable to find node %s", name); |
| return; |
| } |
| _node_did_resp(node_ptr); |
| debug2("node_did_resp %s",name); |
| #endif |
| } |
| |
| static void _node_did_resp(struct node_record *node_ptr) |
| { |
| int node_inx; |
| uint16_t resp_state, base_state, node_flags; |
| |
| node_inx = node_ptr - node_record_table_ptr; |
| node_ptr->last_response = time (NULL); |
| resp_state = node_ptr->node_state & NODE_STATE_NO_RESPOND; |
| if (resp_state) { |
| info("Node %s now responding", node_ptr->name); |
| last_node_update = time (NULL); |
| reset_job_priority(); |
| node_ptr->node_state &= (uint16_t) (~NODE_STATE_NO_RESPOND); |
| } |
| base_state = node_ptr->node_state & NODE_STATE_BASE; |
| node_flags = node_ptr->node_state & NODE_STATE_FLAGS; |
| if (base_state == NODE_STATE_UNKNOWN) { |
| last_node_update = time (NULL); |
| node_ptr->node_state = NODE_STATE_IDLE | node_flags; |
| } |
| if ((base_state == NODE_STATE_DOWN) && |
| (slurmctld_conf.ret2service == 1) && |
| (node_ptr->reason != NULL) && |
| (strncmp(node_ptr->reason, "Not responding", 14) == 0)) { |
| last_node_update = time (NULL); |
| node_ptr->node_state = NODE_STATE_IDLE | node_flags; |
| info("node_did_resp: node %s returned to service", |
| node_ptr->name); |
| xfree(node_ptr->reason); |
| } |
| base_state = node_ptr->node_state & NODE_STATE_BASE; |
| if ((base_state == NODE_STATE_IDLE) |
| && ((node_flags & NODE_STATE_COMPLETING) == 0)) { |
| bit_set (idle_node_bitmap, node_inx); |
| bit_set (share_node_bitmap, node_inx); |
| } |
| if ((base_state == NODE_STATE_DOWN) |
| || (node_flags & NODE_STATE_DRAIN)) |
| bit_clear (avail_node_bitmap, node_inx); |
| else |
| bit_set (avail_node_bitmap, node_inx); |
| if (base_state == NODE_STATE_DOWN) |
| bit_clear (up_node_bitmap, node_inx); |
| else |
| bit_set (up_node_bitmap, node_inx); |
| return; |
| } |
| |
| /* |
| * node_not_resp - record that the specified node is not responding |
| * IN name - name of the node |
| * IN msg_time - time message was sent |
| */ |
| void node_not_resp (char *name, time_t msg_time) |
| { |
| struct node_record *node_ptr; |
| #ifdef HAVE_FRONT_END /* Fake all other nodes */ |
| int i; |
| char host_str[64]; |
| hostlist_t no_resp_hostlist = hostlist_create(""); |
| |
| for (i=0; i<node_record_count; i++) { |
| node_ptr = &node_record_table_ptr[i]; |
| (void) hostlist_push_host(no_resp_hostlist, node_ptr->name); |
| _node_not_resp(node_ptr, msg_time); |
| } |
| hostlist_uniq(no_resp_hostlist); |
| hostlist_ranged_string(no_resp_hostlist, sizeof(host_str), host_str); |
| error("Nodes %s not responding", host_str); |
| hostlist_destroy(no_resp_hostlist); |
| #else |
| node_ptr = find_node_record (name); |
| if (node_ptr == NULL) { |
| error ("node_not_resp unable to find node %s", name); |
| return; |
| } |
| error("Node %s not responding", node_ptr->name); |
| _node_not_resp(node_ptr, msg_time); |
| #endif |
| } |
| |
| static void _node_not_resp (struct node_record *node_ptr, time_t msg_time) |
| { |
| int i; |
| |
| i = node_ptr - node_record_table_ptr; |
| if (node_ptr->node_state & NODE_STATE_NO_RESPOND) |
| return; /* Already known to be not responding */ |
| |
| if (node_ptr->last_response >= msg_time) { |
| debug("node_not_resp: node %s responded since msg sent", |
| node_ptr->name); |
| return; |
| } |
| last_node_update = time (NULL); |
| bit_clear (avail_node_bitmap, i); |
| node_ptr->node_state |= NODE_STATE_NO_RESPOND; |
| return; |
| } |
| |
| /* |
| * set_node_down - make the specified node's state DOWN if possible |
| * (not in a DRAIN state), kill jobs as needed |
| * IN name - name of the node |
| * IN reason - why the node is DOWN |
| */ |
| void set_node_down (char *name, char *reason) |
| { |
| struct node_record *node_ptr; |
| |
| node_ptr = find_node_record (name); |
| if (node_ptr == NULL) { |
| error ("node_not_resp unable to find node %s", name); |
| return; |
| } |
| |
| _make_node_down(node_ptr); |
| (void) kill_running_job_by_node_name(name, false); |
| if ((node_ptr->reason == NULL) |
| || (strncmp(node_ptr->reason, "Not responding", 14) == 0)) { |
| time_t now; |
| char time_buf[64], time_str[32]; |
| |
| now = time (NULL); |
| slurm_make_time_str(&now, time_str, sizeof(time_str)); |
| snprintf(time_buf, sizeof(time_buf), " [slurm@%s]", |
| time_str); |
| xfree(node_ptr->reason); |
| node_ptr->reason = xstrdup(reason); |
| xstrcat(node_ptr->reason, time_buf); |
| } |
| |
| return; |
| } |
| |
| /* |
| * is_node_down - determine if the specified node's state is DOWN |
| * IN name - name of the node |
| * RET true if node exists and is down, otherwise false |
| */ |
| bool is_node_down (char *name) |
| { |
| struct node_record *node_ptr; |
| uint16_t base_state; |
| |
| node_ptr = find_node_record (name); |
| if (node_ptr == NULL) { |
| error ("is_node_down unable to find node %s", name); |
| return false; |
| } |
| |
| base_state = node_ptr->node_state & NODE_STATE_BASE; |
| if (base_state == NODE_STATE_DOWN) |
| return true; |
| return false; |
| } |
| |
| /* |
| * is_node_resp - determine if the specified node's state is responding |
| * IN name - name of the node |
| * RET true if node exists and is responding, otherwise false |
| */ |
| bool is_node_resp (char *name) |
| { |
| struct node_record *node_ptr; |
| |
| node_ptr = find_node_record (name); |
| if (node_ptr == NULL) { |
| error ("is_node_resp unable to find node %s", name); |
| return false; |
| } |
| |
| if (node_ptr->node_state & NODE_STATE_NO_RESPOND) |
| return false; |
| return true; |
| } |
| |
| /* |
| * find_first_node_record - find a record for first node in the bitmap |
| * IN node_bitmap |
| */ |
| struct node_record * |
| find_first_node_record (bitstr_t *node_bitmap) |
| { |
| int inx; |
| |
| if (node_bitmap == NULL) { |
| error ("find_first_node_record passed null bitstring"); |
| return NULL; |
| } |
| |
| inx = bit_ffs (node_bitmap); |
| if (inx < 0) |
| return NULL; |
| else |
| return &node_record_table_ptr[inx]; |
| } |
| |
| #if _DEBUG |
| /* |
| * _dump_hash - print the node_hash_table contents, used for debugging |
| * or analysis of hash technique |
| * global: node_record_table_ptr - pointer to global node table |
| * node_hash_table - table of hash indecies |
| */ |
| static void _dump_hash (void) |
| { |
| int i, inx; |
| struct node_record *node_ptr; |
| |
| if (node_hash_table == NULL) |
| return; |
| for (i = 0; i < node_record_count; i++) { |
| node_ptr = node_hash_table[i]; |
| while (node_ptr) { |
| inx = node_ptr - node_record_table_ptr; |
| debug3("node_hash[%d]:%d", i, inx); |
| node_ptr = node_ptr->node_next; |
| } |
| } |
| } |
| #endif |
| |
| /* msg_to_slurmd - send given msg_type (REQUEST_RECONFIGURE or |
| * REQUEST_SHUTDOWN) to every slurmd, no args */ |
| void msg_to_slurmd (slurm_msg_type_t msg_type) |
| { |
| int i, pos; |
| shutdown_msg_t *shutdown_req; |
| agent_arg_t *kill_agent_args; |
| |
| kill_agent_args = xmalloc (sizeof (agent_arg_t)); |
| kill_agent_args->msg_type = msg_type; |
| kill_agent_args->retry = 0; |
| kill_agent_args->slurm_addr = xmalloc ( |
| sizeof (struct sockaddr_in) * |
| (node_record_count + 1)); |
| kill_agent_args->node_names = xmalloc (MAX_SLURM_NAME * |
| (node_record_count + 1)); |
| if (msg_type == REQUEST_SHUTDOWN) { |
| shutdown_req = xmalloc(sizeof(shutdown_msg_t)); |
| shutdown_req->core = 0; |
| kill_agent_args->msg_args = shutdown_req; |
| } |
| |
| for (i = 0; i < node_record_count; i++) { |
| kill_agent_args->slurm_addr[kill_agent_args->node_count] = |
| node_record_table_ptr[i].slurm_addr; |
| pos = MAX_SLURM_NAME * kill_agent_args->node_count; |
| strncpy (&kill_agent_args->node_names[pos], |
| node_record_table_ptr[i].name, MAX_SLURM_NAME); |
| kill_agent_args->node_count++; |
| #ifdef HAVE_FRONT_END /* Operate only on front-end */ |
| break; |
| #endif |
| } |
| |
| if (kill_agent_args->node_count == 0) { |
| xfree (kill_agent_args->slurm_addr); |
| xfree (kill_agent_args->node_names); |
| xfree (kill_agent_args); |
| } else { |
| debug ("Spawning agent msg_type=%d", msg_type); |
| agent_queue_request(kill_agent_args); |
| } |
| } |
| |
| |
| /* make_node_alloc - flag specified node as allocated to a job |
| * IN node_ptr - pointer to node being allocated |
| * IN job_ptr - pointer to job that is starting |
| */ |
| extern void make_node_alloc(struct node_record *node_ptr, |
| struct job_record *job_ptr) |
| { |
| int inx = node_ptr - node_record_table_ptr; |
| uint16_t node_flags; |
| |
| last_node_update = time (NULL); |
| |
| (node_ptr->run_job_cnt)++; |
| bit_clear(idle_node_bitmap, inx); |
| if (job_ptr->details && (job_ptr->details->shared == 0)) { |
| bit_clear(share_node_bitmap, inx); |
| (node_ptr->no_share_job_cnt)++; |
| } |
| |
| node_flags = node_ptr->node_state & NODE_STATE_FLAGS; |
| node_ptr->node_state = NODE_STATE_ALLOCATED | node_flags; |
| xfree(node_ptr->reason); |
| } |
| |
| /* make_node_comp - flag specified node as completing a job |
| * IN node_ptr - pointer to node marked for completion of job |
| * IN job_ptr - pointer to job that is completing |
| * IN suspended - true if job was previously suspended |
| */ |
| extern void make_node_comp(struct node_record *node_ptr, |
| struct job_record *job_ptr, bool suspended) |
| { |
| int inx = node_ptr - node_record_table_ptr; |
| uint16_t node_flags, base_state; |
| |
| xassert(node_ptr); |
| last_node_update = time (NULL); |
| if (!suspended) { |
| if (node_ptr->run_job_cnt) |
| (node_ptr->run_job_cnt)--; |
| else |
| error("Node %s run_job_cnt underflow in " |
| "make_node_comp", node_ptr->name); |
| |
| if (job_ptr->details && (job_ptr->details->shared == 0)) { |
| if (node_ptr->no_share_job_cnt) |
| (node_ptr->no_share_job_cnt)--; |
| else |
| error("Node %s no_share_job_cnt underflow in " |
| "make_node_comp", node_ptr->name); |
| if (node_ptr->no_share_job_cnt == 0) |
| bit_set(share_node_bitmap, inx); |
| } |
| } |
| |
| base_state = node_ptr->node_state & NODE_STATE_BASE; |
| if (base_state != NODE_STATE_DOWN) { |
| /* Don't verify RPC if DOWN */ |
| (node_ptr->comp_job_cnt)++; |
| node_ptr->node_state |= NODE_STATE_COMPLETING; |
| } |
| node_flags = node_ptr->node_state & NODE_STATE_FLAGS; |
| |
| if ((node_ptr->run_job_cnt == 0) |
| && (node_ptr->comp_job_cnt == 0)) { |
| bit_set(idle_node_bitmap, inx); |
| } |
| |
| if (base_state == NODE_STATE_DOWN) { |
| debug3("make_node_comp: Node %s being left DOWN", |
| node_ptr->name); |
| } else if (node_ptr->run_job_cnt) |
| node_ptr->node_state = NODE_STATE_ALLOCATED | node_flags; |
| else |
| node_ptr->node_state = NODE_STATE_IDLE | node_flags; |
| } |
| |
| /* _make_node_down - flag specified node as down */ |
| static void _make_node_down(struct node_record *node_ptr) |
| { |
| int inx = node_ptr - node_record_table_ptr; |
| uint16_t node_flags; |
| |
| xassert(node_ptr); |
| last_node_update = time (NULL); |
| node_flags = node_ptr->node_state & NODE_STATE_FLAGS; |
| node_flags &= (~NODE_STATE_COMPLETING); |
| node_ptr->node_state = NODE_STATE_DOWN | node_flags; |
| bit_clear (avail_node_bitmap, inx); |
| bit_set (idle_node_bitmap, inx); |
| bit_set (share_node_bitmap, inx); |
| bit_clear (up_node_bitmap, inx); |
| select_g_update_node_state(inx, node_ptr->node_state); |
| } |
| |
| /* |
| * make_node_idle - flag specified node as having finished with a job |
| * IN node_ptr - pointer to node reporting job completion |
| * IN job_ptr - pointer to job that just completed |
| */ |
| void make_node_idle(struct node_record *node_ptr, |
| struct job_record *job_ptr) |
| { |
| int inx = node_ptr - node_record_table_ptr; |
| uint16_t node_flags, base_state; |
| |
| xassert(node_ptr); |
| if (job_ptr /* Specific job completed */ |
| && (job_ptr->job_state & JOB_COMPLETING) /* Not a replay */ |
| && (bit_test(job_ptr->node_bitmap, inx))) { /* Not a replay */ |
| last_job_update = time (NULL); |
| bit_clear(job_ptr->node_bitmap, inx); |
| if (job_ptr->node_cnt) { |
| if ((--job_ptr->node_cnt) == 0) { |
| time_t delay; |
| delay = last_job_update - job_ptr->end_time; |
| if (delay > 60) |
| info("Job %u completion process took " |
| "%ld seconds", job_ptr->job_id, |
| (long) delay); |
| job_ptr->job_state &= (~JOB_COMPLETING); |
| slurm_sched_schedule(); |
| } |
| } else { |
| error("node_cnt underflow on job_id %u", |
| job_ptr->job_id); |
| } |
| |
| if (job_ptr->job_state == JOB_RUNNING) { |
| /* Remove node from running job */ |
| if (node_ptr->run_job_cnt) |
| (node_ptr->run_job_cnt)--; |
| else |
| error("Node %s run_job_cnt underflow in " |
| "make_node_idle", node_ptr->name); |
| } else { |
| if (node_ptr->comp_job_cnt) |
| (node_ptr->comp_job_cnt)--; |
| else |
| error("Node %s comp_job_cnt underflow in " |
| "make_node_idle, job_id %u", |
| node_ptr->name, job_ptr->job_id); |
| if (node_ptr->comp_job_cnt > 0) |
| return; /* More jobs completing */ |
| } |
| } |
| |
| last_node_update = time (NULL); |
| if (node_ptr->comp_job_cnt == 0) |
| node_ptr->node_state &= (~NODE_STATE_COMPLETING); |
| base_state = node_ptr->node_state & NODE_STATE_BASE; |
| node_flags = node_ptr->node_state & NODE_STATE_FLAGS; |
| if (base_state == NODE_STATE_DOWN) { |
| debug3("make_node_idle: Node %s being left DOWN", |
| node_ptr->name); |
| } else if ((node_ptr->node_state & NODE_STATE_DRAIN) && |
| (node_ptr->run_job_cnt == 0) && |
| (node_ptr->comp_job_cnt == 0)) { |
| node_ptr->node_state = NODE_STATE_IDLE | node_flags; |
| bit_set(idle_node_bitmap, inx); |
| bit_clear(avail_node_bitmap, inx); |
| debug3("make_node_idle: Node %s is DRAINED", |
| node_ptr->name); |
| } else if (node_ptr->run_job_cnt) { |
| node_ptr->node_state = NODE_STATE_ALLOCATED | node_flags; |
| } else { |
| node_ptr->node_state = NODE_STATE_IDLE | node_flags; |
| if (((node_flags & NODE_STATE_NO_RESPOND) == 0) |
| && ((node_flags & NODE_STATE_COMPLETING) == 0)) |
| bit_set(idle_node_bitmap, inx); |
| } |
| } |
| |
| /* node_fini - free all memory associated with node records */ |
| void node_fini(void) |
| { |
| int i; |
| |
| if (config_list) { |
| list_destroy(config_list); |
| config_list = NULL; |
| } |
| |
| for (i=0; i< node_record_count; i++) { |
| xfree(node_record_table_ptr[i].part_pptr); |
| xfree(node_record_table_ptr[i].reason); |
| } |
| |
| FREE_NULL_BITMAP(idle_node_bitmap); |
| FREE_NULL_BITMAP(avail_node_bitmap); |
| FREE_NULL_BITMAP(share_node_bitmap); |
| FREE_NULL_BITMAP(up_node_bitmap); |
| |
| xfree(node_record_table_ptr); |
| xfree(node_hash_table); |
| node_record_count = 0; |
| } |