| /*****************************************************************************\ |
| * ping_nodes.c - ping the slurmd daemons to test if they respond |
| ***************************************************************************** |
| * Copyright (C) 2003-2007 The Regents of the University of California. |
| * Copyright (C) 2008-2011 Lawrence Livermore National Security. |
| * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). |
| * Written by Morris Jette <jette1@llnl.gov> et. al. |
| * CODE-OCEC-09-009. All rights reserved. |
| * |
| * This file is part of Slurm, a resource management program. |
| * For details, see <https://slurm.schedmd.com/>. |
| * Please also read the included file: DISCLAIMER. |
| * |
| * Slurm is free software; you can redistribute it and/or modify it under |
| * the terms of the GNU General Public License as published by the Free |
| * Software Foundation; either version 2 of the License, or (at your option) |
| * any later version. |
| * |
| * In addition, as a special exception, the copyright holders give permission |
| * to link the code of portions of this program with the OpenSSL library under |
| * certain conditions as described in each individual source file, and |
| * distribute linked combinations including the two. You must obey the GNU |
| * General Public License in all respects for all of the code used other than |
| * OpenSSL. If you modify file(s) with this exception, you may extend this |
| * exception to your version of the file(s), but you are not obligated to do |
| * so. If you do not wish to do so, delete this exception statement from your |
| * version. If you delete this exception statement from all source files in |
| * the program, then also delete it here. |
| * |
| * Slurm is distributed in the hope that it will be useful, but WITHOUT ANY |
| * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS |
| * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more |
| * details. |
| * |
| * You should have received a copy of the GNU General Public License along |
| * with Slurm; if not, write to the Free Software Foundation, Inc., |
| * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. |
| \*****************************************************************************/ |
| |
| #include "config.h" |
| |
| #include <pthread.h> |
| #include <string.h> |
| #include <time.h> |
| |
| #include "src/common/hostlist.h" |
| #include "src/common/read_config.h" |
| |
| #include "src/interfaces/select.h" |
| |
| #include "src/slurmctld/agent.h" |
| #include "src/slurmctld/ping_nodes.h" |
| #include "src/slurmctld/slurmctld.h" |
| |
| /* Request that nodes re-register at most every MAX_REG_FREQUENCY pings */ |
| #define MAX_REG_FREQUENCY 20 |
| |
| /* Log an error for ping that takes more than 100 seconds to complete */ |
| #define PING_TIMEOUT 100 |
| |
| static pthread_mutex_t lock_mutex = PTHREAD_MUTEX_INITIALIZER; |
| static bool ping_updated = false; |
| static int ping_count = 0; |
| static time_t ping_start = 0; |
| |
| /* |
| * is_ping_done - test if the last node ping cycle has completed. |
| * Use this to avoid starting a new set of ping requests before the |
| * previous one completes |
| * RET true if ping process is done, false otherwise |
| */ |
| bool is_ping_done (void) |
| { |
| static bool ping_msg_sent = false; |
| bool is_done = true; |
| |
| slurm_mutex_lock(&lock_mutex); |
| if (ping_count) { |
| is_done = false; |
| if (!ping_msg_sent && |
| (difftime(time(NULL), ping_start) >= PING_TIMEOUT)) { |
| error("A node ping cycle took more than %d seconds. Node RPC requests like ping, register status, health check and/or accounting gather update are triggered less frequently than configured. Either many nodes are non-responsive or one of SlurmdTimeout, HealthCheckInterval, JobAcctGatherFrequency should be increased.", |
| PING_TIMEOUT); |
| ping_msg_sent = true; |
| } |
| } else { |
| ping_msg_sent = false; |
| /* |
| * We can only consider the last node ping cycle to be fully |
| * completed if ping_updated is true, meaning the |
| * _agent_nodes_update thread finished updating the nodes |
| * response. Otherwise, we could hit a race and incorrectly set |
| * responding/healthy nodes to DOWN. |
| */ |
| is_done = ping_updated; |
| } |
| slurm_mutex_unlock(&lock_mutex); |
| |
| return is_done; |
| } |
| |
| /* |
| * ping_begin - record that a ping cycle has begin. This can be called more |
| * than once (for REQUEST_PING and simultaneous REQUEST_NODE_REGISTRATION |
| * for selected nodes). Matching ping_end calls must be made for each |
| * before is_ping_done returns true. |
| */ |
| void ping_begin (void) |
| { |
| slurm_mutex_lock(&lock_mutex); |
| ping_count++; |
| ping_start = time(NULL); |
| slurm_mutex_unlock(&lock_mutex); |
| } |
| |
| /* |
| * ping_nodes_update - A ping cycle can end but the update can still be pending |
| * for the _agent_nodes_update thread. This call will confirm node info was |
| * updated. |
| */ |
| void ping_nodes_update(void) |
| { |
| slurm_mutex_lock(&lock_mutex); |
| ping_updated = true; |
| slurm_mutex_unlock(&lock_mutex); |
| } |
| |
| /* |
| * ping_end - record that a ping cycle has ended. This can be called more |
| * than once (for REQUEST_PING and simultaneous REQUEST_NODE_REGISTRATION |
| * for selected nodes). Matching ping_end calls must be made for each |
| * before is_ping_done returns true. |
| */ |
| void ping_end (void) |
| { |
| slurm_mutex_lock(&lock_mutex); |
| |
| if (ping_count > 0) |
| ping_count--; |
| else |
| error("%s: ping_count < 0", __func__); |
| |
| if (ping_count == 0) /* no more running ping cycles */ |
| ping_start = 0; |
| ping_updated = false; |
| slurm_mutex_unlock(&lock_mutex); |
| } |
| |
| /* |
| * ping_nodes - check that all nodes and daemons are alive, |
| * get nodes in UNKNOWN state to register |
| */ |
| void ping_nodes (void) |
| { |
| static bool restart_flag = true; /* system just restarted */ |
| static int reg_offset = 0; /* mutex via node table write lock on entry */ |
| static int max_reg_threads = 0; /* max node registration threads |
| * this can include DOWN nodes, so |
| * limit the number to avoid huge |
| * communication delays */ |
| int i; |
| time_t now = time(NULL), still_live_time, node_dead_time; |
| static time_t last_ping_time = (time_t) 0; |
| static time_t last_ping_timeout = (time_t) 0; |
| hostlist_t *down_hostlist = NULL; |
| char *host_str = NULL; |
| agent_arg_t *ping_agent_args = NULL; |
| agent_arg_t *reg_agent_args = NULL; |
| node_record_t *node_ptr = NULL; |
| time_t old_cpu_load_time = now - slurm_conf.slurmd_timeout; |
| time_t old_free_mem_time = now - slurm_conf.slurmd_timeout; |
| int node_offset = 0; |
| |
| ping_agent_args = xmalloc (sizeof (agent_arg_t)); |
| ping_agent_args->msg_type = REQUEST_PING; |
| ping_agent_args->retry = 0; |
| ping_agent_args->protocol_version = SLURM_PROTOCOL_VERSION; |
| ping_agent_args->hostlist = hostlist_create(NULL); |
| |
| reg_agent_args = xmalloc (sizeof (agent_arg_t)); |
| reg_agent_args->msg_type = REQUEST_NODE_REGISTRATION_STATUS; |
| reg_agent_args->retry = 0; |
| reg_agent_args->protocol_version = SLURM_PROTOCOL_VERSION; |
| reg_agent_args->hostlist = hostlist_create(NULL); |
| |
| /* |
| * If there are a large number of down nodes, the node ping |
| * can take a long time to complete: |
| * ping_time = down_nodes * agent_timeout / agent_parallelism |
| * ping_time = down_nodes * 10_seconds / 10 |
| * ping_time = down_nodes (seconds) |
| * Because of this, we extend the SlurmdTimeout by the |
| * time needed to complete a ping of all nodes. |
| */ |
| if ((last_ping_timeout == 0) || |
| (last_ping_time == (time_t) 0)) { |
| node_dead_time = (time_t) 0; |
| } else { |
| node_dead_time = last_ping_time - last_ping_timeout; |
| } |
| still_live_time = now - (slurm_conf.slurmd_timeout / 3); |
| last_ping_time = now; |
| last_ping_timeout = slurm_conf.slurmd_timeout; |
| |
| if (max_reg_threads == 0) { |
| max_reg_threads = MAX(slurm_conf.tree_width, 1); |
| max_reg_threads = MIN(max_reg_threads, 50); |
| } |
| reg_offset += max_reg_threads; |
| if ((reg_offset > active_node_record_count) && |
| (reg_offset >= (max_reg_threads * MAX_REG_FREQUENCY))) |
| reg_offset = 0; |
| |
| for (i = 0; (node_ptr = next_node(&i)); i++) { |
| node_offset++; |
| if (IS_NODE_FUTURE(node_ptr) || |
| IS_NODE_EXTERNAL(node_ptr) || |
| IS_NODE_POWERED_DOWN(node_ptr) || |
| IS_NODE_POWERING_DOWN(node_ptr) || |
| IS_NODE_POWERING_UP(node_ptr) || |
| IS_NODE_INVALID_REG(node_ptr) || |
| IS_NODE_REBOOT_ISSUED(node_ptr)) |
| continue; |
| if ((slurm_conf.slurmd_timeout == 0) && (!restart_flag) && |
| (!IS_NODE_UNKNOWN(node_ptr)) && |
| (!IS_NODE_NO_RESPOND(node_ptr))) |
| continue; |
| |
| if ((node_ptr->last_response != (time_t) 0) && |
| (node_ptr->last_response <= node_dead_time) && |
| (!IS_NODE_DOWN(node_ptr))) { |
| if (down_hostlist) |
| (void) hostlist_push_host(down_hostlist, |
| node_ptr->name); |
| else { |
| down_hostlist = |
| hostlist_create(node_ptr->name); |
| if (!down_hostlist) { |
| fatal("Invalid host name: %s", |
| node_ptr->name); |
| } |
| } |
| set_node_down_ptr(node_ptr, "Not responding"); |
| node_ptr->not_responding = false; /* logged below */ |
| continue; |
| } |
| |
| /* Request a node registration if its state is UNKNOWN or |
| * on a periodic basis (about every MAX_REG_FREQUENCY ping, |
| * this mechanism avoids an additional (per node) timer or |
| * counter and gets updated configuration information |
| * once in a while). We limit these requests since they |
| * can generate a flood of incoming RPCs. */ |
| if (IS_NODE_UNKNOWN(node_ptr) || (node_ptr->boot_time == 0) || |
| ((node_offset >= reg_offset) && |
| (node_offset < (reg_offset + max_reg_threads)))) { |
| if (reg_agent_args->protocol_version > |
| node_ptr->protocol_version) |
| reg_agent_args->protocol_version = |
| node_ptr->protocol_version; |
| hostlist_push_host(reg_agent_args->hostlist, |
| node_ptr->name); |
| reg_agent_args->node_count++; |
| if (PACK_FANOUT_ADDRS(node_ptr)) |
| reg_agent_args->msg_flags |= SLURM_PACK_ADDRS; |
| continue; |
| } |
| |
| if ((!IS_NODE_NO_RESPOND(node_ptr)) && |
| (node_ptr->last_response >= still_live_time) && |
| (node_ptr->cpu_load_time >= old_cpu_load_time) && |
| (node_ptr->free_mem_time >= old_free_mem_time)) |
| continue; |
| |
| /* Do not keep pinging down nodes since this can induce |
| * huge delays in hierarchical communication fail-over */ |
| if (IS_NODE_NO_RESPOND(node_ptr) && IS_NODE_DOWN(node_ptr)) |
| continue; |
| |
| if (ping_agent_args->protocol_version > |
| node_ptr->protocol_version) |
| ping_agent_args->protocol_version = |
| node_ptr->protocol_version; |
| hostlist_push_host(ping_agent_args->hostlist, node_ptr->name); |
| ping_agent_args->node_count++; |
| if (PACK_FANOUT_ADDRS(node_ptr)) |
| ping_agent_args->msg_flags |= SLURM_PACK_ADDRS; |
| } |
| |
| restart_flag = false; |
| if (ping_agent_args->node_count == 0) { |
| hostlist_destroy(ping_agent_args->hostlist); |
| xfree (ping_agent_args); |
| } else { |
| hostlist_uniq(ping_agent_args->hostlist); |
| host_str = hostlist_ranged_string_xmalloc( |
| ping_agent_args->hostlist); |
| debug("Spawning ping agent for %s", host_str); |
| xfree(host_str); |
| ping_begin(); |
| set_agent_arg_r_uid(ping_agent_args, SLURM_AUTH_UID_ANY); |
| agent_queue_request(ping_agent_args); |
| } |
| |
| if (reg_agent_args->node_count == 0) { |
| hostlist_destroy(reg_agent_args->hostlist); |
| xfree (reg_agent_args); |
| } else { |
| hostlist_uniq(reg_agent_args->hostlist); |
| host_str = hostlist_ranged_string_xmalloc( |
| reg_agent_args->hostlist); |
| debug("Spawning registration agent for %s %d hosts", |
| host_str, reg_agent_args->node_count); |
| xfree(host_str); |
| ping_begin(); |
| set_agent_arg_r_uid(reg_agent_args, SLURM_AUTH_UID_ANY); |
| agent_queue_request(reg_agent_args); |
| } |
| |
| if (down_hostlist) { |
| hostlist_uniq(down_hostlist); |
| host_str = hostlist_ranged_string_xmalloc(down_hostlist); |
| error("Nodes %s not responding, setting DOWN", host_str); |
| xfree(host_str); |
| hostlist_destroy(down_hostlist); |
| } |
| } |
| |
| /* Spawn health check function for every node that is not DOWN */ |
| extern void run_health_check(void) |
| { |
| char *host_str = NULL; |
| agent_arg_t *check_agent_args = NULL; |
| node_record_t *node_ptr; |
| int node_test_cnt = 0; |
| int node_limit = 0; |
| int node_states = slurm_conf.health_check_node_state & |
| (~HEALTH_CHECK_CYCLE); |
| int run_cyclic = slurm_conf.health_check_node_state & |
| HEALTH_CHECK_CYCLE; |
| static int base_node_loc = 0; |
| static time_t cycle_start_time = (time_t) 0; |
| |
| if (run_cyclic) { |
| time_t now = time(NULL); |
| if (cycle_start_time == (time_t) 0) |
| cycle_start_time = now; |
| else if (base_node_loc > 0) |
| ; /* mid-cycle */ |
| else if (difftime(now, cycle_start_time) < |
| slurm_conf.health_check_interval) |
| return; /* Wait to start next cycle */ |
| cycle_start_time = now; |
| /* |
| * Determine how many nodes we want to test on each call of |
| * run_health_check() to spread out the work. |
| */ |
| node_limit = (active_node_record_count * 2) / |
| slurm_conf.health_check_interval; |
| node_limit = MAX(node_limit, 10); |
| } |
| |
| check_agent_args = xmalloc (sizeof (agent_arg_t)); |
| check_agent_args->msg_type = REQUEST_HEALTH_CHECK; |
| check_agent_args->retry = 0; |
| check_agent_args->protocol_version = SLURM_PROTOCOL_VERSION; |
| check_agent_args->hostlist = hostlist_create(NULL); |
| |
| /* Sync plugin internal data with |
| * node select_nodeinfo. This is important |
| * after reconfig otherwise select_nodeinfo |
| * will not return the correct number of |
| * allocated cpus. |
| */ |
| select_g_select_nodeinfo_set_all(); |
| |
| for (; (node_ptr = next_node(&base_node_loc)); base_node_loc++) { |
| if (run_cyclic && |
| (node_test_cnt++ >= node_limit)) |
| break; |
| if (IS_NODE_FUTURE(node_ptr) || |
| IS_NODE_EXTERNAL(node_ptr) || |
| IS_NODE_INVALID_REG(node_ptr) || |
| IS_NODE_NO_RESPOND(node_ptr) || |
| IS_NODE_POWERED_DOWN(node_ptr) || |
| IS_NODE_POWERING_DOWN(node_ptr) || |
| IS_NODE_POWERING_UP(node_ptr) || |
| IS_NODE_REBOOT_ISSUED(node_ptr)) |
| continue; |
| if (node_states != HEALTH_CHECK_NODE_ANY) { |
| uint16_t cpus_total, cpus_used = 0; |
| cpus_total = node_ptr->config_ptr->cpus; |
| if (!IS_NODE_IDLE(node_ptr)) |
| cpus_used = node_ptr->alloc_cpus; |
| /* Here the node state is inferred from |
| * the cpus allocated on it. |
| * - cpus_used == 0 |
| * means node is idle |
| * - cpus_used < cpus_total |
| * means the node is in mixed state |
| * else cpus_used == cpus_total |
| * means the node is allocated |
| */ |
| if (cpus_used == 0) { |
| if (!(node_states & HEALTH_CHECK_NODE_IDLE) && |
| (!(node_states & HEALTH_CHECK_NODE_NONDRAINED_IDLE) || |
| IS_NODE_DRAIN(node_ptr))) { |
| continue; |
| } |
| if (!IS_NODE_IDLE(node_ptr)) |
| continue; |
| } else if (cpus_used < cpus_total) { |
| if (!(node_states & HEALTH_CHECK_NODE_MIXED)) |
| continue; |
| } else { |
| if (!(node_states & HEALTH_CHECK_NODE_ALLOC)) |
| continue; |
| } |
| } |
| if (check_agent_args->protocol_version > |
| node_ptr->protocol_version) |
| check_agent_args->protocol_version = |
| node_ptr->protocol_version; |
| hostlist_push_host(check_agent_args->hostlist, node_ptr->name); |
| check_agent_args->node_count++; |
| if (PACK_FANOUT_ADDRS(node_ptr)) |
| check_agent_args->msg_flags |= SLURM_PACK_ADDRS; |
| } |
| if (!node_ptr) |
| base_node_loc = 0; |
| |
| if (check_agent_args->node_count == 0) { |
| hostlist_destroy(check_agent_args->hostlist); |
| xfree (check_agent_args); |
| } else { |
| hostlist_uniq(check_agent_args->hostlist); |
| host_str = hostlist_ranged_string_xmalloc( |
| check_agent_args->hostlist); |
| debug("Spawning health check agent for %s", host_str); |
| xfree(host_str); |
| ping_begin(); |
| set_agent_arg_r_uid(check_agent_args, SLURM_AUTH_UID_ANY); |
| agent_queue_request(check_agent_args); |
| } |
| } |
| |
| /* Update acct_gather data for every node that is not DOWN */ |
| extern void update_nodes_acct_gather_data(void) |
| { |
| node_record_t *node_ptr; |
| int i; |
| char *host_str = NULL; |
| agent_arg_t *agent_args = NULL; |
| |
| agent_args = xmalloc (sizeof (agent_arg_t)); |
| agent_args->msg_type = REQUEST_ACCT_GATHER_UPDATE; |
| agent_args->retry = 0; |
| agent_args->protocol_version = SLURM_PROTOCOL_VERSION; |
| agent_args->hostlist = hostlist_create(NULL); |
| |
| for (i = 0; (node_ptr = next_node(&i)); i++) { |
| if (IS_NODE_FUTURE(node_ptr) || |
| IS_NODE_EXTERNAL(node_ptr) || |
| IS_NODE_INVALID_REG(node_ptr) || |
| IS_NODE_NO_RESPOND(node_ptr) || |
| IS_NODE_POWERED_DOWN(node_ptr) || |
| IS_NODE_POWERING_DOWN(node_ptr) || |
| IS_NODE_POWERING_UP(node_ptr) || |
| IS_NODE_REBOOT_ISSUED(node_ptr)) |
| continue; |
| if (agent_args->protocol_version > node_ptr->protocol_version) |
| agent_args->protocol_version = |
| node_ptr->protocol_version; |
| hostlist_push_host(agent_args->hostlist, node_ptr->name); |
| agent_args->node_count++; |
| if (PACK_FANOUT_ADDRS(node_ptr)) |
| agent_args->msg_flags |= SLURM_PACK_ADDRS; |
| } |
| |
| if (agent_args->node_count == 0) { |
| hostlist_destroy(agent_args->hostlist); |
| xfree (agent_args); |
| } else { |
| hostlist_uniq(agent_args->hostlist); |
| host_str = hostlist_ranged_string_xmalloc(agent_args->hostlist); |
| log_flag(ENERGY, "Updating acct_gather data for %s", host_str); |
| xfree(host_str); |
| ping_begin(); |
| set_agent_arg_r_uid(agent_args, SLURM_AUTH_UID_ANY); |
| agent_queue_request(agent_args); |
| } |
| } |