src/slurmctld/ping_nodes.c - SchedMD/slurm - Git at Google

 /*****************************************************************************\
  *  ping_nodes.c - ping the slurmd daemons to test if they respond
  *****************************************************************************
  *  Copyright (C) 2003-2007 The Regents of the University of California.
  *  Copyright (C) 2008-2011 Lawrence Livermore National Security.
  *  Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
  *  Written by Morris Jette <jette1@llnl.gov> et. al.
  *  CODE-OCEC-09-009. All rights reserved.
  *
  *  This file is part of Slurm, a resource management program.
  *  For details, see <https://slurm.schedmd.com/>.
  *  Please also read the included file: DISCLAIMER.
  *
  *  Slurm is free software; you can redistribute it and/or modify it under
  *  the terms of the GNU General Public License as published by the Free
  *  Software Foundation; either version 2 of the License, or (at your option)
  *  any later version.
  *
  *  In addition, as a special exception, the copyright holders give permission
  *  to link the code of portions of this program with the OpenSSL library under
  *  certain conditions as described in each individual source file, and
  *  distribute linked combinations including the two. You must obey the GNU
  *  General Public License in all respects for all of the code used other than
  *  OpenSSL. If you modify file(s) with this exception, you may extend this
  *  exception to your version of the file(s), but you are not obligated to do
  *  so. If you do not wish to do so, delete this exception statement from your
  *  version.  If you delete this exception statement from all source files in
  *  the program, then also delete it here.
  *
  *  Slurm is distributed in the hope that it will be useful, but WITHOUT ANY
  *  WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
  *  FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
  *  details.
  *
  *  You should have received a copy of the GNU General Public License along
  *  with Slurm; if not, write to the Free Software Foundation, Inc.,
  *  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301  USA.
 \*****************************************************************************/

 #include "config.h"

 #include <pthread.h>
 #include <string.h>
 #include <time.h>

 #include "src/common/hostlist.h"
 #include "src/common/read_config.h"

 #include "src/interfaces/select.h"

 #include "src/slurmctld/agent.h"
 #include "src/slurmctld/ping_nodes.h"
 #include "src/slurmctld/slurmctld.h"

 /* Request that nodes re-register at most every MAX_REG_FREQUENCY pings */
 #define MAX_REG_FREQUENCY 20

 /* Log an error for ping that takes more than 100 seconds to complete */
 #define PING_TIMEOUT 100

 static pthread_mutex_t lock_mutex = PTHREAD_MUTEX_INITIALIZER;
 static bool ping_updated = false;
 static int ping_count = 0;
 static time_t ping_start = 0;

 /*
  * is_ping_done - test if the last node ping cycle has completed.
  *	Use this to avoid starting a new set of ping requests before the
  *	previous one completes
  * RET true if ping process is done, false otherwise
  */
 bool is_ping_done (void)
 {
 	static bool ping_msg_sent = false;
 	bool is_done = true;

 	slurm_mutex_lock(&lock_mutex);
 	if (ping_count) {
 		is_done = false;
 		if (!ping_msg_sent &&
 		    (difftime(time(NULL), ping_start) >= PING_TIMEOUT)) {
 			error("A node ping cycle took more than %d seconds. Node RPC requests like ping, register status, health check and/or accounting gather update are triggered less frequently than configured. Either many nodes are non-responsive or one of SlurmdTimeout, HealthCheckInterval, JobAcctGatherFrequency should be increased.",
 			      PING_TIMEOUT);
 			ping_msg_sent = true;
 		}
 	} else {
 		ping_msg_sent = false;
 		/*
 		 * We can only consider the last node ping cycle to be fully
 		 * completed if ping_updated is true, meaning the
 		 * _agent_nodes_update thread finished updating the nodes
 		 * response. Otherwise, we could hit a race and incorrectly set
 		 * responding/healthy nodes to DOWN.
 		 */
 		is_done = ping_updated;
 	}
 	slurm_mutex_unlock(&lock_mutex);

 	return is_done;
 }

 /*
  * ping_begin - record that a ping cycle has begin. This can be called more
  *	than once (for REQUEST_PING and simultaneous REQUEST_NODE_REGISTRATION
  *	for selected nodes). Matching ping_end calls must be made for each
  *	before is_ping_done returns true.
  */
 void ping_begin (void)
 {
 	slurm_mutex_lock(&lock_mutex);
 	ping_count++;
 	ping_start = time(NULL);
 	slurm_mutex_unlock(&lock_mutex);
 }

 /*
  * ping_nodes_update - A ping cycle can end but the update can still be pending
  * for the _agent_nodes_update thread. This call will confirm node info was
  * updated.
  */
 void ping_nodes_update(void)
 {
 	slurm_mutex_lock(&lock_mutex);
 	ping_updated = true;
 	slurm_mutex_unlock(&lock_mutex);
 }

 /*
  * ping_end - record that a ping cycle has ended. This can be called more
  *	than once (for REQUEST_PING and simultaneous REQUEST_NODE_REGISTRATION
  *	for selected nodes). Matching ping_end calls must be made for each
  *	before is_ping_done returns true.
  */
 void ping_end (void)
 {
 	slurm_mutex_lock(&lock_mutex);

 	if (ping_count > 0)
 		ping_count--;
 	else
 		error("%s: ping_count < 0", __func__);

 	if (ping_count == 0) /* no more running ping cycles */
 		ping_start = 0;
 	ping_updated = false;
 	slurm_mutex_unlock(&lock_mutex);
 }

 /*
  * ping_nodes - check that all nodes and daemons are alive,
  *	get nodes in UNKNOWN state to register
  */
 void ping_nodes (void)
 {
 	static bool restart_flag = true;	/* system just restarted */
 	static int reg_offset = 0;	/* mutex via node table write lock on entry */
 	static int max_reg_threads = 0;	/* max node registration threads
 					 * this can include DOWN nodes, so
 					 * limit the number to avoid huge
 					 * communication delays */
 	int i;
 	time_t now = time(NULL), still_live_time, node_dead_time;
 	static time_t last_ping_time = (time_t) 0;
 	static time_t last_ping_timeout = (time_t) 0;
 	hostlist_t *down_hostlist = NULL;
 	char *host_str = NULL;
 	agent_arg_t *ping_agent_args = NULL;
 	agent_arg_t *reg_agent_args = NULL;
 	node_record_t *node_ptr = NULL;
 	time_t old_cpu_load_time = now - slurm_conf.slurmd_timeout;
 	time_t old_free_mem_time = now - slurm_conf.slurmd_timeout;
 	int node_offset = 0;

 	ping_agent_args = xmalloc (sizeof (agent_arg_t));
 	ping_agent_args->msg_type = REQUEST_PING;
 	ping_agent_args->retry = 0;
 	ping_agent_args->protocol_version = SLURM_PROTOCOL_VERSION;
 	ping_agent_args->hostlist = hostlist_create(NULL);

 	reg_agent_args = xmalloc (sizeof (agent_arg_t));
 	reg_agent_args->msg_type = REQUEST_NODE_REGISTRATION_STATUS;
 	reg_agent_args->retry = 0;
 	reg_agent_args->protocol_version = SLURM_PROTOCOL_VERSION;
 	reg_agent_args->hostlist = hostlist_create(NULL);

 	/*
 	 * If there are a large number of down nodes, the node ping
 	 * can take a long time to complete:
 	 *  ping_time = down_nodes * agent_timeout / agent_parallelism
 	 *  ping_time = down_nodes * 10_seconds / 10
 	 *  ping_time = down_nodes (seconds)
 	 * Because of this, we extend the SlurmdTimeout by the
 	 * time needed to complete a ping of all nodes.
 	 */
 	if ((last_ping_timeout == 0) ||
 	    (last_ping_time == (time_t) 0)) {
 		node_dead_time = (time_t) 0;
 	} else {
 		node_dead_time = last_ping_time - last_ping_timeout;
 	}
 	still_live_time = now - (slurm_conf.slurmd_timeout / 3);
 	last_ping_time  = now;
 	last_ping_timeout = slurm_conf.slurmd_timeout;

 	if (max_reg_threads == 0) {
 		max_reg_threads = MAX(slurm_conf.tree_width, 1);
 		max_reg_threads = MIN(max_reg_threads, 50);
 	}
 	reg_offset += max_reg_threads;
 	if ((reg_offset > active_node_record_count) &&
 	    (reg_offset >= (max_reg_threads * MAX_REG_FREQUENCY)))
 		reg_offset = 0;

 	for (i = 0; (node_ptr = next_node(&i)); i++) {
 		node_offset++;
 		if (IS_NODE_FUTURE(node_ptr) ||
 		    IS_NODE_EXTERNAL(node_ptr) ||
 		    IS_NODE_POWERED_DOWN(node_ptr) ||
 		    IS_NODE_POWERING_DOWN(node_ptr) ||
 		    IS_NODE_POWERING_UP(node_ptr) ||
 		    IS_NODE_INVALID_REG(node_ptr) ||
 		    IS_NODE_REBOOT_ISSUED(node_ptr))
 			continue;
 		if ((slurm_conf.slurmd_timeout == 0) && (!restart_flag) &&
 		    (!IS_NODE_UNKNOWN(node_ptr)) &&
 		    (!IS_NODE_NO_RESPOND(node_ptr)))
 			continue;

 		if ((node_ptr->last_response != (time_t) 0)     &&
 		    (node_ptr->last_response <= node_dead_time) &&
 		    (!IS_NODE_DOWN(node_ptr))) {
 			if (down_hostlist)
 				(void) hostlist_push_host(down_hostlist,
 					node_ptr->name);
 			else {
 				down_hostlist =
 					hostlist_create(node_ptr->name);
 				if (!down_hostlist) {
 					fatal("Invalid host name: %s",
 					      node_ptr->name);
 				}
 			}
 			set_node_down_ptr(node_ptr, "Not responding");
 			node_ptr->not_responding = false;  /* logged below */
 			continue;
 		}

 		/* Request a node registration if its state is UNKNOWN or
 		 * on a periodic basis (about every MAX_REG_FREQUENCY ping,
 		 * this mechanism avoids an additional (per node) timer or
 		 * counter and gets updated configuration information
 		 * once in a while). We limit these requests since they
 		 * can generate a flood of incoming RPCs. */
 		if (IS_NODE_UNKNOWN(node_ptr) || (node_ptr->boot_time == 0) ||
 		    ((node_offset >= reg_offset) &&
 		     (node_offset < (reg_offset + max_reg_threads)))) {
 			if (reg_agent_args->protocol_version >
 			    node_ptr->protocol_version)
 				reg_agent_args->protocol_version =
 					node_ptr->protocol_version;
 			hostlist_push_host(reg_agent_args->hostlist,
 					   node_ptr->name);
 			reg_agent_args->node_count++;
 			if (PACK_FANOUT_ADDRS(node_ptr))
 				reg_agent_args->msg_flags |= SLURM_PACK_ADDRS;
 			continue;
 		}

 		if ((!IS_NODE_NO_RESPOND(node_ptr)) &&
 		    (node_ptr->last_response >= still_live_time) &&
 		    (node_ptr->cpu_load_time >= old_cpu_load_time) &&
 		    (node_ptr->free_mem_time >= old_free_mem_time))
 			continue;

 		/* Do not keep pinging down nodes since this can induce
 		 * huge delays in hierarchical communication fail-over */
 		if (IS_NODE_NO_RESPOND(node_ptr) && IS_NODE_DOWN(node_ptr))
 			continue;

 		if (ping_agent_args->protocol_version >
 		    node_ptr->protocol_version)
 			ping_agent_args->protocol_version =
 				node_ptr->protocol_version;
 		hostlist_push_host(ping_agent_args->hostlist, node_ptr->name);
 		ping_agent_args->node_count++;
 		if (PACK_FANOUT_ADDRS(node_ptr))
 			ping_agent_args->msg_flags |= SLURM_PACK_ADDRS;
 	}

 	restart_flag = false;
 	if (ping_agent_args->node_count == 0) {
 		hostlist_destroy(ping_agent_args->hostlist);
 		xfree (ping_agent_args);
 	} else {
 		hostlist_uniq(ping_agent_args->hostlist);
 		host_str = hostlist_ranged_string_xmalloc(
 				ping_agent_args->hostlist);
 		debug("Spawning ping agent for %s", host_str);
 		xfree(host_str);
 		ping_begin();
 		set_agent_arg_r_uid(ping_agent_args, SLURM_AUTH_UID_ANY);
 		agent_queue_request(ping_agent_args);
 	}

 	if (reg_agent_args->node_count == 0) {
 		hostlist_destroy(reg_agent_args->hostlist);
 		xfree (reg_agent_args);
 	} else {
 		hostlist_uniq(reg_agent_args->hostlist);
 		host_str = hostlist_ranged_string_xmalloc(
 				reg_agent_args->hostlist);
 		debug("Spawning registration agent for %s %d hosts",
 		      host_str, reg_agent_args->node_count);
 		xfree(host_str);
 		ping_begin();
 		set_agent_arg_r_uid(reg_agent_args, SLURM_AUTH_UID_ANY);
 		agent_queue_request(reg_agent_args);
 	}

 	if (down_hostlist) {
 		hostlist_uniq(down_hostlist);
 		host_str = hostlist_ranged_string_xmalloc(down_hostlist);
 		error("Nodes %s not responding, setting DOWN", host_str);
 		xfree(host_str);
 		hostlist_destroy(down_hostlist);
 	}
 }

 /* Spawn health check function for every node that is not DOWN */
 extern void run_health_check(void)
 {
 	char *host_str = NULL;
 	agent_arg_t *check_agent_args = NULL;
 	node_record_t *node_ptr;
 	int node_test_cnt = 0;
 	int node_limit = 0;
 	int node_states = slurm_conf.health_check_node_state &
 		(~HEALTH_CHECK_CYCLE);
 	int run_cyclic = slurm_conf.health_check_node_state &
 		HEALTH_CHECK_CYCLE;
 	static int base_node_loc = 0;
 	static time_t cycle_start_time = (time_t) 0;

 	if (run_cyclic) {
 		time_t now = time(NULL);
 		if (cycle_start_time == (time_t) 0)
 			cycle_start_time = now;
 		else if (base_node_loc > 0)
 			;	/* mid-cycle */
 		else if (difftime(now, cycle_start_time) <
 		         slurm_conf.health_check_interval)
 			return;	/* Wait to start next cycle */
 		cycle_start_time = now;
 		/*
 		 * Determine how many nodes we want to test on each call of
 		 * run_health_check() to spread out the work.
 		 */
 		node_limit = (active_node_record_count * 2) /
 		             slurm_conf.health_check_interval;
 		node_limit = MAX(node_limit, 10);
 	}

 	check_agent_args = xmalloc (sizeof (agent_arg_t));
 	check_agent_args->msg_type = REQUEST_HEALTH_CHECK;
 	check_agent_args->retry = 0;
 	check_agent_args->protocol_version = SLURM_PROTOCOL_VERSION;
 	check_agent_args->hostlist = hostlist_create(NULL);

 	/* Sync plugin internal data with
 	 * node select_nodeinfo. This is important
 	 * after reconfig otherwise select_nodeinfo
 	 * will not return the correct number of
 	 * allocated cpus.
 	 */
 	select_g_select_nodeinfo_set_all();

 	for (; (node_ptr = next_node(&base_node_loc)); base_node_loc++) {
 		if (run_cyclic &&
 		    (node_test_cnt++ >= node_limit))
 				break;
 		if (IS_NODE_FUTURE(node_ptr) ||
 		    IS_NODE_EXTERNAL(node_ptr) ||
 		    IS_NODE_INVALID_REG(node_ptr) ||
 		    IS_NODE_NO_RESPOND(node_ptr) ||
 		    IS_NODE_POWERED_DOWN(node_ptr) ||
 		    IS_NODE_POWERING_DOWN(node_ptr) ||
 		    IS_NODE_POWERING_UP(node_ptr) ||
 		    IS_NODE_REBOOT_ISSUED(node_ptr))
 			continue;
 		if (node_states != HEALTH_CHECK_NODE_ANY) {
 			uint16_t cpus_total, cpus_used = 0;
 			cpus_total = node_ptr->config_ptr->cpus;
 			if (!IS_NODE_IDLE(node_ptr))
 				cpus_used = node_ptr->alloc_cpus;
 			/* Here the node state is inferred from
 			 * the cpus allocated on it.
 			 * - cpus_used == 0
 			 *       means node is idle
 			 * - cpus_used < cpus_total
 			 *       means the node is in mixed state
 			 * else cpus_used == cpus_total
 			 *       means the node is allocated
 			 */
 			if (cpus_used == 0) {
 				if (!(node_states & HEALTH_CHECK_NODE_IDLE) &&
 				    (!(node_states & HEALTH_CHECK_NODE_NONDRAINED_IDLE) ||
 				     IS_NODE_DRAIN(node_ptr))) {
 					continue;
 				}
 				if (!IS_NODE_IDLE(node_ptr))
 					continue;
 			} else if (cpus_used < cpus_total) {
 				if (!(node_states & HEALTH_CHECK_NODE_MIXED))
 					continue;
 			} else {
 				if (!(node_states & HEALTH_CHECK_NODE_ALLOC))
 					continue;
 			}
 		}
 		if (check_agent_args->protocol_version >
 		    node_ptr->protocol_version)
 			check_agent_args->protocol_version =
 				node_ptr->protocol_version;
 		hostlist_push_host(check_agent_args->hostlist, node_ptr->name);
 		check_agent_args->node_count++;
 		if (PACK_FANOUT_ADDRS(node_ptr))
 			check_agent_args->msg_flags |= SLURM_PACK_ADDRS;
 	}
 	if (!node_ptr)
 		base_node_loc = 0;

 	if (check_agent_args->node_count == 0) {
 		hostlist_destroy(check_agent_args->hostlist);
 		xfree (check_agent_args);
 	} else {
 		hostlist_uniq(check_agent_args->hostlist);
 		host_str = hostlist_ranged_string_xmalloc(
 				check_agent_args->hostlist);
 		debug("Spawning health check agent for %s", host_str);
 		xfree(host_str);
 		ping_begin();
 		set_agent_arg_r_uid(check_agent_args, SLURM_AUTH_UID_ANY);
 		agent_queue_request(check_agent_args);
 	}
 }

 /* Update acct_gather data for every node that is not DOWN */
 extern void update_nodes_acct_gather_data(void)
 {
 	node_record_t *node_ptr;
 	int i;
 	char *host_str = NULL;
 	agent_arg_t *agent_args = NULL;

 	agent_args = xmalloc (sizeof (agent_arg_t));
 	agent_args->msg_type = REQUEST_ACCT_GATHER_UPDATE;
 	agent_args->retry = 0;
 	agent_args->protocol_version = SLURM_PROTOCOL_VERSION;
 	agent_args->hostlist = hostlist_create(NULL);

 	for (i = 0; (node_ptr = next_node(&i)); i++) {
 		if (IS_NODE_FUTURE(node_ptr) ||
 		    IS_NODE_EXTERNAL(node_ptr) ||
 		    IS_NODE_INVALID_REG(node_ptr) ||
 		    IS_NODE_NO_RESPOND(node_ptr) ||
 		    IS_NODE_POWERED_DOWN(node_ptr) ||
 		    IS_NODE_POWERING_DOWN(node_ptr) ||
 		    IS_NODE_POWERING_UP(node_ptr) ||
 		    IS_NODE_REBOOT_ISSUED(node_ptr))
 			continue;
 		if (agent_args->protocol_version > node_ptr->protocol_version)
 			agent_args->protocol_version =
 				node_ptr->protocol_version;
 		hostlist_push_host(agent_args->hostlist, node_ptr->name);
 		agent_args->node_count++;
 		if (PACK_FANOUT_ADDRS(node_ptr))
 			agent_args->msg_flags |= SLURM_PACK_ADDRS;
 	}

 	if (agent_args->node_count == 0) {
 		hostlist_destroy(agent_args->hostlist);
 		xfree (agent_args);
 	} else {
 		hostlist_uniq(agent_args->hostlist);
 		host_str = hostlist_ranged_string_xmalloc(agent_args->hostlist);
 		log_flag(ENERGY, "Updating acct_gather data for %s", host_str);
 		xfree(host_str);
 		ping_begin();
 		set_agent_arg_r_uid(agent_args, SLURM_AUTH_UID_ANY);
 		agent_queue_request(agent_args);
 	}
 }
	/*****************************************************************************\
	* ping_nodes.c - ping the slurmd daemons to test if they respond
	*****************************************************************************
	* Copyright (C) 2003-2007 The Regents of the University of California.
	* Copyright (C) 2008-2011 Lawrence Livermore National Security.
	* Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
	* Written by Morris Jette <jette1@llnl.gov> et. al.
	* CODE-OCEC-09-009. All rights reserved.
	*
	* This file is part of Slurm, a resource management program.
	* For details, see <https://slurm.schedmd.com/>.
	* Please also read the included file: DISCLAIMER.
	*
	* Slurm is free software; you can redistribute it and/or modify it under
	* the terms of the GNU General Public License as published by the Free
	* Software Foundation; either version 2 of the License, or (at your option)
	* any later version.
	*
	* In addition, as a special exception, the copyright holders give permission
	* to link the code of portions of this program with the OpenSSL library under
	* certain conditions as described in each individual source file, and
	* distribute linked combinations including the two. You must obey the GNU
	* General Public License in all respects for all of the code used other than
	* OpenSSL. If you modify file(s) with this exception, you may extend this
	* exception to your version of the file(s), but you are not obligated to do
	* so. If you do not wish to do so, delete this exception statement from your
	* version. If you delete this exception statement from all source files in
	* the program, then also delete it here.
	*
	* Slurm is distributed in the hope that it will be useful, but WITHOUT ANY
	* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
	* FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
	* details.
	*
	* You should have received a copy of the GNU General Public License along
	* with Slurm; if not, write to the Free Software Foundation, Inc.,
	* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
	\*****************************************************************************/

	#include "config.h"

	#include <pthread.h>
	#include <string.h>
	#include <time.h>

	#include "src/common/hostlist.h"
	#include "src/common/read_config.h"

	#include "src/interfaces/select.h"

	#include "src/slurmctld/agent.h"
	#include "src/slurmctld/ping_nodes.h"
	#include "src/slurmctld/slurmctld.h"

	/* Request that nodes re-register at most every MAX_REG_FREQUENCY pings */
	#define MAX_REG_FREQUENCY 20

	/* Log an error for ping that takes more than 100 seconds to complete */
	#define PING_TIMEOUT 100

	static pthread_mutex_t lock_mutex = PTHREAD_MUTEX_INITIALIZER;
	static bool ping_updated = false;
	static int ping_count = 0;
	static time_t ping_start = 0;

	/*
	* is_ping_done - test if the last node ping cycle has completed.
	* Use this to avoid starting a new set of ping requests before the
	* previous one completes
	* RET true if ping process is done, false otherwise
	*/
	bool is_ping_done (void)
	{
	static bool ping_msg_sent = false;
	bool is_done = true;

	slurm_mutex_lock(&lock_mutex);
	if (ping_count) {
	is_done = false;
	if (!ping_msg_sent &&
	(difftime(time(NULL), ping_start) >= PING_TIMEOUT)) {
	error("A node ping cycle took more than %d seconds. Node RPC requests like ping, register status, health check and/or accounting gather update are triggered less frequently than configured. Either many nodes are non-responsive or one of SlurmdTimeout, HealthCheckInterval, JobAcctGatherFrequency should be increased.",
	PING_TIMEOUT);
	ping_msg_sent = true;
	}
	} else {
	ping_msg_sent = false;
	/*
	* We can only consider the last node ping cycle to be fully
	* completed if ping_updated is true, meaning the
	* _agent_nodes_update thread finished updating the nodes
	* response. Otherwise, we could hit a race and incorrectly set
	* responding/healthy nodes to DOWN.
	*/
	is_done = ping_updated;
	}
	slurm_mutex_unlock(&lock_mutex);

	return is_done;
	}

	/*
	* ping_begin - record that a ping cycle has begin. This can be called more
	* than once (for REQUEST_PING and simultaneous REQUEST_NODE_REGISTRATION
	* for selected nodes). Matching ping_end calls must be made for each
	* before is_ping_done returns true.
	*/
	void ping_begin (void)
	{
	slurm_mutex_lock(&lock_mutex);
	ping_count++;
	ping_start = time(NULL);
	slurm_mutex_unlock(&lock_mutex);
	}

	/*
	* ping_nodes_update - A ping cycle can end but the update can still be pending
	* for the _agent_nodes_update thread. This call will confirm node info was
	* updated.
	*/
	void ping_nodes_update(void)
	{
	slurm_mutex_lock(&lock_mutex);
	ping_updated = true;
	slurm_mutex_unlock(&lock_mutex);
	}

	/*
	* ping_end - record that a ping cycle has ended. This can be called more
	* than once (for REQUEST_PING and simultaneous REQUEST_NODE_REGISTRATION
	* for selected nodes). Matching ping_end calls must be made for each
	* before is_ping_done returns true.
	*/
	void ping_end (void)
	{
	slurm_mutex_lock(&lock_mutex);

	if (ping_count > 0)
	ping_count--;
	else
	error("%s: ping_count < 0", __func__);

	if (ping_count == 0) /* no more running ping cycles */
	ping_start = 0;
	ping_updated = false;
	slurm_mutex_unlock(&lock_mutex);
	}

	/*
	* ping_nodes - check that all nodes and daemons are alive,
	* get nodes in UNKNOWN state to register
	*/
	void ping_nodes (void)
	{
	static bool restart_flag = true; /* system just restarted */
	static int reg_offset = 0; /* mutex via node table write lock on entry */
	static int max_reg_threads = 0; /* max node registration threads
	* this can include DOWN nodes, so
	* limit the number to avoid huge
	* communication delays */
	int i;
	time_t now = time(NULL), still_live_time, node_dead_time;
	static time_t last_ping_time = (time_t) 0;
	static time_t last_ping_timeout = (time_t) 0;
	hostlist_t *down_hostlist = NULL;
	char *host_str = NULL;
	agent_arg_t *ping_agent_args = NULL;
	agent_arg_t *reg_agent_args = NULL;
	node_record_t *node_ptr = NULL;
	time_t old_cpu_load_time = now - slurm_conf.slurmd_timeout;
	time_t old_free_mem_time = now - slurm_conf.slurmd_timeout;
	int node_offset = 0;

	ping_agent_args = xmalloc (sizeof (agent_arg_t));
	ping_agent_args->msg_type = REQUEST_PING;
	ping_agent_args->retry = 0;
	ping_agent_args->protocol_version = SLURM_PROTOCOL_VERSION;
	ping_agent_args->hostlist = hostlist_create(NULL);

	reg_agent_args = xmalloc (sizeof (agent_arg_t));
	reg_agent_args->msg_type = REQUEST_NODE_REGISTRATION_STATUS;
	reg_agent_args->retry = 0;
	reg_agent_args->protocol_version = SLURM_PROTOCOL_VERSION;
	reg_agent_args->hostlist = hostlist_create(NULL);

	/*
	* If there are a large number of down nodes, the node ping
	* can take a long time to complete:
	* ping_time = down_nodes * agent_timeout / agent_parallelism
	* ping_time = down_nodes * 10_seconds / 10
	* ping_time = down_nodes (seconds)
	* Because of this, we extend the SlurmdTimeout by the
	* time needed to complete a ping of all nodes.
	*/
	if ((last_ping_timeout == 0) \|\|
	(last_ping_time == (time_t) 0)) {
	node_dead_time = (time_t) 0;
	} else {
	node_dead_time = last_ping_time - last_ping_timeout;
	}
	still_live_time = now - (slurm_conf.slurmd_timeout / 3);
	last_ping_time = now;
	last_ping_timeout = slurm_conf.slurmd_timeout;

	if (max_reg_threads == 0) {
	max_reg_threads = MAX(slurm_conf.tree_width, 1);
	max_reg_threads = MIN(max_reg_threads, 50);
	}
	reg_offset += max_reg_threads;
	if ((reg_offset > active_node_record_count) &&
	(reg_offset >= (max_reg_threads * MAX_REG_FREQUENCY)))
	reg_offset = 0;

	for (i = 0; (node_ptr = next_node(&i)); i++) {
	node_offset++;
	if (IS_NODE_FUTURE(node_ptr) \|\|
	IS_NODE_EXTERNAL(node_ptr) \|\|
	IS_NODE_POWERED_DOWN(node_ptr) \|\|
	IS_NODE_POWERING_DOWN(node_ptr) \|\|
	IS_NODE_POWERING_UP(node_ptr) \|\|
	IS_NODE_INVALID_REG(node_ptr) \|\|
	IS_NODE_REBOOT_ISSUED(node_ptr))
	continue;
	if ((slurm_conf.slurmd_timeout == 0) && (!restart_flag) &&
	(!IS_NODE_UNKNOWN(node_ptr)) &&
	(!IS_NODE_NO_RESPOND(node_ptr)))
	continue;

	if ((node_ptr->last_response != (time_t) 0) &&
	(node_ptr->last_response <= node_dead_time) &&
	(!IS_NODE_DOWN(node_ptr))) {
	if (down_hostlist)
	(void) hostlist_push_host(down_hostlist,
	node_ptr->name);
	else {
	down_hostlist =
	hostlist_create(node_ptr->name);
	if (!down_hostlist) {
	fatal("Invalid host name: %s",
	node_ptr->name);
	}
	}
	set_node_down_ptr(node_ptr, "Not responding");
	node_ptr->not_responding = false; /* logged below */
	continue;
	}

	/* Request a node registration if its state is UNKNOWN or
	* on a periodic basis (about every MAX_REG_FREQUENCY ping,
	* this mechanism avoids an additional (per node) timer or
	* counter and gets updated configuration information
	* once in a while). We limit these requests since they
	* can generate a flood of incoming RPCs. */
	if (IS_NODE_UNKNOWN(node_ptr) \|\| (node_ptr->boot_time == 0) \|\|
	((node_offset >= reg_offset) &&
	(node_offset < (reg_offset + max_reg_threads)))) {
	if (reg_agent_args->protocol_version >
	node_ptr->protocol_version)
	reg_agent_args->protocol_version =
	node_ptr->protocol_version;
	hostlist_push_host(reg_agent_args->hostlist,
	node_ptr->name);
	reg_agent_args->node_count++;
	if (PACK_FANOUT_ADDRS(node_ptr))
	reg_agent_args->msg_flags \|= SLURM_PACK_ADDRS;
	continue;
	}

	if ((!IS_NODE_NO_RESPOND(node_ptr)) &&
	(node_ptr->last_response >= still_live_time) &&
	(node_ptr->cpu_load_time >= old_cpu_load_time) &&
	(node_ptr->free_mem_time >= old_free_mem_time))
	continue;

	/* Do not keep pinging down nodes since this can induce
	* huge delays in hierarchical communication fail-over */
	if (IS_NODE_NO_RESPOND(node_ptr) && IS_NODE_DOWN(node_ptr))
	continue;

	if (ping_agent_args->protocol_version >
	node_ptr->protocol_version)
	ping_agent_args->protocol_version =
	node_ptr->protocol_version;
	hostlist_push_host(ping_agent_args->hostlist, node_ptr->name);
	ping_agent_args->node_count++;
	if (PACK_FANOUT_ADDRS(node_ptr))
	ping_agent_args->msg_flags \|= SLURM_PACK_ADDRS;
	}

	restart_flag = false;
	if (ping_agent_args->node_count == 0) {
	hostlist_destroy(ping_agent_args->hostlist);
	xfree (ping_agent_args);
	} else {
	hostlist_uniq(ping_agent_args->hostlist);
	host_str = hostlist_ranged_string_xmalloc(
	ping_agent_args->hostlist);
	debug("Spawning ping agent for %s", host_str);
	xfree(host_str);
	ping_begin();
	set_agent_arg_r_uid(ping_agent_args, SLURM_AUTH_UID_ANY);
	agent_queue_request(ping_agent_args);
	}

	if (reg_agent_args->node_count == 0) {
	hostlist_destroy(reg_agent_args->hostlist);
	xfree (reg_agent_args);
	} else {
	hostlist_uniq(reg_agent_args->hostlist);
	host_str = hostlist_ranged_string_xmalloc(
	reg_agent_args->hostlist);
	debug("Spawning registration agent for %s %d hosts",
	host_str, reg_agent_args->node_count);
	xfree(host_str);
	ping_begin();
	set_agent_arg_r_uid(reg_agent_args, SLURM_AUTH_UID_ANY);
	agent_queue_request(reg_agent_args);
	}

	if (down_hostlist) {
	hostlist_uniq(down_hostlist);
	host_str = hostlist_ranged_string_xmalloc(down_hostlist);
	error("Nodes %s not responding, setting DOWN", host_str);
	xfree(host_str);
	hostlist_destroy(down_hostlist);
	}
	}

	/* Spawn health check function for every node that is not DOWN */
	extern void run_health_check(void)
	{
	char *host_str = NULL;
	agent_arg_t *check_agent_args = NULL;
	node_record_t *node_ptr;
	int node_test_cnt = 0;
	int node_limit = 0;
	int node_states = slurm_conf.health_check_node_state &
	(~HEALTH_CHECK_CYCLE);
	int run_cyclic = slurm_conf.health_check_node_state &
	HEALTH_CHECK_CYCLE;
	static int base_node_loc = 0;
	static time_t cycle_start_time = (time_t) 0;

	if (run_cyclic) {
	time_t now = time(NULL);
	if (cycle_start_time == (time_t) 0)
	cycle_start_time = now;
	else if (base_node_loc > 0)
	; /* mid-cycle */
	else if (difftime(now, cycle_start_time) <
	slurm_conf.health_check_interval)
	return; /* Wait to start next cycle */
	cycle_start_time = now;
	/*
	* Determine how many nodes we want to test on each call of
	* run_health_check() to spread out the work.
	*/
	node_limit = (active_node_record_count * 2) /
	slurm_conf.health_check_interval;
	node_limit = MAX(node_limit, 10);
	}

	check_agent_args = xmalloc (sizeof (agent_arg_t));
	check_agent_args->msg_type = REQUEST_HEALTH_CHECK;
	check_agent_args->retry = 0;
	check_agent_args->protocol_version = SLURM_PROTOCOL_VERSION;
	check_agent_args->hostlist = hostlist_create(NULL);

	/* Sync plugin internal data with
	* node select_nodeinfo. This is important
	* after reconfig otherwise select_nodeinfo
	* will not return the correct number of
	* allocated cpus.
	*/
	select_g_select_nodeinfo_set_all();

	for (; (node_ptr = next_node(&base_node_loc)); base_node_loc++) {
	if (run_cyclic &&
	(node_test_cnt++ >= node_limit))
	break;
	if (IS_NODE_FUTURE(node_ptr) \|\|
	IS_NODE_EXTERNAL(node_ptr) \|\|
	IS_NODE_INVALID_REG(node_ptr) \|\|
	IS_NODE_NO_RESPOND(node_ptr) \|\|
	IS_NODE_POWERED_DOWN(node_ptr) \|\|
	IS_NODE_POWERING_DOWN(node_ptr) \|\|
	IS_NODE_POWERING_UP(node_ptr) \|\|
	IS_NODE_REBOOT_ISSUED(node_ptr))
	continue;
	if (node_states != HEALTH_CHECK_NODE_ANY) {
	uint16_t cpus_total, cpus_used = 0;
	cpus_total = node_ptr->config_ptr->cpus;
	if (!IS_NODE_IDLE(node_ptr))
	cpus_used = node_ptr->alloc_cpus;
	/* Here the node state is inferred from
	* the cpus allocated on it.
	* - cpus_used == 0
	* means node is idle
	* - cpus_used < cpus_total
	* means the node is in mixed state
	* else cpus_used == cpus_total
	* means the node is allocated
	*/
	if (cpus_used == 0) {
	if (!(node_states & HEALTH_CHECK_NODE_IDLE) &&
	(!(node_states & HEALTH_CHECK_NODE_NONDRAINED_IDLE) \|\|
	IS_NODE_DRAIN(node_ptr))) {
	continue;
	}
	if (!IS_NODE_IDLE(node_ptr))
	continue;
	} else if (cpus_used < cpus_total) {
	if (!(node_states & HEALTH_CHECK_NODE_MIXED))
	continue;
	} else {
	if (!(node_states & HEALTH_CHECK_NODE_ALLOC))
	continue;
	}
	}
	if (check_agent_args->protocol_version >
	node_ptr->protocol_version)
	check_agent_args->protocol_version =
	node_ptr->protocol_version;
	hostlist_push_host(check_agent_args->hostlist, node_ptr->name);
	check_agent_args->node_count++;
	if (PACK_FANOUT_ADDRS(node_ptr))
	check_agent_args->msg_flags \|= SLURM_PACK_ADDRS;
	}
	if (!node_ptr)
	base_node_loc = 0;

	if (check_agent_args->node_count == 0) {
	hostlist_destroy(check_agent_args->hostlist);
	xfree (check_agent_args);
	} else {
	hostlist_uniq(check_agent_args->hostlist);
	host_str = hostlist_ranged_string_xmalloc(
	check_agent_args->hostlist);
	debug("Spawning health check agent for %s", host_str);
	xfree(host_str);
	ping_begin();
	set_agent_arg_r_uid(check_agent_args, SLURM_AUTH_UID_ANY);
	agent_queue_request(check_agent_args);
	}
	}

	/* Update acct_gather data for every node that is not DOWN */
	extern void update_nodes_acct_gather_data(void)
	{
	node_record_t *node_ptr;
	int i;
	char *host_str = NULL;
	agent_arg_t *agent_args = NULL;

	agent_args = xmalloc (sizeof (agent_arg_t));
	agent_args->msg_type = REQUEST_ACCT_GATHER_UPDATE;
	agent_args->retry = 0;
	agent_args->protocol_version = SLURM_PROTOCOL_VERSION;
	agent_args->hostlist = hostlist_create(NULL);

	for (i = 0; (node_ptr = next_node(&i)); i++) {
	if (IS_NODE_FUTURE(node_ptr) \|\|
	IS_NODE_EXTERNAL(node_ptr) \|\|
	IS_NODE_INVALID_REG(node_ptr) \|\|
	IS_NODE_NO_RESPOND(node_ptr) \|\|
	IS_NODE_POWERED_DOWN(node_ptr) \|\|
	IS_NODE_POWERING_DOWN(node_ptr) \|\|
	IS_NODE_POWERING_UP(node_ptr) \|\|
	IS_NODE_REBOOT_ISSUED(node_ptr))
	continue;
	if (agent_args->protocol_version > node_ptr->protocol_version)
	agent_args->protocol_version =
	node_ptr->protocol_version;
	hostlist_push_host(agent_args->hostlist, node_ptr->name);
	agent_args->node_count++;
	if (PACK_FANOUT_ADDRS(node_ptr))
	agent_args->msg_flags \|= SLURM_PACK_ADDRS;
	}

	if (agent_args->node_count == 0) {
	hostlist_destroy(agent_args->hostlist);
	xfree (agent_args);
	} else {
	hostlist_uniq(agent_args->hostlist);
	host_str = hostlist_ranged_string_xmalloc(agent_args->hostlist);
	log_flag(ENERGY, "Updating acct_gather data for %s", host_str);
	xfree(host_str);
	ping_begin();
	set_agent_arg_r_uid(agent_args, SLURM_AUTH_UID_ANY);
	agent_queue_request(agent_args);
	}
	}