src/api/slurm_pmi.c - SchedMD/slurm - Git at Google

 /****************************************************************************\
  *  slurm_pmi.c - PMI support functions internal to SLURM
  *****************************************************************************
  *  Copyright (C) 2005-2006 The Regents of the University of California.
  *  Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
  *  Written by Morris Jette <jette1@llnl.gov>.
  *  CODE-OCEC-09-009. All rights reserved.
  *
  *  This file is part of Slurm, a resource management program.
  *  For details, see <https://slurm.schedmd.com/>.
  *  Please also read the included file: DISCLAIMER.
  *
  *  Slurm is free software; you can redistribute it and/or modify it under
  *  the terms of the GNU General Public License as published by the Free
  *  Software Foundation; either version 2 of the License, or (at your option)
  *  any later version.
  *
  *  In addition, as a special exception, the copyright holders give permission
  *  to link the code of portions of this program with the OpenSSL library under
  *  certain conditions as described in each individual source file, and
  *  distribute linked combinations including the two. You must obey the GNU
  *  General Public License in all respects for all of the code used other than
  *  OpenSSL. If you modify file(s) with this exception, you may extend this
  *  exception to your version of the file(s), but you are not obligated to do
  *  so. If you do not wish to do so, delete this exception statement from your
  *  version.  If you delete this exception statement from all source files in
  *  the program, then also delete it here.
  *
  *  Slurm is distributed in the hope that it will be useful, but WITHOUT ANY
  *  WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
  *  FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
  *  details.
  *
  *  You should have received a copy of the GNU General Public License along
  *  with Slurm; if not, write to the Free Software Foundation, Inc.,
  *  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301  USA.
 \*****************************************************************************/

 #include <stdlib.h>
 #include <sys/time.h>

 #include "slurm/slurm.h"
 #include "slurm/slurm_errno.h"

 #include "src/api/slurm_pmi.h"
 #include "src/common/slurm_protocol_api.h"
 #include "src/common/slurm_protocol_defs.h"
 #include "src/common/forward.h"
 #include "src/common/read_config.h"
 #include "src/common/strlcpy.h"
 #include "src/common/xmalloc.h"
 #include "src/common/fd.h"
 #include "src/interfaces/auth.h"
 #include "src/interfaces/conn.h"

 #define DEFAULT_PMI_TIME 500
 #define MAX_RETRIES      5

 int pmi_fd = -1;
 int pmi_time = 0;
 uint16_t srun_port = 0;
 slurm_addr_t srun_addr;

 static void _delay_rpc(int pmi_rank, int pmi_size);
 static int  _forward_comm_set(kvs_comm_set_t *kvs_set_ptr);
 static int  _get_addr(void);
 static void _set_pmi_time(void);

 /* Delay an RPC to srun in order to avoid overwhelming the srun command.
  * The delay is based upon the number of tasks, this task's rank, and PMI_TIME.
  * This logic depends upon synchronized clocks across the cluster. */
 static void _delay_rpc(int pmi_rank, int pmi_size)
 {
 	struct timeval tv1, tv2;
 	uint32_t cur_time;	/* current time in usec (just 9 digits) */
 	uint32_t tot_time;	/* total time expected for all RPCs */
 	uint32_t offset_time;	/* relative time within tot_time */
 	uint32_t target_time;	/* desired time to issue the RPC */
 	uint32_t delta_time, error_time;
 	int retries = 0;

 	if (pmi_rank == 0)	/* Rank 0 has extra communications with no */
 		return;		/* risk of induced packet storm */

 	_set_pmi_time();

 again:	if (gettimeofday(&tv1, NULL)) {
 		usleep(pmi_rank * pmi_time);
 		return;
 	}

 	cur_time = ((tv1.tv_sec % 1000) * 1000000) + tv1.tv_usec;
 	tot_time = pmi_size * pmi_time;
 	offset_time = cur_time % tot_time;
 	target_time = pmi_rank * pmi_time;
 	if (target_time < offset_time)
 		delta_time = target_time - offset_time + tot_time;
 	else
 		delta_time = target_time - offset_time;
 	if (usleep(delta_time)) {
 		if (errno == EINVAL)
 			usleep(900000);
 		/* errno == EINTR */
 		goto again;
 	}

 	/* Verify we are active at the right time. If current time is different
 	 * from target by more than 15*pmi_time, then start over. If PMI_TIME
 	 * is set appropriately, then srun should have no more than 30 RPCs
 	 * in the queue at one time in the worst case. */
 	if (gettimeofday(&tv2, NULL))
 		return;
 	tot_time = (tv2.tv_sec - tv1.tv_sec) * 1000000;
 	tot_time += tv2.tv_usec;
 	tot_time -= tv1.tv_usec;
 	if (tot_time >= delta_time)
 		error_time = tot_time - delta_time;
 	else
 		error_time = delta_time - tot_time;
 	if (error_time > (15*pmi_time)) {	/* too far off */
 #if 0
 		info("delta=%u tot=%u err=%u",
 			delta_time, tot_time, error_time);
 #endif
 		if ((++retries) <= 2)
 			goto again;
 	}
 }

 static int _get_addr(void)
 {
 	char *env_host, *env_port;

 	if (srun_port)
 		return SLURM_SUCCESS;

 	env_host = getenv("SLURM_SRUN_COMM_HOST");
 	env_port = getenv("SLURM_SRUN_COMM_PORT");
 	if (!env_host || !env_port)
 		return SLURM_ERROR;

 	srun_port = (uint16_t) atol(env_port);
 	slurm_set_addr(&srun_addr, srun_port, env_host);
 	return SLURM_SUCCESS;
 }

 static void _set_pmi_time(void)
 {
 	char *tmp, *endptr;

 	if (pmi_time)
 		return;

 	tmp = getenv("PMI_TIME");
 	if (tmp == NULL) {
 		pmi_time = DEFAULT_PMI_TIME;
 		return;
 	}

 	pmi_time = strtol(tmp, &endptr, 10);
 	if ((pmi_time <= 0) || (endptr[0] != '\0')) {
 		error("Invalid PMI_TIME: %s", tmp);
 		pmi_time = DEFAULT_PMI_TIME;
 	}
 }

 /* Transmit PMI Keyval space data */
 extern int slurm_pmi_send_kvs_comm_set(kvs_comm_set_t *kvs_set_ptr,
 				       int pmi_rank, int pmi_size)
 {
 	slurm_msg_t msg_send;
 	int rc, retries = 0, timeout = 0;

 	if (kvs_set_ptr == NULL)
 		return EINVAL;

 	slurm_init(NULL);

 	if ((rc = _get_addr()) != SLURM_SUCCESS)
 		return rc;
 	_set_pmi_time();

 	slurm_msg_t_init(&msg_send);
 	slurm_msg_set_r_uid(&msg_send, SLURM_AUTH_UID_ANY);
 	msg_send.address = srun_addr;
 	msg_send.msg_type = PMI_KVS_PUT_REQ;
 	msg_send.data = (void *) kvs_set_ptr;

 	/* Send the RPC to the local srun communication manager.
 	 * Since the srun can be sent thousands of messages at
 	 * the same time and refuse some connections, retry as
 	 * needed. Spread out messages by task's rank. Also
 	 * increase the timeout if many tasks since the srun
 	 * command is very overloaded.
 	 * We also increase the timeout (default timeout is
 	 * 10 secs). */
 	_delay_rpc(pmi_rank, pmi_size);
 	if      (pmi_size > 4000)	/* 240 secs */
 		timeout = slurm_conf.msg_timeout * 24000;
 	else if (pmi_size > 1000)	/* 120 secs */
 		timeout = slurm_conf.msg_timeout * 12000;
 	else if (pmi_size > 100)	/* 50 secs */
 		timeout = slurm_conf.msg_timeout * 5000;
 	else if (pmi_size > 10)		/* 20 secs */
 		timeout = slurm_conf.msg_timeout * 2000;

 	while (slurm_send_recv_rc_msg_only_one(&msg_send, &rc, timeout) < 0) {
 		if (retries++ > MAX_RETRIES) {
 			error("slurm_send_kvs_comm_set: %m");
 			return SLURM_ERROR;
 		} else
 			debug("send_kvs retry %d", retries);
 		_delay_rpc(pmi_rank, pmi_size);
 	}

 	return rc;
 }

 /* Wait for barrier and get full PMI Keyval space data */
 extern int slurm_pmi_get_kvs_comm_set(kvs_comm_set_t **kvs_set_ptr,
 				      int pmi_rank, int pmi_size)
 {
 	int rc, retries = 0, timeout = 0;
 	void *tls_conn = NULL;
 	slurm_msg_t msg_send, msg_rcv;
 	slurm_addr_t slurm_addr, srun_reply_addr;
 	char hostname[HOST_NAME_MAX];
 	kvs_get_msg_t data;
 	char *env_pmi_ifhn;

 	if (kvs_set_ptr == NULL)
 		return EINVAL;

 	slurm_init(NULL);

 	*kvs_set_ptr = NULL;	/* initialization */

 	if ((rc = _get_addr()) != SLURM_SUCCESS) {
 		error("_get_addr: %m");
 		return rc;
 	}

 	_set_pmi_time();

 	if (pmi_fd < 0) {
 		if ((pmi_fd = slurm_init_msg_engine_port(0)) < 0) {
 			error("slurm_init_msg_engine_port: %m");
 			return SLURM_ERROR;
 		}
 		fd_set_blocking(pmi_fd);
 	}
 	if (slurm_get_stream_addr(pmi_fd, &slurm_addr) < 0) {
 		error("slurm_get_stream_addr: %m");
 		return SLURM_ERROR;
 	}
 	if ((env_pmi_ifhn = getenv("SLURM_PMI_RESP_IFHN")))
 		strlcpy(hostname, env_pmi_ifhn, sizeof(hostname));
 	else
 		gethostname_short(hostname, sizeof(hostname));

 	memset(&data, 0, sizeof(data));
 	data.task_id = pmi_rank;
 	data.size = pmi_size;
 	data.port = slurm_get_port(&slurm_addr);
 	data.hostname = hostname;
 	slurm_msg_t_init(&msg_send);
 	slurm_msg_set_r_uid(&msg_send, SLURM_AUTH_UID_ANY);
 	slurm_msg_t_init(&msg_rcv);
 	msg_send.address = srun_addr;
 	msg_send.msg_type = PMI_KVS_GET_REQ;
 	msg_send.data = &data;

 	/* Send the RPC to the local srun communication manager.
 	 * Since the srun can be sent thousands of messages at
 	 * the same time and refuse some connections, retry as
 	 * needed. Wait until all key-pairs have been sent by
 	 * all tasks then spread out messages by task's rank.
 	 * Also increase the message timeout if many tasks
 	 * since the srun command can get very overloaded (the
 	 * default timeout is 10 secs).
 	 */
 	_delay_rpc(pmi_rank, pmi_size);
 	if      (pmi_size > 4000)	/* 240 secs */
 		timeout = slurm_conf.msg_timeout * 24000;
 	else if (pmi_size > 1000)	/* 120 secs */
 		timeout = slurm_conf.msg_timeout * 12000;
 	else if (pmi_size > 100)	/* 60 secs */
 		timeout = slurm_conf.msg_timeout * 6000;
 	else if (pmi_size > 10)		/* 20 secs */
 		timeout = slurm_conf.msg_timeout * 2000;

 	while (slurm_send_recv_rc_msg_only_one(&msg_send, &rc, timeout) < 0) {
 		if (retries++ > MAX_RETRIES) {
 			error("slurm_get_kvs_comm_set: %m");
 			return SLURM_ERROR;
 		} else
 			debug("get kvs retry %d", retries);
 		_delay_rpc(pmi_rank, pmi_size);
 	}
 	if (rc != SLURM_SUCCESS) {
 		error("slurm_get_kvs_comm_set error_code=%d", rc);
 		return rc;
 	}

 	/* get the message after all tasks reach the barrier */
 	if (!(tls_conn = slurm_accept_msg_conn(pmi_fd, &srun_reply_addr))) {
 		error("slurm_accept_msg_conn: %m");
 		return errno;
 	}

 	while ((rc = slurm_receive_msg(tls_conn, &msg_rcv, timeout)) != 0) {
 		if (errno == EINTR)
 			continue;
 		error("slurm_receive_msg: %m");
 		conn_g_destroy(tls_conn, true);
 		return errno;
 	}
 	if (msg_rcv.auth_cred)
 		auth_g_destroy(msg_rcv.auth_cred);

 	if (msg_rcv.msg_type != PMI_KVS_GET_RESP) {
 		error("slurm_get_kvs_comm_set msg_type=%s",
 		      rpc_num2string(msg_rcv.msg_type));
 		conn_g_destroy(tls_conn, true);
 		return SLURM_UNEXPECTED_MSG_ERROR;
 	}
 	if (slurm_send_rc_msg(&msg_rcv, SLURM_SUCCESS) < 0)
 		error("slurm_send_rc_msg: %m");

 	conn_g_destroy(tls_conn, true);
 	*kvs_set_ptr = msg_rcv.data;

 	rc = _forward_comm_set(*kvs_set_ptr);
 	return rc;
 }

 /* Forward keypair info to other tasks as required.
  * Clear message forward structure upon completion.
  * The messages are forwarded sequentially. */
 static int _forward_comm_set(kvs_comm_set_t *kvs_set_ptr)
 {
 	int i, rc = SLURM_SUCCESS;
 	int tmp_host_cnt = kvs_set_ptr->host_cnt;
 	slurm_msg_t msg_send;
 	int msg_rc;

 	kvs_set_ptr->host_cnt = 0;
 	for (i=0; i<tmp_host_cnt; i++) {
 		if (kvs_set_ptr->kvs_host_ptr[i].port == 0)
 			continue;	/* empty */
 		slurm_msg_t_init(&msg_send);
 		slurm_msg_set_r_uid(&msg_send, SLURM_AUTH_UID_ANY);
 		msg_send.msg_type = PMI_KVS_GET_RESP;
 		msg_send.data = (void *) kvs_set_ptr;
 		slurm_set_addr(&msg_send.address,
 			kvs_set_ptr->kvs_host_ptr[i].port,
 			kvs_set_ptr->kvs_host_ptr[i].hostname);
 		if (slurm_send_recv_rc_msg_only_one(&msg_send,
 				&msg_rc, 0) < 0) {
 			error("Could not forward msg to %s",
 				kvs_set_ptr->kvs_host_ptr[i].hostname);
 			msg_rc = 1;
 		}
 		rc = MAX(rc, msg_rc);
 		xfree(kvs_set_ptr->kvs_host_ptr[i].hostname);
 	}
 	xfree(kvs_set_ptr->kvs_host_ptr);
 	return rc;
 }

 extern void slurm_pmi_free_kvs_comm_set(kvs_comm_set_t *msg)
 {
 	slurm_free_kvs_comm_set(msg);
 }

 /* Finalization processing */
 void slurm_pmi_finalize(void)
 {
 	if (pmi_fd >= 0) {
 		close(pmi_fd);
 		pmi_fd = -1;
 	}
 	srun_port = 0;
 }

 /*
  * Wrapper for slurm_kill_job_step().
  * We must keep this function signature intact even if we change that function.
  */
 extern int slurm_pmi_kill_job_step(uint32_t job_id, uint32_t step_id,
 				   uint16_t signal)
 {
 	return slurm_kill_job_step(job_id, step_id, signal, 0);
 }
	/****************************************************************************\
	* slurm_pmi.c - PMI support functions internal to SLURM
	*****************************************************************************
	* Copyright (C) 2005-2006 The Regents of the University of California.
	* Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
	* Written by Morris Jette <jette1@llnl.gov>.
	* CODE-OCEC-09-009. All rights reserved.
	*
	* This file is part of Slurm, a resource management program.
	* For details, see <https://slurm.schedmd.com/>.
	* Please also read the included file: DISCLAIMER.
	*
	* Slurm is free software; you can redistribute it and/or modify it under
	* the terms of the GNU General Public License as published by the Free
	* Software Foundation; either version 2 of the License, or (at your option)
	* any later version.
	*
	* In addition, as a special exception, the copyright holders give permission
	* to link the code of portions of this program with the OpenSSL library under
	* certain conditions as described in each individual source file, and
	* distribute linked combinations including the two. You must obey the GNU
	* General Public License in all respects for all of the code used other than
	* OpenSSL. If you modify file(s) with this exception, you may extend this
	* exception to your version of the file(s), but you are not obligated to do
	* so. If you do not wish to do so, delete this exception statement from your
	* version. If you delete this exception statement from all source files in
	* the program, then also delete it here.
	*
	* Slurm is distributed in the hope that it will be useful, but WITHOUT ANY
	* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
	* FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
	* details.
	*
	* You should have received a copy of the GNU General Public License along
	* with Slurm; if not, write to the Free Software Foundation, Inc.,
	* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
	\*****************************************************************************/

	#include <stdlib.h>
	#include <sys/time.h>

	#include "slurm/slurm.h"
	#include "slurm/slurm_errno.h"

	#include "src/api/slurm_pmi.h"
	#include "src/common/slurm_protocol_api.h"
	#include "src/common/slurm_protocol_defs.h"
	#include "src/common/forward.h"
	#include "src/common/read_config.h"
	#include "src/common/strlcpy.h"
	#include "src/common/xmalloc.h"
	#include "src/common/fd.h"
	#include "src/interfaces/auth.h"
	#include "src/interfaces/conn.h"

	#define DEFAULT_PMI_TIME 500
	#define MAX_RETRIES 5

	int pmi_fd = -1;
	int pmi_time = 0;
	uint16_t srun_port = 0;
	slurm_addr_t srun_addr;

	static void _delay_rpc(int pmi_rank, int pmi_size);
	static int _forward_comm_set(kvs_comm_set_t *kvs_set_ptr);
	static int _get_addr(void);
	static void _set_pmi_time(void);

	/* Delay an RPC to srun in order to avoid overwhelming the srun command.
	* The delay is based upon the number of tasks, this task's rank, and PMI_TIME.
	* This logic depends upon synchronized clocks across the cluster. */
	static void _delay_rpc(int pmi_rank, int pmi_size)
	{
	struct timeval tv1, tv2;
	uint32_t cur_time; /* current time in usec (just 9 digits) */
	uint32_t tot_time; /* total time expected for all RPCs */
	uint32_t offset_time; /* relative time within tot_time */
	uint32_t target_time; /* desired time to issue the RPC */
	uint32_t delta_time, error_time;
	int retries = 0;

	if (pmi_rank == 0) /* Rank 0 has extra communications with no */
	return; /* risk of induced packet storm */

	_set_pmi_time();

	again: if (gettimeofday(&tv1, NULL)) {
	usleep(pmi_rank * pmi_time);
	return;
	}

	cur_time = ((tv1.tv_sec % 1000) * 1000000) + tv1.tv_usec;
	tot_time = pmi_size * pmi_time;
	offset_time = cur_time % tot_time;
	target_time = pmi_rank * pmi_time;
	if (target_time < offset_time)
	delta_time = target_time - offset_time + tot_time;
	else
	delta_time = target_time - offset_time;
	if (usleep(delta_time)) {
	if (errno == EINVAL)
	usleep(900000);
	/* errno == EINTR */
	goto again;
	}

	/* Verify we are active at the right time. If current time is different
	* from target by more than 15*pmi_time, then start over. If PMI_TIME
	* is set appropriately, then srun should have no more than 30 RPCs
	* in the queue at one time in the worst case. */
	if (gettimeofday(&tv2, NULL))
	return;
	tot_time = (tv2.tv_sec - tv1.tv_sec) * 1000000;
	tot_time += tv2.tv_usec;
	tot_time -= tv1.tv_usec;
	if (tot_time >= delta_time)
	error_time = tot_time - delta_time;
	else
	error_time = delta_time - tot_time;
	if (error_time > (15pmi_time)) { / too far off */
	#if 0
	info("delta=%u tot=%u err=%u",
	delta_time, tot_time, error_time);
	#endif
	if ((++retries) <= 2)
	goto again;
	}
	}

	static int _get_addr(void)
	{
	char env_host, env_port;

	if (srun_port)
	return SLURM_SUCCESS;

	env_host = getenv("SLURM_SRUN_COMM_HOST");
	env_port = getenv("SLURM_SRUN_COMM_PORT");
	if (!env_host \|\| !env_port)
	return SLURM_ERROR;

	srun_port = (uint16_t) atol(env_port);
	slurm_set_addr(&srun_addr, srun_port, env_host);
	return SLURM_SUCCESS;
	}

	static void _set_pmi_time(void)
	{
	char tmp, endptr;

	if (pmi_time)
	return;

	tmp = getenv("PMI_TIME");
	if (tmp == NULL) {
	pmi_time = DEFAULT_PMI_TIME;
	return;
	}

	pmi_time = strtol(tmp, &endptr, 10);
	if ((pmi_time <= 0) \|\| (endptr[0] != '\0')) {
	error("Invalid PMI_TIME: %s", tmp);
	pmi_time = DEFAULT_PMI_TIME;
	}
	}

	/* Transmit PMI Keyval space data */
	extern int slurm_pmi_send_kvs_comm_set(kvs_comm_set_t *kvs_set_ptr,
	int pmi_rank, int pmi_size)
	{
	slurm_msg_t msg_send;
	int rc, retries = 0, timeout = 0;

	if (kvs_set_ptr == NULL)
	return EINVAL;

	slurm_init(NULL);

	if ((rc = _get_addr()) != SLURM_SUCCESS)
	return rc;
	_set_pmi_time();

	slurm_msg_t_init(&msg_send);
	slurm_msg_set_r_uid(&msg_send, SLURM_AUTH_UID_ANY);
	msg_send.address = srun_addr;
	msg_send.msg_type = PMI_KVS_PUT_REQ;
	msg_send.data = (void *) kvs_set_ptr;

	/* Send the RPC to the local srun communication manager.
	* Since the srun can be sent thousands of messages at
	* the same time and refuse some connections, retry as
	* needed. Spread out messages by task's rank. Also
	* increase the timeout if many tasks since the srun
	* command is very overloaded.
	* We also increase the timeout (default timeout is
	* 10 secs). */
	_delay_rpc(pmi_rank, pmi_size);
	if (pmi_size > 4000) /* 240 secs */
	timeout = slurm_conf.msg_timeout * 24000;
	else if (pmi_size > 1000) /* 120 secs */
	timeout = slurm_conf.msg_timeout * 12000;
	else if (pmi_size > 100) /* 50 secs */
	timeout = slurm_conf.msg_timeout * 5000;
	else if (pmi_size > 10) /* 20 secs */
	timeout = slurm_conf.msg_timeout * 2000;

	while (slurm_send_recv_rc_msg_only_one(&msg_send, &rc, timeout) < 0) {
	if (retries++ > MAX_RETRIES) {
	error("slurm_send_kvs_comm_set: %m");
	return SLURM_ERROR;
	} else
	debug("send_kvs retry %d", retries);
	_delay_rpc(pmi_rank, pmi_size);
	}

	return rc;
	}

	/* Wait for barrier and get full PMI Keyval space data */
	extern int slurm_pmi_get_kvs_comm_set(kvs_comm_set_t **kvs_set_ptr,
	int pmi_rank, int pmi_size)
	{
	int rc, retries = 0, timeout = 0;
	void *tls_conn = NULL;
	slurm_msg_t msg_send, msg_rcv;
	slurm_addr_t slurm_addr, srun_reply_addr;
	char hostname[HOST_NAME_MAX];
	kvs_get_msg_t data;
	char *env_pmi_ifhn;

	if (kvs_set_ptr == NULL)
	return EINVAL;

	slurm_init(NULL);

	kvs_set_ptr = NULL; / initialization */

	if ((rc = _get_addr()) != SLURM_SUCCESS) {
	error("_get_addr: %m");
	return rc;
	}

	_set_pmi_time();

	if (pmi_fd < 0) {
	if ((pmi_fd = slurm_init_msg_engine_port(0)) < 0) {
	error("slurm_init_msg_engine_port: %m");
	return SLURM_ERROR;
	}
	fd_set_blocking(pmi_fd);
	}
	if (slurm_get_stream_addr(pmi_fd, &slurm_addr) < 0) {
	error("slurm_get_stream_addr: %m");
	return SLURM_ERROR;
	}
	if ((env_pmi_ifhn = getenv("SLURM_PMI_RESP_IFHN")))
	strlcpy(hostname, env_pmi_ifhn, sizeof(hostname));
	else
	gethostname_short(hostname, sizeof(hostname));

	memset(&data, 0, sizeof(data));
	data.task_id = pmi_rank;
	data.size = pmi_size;
	data.port = slurm_get_port(&slurm_addr);
	data.hostname = hostname;
	slurm_msg_t_init(&msg_send);
	slurm_msg_set_r_uid(&msg_send, SLURM_AUTH_UID_ANY);
	slurm_msg_t_init(&msg_rcv);
	msg_send.address = srun_addr;
	msg_send.msg_type = PMI_KVS_GET_REQ;
	msg_send.data = &data;

	/* Send the RPC to the local srun communication manager.
	* Since the srun can be sent thousands of messages at
	* the same time and refuse some connections, retry as
	* needed. Wait until all key-pairs have been sent by
	* all tasks then spread out messages by task's rank.
	* Also increase the message timeout if many tasks
	* since the srun command can get very overloaded (the
	* default timeout is 10 secs).
	*/
	_delay_rpc(pmi_rank, pmi_size);
	if (pmi_size > 4000) /* 240 secs */
	timeout = slurm_conf.msg_timeout * 24000;
	else if (pmi_size > 1000) /* 120 secs */
	timeout = slurm_conf.msg_timeout * 12000;
	else if (pmi_size > 100) /* 60 secs */
	timeout = slurm_conf.msg_timeout * 6000;
	else if (pmi_size > 10) /* 20 secs */
	timeout = slurm_conf.msg_timeout * 2000;

	while (slurm_send_recv_rc_msg_only_one(&msg_send, &rc, timeout) < 0) {
	if (retries++ > MAX_RETRIES) {
	error("slurm_get_kvs_comm_set: %m");
	return SLURM_ERROR;
	} else
	debug("get kvs retry %d", retries);
	_delay_rpc(pmi_rank, pmi_size);
	}
	if (rc != SLURM_SUCCESS) {
	error("slurm_get_kvs_comm_set error_code=%d", rc);
	return rc;
	}

	/* get the message after all tasks reach the barrier */
	if (!(tls_conn = slurm_accept_msg_conn(pmi_fd, &srun_reply_addr))) {
	error("slurm_accept_msg_conn: %m");
	return errno;
	}

	while ((rc = slurm_receive_msg(tls_conn, &msg_rcv, timeout)) != 0) {
	if (errno == EINTR)
	continue;
	error("slurm_receive_msg: %m");
	conn_g_destroy(tls_conn, true);
	return errno;
	}
	if (msg_rcv.auth_cred)
	auth_g_destroy(msg_rcv.auth_cred);

	if (msg_rcv.msg_type != PMI_KVS_GET_RESP) {
	error("slurm_get_kvs_comm_set msg_type=%s",
	rpc_num2string(msg_rcv.msg_type));
	conn_g_destroy(tls_conn, true);
	return SLURM_UNEXPECTED_MSG_ERROR;
	}
	if (slurm_send_rc_msg(&msg_rcv, SLURM_SUCCESS) < 0)
	error("slurm_send_rc_msg: %m");

	conn_g_destroy(tls_conn, true);
	*kvs_set_ptr = msg_rcv.data;

	rc = _forward_comm_set(*kvs_set_ptr);
	return rc;
	}

	/* Forward keypair info to other tasks as required.
	* Clear message forward structure upon completion.
	* The messages are forwarded sequentially. */
	static int _forward_comm_set(kvs_comm_set_t *kvs_set_ptr)
	{
	int i, rc = SLURM_SUCCESS;
	int tmp_host_cnt = kvs_set_ptr->host_cnt;
	slurm_msg_t msg_send;
	int msg_rc;

	kvs_set_ptr->host_cnt = 0;
	for (i=0; i<tmp_host_cnt; i++) {
	if (kvs_set_ptr->kvs_host_ptr[i].port == 0)
	continue; /* empty */
	slurm_msg_t_init(&msg_send);
	slurm_msg_set_r_uid(&msg_send, SLURM_AUTH_UID_ANY);
	msg_send.msg_type = PMI_KVS_GET_RESP;
	msg_send.data = (void *) kvs_set_ptr;
	slurm_set_addr(&msg_send.address,
	kvs_set_ptr->kvs_host_ptr[i].port,
	kvs_set_ptr->kvs_host_ptr[i].hostname);
	if (slurm_send_recv_rc_msg_only_one(&msg_send,
	&msg_rc, 0) < 0) {
	error("Could not forward msg to %s",
	kvs_set_ptr->kvs_host_ptr[i].hostname);
	msg_rc = 1;
	}
	rc = MAX(rc, msg_rc);
	xfree(kvs_set_ptr->kvs_host_ptr[i].hostname);
	}
	xfree(kvs_set_ptr->kvs_host_ptr);
	return rc;
	}

	extern void slurm_pmi_free_kvs_comm_set(kvs_comm_set_t *msg)
	{
	slurm_free_kvs_comm_set(msg);
	}

	/* Finalization processing */
	void slurm_pmi_finalize(void)
	{
	if (pmi_fd >= 0) {
	close(pmi_fd);
	pmi_fd = -1;
	}
	srun_port = 0;
	}

	/*
	* Wrapper for slurm_kill_job_step().
	* We must keep this function signature intact even if we change that function.
	*/
	extern int slurm_pmi_kill_job_step(uint32_t job_id, uint32_t step_id,
	uint16_t signal)
	{
	return slurm_kill_job_step(job_id, step_id, signal, 0);
	}