| /****************************************************************************\ |
| * slurm_pmi.c - PMI support functions internal to SLURM |
| ***************************************************************************** |
| * Copyright (C) 2005-2006 The Regents of the University of California. |
| * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). |
| * Written by Morris Jette <jette1@llnl.gov>. |
| * CODE-OCEC-09-009. All rights reserved. |
| * |
| * This file is part of Slurm, a resource management program. |
| * For details, see <https://slurm.schedmd.com/>. |
| * Please also read the included file: DISCLAIMER. |
| * |
| * Slurm is free software; you can redistribute it and/or modify it under |
| * the terms of the GNU General Public License as published by the Free |
| * Software Foundation; either version 2 of the License, or (at your option) |
| * any later version. |
| * |
| * In addition, as a special exception, the copyright holders give permission |
| * to link the code of portions of this program with the OpenSSL library under |
| * certain conditions as described in each individual source file, and |
| * distribute linked combinations including the two. You must obey the GNU |
| * General Public License in all respects for all of the code used other than |
| * OpenSSL. If you modify file(s) with this exception, you may extend this |
| * exception to your version of the file(s), but you are not obligated to do |
| * so. If you do not wish to do so, delete this exception statement from your |
| * version. If you delete this exception statement from all source files in |
| * the program, then also delete it here. |
| * |
| * Slurm is distributed in the hope that it will be useful, but WITHOUT ANY |
| * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS |
| * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more |
| * details. |
| * |
| * You should have received a copy of the GNU General Public License along |
| * with Slurm; if not, write to the Free Software Foundation, Inc., |
| * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. |
| \*****************************************************************************/ |
| |
| #include <stdlib.h> |
| #include <sys/time.h> |
| |
| #include "slurm/slurm.h" |
| #include "slurm/slurm_errno.h" |
| |
| #include "src/api/slurm_pmi.h" |
| #include "src/common/slurm_protocol_api.h" |
| #include "src/common/slurm_protocol_defs.h" |
| #include "src/common/forward.h" |
| #include "src/common/read_config.h" |
| #include "src/common/strlcpy.h" |
| #include "src/common/xmalloc.h" |
| #include "src/common/fd.h" |
| #include "src/interfaces/auth.h" |
| #include "src/interfaces/conn.h" |
| |
| #define DEFAULT_PMI_TIME 500 |
| #define MAX_RETRIES 5 |
| |
| int pmi_fd = -1; |
| int pmi_time = 0; |
| uint16_t srun_port = 0; |
| slurm_addr_t srun_addr; |
| |
| static void _delay_rpc(int pmi_rank, int pmi_size); |
| static int _forward_comm_set(kvs_comm_set_t *kvs_set_ptr); |
| static int _get_addr(void); |
| static void _set_pmi_time(void); |
| |
| /* Delay an RPC to srun in order to avoid overwhelming the srun command. |
| * The delay is based upon the number of tasks, this task's rank, and PMI_TIME. |
| * This logic depends upon synchronized clocks across the cluster. */ |
| static void _delay_rpc(int pmi_rank, int pmi_size) |
| { |
| struct timeval tv1, tv2; |
| uint32_t cur_time; /* current time in usec (just 9 digits) */ |
| uint32_t tot_time; /* total time expected for all RPCs */ |
| uint32_t offset_time; /* relative time within tot_time */ |
| uint32_t target_time; /* desired time to issue the RPC */ |
| uint32_t delta_time, error_time; |
| int retries = 0; |
| |
| if (pmi_rank == 0) /* Rank 0 has extra communications with no */ |
| return; /* risk of induced packet storm */ |
| |
| _set_pmi_time(); |
| |
| again: if (gettimeofday(&tv1, NULL)) { |
| usleep(pmi_rank * pmi_time); |
| return; |
| } |
| |
| cur_time = ((tv1.tv_sec % 1000) * 1000000) + tv1.tv_usec; |
| tot_time = pmi_size * pmi_time; |
| offset_time = cur_time % tot_time; |
| target_time = pmi_rank * pmi_time; |
| if (target_time < offset_time) |
| delta_time = target_time - offset_time + tot_time; |
| else |
| delta_time = target_time - offset_time; |
| if (usleep(delta_time)) { |
| if (errno == EINVAL) |
| usleep(900000); |
| /* errno == EINTR */ |
| goto again; |
| } |
| |
| /* Verify we are active at the right time. If current time is different |
| * from target by more than 15*pmi_time, then start over. If PMI_TIME |
| * is set appropriately, then srun should have no more than 30 RPCs |
| * in the queue at one time in the worst case. */ |
| if (gettimeofday(&tv2, NULL)) |
| return; |
| tot_time = (tv2.tv_sec - tv1.tv_sec) * 1000000; |
| tot_time += tv2.tv_usec; |
| tot_time -= tv1.tv_usec; |
| if (tot_time >= delta_time) |
| error_time = tot_time - delta_time; |
| else |
| error_time = delta_time - tot_time; |
| if (error_time > (15*pmi_time)) { /* too far off */ |
| #if 0 |
| info("delta=%u tot=%u err=%u", |
| delta_time, tot_time, error_time); |
| #endif |
| if ((++retries) <= 2) |
| goto again; |
| } |
| } |
| |
| static int _get_addr(void) |
| { |
| char *env_host, *env_port; |
| |
| if (srun_port) |
| return SLURM_SUCCESS; |
| |
| env_host = getenv("SLURM_SRUN_COMM_HOST"); |
| env_port = getenv("SLURM_SRUN_COMM_PORT"); |
| if (!env_host || !env_port) |
| return SLURM_ERROR; |
| |
| srun_port = (uint16_t) atol(env_port); |
| slurm_set_addr(&srun_addr, srun_port, env_host); |
| return SLURM_SUCCESS; |
| } |
| |
| static void _set_pmi_time(void) |
| { |
| char *tmp, *endptr; |
| |
| if (pmi_time) |
| return; |
| |
| tmp = getenv("PMI_TIME"); |
| if (tmp == NULL) { |
| pmi_time = DEFAULT_PMI_TIME; |
| return; |
| } |
| |
| pmi_time = strtol(tmp, &endptr, 10); |
| if ((pmi_time <= 0) || (endptr[0] != '\0')) { |
| error("Invalid PMI_TIME: %s", tmp); |
| pmi_time = DEFAULT_PMI_TIME; |
| } |
| } |
| |
| /* Transmit PMI Keyval space data */ |
| extern int slurm_pmi_send_kvs_comm_set(kvs_comm_set_t *kvs_set_ptr, |
| int pmi_rank, int pmi_size) |
| { |
| slurm_msg_t msg_send; |
| int rc, retries = 0, timeout = 0; |
| |
| if (kvs_set_ptr == NULL) |
| return EINVAL; |
| |
| slurm_init(NULL); |
| |
| if ((rc = _get_addr()) != SLURM_SUCCESS) |
| return rc; |
| _set_pmi_time(); |
| |
| slurm_msg_t_init(&msg_send); |
| slurm_msg_set_r_uid(&msg_send, SLURM_AUTH_UID_ANY); |
| msg_send.address = srun_addr; |
| msg_send.msg_type = PMI_KVS_PUT_REQ; |
| msg_send.data = (void *) kvs_set_ptr; |
| |
| /* Send the RPC to the local srun communication manager. |
| * Since the srun can be sent thousands of messages at |
| * the same time and refuse some connections, retry as |
| * needed. Spread out messages by task's rank. Also |
| * increase the timeout if many tasks since the srun |
| * command is very overloaded. |
| * We also increase the timeout (default timeout is |
| * 10 secs). */ |
| _delay_rpc(pmi_rank, pmi_size); |
| if (pmi_size > 4000) /* 240 secs */ |
| timeout = slurm_conf.msg_timeout * 24000; |
| else if (pmi_size > 1000) /* 120 secs */ |
| timeout = slurm_conf.msg_timeout * 12000; |
| else if (pmi_size > 100) /* 50 secs */ |
| timeout = slurm_conf.msg_timeout * 5000; |
| else if (pmi_size > 10) /* 20 secs */ |
| timeout = slurm_conf.msg_timeout * 2000; |
| |
| while (slurm_send_recv_rc_msg_only_one(&msg_send, &rc, timeout) < 0) { |
| if (retries++ > MAX_RETRIES) { |
| error("slurm_send_kvs_comm_set: %m"); |
| return SLURM_ERROR; |
| } else |
| debug("send_kvs retry %d", retries); |
| _delay_rpc(pmi_rank, pmi_size); |
| } |
| |
| return rc; |
| } |
| |
| /* Wait for barrier and get full PMI Keyval space data */ |
| extern int slurm_pmi_get_kvs_comm_set(kvs_comm_set_t **kvs_set_ptr, |
| int pmi_rank, int pmi_size) |
| { |
| int rc, retries = 0, timeout = 0; |
| void *tls_conn = NULL; |
| slurm_msg_t msg_send, msg_rcv; |
| slurm_addr_t slurm_addr, srun_reply_addr; |
| char hostname[HOST_NAME_MAX]; |
| kvs_get_msg_t data; |
| char *env_pmi_ifhn; |
| |
| if (kvs_set_ptr == NULL) |
| return EINVAL; |
| |
| slurm_init(NULL); |
| |
| *kvs_set_ptr = NULL; /* initialization */ |
| |
| if ((rc = _get_addr()) != SLURM_SUCCESS) { |
| error("_get_addr: %m"); |
| return rc; |
| } |
| |
| _set_pmi_time(); |
| |
| if (pmi_fd < 0) { |
| if ((pmi_fd = slurm_init_msg_engine_port(0)) < 0) { |
| error("slurm_init_msg_engine_port: %m"); |
| return SLURM_ERROR; |
| } |
| fd_set_blocking(pmi_fd); |
| } |
| if (slurm_get_stream_addr(pmi_fd, &slurm_addr) < 0) { |
| error("slurm_get_stream_addr: %m"); |
| return SLURM_ERROR; |
| } |
| if ((env_pmi_ifhn = getenv("SLURM_PMI_RESP_IFHN"))) |
| strlcpy(hostname, env_pmi_ifhn, sizeof(hostname)); |
| else |
| gethostname_short(hostname, sizeof(hostname)); |
| |
| memset(&data, 0, sizeof(data)); |
| data.task_id = pmi_rank; |
| data.size = pmi_size; |
| data.port = slurm_get_port(&slurm_addr); |
| data.hostname = hostname; |
| slurm_msg_t_init(&msg_send); |
| slurm_msg_set_r_uid(&msg_send, SLURM_AUTH_UID_ANY); |
| slurm_msg_t_init(&msg_rcv); |
| msg_send.address = srun_addr; |
| msg_send.msg_type = PMI_KVS_GET_REQ; |
| msg_send.data = &data; |
| |
| /* Send the RPC to the local srun communication manager. |
| * Since the srun can be sent thousands of messages at |
| * the same time and refuse some connections, retry as |
| * needed. Wait until all key-pairs have been sent by |
| * all tasks then spread out messages by task's rank. |
| * Also increase the message timeout if many tasks |
| * since the srun command can get very overloaded (the |
| * default timeout is 10 secs). |
| */ |
| _delay_rpc(pmi_rank, pmi_size); |
| if (pmi_size > 4000) /* 240 secs */ |
| timeout = slurm_conf.msg_timeout * 24000; |
| else if (pmi_size > 1000) /* 120 secs */ |
| timeout = slurm_conf.msg_timeout * 12000; |
| else if (pmi_size > 100) /* 60 secs */ |
| timeout = slurm_conf.msg_timeout * 6000; |
| else if (pmi_size > 10) /* 20 secs */ |
| timeout = slurm_conf.msg_timeout * 2000; |
| |
| while (slurm_send_recv_rc_msg_only_one(&msg_send, &rc, timeout) < 0) { |
| if (retries++ > MAX_RETRIES) { |
| error("slurm_get_kvs_comm_set: %m"); |
| return SLURM_ERROR; |
| } else |
| debug("get kvs retry %d", retries); |
| _delay_rpc(pmi_rank, pmi_size); |
| } |
| if (rc != SLURM_SUCCESS) { |
| error("slurm_get_kvs_comm_set error_code=%d", rc); |
| return rc; |
| } |
| |
| /* get the message after all tasks reach the barrier */ |
| if (!(tls_conn = slurm_accept_msg_conn(pmi_fd, &srun_reply_addr))) { |
| error("slurm_accept_msg_conn: %m"); |
| return errno; |
| } |
| |
| while ((rc = slurm_receive_msg(tls_conn, &msg_rcv, timeout)) != 0) { |
| if (errno == EINTR) |
| continue; |
| error("slurm_receive_msg: %m"); |
| conn_g_destroy(tls_conn, true); |
| return errno; |
| } |
| if (msg_rcv.auth_cred) |
| auth_g_destroy(msg_rcv.auth_cred); |
| |
| if (msg_rcv.msg_type != PMI_KVS_GET_RESP) { |
| error("slurm_get_kvs_comm_set msg_type=%s", |
| rpc_num2string(msg_rcv.msg_type)); |
| conn_g_destroy(tls_conn, true); |
| return SLURM_UNEXPECTED_MSG_ERROR; |
| } |
| if (slurm_send_rc_msg(&msg_rcv, SLURM_SUCCESS) < 0) |
| error("slurm_send_rc_msg: %m"); |
| |
| conn_g_destroy(tls_conn, true); |
| *kvs_set_ptr = msg_rcv.data; |
| |
| rc = _forward_comm_set(*kvs_set_ptr); |
| return rc; |
| } |
| |
| /* Forward keypair info to other tasks as required. |
| * Clear message forward structure upon completion. |
| * The messages are forwarded sequentially. */ |
| static int _forward_comm_set(kvs_comm_set_t *kvs_set_ptr) |
| { |
| int i, rc = SLURM_SUCCESS; |
| int tmp_host_cnt = kvs_set_ptr->host_cnt; |
| slurm_msg_t msg_send; |
| int msg_rc; |
| |
| kvs_set_ptr->host_cnt = 0; |
| for (i=0; i<tmp_host_cnt; i++) { |
| if (kvs_set_ptr->kvs_host_ptr[i].port == 0) |
| continue; /* empty */ |
| slurm_msg_t_init(&msg_send); |
| slurm_msg_set_r_uid(&msg_send, SLURM_AUTH_UID_ANY); |
| msg_send.msg_type = PMI_KVS_GET_RESP; |
| msg_send.data = (void *) kvs_set_ptr; |
| slurm_set_addr(&msg_send.address, |
| kvs_set_ptr->kvs_host_ptr[i].port, |
| kvs_set_ptr->kvs_host_ptr[i].hostname); |
| if (slurm_send_recv_rc_msg_only_one(&msg_send, |
| &msg_rc, 0) < 0) { |
| error("Could not forward msg to %s", |
| kvs_set_ptr->kvs_host_ptr[i].hostname); |
| msg_rc = 1; |
| } |
| rc = MAX(rc, msg_rc); |
| xfree(kvs_set_ptr->kvs_host_ptr[i].hostname); |
| } |
| xfree(kvs_set_ptr->kvs_host_ptr); |
| return rc; |
| } |
| |
| extern void slurm_pmi_free_kvs_comm_set(kvs_comm_set_t *msg) |
| { |
| slurm_free_kvs_comm_set(msg); |
| } |
| |
| /* Finalization processing */ |
| void slurm_pmi_finalize(void) |
| { |
| if (pmi_fd >= 0) { |
| close(pmi_fd); |
| pmi_fd = -1; |
| } |
| srun_port = 0; |
| } |
| |
| /* |
| * Wrapper for slurm_kill_job_step(). |
| * We must keep this function signature intact even if we change that function. |
| */ |
| extern int slurm_pmi_kill_job_step(uint32_t job_id, uint32_t step_id, |
| uint16_t signal) |
| { |
| return slurm_kill_job_step(job_id, step_id, signal, 0); |
| } |