blob: 0b03f1a21f07deb5c337185ff5e08ad806cba11a [file] [log] [blame]
/*****************************************************************************\
* port_mgr.c - manage the reservation of I/O ports on the nodes.
* Design for use with OpenMPI.
*****************************************************************************
* Copyright (C) 2009 Lawrence Livermore National Security.
* Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
* Written by Morris Jette <jette1@llnl.gov>
* CODE-OCEC-09-009. All rights reserved.
*
* This file is part of Slurm, a resource management program.
* For details, see <https://slurm.schedmd.com/>.
* Please also read the included file: DISCLAIMER.
*
* Slurm is free software; you can redistribute it and/or modify it under
* the terms of the GNU General Public License as published by the Free
* Software Foundation; either version 2 of the License, or (at your option)
* any later version.
*
* In addition, as a special exception, the copyright holders give permission
* to link the code of portions of this program with the OpenSSL library under
* certain conditions as described in each individual source file, and
* distribute linked combinations including the two. You must obey the GNU
* General Public License in all respects for all of the code used other than
* OpenSSL. If you modify file(s) with this exception, you may extend this
* exception to your version of the file(s), but you are not obligated to do
* so. If you do not wish to do so, delete this exception statement from your
* version. If you delete this exception statement from all source files in
* the program, then also delete it here.
*
* Slurm is distributed in the hope that it will be useful, but WITHOUT ANY
* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
* FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
* details.
*
* You should have received a copy of the GNU General Public License along
* with Slurm; if not, write to the Free Software Foundation, Inc.,
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
\*****************************************************************************/
#include <stdlib.h>
#include <string.h>
#include "src/common/bitstring.h"
#include "src/common/hostlist.h"
#include "src/common/job_record.h"
#include "src/common/node_conf.h"
#include "src/common/xmalloc.h"
#include "src/common/xstring.h"
#define _DEBUG 0
bitstr_t **port_resv_table = (bitstr_t **) NULL;
int port_resv_cnt = 0;
int port_resv_min = 0;
int port_resv_max = 0;
static void _dump_resv_port_info(void);
static void _make_all_resv(list_t *job_list);
static void _make_step_resv(step_record_t *step_ptr);
static int _rebuild_port_array(const char *resv_ports,
uint16_t *resv_port_cnt,
int **resv_port_array);
static void _dump_resv_port_info(void)
{
#if _DEBUG
int i;
char *tmp_char;
for (i=0; i<port_resv_cnt; i++) {
if (!port_resv_table[i] ||
bit_set_count(port_resv_table[i]) == 0)
continue;
tmp_char = bitmap2node_name(port_resv_table[i]);
info("Port %d: %s", (i+port_resv_min), tmp_char);
xfree(tmp_char);
}
#endif
}
/* Builds the resv_port_array based upon resv_ports (a string) */
static int _rebuild_port_array(const char *resv_ports,
uint16_t *resv_port_cnt,
int **resv_port_array)
{
int i;
char *tmp_char;
hostlist_t *hl;
tmp_char = xstrdup_printf("[%s]", resv_ports);
hl = hostlist_create(tmp_char);
xfree(tmp_char);
if (!hl)
return SLURM_ERROR;
*resv_port_array = xcalloc(*resv_port_cnt, *resv_port_cnt);
*resv_port_cnt = 0;
while ((tmp_char = hostlist_shift(hl))) {
i = atoi(tmp_char);
if (i > 0)
(*resv_port_array)[(*resv_port_cnt)++]=i;
free(tmp_char);
}
hostlist_destroy(hl);
if (*resv_port_cnt == 0)
return ESLURM_PORTS_INVALID;
return SLURM_SUCCESS;
}
/*
* Update the local reservation table
* Builds resv_port_array if NULL based upon resv_ports (a string)
*/
static int _make_resv(bitstr_t *node_bitmap,
const char *resv_ports,
uint16_t *resv_port_cnt,
int **resv_port_array)
{
int i, j;
int rc = SLURM_SUCCESS;
if ((*resv_port_cnt == 0) ||
(resv_ports == NULL) ||
(resv_ports[0] == '\0'))
return rc;
if ((*resv_port_array == NULL) &&
(rc = _rebuild_port_array(resv_ports, resv_port_cnt,
resv_port_array)))
return rc;
for (i=0; i < *resv_port_cnt; i++) {
if (((*resv_port_array)[i] < port_resv_min) ||
((*resv_port_array)[i] > port_resv_max))
continue;
j = (*resv_port_array)[i] - port_resv_min;
bit_or(port_resv_table[j], node_bitmap);
}
return rc;
}
/* Update the local reservation table for one job step.
* Builds the job step's resv_port_array based upon resv_ports (a string) */
static void _make_step_resv(step_record_t *step_ptr)
{
int rc = _make_resv(step_ptr->step_node_bitmap, step_ptr->resv_ports,
&step_ptr->resv_port_cnt,
&step_ptr->resv_port_array);
if (rc == SLURM_SUCCESS)
return;
if (rc == ESLURM_PORTS_INVALID)
error("%pS has invalid reserved ports: %s",
step_ptr, step_ptr->resv_ports);
else
error("Problem recovering resv_port_array for %pS: %s",
step_ptr, step_ptr->resv_ports);
xfree(step_ptr->resv_ports);
return;
}
/* Update the local reservation table for one stepmgr enabled job
* Builds the job resv_port_array based upon resv_ports (a string) */
static void _make_job_resv(job_record_t *job_ptr)
{
int rc;
if (!IS_JOB_RUNNING(job_ptr) ||
!(job_ptr->bit_flags & STEPMGR_ENABLED))
return;
rc = _make_resv(job_ptr->node_bitmap, job_ptr->resv_ports,
&job_ptr->resv_port_cnt, &job_ptr->resv_port_array);
if (rc == SLURM_SUCCESS)
return;
if (rc == ESLURM_PORTS_INVALID)
error("%pJ has invalid reserved ports: %s",
job_ptr, job_ptr->resv_ports);
else
error("Problem recovering resv_port_array for %pJ: %s",
job_ptr, job_ptr->resv_ports);
xfree(job_ptr->resv_ports);
return;
}
/* Identify every job step with a port reservation and put the
* reservation into the local reservation table. */
static void _make_all_resv(list_t *job_list)
{
job_record_t *job_ptr;
step_record_t *step_ptr;
list_itr_t *job_iterator, *step_iterator;
job_iterator = list_iterator_create(job_list);
while ((job_ptr = list_next(job_iterator))) {
_make_job_resv(job_ptr);
step_iterator = list_iterator_create(job_ptr->step_list);
while ((step_ptr = list_next(step_iterator))) {
if (step_ptr->state < JOB_RUNNING)
continue;
_make_step_resv(step_ptr);
}
list_iterator_destroy(step_iterator);
}
list_iterator_destroy(job_iterator);
}
/* Configure reserved ports.
* Call with mpi_params==NULL to free memory */
extern int reserve_port_config(char *mpi_params, list_t *job_list)
{
char *tmp_e=NULL, *tmp_p=NULL;
int i, p_min, p_max;
if (mpi_params)
tmp_p = strstr(mpi_params, "ports=");
if (tmp_p == NULL) {
if (port_resv_table) {
info("Clearing port reservations");
for (i=0; i<port_resv_cnt; i++)
FREE_NULL_BITMAP(port_resv_table[i]);
xfree(port_resv_table);
port_resv_cnt = 0;
port_resv_min = port_resv_max = 0;
}
return SLURM_SUCCESS;
}
tmp_p += 6;
p_min = strtol(tmp_p, &tmp_e, 10);
if ((p_min < 1) || (tmp_e[0] != '-')) {
info("invalid MpiParams: %s", mpi_params);
return SLURM_ERROR;
}
tmp_e++;
p_max = strtol(tmp_e, NULL, 10);
if (p_max < p_min) {
info("invalid MpiParams: %s", mpi_params);
return SLURM_ERROR;
}
if ((p_min == port_resv_min) && (p_max == port_resv_max)) {
_dump_resv_port_info();
return SLURM_SUCCESS; /* No change */
}
port_resv_min = p_min;
port_resv_max = p_max;
port_resv_cnt = p_max - p_min + 1;
debug("Ports available for reservation %u-%u",
port_resv_min, port_resv_max);
xfree(port_resv_table);
port_resv_table = xmalloc(sizeof(bitstr_t *) * port_resv_cnt);
for (i=0; i<port_resv_cnt; i++)
port_resv_table[i] = bit_alloc(node_record_count);
_make_all_resv(job_list);
_dump_resv_port_info();
return SLURM_SUCCESS;
}
extern int reserve_port_stepmgr_init(job_record_t *job_ptr)
{
int p_min, p_max;
int i, j = 0;
int rc;
if (job_ptr->resv_ports == NULL) {
if (port_resv_table) {
info("Clearing port reservations");
for (i = 0; i < port_resv_cnt; i++)
FREE_NULL_BITMAP(port_resv_table[i]);
xfree(port_resv_table);
port_resv_cnt = 0;
port_resv_min = port_resv_max = 0;
}
return SLURM_SUCCESS;
}
if (!job_ptr->resv_port_array &&
(rc = _rebuild_port_array(job_ptr->resv_ports,
&job_ptr->resv_port_cnt,
&job_ptr->resv_port_array))) {
if (rc == ESLURM_PORTS_INVALID)
error("%pJ has invalid reserved ports: %s",
job_ptr, job_ptr->resv_ports);
else
error("Problem recovering resv_port_array for %pJ: %s",
job_ptr, job_ptr->resv_ports);
xfree(job_ptr->resv_ports);
return SLURM_ERROR;
}
p_min = job_ptr->resv_port_array[0];
p_max = job_ptr->resv_port_array[job_ptr->resv_port_cnt - 1];
if ((p_min == port_resv_min) && (p_max == port_resv_max)) {
_dump_resv_port_info();
return SLURM_SUCCESS; /* No change */
}
port_resv_min = p_min;
port_resv_max = p_max;
port_resv_cnt = p_max - p_min + 1;
debug("Ports available for reservation %u-%u",
port_resv_min, port_resv_max);
xfree(port_resv_table);
port_resv_table = xmalloc(sizeof(bitstr_t *) * port_resv_cnt);
for (i=0; i<port_resv_cnt; i++) {
if (job_ptr->resv_port_array[j] != i + port_resv_min)
continue;
port_resv_table[i] = bit_alloc(bit_size(job_ptr->node_bitmap));
j++;
}
_dump_resv_port_info();
return SLURM_SUCCESS;
}
/* Reserve ports for a job step
* NOTE: We keep track of last port reserved and go round-robin through full
* set of available ports. This helps avoid re-using busy ports when
* restarting job steps.
* RET SLURM_SUCCESS or an error code */
static int _resv_port_alloc(uint16_t resv_port_cnt,
bitstr_t *node_bitmap,
char **resv_ports,
int **resv_port_array,
int *port_inx)
{
int i;
int *port_array = NULL;
char port_str[16];
hostlist_t *hl;
static int last_port_alloc = 0;
xassert(!*resv_ports);
xassert(!*resv_port_array);
if (resv_port_cnt > port_resv_cnt)
return ESLURM_PORTS_INVALID;
/* Identify available ports */
port_array = xmalloc(sizeof(int) * resv_port_cnt);
*port_inx = 0;
for (i=0; i<port_resv_cnt; i++) {
if (++last_port_alloc >= port_resv_cnt)
last_port_alloc = 0;
if (!port_resv_table[last_port_alloc] ||
bit_overlap_any(node_bitmap,
port_resv_table[last_port_alloc]))
continue;
port_array[(*port_inx)++] = last_port_alloc;
if (*port_inx >= resv_port_cnt)
break;
}
if (*port_inx < resv_port_cnt) {
xfree(port_array);
return ESLURM_PORTS_BUSY;
}
/* Reserve selected ports */
hl = hostlist_create(NULL);
for (i=0; i < *port_inx; i++) {
bit_or(port_resv_table[port_array[i]], node_bitmap);
port_array[i] += port_resv_min;
snprintf(port_str, sizeof(port_str), "%d", port_array[i]);
hostlist_push_host(hl, port_str);
}
hostlist_sort(hl);
/* get the ranged string with no brackets on it */
*resv_ports = hostlist_ranged_string_xmalloc_dims(hl, 1, 0);
hostlist_destroy(hl);
*resv_port_array = port_array;
return SLURM_SUCCESS;
}
extern int resv_port_step_alloc(step_record_t *step_ptr)
{
int rc;
int port_inx;
if (step_ptr->resv_port_array || step_ptr->resv_ports) {
/*
* Both resv_ports and resv_port_array need to be NULL.
* If they are not that could lead to resv_ports never being
* freed on nodes, eventually making those nodes unable to
* schedule jobs since their ports could have been allocated
* without being freed. By setting resv_ports and
* resv_port_array to NULL in job_array_split() guarantees that,
* but try to catch this issue if it happens in future.
*/
error("%pS allocated reserved ports while it already had reserved ports %s",
step_ptr, step_ptr->resv_ports);
/*
* We can't just call _resv_port_free() because it is not
* guaranteed that the node_bitmap or resv_port_cnt is the same
* from when resv_port_array was allocated.
*/
xfree(step_ptr->resv_port_array);
xfree(step_ptr->resv_ports);
}
rc = _resv_port_alloc(step_ptr->resv_port_cnt,
step_ptr->step_node_bitmap, &step_ptr->resv_ports,
&step_ptr->resv_port_array, &port_inx);
if (rc == ESLURM_PORTS_INVALID)
info("%pS needs %u reserved ports, but only %d exist",
step_ptr, step_ptr->resv_port_cnt, port_resv_cnt);
else if (rc == ESLURM_PORTS_BUSY)
info("insufficient ports for %pS to reserve (%d of %u)",
step_ptr, port_inx, step_ptr->resv_port_cnt);
debug("reserved ports %s for %pS", step_ptr->resv_ports, step_ptr);
return rc;
}
extern int resv_port_job_alloc(job_record_t *job_ptr)
{
int rc;
int port_inx;
if (job_ptr->resv_port_array || job_ptr->resv_ports) {
/*
* Both resv_ports and resv_port_array need to be NULL.
* If they are not that could lead to resv_ports never being
* freed on nodes, eventually making those nodes unable to
* schedule jobs since their ports could have been allocated
* without being freed. By setting resv_ports and
* resv_port_array to NULL in job_array_split() guarantees that,
* but try to catch this issue if it happens in future.
*/
error("%pJ allocated reserved ports while it already had reserved ports %s. Ports may be lost, which will require a restart of the slurmctld daemon to resolve.",
job_ptr, job_ptr->resv_ports);
/*
* We can't just call _resv_port_free() because it is not
* guaranteed that the node_bitmap or resv_port_cnt is the same
* from when resv_port_array was allocated. A restart of the
* controller will restore any lost ports.
*/
xfree(job_ptr->resv_port_array);
xfree(job_ptr->resv_ports);
}
rc = _resv_port_alloc(job_ptr->resv_port_cnt,
job_ptr->node_bitmap, &job_ptr->resv_ports,
&job_ptr->resv_port_array, &port_inx);
if (rc == ESLURM_PORTS_INVALID)
info("%pJ needs %u reserved ports, but only %d exist",
job_ptr, job_ptr->resv_port_cnt, port_resv_cnt);
else if (rc == ESLURM_PORTS_BUSY)
info("insufficient ports for %pJ to reserve (%d of %u)",
job_ptr, port_inx, job_ptr->resv_port_cnt);
debug("reserved ports %s for %pJ", job_ptr->resv_ports, job_ptr);
return rc;
}
extern int resv_port_check_job_request_cnt(job_record_t *job_ptr)
{
if (job_ptr->resv_port_cnt &&
!(job_ptr->bit_flags & STEPMGR_ENABLED) &&
!xstrstr(slurm_conf.slurmctld_params, "enable_stepmgr")) {
error("%pJ requested a reserve port count for the allocation but slurmstepd step management isn't be enabled.",
job_ptr);
return ESLURM_PORTS_INVALID;
}
if (job_ptr->resv_port_cnt > port_resv_cnt) {
info("%pJ needs %u reserved ports, but only %d exist",
job_ptr, job_ptr->resv_port_cnt, port_resv_cnt);
return ESLURM_PORTS_INVALID;
}
return SLURM_SUCCESS;
}
extern int resv_port_get_resv_port_cnt()
{
return port_resv_cnt;
}
/*
* Release reserved ports
* RET SLURM_SUCCESS or an error code
*/
static void _resv_port_free(uint16_t resv_port_cnt,
int *resv_port_array,
bitstr_t *node_bitmap)
{
int i, j;
if (resv_port_array == NULL)
return;
for (i=0; i<resv_port_cnt; i++) {
if ((resv_port_array[i] < port_resv_min) ||
(resv_port_array[i] > port_resv_max))
continue;
j = resv_port_array[i] - port_resv_min;
if (!port_resv_table[i])
continue;
bit_and_not(port_resv_table[j], node_bitmap);
}
}
/*
* Release reserved ports for a job step
* RET SLURM_SUCCESS or an error code
*/
extern void resv_port_step_free(step_record_t *step_ptr)
{
if (step_ptr->resv_port_array == NULL)
return;
_resv_port_free(step_ptr->resv_port_cnt, step_ptr->resv_port_array,
step_ptr->step_node_bitmap);
xfree(step_ptr->resv_port_array);
debug2("freed ports %s for %pS",
step_ptr->resv_ports, step_ptr);
}
extern void resv_port_job_free(job_record_t *job_ptr)
{
if (job_ptr->resv_port_array == NULL)
return;
_resv_port_free(job_ptr->resv_port_cnt,
job_ptr->resv_port_array,
job_ptr->node_bitmap);
xfree(job_ptr->resv_port_array);
debug2("freed ports %s for %pJ",
job_ptr->resv_ports, job_ptr);
}