/*****************************************************************************\
 *  port_mgr.c - manage the reservation of I/O ports on the nodes.
 *	Design for use with OpenMPI.
 *****************************************************************************
 *  Copyright (C) 2009 Lawrence Livermore National Security.
 *  Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
 *  Written by Morris Jette <jette1@llnl.gov>
 *  CODE-OCEC-09-009. All rights reserved.
 *
 *  This file is part of Slurm, a resource management program.
 *  For details, see <https://slurm.schedmd.com/>.
 *  Please also read the included file: DISCLAIMER.
 *
 *  Slurm is free software; you can redistribute it and/or modify it under
 *  the terms of the GNU General Public License as published by the Free
 *  Software Foundation; either version 2 of the License, or (at your option)
 *  any later version.
 *
 *  In addition, as a special exception, the copyright holders give permission
 *  to link the code of portions of this program with the OpenSSL library under
 *  certain conditions as described in each individual source file, and
 *  distribute linked combinations including the two. You must obey the GNU
 *  General Public License in all respects for all of the code used other than
 *  OpenSSL. If you modify file(s) with this exception, you may extend this
 *  exception to your version of the file(s), but you are not obligated to do
 *  so. If you do not wish to do so, delete this exception statement from your
 *  version.  If you delete this exception statement from all source files in
 *  the program, then also delete it here.
 *
 *  Slurm is distributed in the hope that it will be useful, but WITHOUT ANY
 *  WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 *  FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
 *  details.
 *
 *  You should have received a copy of the GNU General Public License along
 *  with Slurm; if not, write to the Free Software Foundation, Inc.,
 *  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301  USA.
\*****************************************************************************/

#include <stdlib.h>
#include <string.h>

#include "src/common/bitstring.h"
#include "src/common/hostlist.h"
#include "src/common/job_record.h"
#include "src/common/node_conf.h"
#include "src/common/xmalloc.h"
#include "src/common/xstring.h"

#define  _DEBUG 0

bitstr_t **port_resv_table = (bitstr_t **) NULL;
int        port_resv_cnt   = 0;
int        port_resv_min   = 0;
int        port_resv_max   = 0;

static void _dump_resv_port_info(void);
static void _make_all_resv(list_t *job_list);
static void _make_step_resv(step_record_t *step_ptr);
static int _rebuild_port_array(const char *resv_ports,
			       uint16_t *resv_port_cnt,
			       int **resv_port_array);

static void _dump_resv_port_info(void)
{
#if _DEBUG
	int i;
	char *tmp_char;

	for (i=0; i<port_resv_cnt; i++) {
		if (!port_resv_table[i] ||
		    bit_set_count(port_resv_table[i]) == 0)
			continue;

		tmp_char = bitmap2node_name(port_resv_table[i]);
		info("Port %d: %s", (i+port_resv_min), tmp_char);
		xfree(tmp_char);
	}
#endif
}

/* Builds the resv_port_array based upon resv_ports (a string) */
static int _rebuild_port_array(const char *resv_ports,
			       uint16_t *resv_port_cnt,
			       int **resv_port_array)
{
	int i;
	char *tmp_char;
	hostlist_t *hl;

	tmp_char = xstrdup_printf("[%s]", resv_ports);
	hl = hostlist_create(tmp_char);
	xfree(tmp_char);
	if (!hl)
		return SLURM_ERROR;

	*resv_port_array = xcalloc(*resv_port_cnt, *resv_port_cnt);
	*resv_port_cnt = 0;
	while ((tmp_char = hostlist_shift(hl))) {
		i = atoi(tmp_char);
		if (i > 0)
			(*resv_port_array)[(*resv_port_cnt)++]=i;
		free(tmp_char);
	}
	hostlist_destroy(hl);
	if (*resv_port_cnt == 0)
		return ESLURM_PORTS_INVALID;

	return SLURM_SUCCESS;
}

/*
 * Update the local reservation table
 * Builds resv_port_array if NULL based upon resv_ports (a string)
 */
static int _make_resv(bitstr_t *node_bitmap,
		      const char *resv_ports,
		      uint16_t *resv_port_cnt,
		      int **resv_port_array)
{
	int i, j;
	int rc = SLURM_SUCCESS;

	if ((*resv_port_cnt == 0) ||
	    (resv_ports == NULL) ||
	    (resv_ports[0] == '\0'))
		return rc;

	if ((*resv_port_array == NULL) &&
	     (rc = _rebuild_port_array(resv_ports, resv_port_cnt,
				       resv_port_array)))
			return rc;

	for (i=0; i < *resv_port_cnt; i++) {
		if (((*resv_port_array)[i] < port_resv_min) ||
		    ((*resv_port_array)[i] > port_resv_max))
			continue;
		j = (*resv_port_array)[i] - port_resv_min;
		bit_or(port_resv_table[j], node_bitmap);
	}

	return rc;
}

/* Update the local reservation table for one job step.
 * Builds the job step's resv_port_array based upon resv_ports (a string) */
static void _make_step_resv(step_record_t *step_ptr)
{
	int rc = _make_resv(step_ptr->step_node_bitmap, step_ptr->resv_ports,
			    &step_ptr->resv_port_cnt,
			    &step_ptr->resv_port_array);
	if (rc == SLURM_SUCCESS)
		return;

	if (rc == ESLURM_PORTS_INVALID)
		error("%pS has invalid reserved ports: %s",
		      step_ptr, step_ptr->resv_ports);
	else
		error("Problem recovering resv_port_array for %pS: %s",
		      step_ptr, step_ptr->resv_ports);
	xfree(step_ptr->resv_ports);
	return;
}

/* Update the local reservation table for one stepmgr enabled job
 * Builds the job resv_port_array based upon resv_ports (a string) */
static void _make_job_resv(job_record_t *job_ptr)
{
	int rc;

	if (!IS_JOB_RUNNING(job_ptr) ||
	    !(job_ptr->bit_flags & STEPMGR_ENABLED))
		return;
	rc = _make_resv(job_ptr->node_bitmap, job_ptr->resv_ports,
			&job_ptr->resv_port_cnt, &job_ptr->resv_port_array);
	if (rc == SLURM_SUCCESS)
		return;

	if (rc == ESLURM_PORTS_INVALID)
		error("%pJ has invalid reserved ports: %s",
		      job_ptr, job_ptr->resv_ports);
	else
		error("Problem recovering resv_port_array for %pJ: %s",
		      job_ptr, job_ptr->resv_ports);
	xfree(job_ptr->resv_ports);
	return;
}

/* Identify every job step with a port reservation and put the
 * reservation into the local reservation table. */
static void _make_all_resv(list_t *job_list)
{
	job_record_t *job_ptr;
	step_record_t *step_ptr;
	list_itr_t *job_iterator, *step_iterator;

	job_iterator = list_iterator_create(job_list);
	while ((job_ptr = list_next(job_iterator))) {
		_make_job_resv(job_ptr);

		step_iterator = list_iterator_create(job_ptr->step_list);
		while ((step_ptr = list_next(step_iterator))) {
			if (step_ptr->state < JOB_RUNNING)
				continue;
			_make_step_resv(step_ptr);
		}
		list_iterator_destroy(step_iterator);
	}
	list_iterator_destroy(job_iterator);
}

/* Configure reserved ports.
 * Call with mpi_params==NULL to free memory */
extern int reserve_port_config(char *mpi_params, list_t *job_list)
{
	char *tmp_e=NULL, *tmp_p=NULL;
	int i, p_min, p_max;

	if (mpi_params)
		tmp_p = strstr(mpi_params, "ports=");
	if (tmp_p == NULL) {
		if (port_resv_table) {
			info("Clearing port reservations");
			for (i=0; i<port_resv_cnt; i++)
				FREE_NULL_BITMAP(port_resv_table[i]);
			xfree(port_resv_table);
			port_resv_cnt = 0;
			port_resv_min = port_resv_max = 0;
		}
		return SLURM_SUCCESS;
	}

	tmp_p += 6;
	p_min = strtol(tmp_p, &tmp_e, 10);
	if ((p_min < 1) || (tmp_e[0] != '-')) {
		info("invalid MpiParams: %s", mpi_params);
		return SLURM_ERROR;
	}
	tmp_e++;
	p_max = strtol(tmp_e, NULL, 10);
	if (p_max < p_min) {
		info("invalid MpiParams: %s", mpi_params);
		return SLURM_ERROR;
	}

	if ((p_min == port_resv_min) && (p_max == port_resv_max)) {
		_dump_resv_port_info();
		return SLURM_SUCCESS;	/* No change */
	}

	port_resv_min = p_min;
	port_resv_max = p_max;
	port_resv_cnt = p_max - p_min + 1;
	debug("Ports available for reservation %u-%u",
	      port_resv_min, port_resv_max);

	xfree(port_resv_table);
	port_resv_table = xmalloc(sizeof(bitstr_t *) * port_resv_cnt);
	for (i=0; i<port_resv_cnt; i++)
		port_resv_table[i] = bit_alloc(node_record_count);

	_make_all_resv(job_list);
	_dump_resv_port_info();
	return SLURM_SUCCESS;
}

extern int reserve_port_stepmgr_init(job_record_t *job_ptr)
{
	int p_min, p_max;
	int i, j = 0;
	int rc;

	if (job_ptr->resv_ports == NULL) {
		if (port_resv_table) {
			info("Clearing port reservations");
			for (i = 0; i < port_resv_cnt; i++)
				FREE_NULL_BITMAP(port_resv_table[i]);
			xfree(port_resv_table);
			port_resv_cnt = 0;
			port_resv_min = port_resv_max = 0;
		}
		return SLURM_SUCCESS;
	}

	if (!job_ptr->resv_port_array &&
	    (rc = _rebuild_port_array(job_ptr->resv_ports,
				      &job_ptr->resv_port_cnt,
				      &job_ptr->resv_port_array))) {
		if (rc == ESLURM_PORTS_INVALID)
			error("%pJ has invalid reserved ports: %s",
				job_ptr, job_ptr->resv_ports);
		else
			error("Problem recovering resv_port_array for %pJ: %s",
				job_ptr, job_ptr->resv_ports);

		xfree(job_ptr->resv_ports);
		return SLURM_ERROR;
	}

	p_min = job_ptr->resv_port_array[0];
	p_max = job_ptr->resv_port_array[job_ptr->resv_port_cnt - 1];

	if ((p_min == port_resv_min) && (p_max == port_resv_max)) {
		_dump_resv_port_info();
		return SLURM_SUCCESS;	/* No change */
	}

	port_resv_min = p_min;
	port_resv_max = p_max;
	port_resv_cnt = p_max - p_min + 1;
	debug("Ports available for reservation %u-%u",
	      port_resv_min, port_resv_max);

	xfree(port_resv_table);
	port_resv_table = xmalloc(sizeof(bitstr_t *) * port_resv_cnt);
	for (i=0; i<port_resv_cnt; i++) {
		if (job_ptr->resv_port_array[j] != i + port_resv_min)
			continue;
		port_resv_table[i] = bit_alloc(bit_size(job_ptr->node_bitmap));
		j++;
	}

	_dump_resv_port_info();
	return SLURM_SUCCESS;
}

/* Reserve ports for a job step
 * NOTE: We keep track of last port reserved and go round-robin through full
 *       set of available ports. This helps avoid re-using busy ports when
 *       restarting job steps.
 * RET SLURM_SUCCESS or an error code */
static int _resv_port_alloc(uint16_t resv_port_cnt,
			    bitstr_t *node_bitmap,
			    char **resv_ports,
			    int **resv_port_array,
			    int *port_inx)
{
	int i;
	int *port_array = NULL;
	char port_str[16];
	hostlist_t *hl;
	static int last_port_alloc = 0;

	xassert(!*resv_ports);
	xassert(!*resv_port_array);

	if (resv_port_cnt > port_resv_cnt)
		return ESLURM_PORTS_INVALID;

	/* Identify available ports */
	port_array = xmalloc(sizeof(int) * resv_port_cnt);
	*port_inx = 0;
	for (i=0; i<port_resv_cnt; i++) {
		if (++last_port_alloc >= port_resv_cnt)
			last_port_alloc = 0;
		if (!port_resv_table[last_port_alloc] ||
		    bit_overlap_any(node_bitmap,
				    port_resv_table[last_port_alloc]))
			continue;
		port_array[(*port_inx)++] = last_port_alloc;
		if (*port_inx >= resv_port_cnt)
			break;
	}
	if (*port_inx < resv_port_cnt) {
		xfree(port_array);
		return ESLURM_PORTS_BUSY;
	}

	/* Reserve selected ports */
	hl = hostlist_create(NULL);
	for (i=0; i < *port_inx; i++) {
		bit_or(port_resv_table[port_array[i]], node_bitmap);
		port_array[i] += port_resv_min;
		snprintf(port_str, sizeof(port_str), "%d", port_array[i]);
		hostlist_push_host(hl, port_str);
	}
	hostlist_sort(hl);
	/* get the ranged string with no brackets on it */
	*resv_ports = hostlist_ranged_string_xmalloc_dims(hl, 1, 0);
	hostlist_destroy(hl);
	*resv_port_array = port_array;

	return SLURM_SUCCESS;
}

extern int resv_port_step_alloc(step_record_t *step_ptr)
{
	int rc;
	int port_inx;

	if (step_ptr->resv_port_array || step_ptr->resv_ports) {
		/*
		 * Both resv_ports and resv_port_array need to be NULL.
		 * If they are not that could lead to resv_ports never being
		 * freed on nodes, eventually making those nodes unable to
		 * schedule jobs since their ports could have been allocated
		 * without being freed. By setting resv_ports and
		 * resv_port_array to NULL in job_array_split() guarantees that,
		 * but try to catch this issue if it happens in future.
		 */
		error("%pS allocated reserved ports while it already had reserved ports %s",
		       step_ptr, step_ptr->resv_ports);

		/*
		 * We can't just call _resv_port_free() because it is not
		 * guaranteed that the node_bitmap or resv_port_cnt is the same
		 * from when resv_port_array was allocated.
		 */
		xfree(step_ptr->resv_port_array);
		xfree(step_ptr->resv_ports);
	}

	rc = _resv_port_alloc(step_ptr->resv_port_cnt,
			      step_ptr->step_node_bitmap, &step_ptr->resv_ports,
			      &step_ptr->resv_port_array, &port_inx);
	if (rc == ESLURM_PORTS_INVALID)
		info("%pS needs %u reserved ports, but only %d exist",
		     step_ptr, step_ptr->resv_port_cnt, port_resv_cnt);
	else if (rc == ESLURM_PORTS_BUSY)
		info("insufficient ports for %pS to reserve (%d of %u)",
		     step_ptr, port_inx, step_ptr->resv_port_cnt);

	debug("reserved ports %s for %pS", step_ptr->resv_ports, step_ptr);
	return rc;
}

extern int resv_port_job_alloc(job_record_t *job_ptr)
{
	int rc;
	int port_inx;

	if (job_ptr->resv_port_array || job_ptr->resv_ports) {
		/*
		 * Both resv_ports and resv_port_array need to be NULL.
		 * If they are not that could lead to resv_ports never being
		 * freed on nodes, eventually making those nodes unable to
		 * schedule jobs since their ports could have been allocated
		 * without being freed. By setting resv_ports and
		 * resv_port_array to NULL in job_array_split() guarantees that,
		 * but try to catch this issue if it happens in future.
		 */
		error("%pJ allocated reserved ports while it already had reserved ports %s. Ports may be lost, which will require a restart of the slurmctld daemon to resolve.",
		       job_ptr, job_ptr->resv_ports);

		/*
		 * We can't just call _resv_port_free() because it is not
		 * guaranteed that the node_bitmap or resv_port_cnt is the same
		 * from when resv_port_array was allocated. A restart of the
		 * controller will restore any lost ports.
		 */
		xfree(job_ptr->resv_port_array);
		xfree(job_ptr->resv_ports);
	}

	rc = _resv_port_alloc(job_ptr->resv_port_cnt,
			      job_ptr->node_bitmap, &job_ptr->resv_ports,
			      &job_ptr->resv_port_array, &port_inx);
	if (rc == ESLURM_PORTS_INVALID)
		info("%pJ needs %u reserved ports, but only %d exist",
		     job_ptr, job_ptr->resv_port_cnt, port_resv_cnt);
	else if (rc == ESLURM_PORTS_BUSY)
		info("insufficient ports for %pJ to reserve (%d of %u)",
		     job_ptr, port_inx, job_ptr->resv_port_cnt);

	debug("reserved ports %s for %pJ", job_ptr->resv_ports, job_ptr);
	return rc;
}

extern int resv_port_check_job_request_cnt(job_record_t *job_ptr)
{
	if (job_ptr->resv_port_cnt &&
	    !(job_ptr->bit_flags & STEPMGR_ENABLED) &&
	    !xstrstr(slurm_conf.slurmctld_params, "enable_stepmgr")) {
		error("%pJ requested a reserve port count for the allocation but slurmstepd step management isn't be enabled.",
		      job_ptr);
		return ESLURM_PORTS_INVALID;
	}

	if (job_ptr->resv_port_cnt > port_resv_cnt) {
		info("%pJ needs %u reserved ports, but only %d exist",
		     job_ptr, job_ptr->resv_port_cnt, port_resv_cnt);
		return ESLURM_PORTS_INVALID;
	}
	return SLURM_SUCCESS;
}

extern int resv_port_get_resv_port_cnt()
{
	return port_resv_cnt;
}

/*
 * Release reserved ports
 * RET SLURM_SUCCESS or an error code
 */
static void _resv_port_free(uint16_t resv_port_cnt,
			    int *resv_port_array,
			    bitstr_t *node_bitmap)
{
	int i, j;

	if (resv_port_array == NULL)
		return;

	for (i=0; i<resv_port_cnt; i++) {
		if ((resv_port_array[i] < port_resv_min) ||
		    (resv_port_array[i] > port_resv_max))
			continue;
		j = resv_port_array[i] - port_resv_min;
		if (!port_resv_table[i])
			continue;
		bit_and_not(port_resv_table[j], node_bitmap);

	}
}

/*
 * Release reserved ports for a job step
 * RET SLURM_SUCCESS or an error code
 */
extern void resv_port_step_free(step_record_t *step_ptr)
{
	if (step_ptr->resv_port_array == NULL)
		return;

	_resv_port_free(step_ptr->resv_port_cnt, step_ptr->resv_port_array,
			step_ptr->step_node_bitmap);
	xfree(step_ptr->resv_port_array);

	debug2("freed ports %s for %pS",
	       step_ptr->resv_ports, step_ptr);
}

extern void resv_port_job_free(job_record_t *job_ptr)
{
	if (job_ptr->resv_port_array == NULL)
		return;

	_resv_port_free(job_ptr->resv_port_cnt,
			job_ptr->resv_port_array,
			job_ptr->node_bitmap);
	xfree(job_ptr->resv_port_array);

	debug2("freed ports %s for %pJ",
	       job_ptr->resv_ports, job_ptr);
}
