blob: 257cf59b622bcdb9759654833f39bc8ed411ce5a [file] [log] [blame]
/*****************************************************************************\
* delayed.c - definitions for delayed work in connection manager
*****************************************************************************
* Copyright (C) SchedMD LLC.
*
* This file is part of Slurm, a resource management program.
* For details, see <https://slurm.schedmd.com/>.
* Please also read the included file: DISCLAIMER.
*
* Slurm is free software; you can redistribute it and/or modify it under
* the terms of the GNU General Public License as published by the Free
* Software Foundation; either version 2 of the License, or (at your option)
* any later version.
*
* In addition, as a special exception, the copyright holders give permission
* to link the code of portions of this program with the OpenSSL library under
* certain conditions as described in each individual source file, and
* distribute linked combinations including the two. You must obey the GNU
* General Public License in all respects for all of the code used other than
* OpenSSL. If you modify file(s) with this exception, you may extend this
* exception to your version of the file(s), but you are not obligated to do
* so. If you do not wish to do so, delete this exception statement from your
* version. If you delete this exception statement from all source files in
* the program, then also delete it here.
*
* Slurm is distributed in the hope that it will be useful, but WITHOUT ANY
* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
* FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
* details.
*
* You should have received a copy of the GNU General Public License along
* with Slurm; if not, write to the Free Software Foundation, Inc.,
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
\*****************************************************************************/
#include <stdlib.h>
#include <time.h>
#include "src/common/macros.h"
#include "src/common/read_config.h"
#include "src/common/slurm_time.h"
#include "src/common/xmalloc.h"
#include "src/common/xstring.h"
#include "src/conmgr/conmgr.h"
#include "src/conmgr/delayed.h"
#include "src/conmgr/mgr.h"
#define CTIME_STR_LEN 72
typedef struct {
#define MAGIC_FOREACH_DELAYED_WORK 0xB233443A
int magic; /* MAGIC_FOREACH_DELAYED_WORK */
work_t *shortest;
timespec_t time;
} foreach_delayed_work_t;
#define MAGIC_FOREACH_CANCEL_WORK 0xA238483A
typedef struct {
int magic; /* MAGIC_FOREACH_CANCEL_WORK */
bool connections_only;
} foreach_cancel_work_t;
/* timer to trigger SIGALRM */
static timer_t timer = {0};
/* Mutex to protect timer */
pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER;
static int _inspect_work(void *x, void *key);
static void _update_timer(work_t *shortest, const timespec_t time);
static bool _work_clear_time_delay(work_t *work);
/*
* Remove delay dependency and release work back into work queue
*
* WARNING: caller must hold mgr.mutex
* IN x - work to release back to normal work handling.
* takes ownership of pointer.
*/
static void _release_work(void *x)
{
work_t *work = x;
xassert(work->magic == MAGIC_WORK);
(void) _work_clear_time_delay(work);
handle_work(true, work);
}
static int _cancel_work(void *x, void *key)
{
work_t *work = x;
foreach_cancel_work_t *args = key;
xassert(work->magic == MAGIC_WORK);
xassert(args->magic == MAGIC_FOREACH_CANCEL_WORK);
if (args->connections_only && !work->ref)
return 0;
work->status = CONMGR_WORK_STATUS_CANCELLED;
return 1;
}
extern void cancel_delayed_work(bool connections_only)
{
foreach_cancel_work_t args = {
.magic = MAGIC_FOREACH_CANCEL_WORK,
.connections_only = connections_only,
};
if (!mgr.delayed_work || list_is_empty(mgr.delayed_work))
return;
log_flag(CONMGR, "%s: cancelling%s %d delayed work",
__func__, (connections_only ? " connection" : ""),
list_count(mgr.delayed_work));
/* run everything immediately but with cancelled status */
(void) list_delete_all(mgr.delayed_work, _cancel_work, &args);
}
static void _inspect(void)
{
int count, total;
foreach_delayed_work_t dargs = {
.magic = MAGIC_FOREACH_DELAYED_WORK,
.time = timespec_now(),
};
total = list_count(mgr.delayed_work);
count = list_delete_all(mgr.delayed_work, _inspect_work, &dargs);
_update_timer(dargs.shortest, dargs.time);
log_flag(CONMGR, "%s: checked all timers and triggered %d/%d delayed work",
__func__, count, total);
}
static struct itimerspec _calc_timer(work_t *shortest,
const timespec_t time)
{
const timespec_t begin = shortest->control.time_begin;
if (slurm_conf.debug_flags & DEBUG_FLAG_CONMGR) {
char str[CTIME_STR_LEN];
timespec_ctime(begin, true, str, sizeof(str));
log_flag(CONMGR, "%s: setting conmgr timer for %s for %s()",
__func__, str, shortest->callback.func_name);
}
return (struct itimerspec) {
.it_value = begin,
};
}
static void _update_timer(work_t *shortest, const timespec_t time)
{
int rc;
struct itimerspec spec = {{0}};
if (shortest) {
spec = _calc_timer(shortest, time);
} else {
log_flag(CONMGR, "%s: disabling conmgr timer", __func__);
}
slurm_mutex_lock(&mutex);
rc = timer_settime(timer, TIMER_ABSTIME, &spec, NULL);
slurm_mutex_unlock(&mutex);
if (rc) {
if ((rc == -1) && errno)
rc = errno;
error("%s: timer_set_time() failed: %s",
__func__, slurm_strerror(rc));
}
}
/* check begin times to see if the work delay has elapsed */
static int _inspect_work(void *x, void *key)
{
work_t *work = x;
const timespec_t begin = work->control.time_begin;
foreach_delayed_work_t *args = key;
const timespec_t now = timespec_now();
const bool trigger = timespec_is_after(now, begin);
xassert(args->magic == MAGIC_FOREACH_DELAYED_WORK);
xassert(work->magic == MAGIC_WORK);
if (slurm_conf.debug_flags & DEBUG_FLAG_CONMGR) {
const timespec_diff_ns_t diff = timespec_diff_ns(begin, now);
char str[CTIME_STR_LEN];
timespec_ctime(diff.diff, false, str, sizeof(str));
log_flag(CONMGR, "%s: %s delayed work ETA %s for %s@0x%"PRIxPTR,
__func__, (trigger ? "triggering" : "deferring"),
str, work->callback.func_name,
(uintptr_t) work->callback.func);
}
if (!args->shortest)
args->shortest = work;
else if (timespec_is_after(args->shortest->control.time_begin, begin))
args->shortest = work;
return trigger ? 1 : 0;
}
extern timespec_t conmgr_calc_work_time_delay(
time_t delay_seconds,
long delay_nanoseconds)
{
/*
* Renormalize ns into seconds to only have partial seconds in
* nanoseconds. Nanoseconds won't matter with a larger number of
* seconds.
*/
return timespec_normalize(timespec_add((timespec_t) {
.tv_sec = delay_seconds,
.tv_nsec = delay_nanoseconds,
}, timespec_now()));
}
extern void init_delayed_work(void)
{
int rc;
mgr.delayed_work = list_create(_release_work);
again:
slurm_mutex_lock(&mutex);
{
struct sigevent sevp = {
.sigev_notify = SIGEV_SIGNAL,
.sigev_signo = SIGALRM,
.sigev_value.sival_ptr = &timer,
};
rc = timer_create(TIMESPEC_CLOCK_TYPE, &sevp, &timer);
}
slurm_mutex_unlock(&mutex);
if (!rc)
return;
if ((rc == -1) && errno)
rc = errno;
if (rc == EAGAIN)
goto again;
else if (rc)
fatal("%s: timer_create() failed: %s",
__func__, slurm_strerror(rc));
}
extern void free_delayed_work(void)
{
int rc;
if (!mgr.delayed_work)
return;
FREE_NULL_LIST(mgr.delayed_work);
slurm_mutex_lock(&mutex);
rc = timer_delete(timer);
slurm_mutex_unlock(&mutex);
if (rc)
fatal("%s: timer_delete() failed: %m", __func__);
}
static void _update_delayed_work(bool locked)
{
if (!locked)
slurm_mutex_lock(&mgr.mutex);
_inspect();
if (!locked)
slurm_mutex_unlock(&mgr.mutex);
}
extern void on_signal_alarm(conmgr_callback_args_t conmgr_args, void *arg)
{
if (conmgr_args.status == CONMGR_WORK_STATUS_CANCELLED)
return;
log_flag(CONMGR, "%s: caught SIGALRM", __func__);
_update_delayed_work(false);
}
/*
* Clear time delay dependency from work
* IN work - work to remove CONMGR_WORK_DEP_TIME_DELAY flag
* NOTE: caller must call update_timer() after to cause work to requeue
* NOTE: caller must hold mgr.mutex lock
* RET True if time delay removed
*/
static bool _work_clear_time_delay(work_t *work)
{
xassert(work->magic == MAGIC_WORK);
if (work->status != CONMGR_WORK_STATUS_PENDING)
return false;
if (!(work->control.depend_type & CONMGR_WORK_DEP_TIME_DELAY))
return false;
#ifndef NDEBUG
work->control.time_begin = (timespec_t) {0};
#endif /* !NDEBUG */
work_mask_depend(work, ~CONMGR_WORK_DEP_TIME_DELAY);
return true;
}
extern void add_work_delayed(work_t *work)
{
list_append(mgr.delayed_work, work);
_update_delayed_work(true);
}
extern char *work_delayed_to_str(work_t *work)
{
char *delay = NULL, str[CTIME_STR_LEN];
if (!(work->control.depend_type & CONMGR_WORK_DEP_TIME_DELAY))
return NULL;
timespec_ctime(work->control.time_begin, true, str, sizeof(str));
xstrfmtcat(delay, " time_begin=%s", str);
return delay;
}