blob: 56a452b32905d9d677f99c61a05fe72d61c29210 [file] [log] [blame]
/*****************************************************************************\
* preempt.c - Job preemption plugin function setup.
*****************************************************************************
* Copyright (C) 2009-2010 Lawrence Livermore National Security.
* Copyright (C) SchedMD LLC.
* Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
* Written by Morris Jette <jette1@llnl.gov>
* CODE-OCEC-09-009. All rights reserved.
*
* This file is part of Slurm, a resource management program.
* For details, see <https://slurm.schedmd.com/>.
* Please also read the included file: DISCLAIMER.
*
* Slurm is free software; you can redistribute it and/or modify it under
* the terms of the GNU General Public License as published by the Free
* Software Foundation; either version 2 of the License, or (at your option)
* any later version.
*
* In addition, as a special exception, the copyright holders give permission
* to link the code of portions of this program with the OpenSSL library under
* certain conditions as described in each individual source file, and
* distribute linked combinations including the two. You must obey the GNU
* General Public License in all respects for all of the code used other than
* OpenSSL. If you modify file(s) with this exception, you may extend this
* exception to your version of the file(s), but you are not obligated to do
* so. If you do not wish to do so, delete this exception statement from your
* version. If you delete this exception statement from all source files in
* the program, then also delete it here.
*
* Slurm is distributed in the hope that it will be useful, but WITHOUT ANY
* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
* FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
* details.
*
* You should have received a copy of the GNU General Public License along
* with Slurm; if not, write to the Free Software Foundation, Inc.,
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
\*****************************************************************************/
#include <pthread.h>
#include <signal.h>
#include "preempt.h"
#include "src/common/log.h"
#include "src/common/plugrack.h"
#include "src/common/proc_args.h"
#include "src/common/slurm_protocol_api.h"
#include "src/common/xmalloc.h"
#include "src/common/xstring.h"
#include "src/slurmctld/reservation.h"
#include "src/slurmctld/slurmctld.h"
#include "src/slurmctld/job_scheduler.h"
#include "src/slurmctld/acct_policy.h"
static bool youngest_order = false;
static uint32_t min_exempt_priority = NO_VAL;
static plugin_init_t plugin_inited = PLUGIN_NOT_INITED;
typedef struct slurm_preempt_ops {
bool (*job_preempt_check) (job_queue_rec_t *preemptor,
job_queue_rec_t *preemptee);
bool (*preemptable) (job_record_t *preemptor,
job_record_t *preemptee);
uint32_t (*get_grace_time)(job_record_t *job_ptr);
uint16_t (*get_mode)(job_record_t *job_ptr);
uint32_t (*get_prio)(job_record_t *job_ptr);
} slurm_preempt_ops_t;
/*
* Must be synchronized with slurm_preempt_ops_t above.
*/
static const char *syms[] = {
"preempt_p_job_preempt_check",
"preempt_p_preemptable",
"preempt_p_get_grace_time",
"preempt_p_get_mode",
"preempt_p_get_prio",
};
static slurm_preempt_ops_t ops;
static plugin_context_t *g_context = NULL;
static pthread_mutex_t g_context_lock = PTHREAD_MUTEX_INITIALIZER;
typedef struct {
job_record_t *preemptor;
list_t *preemptee_job_list;
} preempt_candidates_t;
static int _is_job_preempt_exempt_internal(void *x, void *key)
{
job_record_t *preemptee_ptr = (job_record_t *)x;
job_record_t *preemptor_ptr = (job_record_t *)key;
if (job_borrow_from_resv_check(preemptee_ptr, preemptor_ptr)) {
/*
* This job is on borrowed time from the reservation!
* Automatic preemption.
*/
} else if (!(*(ops.preemptable))(preemptee_ptr, preemptor_ptr))
return 1;
if (min_exempt_priority < preemptee_ptr->priority)
return 1;
if (preemptor_ptr->details &&
(preemptor_ptr->details->expanding_jobid == preemptee_ptr->job_id))
return 1;
if (acct_policy_is_job_preempt_exempt(preemptee_ptr))
return 1;
return 0;
}
static bool _is_job_preempt_exempt(job_record_t *preemptee_ptr,
job_record_t *preemptor_ptr)
{
xassert(preemptee_ptr);
xassert(preemptor_ptr);
if (!preemptee_ptr->het_job_list)
return _is_job_preempt_exempt_internal(
preemptee_ptr, preemptor_ptr);
/*
* All components of a job must be preemptable otherwise it is
* preempt exempt
*/
return list_find_first(preemptee_ptr->het_job_list,
_is_job_preempt_exempt_internal,
preemptor_ptr) ? true : false;
}
/*
* Return the PreemptMode which should apply to stop this job
*/
static uint16_t _job_preempt_mode_internal(job_record_t *job_ptr)
{
uint16_t data = (*(ops.get_mode))(job_ptr);
/* --signal=R jobs must be requeue or cancel */
if ((job_ptr->warn_flags & KILL_JOB_RESV) &&
(data != PREEMPT_MODE_REQUEUE))
data = PREEMPT_MODE_CANCEL;
return data;
}
static int _find_job_by_preempt_mode(void *x, void *arg)
{
job_record_t *job_ptr = (job_record_t *)x;
uint16_t preempt_mode = *(uint16_t *)arg;
if (_job_preempt_mode_internal(job_ptr) == preempt_mode)
return 1;
return 0;
}
static int _add_preemptable_job(void *x, void *arg)
{
job_record_t *candidate = (job_record_t *) x;
preempt_candidates_t *candidates = (preempt_candidates_t *) arg;
job_record_t *preemptor = candidates->preemptor;
/*
* We only want to look at the master component of a hetjob. Since all
* components have to be preemptable it should be here at some point.
*/
if (candidate->het_job_id && !candidate->het_job_list)
return 0;
if (_is_job_preempt_exempt(candidate, preemptor))
return 0;
/*
* We have to check the entire bitmap space here before we can check
* each part of a hetjob in _is_job_preempt_exempt()
*/
if (!job_overlap_and_running(preemptor->part_ptr->node_bitmap,
preemptor->license_list, candidate))
return 0;
/* This job is a preemption candidate */
if (!candidates->preemptee_job_list)
candidates->preemptee_job_list = list_create(NULL);
list_append(candidates->preemptee_job_list, candidate);
return 0;
}
static int _sort_by_prio(void *x, void *y)
{
int rc;
uint32_t job_prio1, job_prio2;
job_record_t *j1 = *(job_record_t **)x;
job_record_t *j2 = *(job_record_t **)y;
job_prio1 = (*(ops.get_prio))(j1);
job_prio2 = (*(ops.get_prio))(j2);
if (job_prio1 > job_prio2)
rc = 1;
else if (job_prio1 < job_prio2)
rc = -1;
else
rc = 0;
return rc;
}
static int _sort_by_youngest(void *x, void *y)
{
int rc;
job_record_t *j1 = *(job_record_t **) x;
job_record_t *j2 = *(job_record_t **) y;
if (j1->start_time < j2->start_time)
rc = 1;
else if (j1->start_time > j2->start_time)
rc = -1;
else
rc = 0;
return rc;
}
extern int preempt_g_init(void)
{
int retval = SLURM_SUCCESS;
char *plugin_type = "preempt", *temp_str;
slurm_mutex_lock(&g_context_lock);
if (plugin_inited)
goto done;
if (!slurm_conf.preempt_type) {
plugin_inited = PLUGIN_NOOP;
goto done;
}
g_context = plugin_context_create(
plugin_type, slurm_conf.preempt_type,
(void **)&ops, syms, sizeof(syms));
if (!g_context) {
error("cannot create %s context for %s", plugin_type,
slurm_conf.preempt_type);
retval = SLURM_ERROR;
plugin_inited = PLUGIN_NOT_INITED;
goto done;
}
youngest_order = false;
if (xstrcasestr(slurm_conf.preempt_params, "youngest_first") ||
xstrcasestr(slurm_conf.sched_params, "preempt_youngest_first"))
youngest_order = true;
min_exempt_priority = NO_VAL;
if ((temp_str = xstrcasestr(slurm_conf.preempt_params,
"min_exempt_priority=")))
retval = parse_uint32((temp_str + 20), &min_exempt_priority);
plugin_inited = PLUGIN_INITED;
done:
slurm_mutex_unlock(&g_context_lock);
return retval;
}
extern int preempt_g_fini(void)
{
int rc = SLURM_SUCCESS;
if (g_context) {
rc = plugin_context_destroy(g_context);
g_context = NULL;
}
plugin_inited = PLUGIN_NOT_INITED;
return rc;
}
extern list_t *slurm_find_preemptable_jobs(job_record_t *job_ptr)
{
preempt_candidates_t candidates = { .preemptor = job_ptr };
/* Validate the preemptor job */
xassert(plugin_inited != PLUGIN_NOT_INITED);
if (plugin_inited == PLUGIN_NOOP)
return NULL;
if (!job_ptr) {
error("%s: job_ptr is NULL", __func__);
return NULL;
}
if (!IS_JOB_PENDING(job_ptr)) {
error("%s: %pJ not pending", __func__, job_ptr);
return NULL;
}
if (!job_ptr->part_ptr) {
error("%s: %pJ has NULL partition ptr", __func__, job_ptr);
return NULL;
}
if (!job_ptr->part_ptr->node_bitmap) {
error("%s: partition %s node_bitmap=NULL",
__func__, job_ptr->part_ptr->name);
return NULL;
}
/* Build an array of pointers to preemption candidates */
if (slurm_preemption_enabled() ||
job_uses_max_start_delay_resv(job_ptr))
list_for_each(job_list, _add_preemptable_job, &candidates);
if (candidates.preemptee_job_list && youngest_order)
list_sort(candidates.preemptee_job_list, _sort_by_youngest);
else if (candidates.preemptee_job_list)
list_sort(candidates.preemptee_job_list, _sort_by_prio);
return candidates.preemptee_job_list;
}
/*
* Return the PreemptMode which should apply to stop this job
*/
extern uint16_t slurm_job_preempt_mode(job_record_t *job_ptr)
{
uint16_t data;
xassert(plugin_inited != PLUGIN_NOT_INITED);
if (plugin_inited == PLUGIN_NOOP)
return PREEMPT_MODE_OFF;
if (job_ptr->het_job_list && !job_ptr->job_preempt_comp) {
/*
* Find the component job to use as the template for
* setting the preempt mode for all other components.
* The first component job found having a preempt mode
* in the hierarchy (ordered highest to lowest:
* SUSPEND->REQUEUE->CANCEL) will be used as
* the template.
*
* NOTE: CANCEL is not on the list below since it is handled
* as the default.
*/
static const uint16_t preempt_modes[] = {
PREEMPT_MODE_SUSPEND,
PREEMPT_MODE_REQUEUE
};
static const int preempt_modes_cnt = sizeof(preempt_modes) /
sizeof(preempt_modes[0]);
for (int pm_index = 0; pm_index < preempt_modes_cnt;
pm_index++) {
data = preempt_modes[pm_index];
if ((job_ptr->job_preempt_comp = list_find_first(
job_ptr->het_job_list,
_find_job_by_preempt_mode,
&data)))
break;
}
/* if not found look up the mode (CANCEL expected) */
if (!job_ptr->job_preempt_comp)
data = _job_preempt_mode_internal(job_ptr);
} else
data = _job_preempt_mode_internal(job_ptr->job_preempt_comp ?
job_ptr->job_preempt_comp :
job_ptr);
return data;
}
/*
* Return true if any jobs can be preempted, otherwise false
*/
extern bool slurm_preemption_enabled(void)
{
if (plugin_inited == PLUGIN_NOOP)
return false;
return (slurm_conf.preempt_mode != PREEMPT_MODE_OFF);
}
/*
* Check to see if a job is in a grace time.
* If no grace_time active then return 1.
* If grace_time is currently active then return -1.
*/
static int _job_check_grace_internal(void *x, void *arg)
{
job_record_t *job_ptr = (job_record_t *)x;
job_record_t *preemptor_ptr = (job_record_t *)arg;
int rc = -1;
uint32_t grace_time = 0;
if (job_ptr->preempt_time) {
if (time(NULL) >= job_ptr->end_time) {
job_ptr->preempt_time = time(NULL);
rc = 1;
}
return rc;
}
xassert(preemptor_ptr);
/*
* If this job is running in parts of a reservation
*/
if (job_borrow_from_resv_check(job_ptr, preemptor_ptr))
grace_time = job_ptr->warn_time;
else
grace_time = (*(ops.get_grace_time))(job_ptr);
job_ptr->preempt_time = time(NULL);
job_ptr->end_time = MIN(job_ptr->end_time,
(job_ptr->preempt_time + (time_t)grace_time));
if (grace_time) {
debug("setting %u sec preemption grace time for %pJ to reclaim resources for %pJ",
grace_time, job_ptr, preemptor_ptr);
/* send job warn signal always sends SIGCONT first */
if (preempt_send_user_signal && job_ptr->warn_signal &&
!(job_ptr->warn_flags & WARN_SENT))
send_job_warn_signal(job_ptr, true);
else {
job_signal(job_ptr, SIGCONT, 0, 0, 0);
job_signal(job_ptr, SIGTERM, 0, 0, 0);
}
job_ptr->bit_flags |= GRACE_PREEMPT;
} else
rc = 1;
return rc;
}
/*
* Check to see if a job (or hetjob) is in a grace time.
* If no grace_time active then return 0.
* If grace_time is currently active then return 1.
*/
static int _job_check_grace(job_record_t *job_ptr, job_record_t *preemptor_ptr)
{
if (job_ptr->het_job_list)
return list_for_each_nobreak(job_ptr->het_job_list,
_job_check_grace_internal,
preemptor_ptr) <= 0 ? 1 : 0;
return _job_check_grace_internal(job_ptr, preemptor_ptr) < 0 ? 1 : 0;
}
static int _job_warn_signal_wrapper(void *x, void *arg)
{
job_record_t *job_ptr = (job_record_t *)x;
bool ignore_time = *(bool *)arg;
/* Ignore Time is always true */
send_job_warn_signal(job_ptr, ignore_time);
return 0;
}
extern uint32_t slurm_job_preempt(job_record_t *job_ptr,
job_record_t *preemptor_ptr,
uint16_t mode, bool ignore_time)
{
int rc = SLURM_ERROR;
xassert(plugin_inited != PLUGIN_NOT_INITED);
if (plugin_inited == PLUGIN_NOOP)
return rc;
/* If any job is in a grace period continue */
if (_job_check_grace(job_ptr, preemptor_ptr))
return SLURM_ERROR;
if (preempt_send_user_signal) {
if (job_ptr->het_job_list)
(void)list_for_each(job_ptr->het_job_list,
_job_warn_signal_wrapper,
&ignore_time);
else
send_job_warn_signal(job_ptr, ignore_time);
}
if (mode == PREEMPT_MODE_CANCEL) {
if (job_ptr->het_job_list)
rc = het_job_signal(job_ptr, SIGKILL, 0, 0, true);
else
rc = job_signal(job_ptr, SIGKILL, 0, 0, true);
if (rc == SLURM_SUCCESS) {
info("preempted %pJ has been killed to reclaim resources for %pJ",
job_ptr, preemptor_ptr);
}
} else if (mode == PREEMPT_MODE_REQUEUE) {
/* job_requeue already handles het jobs */
rc = job_requeue(0, job_ptr->job_id,
NULL, true, 0);
if (rc == SLURM_SUCCESS) {
info("preempted %pJ has been requeued to reclaim resources for %pJ",
job_ptr, preemptor_ptr);
}
}
if (rc != SLURM_SUCCESS) {
if (job_ptr->het_job_list)
rc = het_job_signal(job_ptr, SIGKILL, 0, 0, true);
else
rc = job_signal(job_ptr, SIGKILL, 0, 0, true);
if (rc == SLURM_SUCCESS) {
info("%s: preempted %pJ had to be killed",
__func__, job_ptr);
} else {
info("%s: preempted %pJ kill failure %s",
__func__, job_ptr, slurm_strerror(rc));
}
}
return rc;
}
/*
* Return true if the preemptor can preempt the preemptee, otherwise false
*/
extern bool preempt_g_job_preempt_check(job_queue_rec_t *preemptor,
job_queue_rec_t *preemptee)
{
xassert(plugin_inited != PLUGIN_NOT_INITED);
if (plugin_inited == PLUGIN_NOOP)
return false;
return (*(ops.job_preempt_check))(preemptor, preemptee);
}