blob: 73d1a66d9314ab4281da2d82179319f79598bea9 [file] [log] [blame]
/*****************************************************************************\
* backfill.c - simple backfill scheduler plugin.
*
* If a partition does not have root only access and nodes are not shared
* then raise the priority of pending jobs if doing so does not adversely
* effect the expected initiation of any higher priority job. We do not alter
* a job's required or excluded node list, so this is a conservative
* algorithm.
*
* For example, consider a cluster "lx[01-08]" with one job executing on
* nodes "lx[01-04]". The highest priority pending job requires five nodes
* including "lx05". The next highest priority pending job requires any
* three nodes. Without explicitly forcing the second job to use nodes
* "lx[06-08]", we can't start it without possibly delaying the higher
* priority job.
*****************************************************************************
* Copyright (C) 2003-2007 The Regents of the University of California.
* Copyright (C) 2008-2010 Lawrence Livermore National Security.
* Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
* Written by Morris Jette <jette1@llnl.gov>
* CODE-OCEC-09-009. All rights reserved.
*
* This file is part of Slurm, a resource management program.
* For details, see <https://slurm.schedmd.com/>.
* Please also read the included file: DISCLAIMER.
*
* Slurm is free software; you can redistribute it and/or modify it under
* the terms of the GNU General Public License as published by the Free
* Software Foundation; either version 2 of the License, or (at your option)
* any later version.
*
* In addition, as a special exception, the copyright holders give permission
* to link the code of portions of this program with the OpenSSL library under
* certain conditions as described in each individual source file, and
* distribute linked combinations including the two. You must obey the GNU
* General Public License in all respects for all of the code used other than
* OpenSSL. If you modify file(s) with this exception, you may extend this
* exception to your version of the file(s), but you are not obligated to do
* so. If you do not wish to do so, delete this exception statement from your
* version. If you delete this exception statement from all source files in
* the program, then also delete it here.
*
* Slurm is distributed in the hope that it will be useful, but WITHOUT ANY
* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
* FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
* details.
*
* You should have received a copy of the GNU General Public License along
* with Slurm; if not, write to the Free Software Foundation, Inc.,
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
\*****************************************************************************/
#include "config.h"
#if HAVE_SYS_PRCTL_H
# include <sys/prctl.h>
#endif
#include <pthread.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#include <unistd.h>
#include "slurm/slurm.h"
#include "slurm/slurmdb.h"
#include "slurm/slurm_errno.h"
#include "src/common/assoc_mgr.h"
#include "src/common/job_features.h"
#include "src/common/list.h"
#include "src/common/macros.h"
#include "src/common/parse_time.h"
#include "src/common/read_config.h"
#include "src/common/slurm_protocol_api.h"
#include "src/common/xmalloc.h"
#include "src/common/xstring.h"
#include "src/interfaces/accounting_storage.h"
#include "src/interfaces/burst_buffer.h"
#include "src/interfaces/gres.h"
#include "src/interfaces/node_features.h"
#include "src/interfaces/mcs.h"
#include "src/interfaces/preempt.h"
#include "src/interfaces/select.h"
#include "src/interfaces/topology.h"
#include "src/slurmctld/acct_policy.h"
#include "src/slurmctld/fed_mgr.h"
#include "src/slurmctld/job_scheduler.h"
#include "src/slurmctld/licenses.h"
#include "src/slurmctld/locks.h"
#include "src/slurmctld/node_scheduler.h"
#include "src/slurmctld/proc_req.h"
#include "src/slurmctld/reservation.h"
#include "src/slurmctld/slurmctld.h"
#include "src/stepmgr/gres_stepmgr.h"
#include "src/stepmgr/srun_comm.h"
#include "backfill.h"
#include "oracle.h"
#define BACKFILL_INTERVAL 30
#define BACKFILL_RESOLUTION 60
#define BACKFILL_WINDOW (24 * 60 * 60)
#define BF_MAX_JOB_ARRAY_RESV 20
#define YIELD_INTERVAL 2000000 /* time in micro-seconds */
#define YIELD_SLEEP 500000; /* time in micro-seconds */
#define MAX_BACKFILL_INTERVAL 10800 /* 3 hours */
#define MAX_BACKFILL_RESOLUTION 3600 /* 1 hour */
#define MAX_BACKFILL_WINDOW (30 * 24 * 60 * 60) /* 30 days */
#define MAX_BF_JOB_PART_COUNT_RESERVE 100000
#define MAX_BF_MAX_JOB_ARRAY_RESV 1000
#define MAX_BF_MAX_JOB_START 10000
#define DEF_BF_MAX_JOB_TEST 500
#define MAX_BF_MAX_JOB_TEST 1000000
#define MAX_BF_MAX_TIME 3600
#define MAX_BF_MIN_AGE_RESERVE (30 * 24 * 60 * 60) /* 30 days */
#define MAX_BF_MIN_PRIO_RESERVE INFINITE
#define MAX_BF_YIELD_INTERVAL 10000000 /* 10 seconds in usec */
#define MAX_MAX_RPC_CNT 1000
#define MAX_YIELD_RPC_CNT 200
#define MAX_YIELD_SLEEP 10000000 /* 10 seconds in usec */
#define MAX_BF_MAX_JOB_ASSOC MAX_BF_MAX_JOB_TEST
#define MAX_BF_MAX_JOB_USER MAX_BF_MAX_JOB_TEST
#define MAX_BF_MAX_JOB_USER_PART MAX_BF_MAX_JOB_TEST
#define MAX_BF_MAX_JOB_PART MAX_BF_MAX_JOB_TEST
typedef struct {
node_space_map_t *node_space;
int *node_space_recs;
} node_space_handler_t;
/*
* HetJob scheduling structures
* NOTE: An individual hetjob component can be submitted to multiple
* partitions and have different start times in each
*/
typedef struct {
uint32_t job_id;
job_record_t *job_ptr;
time_t latest_start; /* Time when expected to start */
part_record_t *part_ptr;
slurmctld_resv_t *resv_ptr;
} het_job_rec_t;
typedef struct {
uint32_t comp_time_limit; /* Time limit for hetjob */
uint32_t het_job_id;
list_t *het_job_rec_list; /* list of het_job_rec_t */
time_t prev_start; /* Expected start time from last test */
} het_job_map_t;
typedef struct {
uint32_t het_job_id;
time_t start_time;
} deadlock_job_struct_t;
typedef struct {
list_t *deadlock_job_list;
part_record_t *part_ptr;
} deadlock_part_struct_t;
/* Diagnostic statistics */
extern diag_stats_t slurmctld_diag_stats;
uint32_t bf_sleep_usec = 0;
typedef struct {
slurmdb_bf_usage_t bf_usage;
uid_t uid;
} bf_user_usage_t;
typedef struct {
bool allocated; /* A job is running on this node */
time_t last_job_end; /* Last end time of running job on node*/
char *mcs_label;
bool mixed_user; /* multiple users running on node */
bool needs_sorting; /* After adding to the mix sort related
* nodes_used_list */
uint32_t node_index;
bool owned; /* Node has exclusive=user job */
uint32_t uid; /* user id of a job running on the node */
} node_used_t;
typedef struct {
bool delay_start;
bool is_exclusive_user;
uint32_t job_user;
time_t *later_start;
char *mcs_label;
uint32_t min_nodes;
bitstr_t *node_bitmap;
int node_cnt;
time_t prev_time;
bitstr_t *req_nodes;
bool set_later_start;
time_t start_time;
} filter_exclusive_args_t;
/*********************** local variables *********************/
static bool stop_backfill = false;
static pthread_mutex_t term_lock = PTHREAD_MUTEX_INITIALIZER;
static pthread_cond_t term_cond = PTHREAD_COND_INITIALIZER;
static pthread_mutex_t config_lock = PTHREAD_MUTEX_INITIALIZER;
static bool config_flag = false;
static int backfill_interval = BACKFILL_INTERVAL;
static int bf_max_time = BACKFILL_INTERVAL;
static int backfill_resolution = BACKFILL_RESOLUTION;
static int backfill_window = BACKFILL_WINDOW;
static int bf_job_part_count_reserve = 0;
static int bf_max_job_array_resv = BF_MAX_JOB_ARRAY_RESV;
static int bf_min_age_reserve = 0;
static int bf_node_space_size = 0;
static bool bf_running_job_reserve = false;
static bool bf_licenses = false;
static uint32_t bf_min_prio_reserve = 0;
static list_t *deadlock_global_list = NULL;
static bool bf_hetjob_immediate = false;
static uint16_t bf_hetjob_prio = 0;
static bool bf_one_resv_per_job = false;
static bool bf_allow_magnetic_slot = false;
static bool bf_topopt_enable = false;
static uint32_t job_start_cnt = 0;
static uint32_t job_test_cnt = 0;
static int max_backfill_job_cnt = DEF_BF_MAX_JOB_TEST;
static int max_backfill_job_per_assoc = 0;
static int max_backfill_job_per_part = 0;
static int max_backfill_job_per_user = 0;
static int max_backfill_job_per_user_part = 0;
static int max_backfill_jobs_start = 0;
static bool backfill_continue = false;
static bool assoc_limit_stop = false;
static int max_rpc_cnt = 0;
static int yield_rpc_cnt = 0;
static int yield_interval = YIELD_INTERVAL;
static int yield_sleep = YIELD_SLEEP;
static list_t *het_job_list = NULL;
static xhash_t *user_usage_map = NULL; /* look up user usage when no assoc */
static bitstr_t *planned_bitmap = NULL;
static bool soft_time_limit = false;
/*********************** local functions *********************/
static void _add_reservation(time_t start_time, time_t end_reserve,
bitstr_t *res_bitmap, job_record_t *job_ptr,
node_space_map_t *node_space, int *node_space_recs,
time_t orig_start_time);
static void _adjust_hetjob_prio(uint32_t *prio, uint32_t val);
static void _attempt_backfill(void);
static int _clear_job_estimates(void *x, void *arg);
static int _clear_qos_blocked_times(void *x, void *arg);
static void _do_diag_stats(struct timeval *tv1, struct timeval *tv2,
int node_space_recs);
static uint32_t _get_job_max_tl(job_record_t *job_ptr, time_t now,
node_space_map_t *node_space);
static bool _hetjob_any_resv(job_record_t *het_leader);
static uint32_t _hetjob_calc_prio(job_record_t *het_leader);
static uint32_t _hetjob_calc_prio_tier(job_record_t *het_leader);
static void _het_job_deadlock_fini(void);
static bool _het_job_deadlock_test(job_record_t *job_ptr);
static bool _job_part_valid(job_record_t *job_ptr, part_record_t *part_ptr);
static void _load_config(void);
static bool _many_pending_rpcs(void);
static bool _more_work(time_t last_backfill_time);
static uint32_t _my_sleep(int64_t usec);
static int _num_feature_count(job_record_t *job_ptr, bool *has_xand,
bool *has_mor);
static int _het_job_find_map(void *x, void *key);
static void _het_job_map_del(void *x);
static void _het_job_start_clear(void);
static time_t _het_job_start_find(job_record_t *job_ptr);
static void _het_job_start_set(job_record_t *job_ptr, time_t latest_start,
uint32_t comp_time_limit);
static bool _het_job_start_test_single(node_space_map_t *node_space,
het_job_map_t *map, bool single);
static int _het_job_start_test_list(void *map, void *node_space);
static void _het_job_start_test(node_space_map_t *node_space,
uint32_t het_job_id, node_used_t *nodes_used,
list_t *nodes_used_list);
static void _reset_job_time_limit(job_record_t *job_ptr, time_t now,
node_space_map_t *node_space);
static void _set_bf_exit(bf_exit_t code);
static int _set_hetjob_details(void *x, void *arg);
static int _start_job(job_record_t *job_ptr, bitstr_t *avail_bitmap);
static bool _test_resv_overlap(node_space_map_t *node_space,
bitstr_t *use_bitmap, job_record_t *job_ptr,
uint32_t start_time, uint32_t end_reserve);
static int _try_sched(job_record_t *job_ptr, bitstr_t **avail_bitmap,
uint32_t min_nodes, uint32_t max_nodes,
uint32_t req_nodes, resv_exc_t *resv_exc_ptr,
will_run_data_t *will_run);
static int _yield_locks(int64_t usec);
static void _bf_map_key_id(void *item, const char **key, uint32_t *key_len);
static void _bf_map_free(void *item);
/* Log resources to be allocated to a pending job */
static void _dump_job_sched(job_record_t *job_ptr, time_t end_time,
bitstr_t *avail_bitmap)
{
char begin_buf[256], end_buf[256], *node_list;
slurm_make_time_str(&job_ptr->start_time, begin_buf, sizeof(begin_buf));
slurm_make_time_str(&end_time, end_buf, sizeof(end_buf));
node_list = bitmap2node_name(avail_bitmap);
log_flag(BACKFILL, "%pJ to start at %s, end at %s on nodes %s in partition %s",
job_ptr, begin_buf, end_buf, node_list,
job_ptr->part_ptr->name);
xfree(node_list);
}
static void _dump_job_test(job_record_t *job_ptr, bitstr_t *avail_bitmap,
time_t start_time, time_t later_start)
{
char begin_buf[256], *node_list;
char end_buf[256];
char later_buf[256];
if (start_time == 0)
strcpy(begin_buf, "NOW");
else
slurm_make_time_str(&start_time, begin_buf, sizeof(begin_buf));
if (later_start == 0)
strcpy(later_buf, "NO");
else
slurm_make_time_str(&later_start, later_buf, sizeof(later_buf));
if (later_start)
later_start += job_ptr->time_limit * 60;
slurm_make_time_str(&later_start, end_buf, sizeof(end_buf));
node_list = bitmap2node_name(avail_bitmap);
log_flag(BACKFILL, "Test %pJ at %s to %s (later_start: %s) on %s",
job_ptr, begin_buf, end_buf, later_buf, node_list);
xfree(node_list);
}
/* Log resource allocate table */
static void _dump_node_space_table(node_space_map_t *node_space_ptr)
{
int i = 0;
char begin_buf[256], end_buf[256], *node_list, *licenses;
log_flag(BACKFILL, "=========================================");
while (1) {
slurm_make_time_str(&node_space_ptr[i].begin_time,
begin_buf, sizeof(begin_buf));
slurm_make_time_str(&node_space_ptr[i].end_time,
end_buf, sizeof(end_buf));
node_list = bitmap2node_name(node_space_ptr[i].avail_bitmap);
licenses = bf_licenses_to_string(node_space_ptr[i].licenses);
log_flag(BACKFILL, "Begin:%s End:%s Nodes:%s Licenses:%s Fragmentation:%u",
begin_buf, end_buf, node_list, licenses,
node_space_ptr[i].fragmentation);
xfree(node_list);
xfree(licenses);
if ((i = node_space_ptr[i].next) == 0)
break;
}
log_flag(BACKFILL, "=========================================");
}
static void _set_job_time_limit(job_record_t *job_ptr, uint32_t new_limit)
{
job_ptr->time_limit = new_limit;
/* reset flag if we have a NO_VAL time_limit */
if (job_ptr->time_limit == NO_VAL)
job_ptr->limit_set.time = 0;
}
/*
* _many_pending_rpcs - Determine if slurmctld is busy with many active RPCs
* RET - True if slurmctld currently has more than max_rpc_cnt active RPCs
*/
static bool _many_pending_rpcs(void)
{
bool many_pending_rpcs = false;
slurm_mutex_lock(&slurmctld_config.thread_count_lock);
//info("thread_count = %u", slurmctld_config.server_thread_count);
if ((max_rpc_cnt > 0) &&
(slurmctld_config.server_thread_count >= max_rpc_cnt))
many_pending_rpcs = true;
slurm_mutex_unlock(&slurmctld_config.thread_count_lock);
return many_pending_rpcs;
}
/*
* Report summary of job's feature specification
* IN job_ptr - job to schedule
* OUT has_xand - true if features are XANDed together
* OUT has_mor - true if features are MORed together
* RET Total count for ALL job features, even counts with XAND separator
*/
static int _num_feature_count(job_record_t *job_ptr, bool *has_xand,
bool *has_mor)
{
job_details_t *detail_ptr = job_ptr->details;
int rc = 0;
list_itr_t *feat_iter;
job_feature_t *feat_ptr;
*has_xand = false;
*has_mor = false;
if (detail_ptr->feature_list_use == NULL) /* no constraints */
return rc;
feat_iter = list_iterator_create(detail_ptr->feature_list_use);
while ((feat_ptr = list_next(feat_iter))) {
if (feat_ptr->count)
rc++;
if (feat_ptr->op_code == FEATURE_OP_XAND)
*has_xand = true;
if (feat_ptr->op_code == FEATURE_OP_MOR)
*has_mor = true;
}
list_iterator_destroy(feat_iter);
return rc;
}
static int _clear_qos_blocked_times(void *x, void *arg)
{
slurmdb_qos_rec_t *qos_ptr = (slurmdb_qos_rec_t *) x;
qos_ptr->blocked_until = 0;
return 0;
}
/*
* Attempt to schedule a specific job on specific available nodes
* IN job_ptr - job to schedule
* IN/OUT avail_bitmap - nodes available/selected to use
* IN resv_exc_ptr - Various TRES which can not be used
* RET SLURM_SUCCESS on success, otherwise an error code
*/
static int _try_sched(job_record_t *job_ptr, bitstr_t **avail_bitmap,
uint32_t min_nodes, uint32_t max_nodes,
uint32_t req_nodes, resv_exc_t *resv_exc_ptr,
will_run_data_t *will_run)
{
bitstr_t *low_bitmap = NULL, *tmp_bitmap = NULL;
int rc = SLURM_SUCCESS;
bool has_xand = false, has_mor = false;
int feat_cnt = _num_feature_count(job_ptr, &has_xand, &has_mor);
job_details_t *detail_ptr = job_ptr->details;
list_t *feature_cache = detail_ptr->feature_list_use;
list_t *preemptee_candidates = NULL;
list_itr_t *feat_iter;
job_feature_t *feat_ptr;
job_feature_t *feature_base;
if (has_xand || feat_cnt) {
/*
* Cache the feature information and test the individual
* features (or sets of features in parenthesis), one at a time
*/
time_t high_start = 0;
uint32_t feat_min_node;
uint32_t feat_node_cnt;
tmp_bitmap = bit_copy(*avail_bitmap);
preemptee_candidates = slurm_find_preemptable_jobs(job_ptr);
feat_iter = list_iterator_create(feature_cache);
while ((feat_ptr = list_next(feat_iter)) &&
(rc == SLURM_SUCCESS)) {
detail_ptr->feature_list_use =
list_create(feature_list_delete);
feature_base = xmalloc(sizeof(job_feature_t));
feature_base->name = xstrdup(feat_ptr->name);
feature_base->op_code = feat_ptr->op_code;
list_append(detail_ptr->feature_list_use, feature_base);
feat_min_node = feat_ptr->count;
while ((feat_ptr->paren > 0) &&
((feat_ptr = list_next(feat_iter)))) {
feature_base = xmalloc(sizeof(job_feature_t));
feature_base->name = xstrdup(feat_ptr->name);
feature_base->op_code = feat_ptr->op_code;
feat_min_node = feat_ptr->count;
list_append(detail_ptr->feature_list_use,
feature_base);
}
feature_base->op_code = FEATURE_OP_END;
feat_min_node = MAX(1, feat_min_node);
if ((job_req_node_filter(job_ptr, *avail_bitmap, true)
== SLURM_SUCCESS) &&
(bit_set_count(*avail_bitmap) >= feat_min_node)) {
rc = select_g_job_test(job_ptr, *avail_bitmap,
feat_min_node, max_nodes,
feat_min_node,
SELECT_MODE_WILL_RUN,
preemptee_candidates,
NULL,
resv_exc_ptr,
will_run);
if (rc == SLURM_SUCCESS) {
if ((high_start == 0) ||
(high_start < job_ptr->start_time))
high_start =
job_ptr->start_time;
if (low_bitmap) {
bit_or(low_bitmap,
*avail_bitmap);
} else {
low_bitmap = *avail_bitmap;
*avail_bitmap = NULL;
}
}
} else {
rc = ESLURM_NODES_BUSY;
}
FREE_NULL_BITMAP(*avail_bitmap);
*avail_bitmap = bit_copy(tmp_bitmap);
if (low_bitmap)
bit_and_not(*avail_bitmap, low_bitmap);
FREE_NULL_LIST(detail_ptr->feature_list_use);
}
list_iterator_destroy(feat_iter);
if (low_bitmap)
feat_node_cnt = bit_set_count(low_bitmap);
else
feat_node_cnt = 0;
if (feat_node_cnt < req_nodes) {
detail_ptr->feature_list_use = NULL;
rc = select_g_job_test(job_ptr, *avail_bitmap,
min_nodes - feat_node_cnt,
max_nodes - feat_node_cnt,
req_nodes - feat_node_cnt,
SELECT_MODE_WILL_RUN,
preemptee_candidates,
NULL,
resv_exc_ptr,
will_run);
if (low_bitmap) {
bit_or(low_bitmap, *avail_bitmap);
} else {
low_bitmap = *avail_bitmap;
*avail_bitmap = NULL;
}
}
FREE_NULL_LIST(preemptee_candidates);
FREE_NULL_BITMAP(tmp_bitmap);
if (high_start && rc == SLURM_SUCCESS) {
job_ptr->start_time = high_start;
FREE_NULL_BITMAP(*avail_bitmap);
*avail_bitmap = low_bitmap;
} else {
rc = ESLURM_NODES_BUSY;
job_ptr->start_time = 0;
FREE_NULL_BITMAP(*avail_bitmap);
FREE_NULL_BITMAP(low_bitmap);
}
/* Restore the original feature information */
detail_ptr->feature_list_use = feature_cache;
} else if (has_mor) {
/*
* Cache the feature information and test the individual
* features (or sets of features in parenthesis), one at a time
*/
time_t low_start = 0;
tmp_bitmap = bit_copy(*avail_bitmap);
preemptee_candidates = slurm_find_preemptable_jobs(job_ptr);
feat_iter = list_iterator_create(feature_cache);
while ((feat_ptr = list_next(feat_iter))) {
detail_ptr->feature_list_use =
list_create(feature_list_delete);
feature_base = xmalloc(sizeof(job_feature_t));
feature_base->name = xstrdup(feat_ptr->name);
feature_base->op_code = feat_ptr->op_code;
list_append(detail_ptr->feature_list_use, feature_base);
while ((feat_ptr->paren > 0) &&
((feat_ptr = list_next(feat_iter)))) {
feature_base = xmalloc(sizeof(job_feature_t));
feature_base->name = xstrdup(feat_ptr->name);
feature_base->op_code = feat_ptr->op_code;
list_append(detail_ptr->feature_list_use,
feature_base);
}
feature_base->op_code = FEATURE_OP_END;
if ((job_req_node_filter(job_ptr, *avail_bitmap, true)
== SLURM_SUCCESS) &&
(bit_set_count(*avail_bitmap) >= min_nodes)) {
rc = select_g_job_test(job_ptr, *avail_bitmap,
min_nodes, max_nodes,
req_nodes,
SELECT_MODE_WILL_RUN,
preemptee_candidates,
NULL,
resv_exc_ptr,
will_run);
if ((rc == SLURM_SUCCESS) &&
((low_start == 0) ||
(low_start > job_ptr->start_time))) {
low_start = job_ptr->start_time;
low_bitmap = *avail_bitmap;
*avail_bitmap = NULL;
}
}
FREE_NULL_BITMAP(*avail_bitmap);
*avail_bitmap = bit_copy(tmp_bitmap);
FREE_NULL_LIST(detail_ptr->feature_list_use);
}
list_iterator_destroy(feat_iter);
FREE_NULL_LIST(preemptee_candidates);
FREE_NULL_BITMAP(tmp_bitmap);
if (low_start) {
job_ptr->start_time = low_start;
rc = SLURM_SUCCESS;
FREE_NULL_BITMAP(*avail_bitmap);
*avail_bitmap = low_bitmap;
} else {
rc = ESLURM_NODES_BUSY;
FREE_NULL_BITMAP(low_bitmap);
}
/* Restore the original feature information */
detail_ptr->feature_list_use = feature_cache;
} else if (detail_ptr->feature_list_use) {
if ((job_req_node_filter(job_ptr, *avail_bitmap, true) !=
SLURM_SUCCESS) ||
(bit_set_count(*avail_bitmap) < min_nodes)) {
rc = ESLURM_NODES_BUSY;
} else {
preemptee_candidates =
slurm_find_preemptable_jobs(job_ptr);
rc = select_g_job_test(job_ptr, *avail_bitmap,
min_nodes, max_nodes, req_nodes,
SELECT_MODE_WILL_RUN,
preemptee_candidates,
NULL,
resv_exc_ptr,
will_run);
}
} else {
/* Try to schedule the job. First on dedicated nodes
* then on shared nodes (if so configured). */
uint16_t orig_shared;
time_t now = time(NULL);
char str[100];
preemptee_candidates = slurm_find_preemptable_jobs(job_ptr);
orig_shared = job_ptr->details->share_res;
job_ptr->details->share_res = 0;
tmp_bitmap = bit_copy(*avail_bitmap);
if (resv_exc_ptr && resv_exc_ptr->core_bitmap) {
bit_fmt(str, (sizeof(str) - 1),
resv_exc_ptr->core_bitmap);
debug2("exclude core bitmap: %s", str);
}
rc = select_g_job_test(job_ptr, *avail_bitmap, min_nodes,
max_nodes, req_nodes,
SELECT_MODE_WILL_RUN,
preemptee_candidates,
NULL,
resv_exc_ptr,
will_run);
job_ptr->details->share_res = orig_shared;
if (((rc != SLURM_SUCCESS) || (job_ptr->start_time > now)) &&
(orig_shared != 0)) {
FREE_NULL_BITMAP(*avail_bitmap);
*avail_bitmap = tmp_bitmap;
rc = select_g_job_test(job_ptr, *avail_bitmap,
min_nodes, max_nodes, req_nodes,
SELECT_MODE_WILL_RUN,
preemptee_candidates,
NULL,
resv_exc_ptr,
will_run);
} else
FREE_NULL_BITMAP(tmp_bitmap);
}
FREE_NULL_LIST(preemptee_candidates);
return rc;
}
/* Terminate backfill_agent */
extern void stop_backfill_agent(void)
{
slurm_mutex_lock(&term_lock);
stop_backfill = true;
slurm_cond_signal(&term_cond);
slurm_mutex_unlock(&term_lock);
}
/* Sleep for at least specified time, returns actual sleep time in usec */
static uint32_t _my_sleep(int64_t usec)
{
int64_t nsec;
uint32_t sleep_time = 0;
struct timespec ts = {0, 0};
struct timeval tv1 = {0, 0}, tv2 = {0, 0};
if (gettimeofday(&tv1, NULL)) { /* Some error */
sleep(1);
return 1000000;
}
nsec = tv1.tv_usec + usec;
nsec *= 1000;
ts.tv_sec = tv1.tv_sec + (nsec / 1000000000);
ts.tv_nsec = nsec % 1000000000;
slurm_mutex_lock(&term_lock);
if (!stop_backfill)
slurm_cond_timedwait(&term_cond, &term_lock, &ts);
slurm_mutex_unlock(&term_lock);
if (gettimeofday(&tv2, NULL))
return usec;
sleep_time = (tv2.tv_sec - tv1.tv_sec) * 1000000;
sleep_time += tv2.tv_usec;
sleep_time -= tv1.tv_usec;
return sleep_time;
}
static void _load_config(void)
{
char *sched_params = slurm_conf.sched_params, *tmp_ptr;
long tmp_val = 0;
if ((tmp_ptr = xstrcasestr(sched_params, "bf_interval="))) {
backfill_interval = atoi(tmp_ptr + 12);
if (((backfill_interval != -1) && (backfill_interval < 1)) ||
backfill_interval > MAX_BACKFILL_INTERVAL) {
error("Invalid SchedulerParameters bf_interval: %d",
backfill_interval);
backfill_interval = BACKFILL_INTERVAL;
}
} else {
backfill_interval = BACKFILL_INTERVAL;
}
if ((tmp_ptr = xstrcasestr(sched_params, "bf_max_time="))) {
bf_max_time = atoi(tmp_ptr + 12);
if (bf_max_time < 1 || bf_max_time > MAX_BF_MAX_TIME) {
error("Invalid SchedulerParameters bf_max_time:"
" %d", bf_max_time);
bf_max_time = backfill_interval;
}
} else {
bf_max_time = backfill_interval;
}
if ((tmp_ptr = xstrcasestr(sched_params, "bf_window="))) {
backfill_window = atoi(tmp_ptr + 10) * 60; /* mins to secs */
if (backfill_window < 1 ||
backfill_window > MAX_BACKFILL_WINDOW) {
error("Invalid SchedulerParameters bf_window: %d",
backfill_window);
backfill_window = BACKFILL_WINDOW;
}
} else {
backfill_window = BACKFILL_WINDOW;
}
if ((tmp_ptr = xstrcasestr(sched_params, "bf_max_job_test=")))
max_backfill_job_cnt = atoi(tmp_ptr + 16);
else if ((tmp_ptr = xstrcasestr(sched_params, "max_job_bf="))) {
fatal("Invalid parameter max_job_bf. The option is no longer supported, please use bf_max_job_test instead.");
}
else
max_backfill_job_cnt = DEF_BF_MAX_JOB_TEST;
if (max_backfill_job_cnt < 1 ||
max_backfill_job_cnt > MAX_BF_MAX_JOB_TEST) {
error("Invalid SchedulerParameters bf_max_job_test: %d",
max_backfill_job_cnt);
max_backfill_job_cnt = DEF_BF_MAX_JOB_TEST;
}
if ((tmp_ptr = xstrcasestr(sched_params, "bf_node_space_size=")))
bf_node_space_size = atoi(tmp_ptr + 19);
else
bf_node_space_size = max_backfill_job_cnt;
if (bf_node_space_size < 2 ||
bf_node_space_size > 2 * MAX_BF_MAX_JOB_TEST) {
error("Invalid SchedulerParameters bf_node_space_size: %d",
bf_node_space_size);
bf_node_space_size = max_backfill_job_cnt;
}
if ((tmp_ptr = xstrcasestr(sched_params, "bf_resolution="))) {
backfill_resolution = atoi(tmp_ptr + 14);
if (backfill_resolution < 1 ||
backfill_resolution > MAX_BACKFILL_RESOLUTION) {
error("Invalid SchedulerParameters bf_resolution: %d",
backfill_resolution);
backfill_resolution = BACKFILL_RESOLUTION;
}
} else {
backfill_resolution = BACKFILL_RESOLUTION;
}
if ((tmp_ptr = xstrcasestr(sched_params, "bf_max_job_array_resv="))) {
bf_max_job_array_resv = atoi(tmp_ptr + 22);
if (bf_max_job_array_resv < 0 ||
bf_max_job_array_resv > MAX_BF_MAX_JOB_ARRAY_RESV) {
error("Invalid SchedulerParameters bf_max_job_array_resv: %d",
bf_max_job_array_resv);
bf_max_job_array_resv = BF_MAX_JOB_ARRAY_RESV;
}
} else {
bf_max_job_array_resv = BF_MAX_JOB_ARRAY_RESV;
}
if ((tmp_ptr = xstrcasestr(sched_params, "bf_max_job_part="))) {
max_backfill_job_per_part = atoi(tmp_ptr + 16);
if (max_backfill_job_per_part < 0) {
error("Invalid SchedulerParameters bf_max_job_part: %d",
max_backfill_job_per_part);
max_backfill_job_per_part = 0;
}
} else {
max_backfill_job_per_part = 0;
}
if ((max_backfill_job_per_part != 0) &&
(max_backfill_job_per_part >= max_backfill_job_cnt)) {
error("bf_max_job_part >= bf_max_job_test (%u >= %u)",
max_backfill_job_per_part, max_backfill_job_cnt);
}
if ((tmp_ptr = xstrcasestr(sched_params, "bf_max_job_start="))) {
max_backfill_jobs_start = atoi(tmp_ptr + 17);
if (max_backfill_jobs_start < 0 ||
max_backfill_jobs_start > MAX_BF_MAX_JOB_START) {
error("Invalid SchedulerParameters bf_max_job_start: %d",
max_backfill_jobs_start);
max_backfill_jobs_start = 0;
}
} else {
max_backfill_jobs_start = 0;
}
if ((tmp_ptr = xstrcasestr(sched_params, "bf_max_job_user="))) {
max_backfill_job_per_user = atoi(tmp_ptr + 16);
if (max_backfill_job_per_user < 0) {
error("Invalid SchedulerParameters bf_max_job_user: %d",
max_backfill_job_per_user);
max_backfill_job_per_user = 0;
}
} else {
max_backfill_job_per_user = 0;
}
if ((max_backfill_job_per_user != 0) &&
(max_backfill_job_per_user > max_backfill_job_cnt)) {
warning("bf_max_job_user > bf_max_job_test (%u > %u)",
max_backfill_job_per_user, max_backfill_job_cnt);
}
bf_job_part_count_reserve = 0;
if ((tmp_ptr = xstrcasestr(sched_params, "bf_job_part_count_reserve="))) {
int job_cnt = atoi(tmp_ptr + 26);
if (job_cnt < 0 || job_cnt > MAX_BF_JOB_PART_COUNT_RESERVE) {
error("Invalid SchedulerParameters bf_job_part_count_reserve: %d",
job_cnt);
} else {
bf_job_part_count_reserve = job_cnt;
}
}
if ((tmp_ptr = xstrcasestr(sched_params, "bf_max_job_user_part="))) {
max_backfill_job_per_user_part = atoi(tmp_ptr + 21);
if (max_backfill_job_per_user_part < 0) {
error("Invalid SchedulerParameters bf_max_job_user_part: %d",
max_backfill_job_per_user_part);
max_backfill_job_per_user_part = 0;
}
} else {
max_backfill_job_per_user_part = 0;
}
if ((max_backfill_job_per_user_part != 0) &&
(max_backfill_job_per_user_part > max_backfill_job_cnt)) {
warning("bf_max_job_user_part > bf_max_job_test (%u > %u)",
max_backfill_job_per_user_part, max_backfill_job_cnt);
}
if ((tmp_ptr = xstrcasestr(sched_params, "bf_max_job_assoc="))) {
max_backfill_job_per_assoc = atoi(tmp_ptr + 17);
if (max_backfill_job_per_assoc < 0) {
error("Invalid SchedulerParameters bf_max_job_assoc: %d",
max_backfill_job_per_assoc);
max_backfill_job_per_assoc = 0;
}
} else {
max_backfill_job_per_assoc = 0;
}
if ((max_backfill_job_per_assoc != 0) &&
(max_backfill_job_per_assoc > max_backfill_job_cnt)) {
warning("bf_max_job_assoc > bf_max_job_test (%u > %u)",
max_backfill_job_per_assoc, max_backfill_job_cnt);
}
if ((max_backfill_job_per_assoc != 0) &&
(max_backfill_job_per_user != 0)) {
error("Both bf_max_job_user and bf_max_job_assoc are set: "
"bf_max_job_assoc taking precedence.");
max_backfill_job_per_user = 0;
}
bf_min_age_reserve = 0;
if ((tmp_ptr = xstrcasestr(sched_params, "bf_min_age_reserve="))) {
int min_age = atoi(tmp_ptr + 19);
if (min_age < 0 || min_age > MAX_BF_MIN_AGE_RESERVE) {
error("Invalid SchedulerParameters bf_min_age_reserve: %d",
min_age);
} else {
bf_min_age_reserve = min_age;
}
}
bf_min_prio_reserve = 0;
if ((tmp_ptr = xstrcasestr(sched_params, "bf_min_prio_reserve="))) {
unsigned long long int min_prio;
tmp_ptr += 20;
min_prio = strtoull(tmp_ptr, NULL, 10);
if (!min_prio || min_prio > MAX_BF_MIN_PRIO_RESERVE) {
error("Invalid SchedulerParameters bf_min_prio_reserve: %llu",
min_prio);
} else {
bf_min_prio_reserve = (uint32_t) min_prio;
}
}
/* bf_continue makes backfill continue where it was if interrupted */
if (xstrcasestr(sched_params, "bf_continue")) {
backfill_continue = true;
} else {
backfill_continue = false;
}
if (xstrcasestr(sched_params, "assoc_limit_stop")) {
assoc_limit_stop = true;
} else {
assoc_limit_stop = false;
}
if ((tmp_ptr = xstrcasestr(sched_params, "bf_yield_interval="))) {
yield_interval = atoi(tmp_ptr + 18);
if ((yield_interval <= 0) ||
(yield_interval > MAX_BF_YIELD_INTERVAL)) {
error("Invalid backfill scheduler bf_yield_interval: %d",
yield_interval);
yield_interval = YIELD_INTERVAL;
}
} else {
yield_interval = YIELD_INTERVAL;
}
if ((tmp_ptr = xstrcasestr(sched_params, "bf_yield_sleep="))) {
yield_sleep = (int64_t) atoll(tmp_ptr + 15);
if (yield_sleep <= 0 || yield_sleep > MAX_YIELD_SLEEP) {
error("Invalid backfill scheduler bf_yield_sleep: %d",
yield_sleep);
yield_sleep = YIELD_SLEEP;
}
} else {
yield_sleep = YIELD_SLEEP;
}
bf_hetjob_prio = 0;
if ((tmp_ptr = xstrcasestr(sched_params, "bf_hetjob_prio="))) {
tmp_ptr += 15;
if (!xstrncasecmp(tmp_ptr, "min", 3))
bf_hetjob_prio |= HETJOB_PRIO_MIN;
else if (!xstrncasecmp(tmp_ptr, "max", 3))
bf_hetjob_prio |= HETJOB_PRIO_MAX;
else if (!xstrncasecmp(tmp_ptr, "avg", 3))
bf_hetjob_prio |= HETJOB_PRIO_AVG;
else
error("Invalid SchedulerParameters bf_hetjob_prio: %s",
tmp_ptr);
}
bf_hetjob_immediate = false;
if (xstrcasestr(sched_params, "bf_hetjob_immediate"))
bf_hetjob_immediate = true;
if (bf_hetjob_immediate && !bf_hetjob_prio) {
bf_hetjob_prio |= HETJOB_PRIO_MIN;
info("bf_hetjob_immediate automatically sets bf_hetjob_prio=min");
}
if (xstrcasestr(sched_params, "bf_one_resv_per_job"))
bf_one_resv_per_job = true;
else
bf_one_resv_per_job = false;
if (xstrcasestr(sched_params, "bf_allow_magnetic_slot"))
bf_allow_magnetic_slot = true;
else
bf_allow_magnetic_slot = false;
if (xstrcasestr(sched_params, "bf_running_job_reserve"))
bf_running_job_reserve = true;
else
bf_running_job_reserve = false;
if (xstrcasestr(sched_params, "bf_licenses")) {
bf_licenses = true;
bf_running_job_reserve = true;
} else {
bf_licenses = false;
}
if (xstrcasestr(sched_params, "bf_topopt_enable")) {
bf_topopt_enable = true;
} else {
bf_topopt_enable = false;
}
if ((tmp_ptr = xstrcasestr(sched_params, "bf_topopt_iterations="))) {
bf_topopt_iterations = atoi(tmp_ptr + 21);
if ((bf_topopt_iterations <= 1) ||
(bf_topopt_iterations > MAX_ORACLE_DEPTH)) {
error("Invalid backfill scheduler bf_topopt_iterations: %d",
bf_topopt_iterations);
bf_topopt_iterations = ORACLE_DEPTH;
}
} else {
bf_topopt_iterations = ORACLE_DEPTH;
}
if ((tmp_ptr = xstrcasestr(sched_params, "max_rpc_cnt=")))
max_rpc_cnt = atoi(tmp_ptr + 12);
else if ((tmp_ptr = xstrcasestr(sched_params, "max_rpc_count=")))
max_rpc_cnt = atoi(tmp_ptr + 14);
else
max_rpc_cnt = 0;
if ((max_rpc_cnt < 0) || (max_rpc_cnt > MAX_MAX_RPC_CNT)) {
error("Invalid SchedulerParameters max_rpc_cnt: %d",
max_rpc_cnt);
max_rpc_cnt = 0;
}
if ((tmp_ptr = xstrcasestr(sched_params, "bf_yield_rpc_cnt=")))
tmp_val = strtol(tmp_ptr + 17, NULL, 10);
else if ((tmp_ptr = xstrcasestr(sched_params, "bf_yield_rpc_count=")))
tmp_val = strtol(tmp_ptr + 19, NULL, 10);
else
tmp_val = MAX((max_rpc_cnt / 10), 20);
if ((tmp_val < 0) || (tmp_val > MAX_YIELD_RPC_CNT)) {
error("Invalid SchedulerParameters bf_yield_rpc_cnt: %ld",
tmp_val);
yield_rpc_cnt = MAX((max_rpc_cnt / 10), 20);
} else {
yield_rpc_cnt = tmp_val;
}
if (xstrcasestr(sched_params, "time_min_as_soft_limit"))
soft_time_limit = true;
}
/* Note that slurm.conf has changed */
extern void backfill_reconfig(void)
{
slurm_mutex_lock(&config_lock);
config_flag = true;
slurm_mutex_unlock(&config_lock);
}
/* Update backfill scheduling statistics
* IN tv1 - start time
* IN tv2 - end (current) time
* IN node_space_recs - count of records in resources/time table being tested
*/
static void _do_diag_stats(struct timeval *tv1, struct timeval *tv2,
int node_space_recs)
{
uint32_t delta_t, real_time;
delta_t = (tv2->tv_sec - tv1->tv_sec) * 1000000;
delta_t += tv2->tv_usec;
delta_t -= tv1->tv_usec;
real_time = delta_t - bf_sleep_usec;
slurmctld_diag_stats.bf_cycle_counter++;
slurmctld_diag_stats.bf_cycle_sum += real_time;
slurmctld_diag_stats.bf_cycle_last = real_time;
slurmctld_diag_stats.bf_depth_sum += slurmctld_diag_stats.bf_last_depth;
slurmctld_diag_stats.bf_depth_try_sum +=
slurmctld_diag_stats.bf_last_depth_try;
if (slurmctld_diag_stats.bf_cycle_last >
slurmctld_diag_stats.bf_cycle_max) {
slurmctld_diag_stats.bf_cycle_max = slurmctld_diag_stats.
bf_cycle_last;
}
slurmctld_diag_stats.bf_table_size = node_space_recs;
slurmctld_diag_stats.bf_table_size_sum += node_space_recs;
}
static void _init_planned_bitmap(void)
{
slurmctld_lock_t read_node_lock = { .node = READ_LOCK };
node_record_t *node_ptr = NULL;
xassert(!planned_bitmap);
planned_bitmap = bit_alloc(node_record_count);
/* Sync planned_bitmap with NODE_STATE_PLANNED nodes from state save */
lock_slurmctld(read_node_lock);
for (int i = 0; (node_ptr = next_node(&i)); i++)
if (IS_NODE_PLANNED(node_ptr))
bit_set(planned_bitmap, i);
unlock_slurmctld(read_node_lock);
}
extern void __attempt_backfill(void)
{
_load_config();
het_job_list = list_create(_het_job_map_del);
_init_planned_bitmap();
_attempt_backfill();
FREE_NULL_LIST(het_job_list);
FREE_NULL_BITMAP(planned_bitmap);
}
/* backfill_agent - detached thread periodically attempts to backfill jobs */
extern void *backfill_agent(void *args)
{
time_t now;
double wait_time;
static time_t last_backfill_time = 0;
/* Read config and partitions; Write jobs and nodes */
slurmctld_lock_t all_locks = {
READ_LOCK, WRITE_LOCK, WRITE_LOCK, READ_LOCK, READ_LOCK };
bool load_config;
bool short_sleep = false;
int backfill_cnt = 0;
#if HAVE_SYS_PRCTL_H
if (prctl(PR_SET_NAME, "bckfl", NULL, NULL, NULL) < 0) {
error("cannot set my name to %s %m", "backfill");
}
#endif
_load_config();
last_backfill_time = time(NULL);
_init_planned_bitmap();
het_job_list = list_create(_het_job_map_del);
while (!stop_backfill) {
if (short_sleep)
_my_sleep(USEC_IN_SEC);
else if (backfill_interval == -1)
_my_sleep(BACKFILL_INTERVAL * USEC_IN_SEC);
else
_my_sleep((int64_t) backfill_interval * USEC_IN_SEC);
if (stop_backfill)
break;
if (slurmctld_config.scheduling_disabled)
continue;
list_flush(het_job_list);
slurm_mutex_lock(&config_lock);
if (config_flag) {
config_flag = false;
load_config = true;
} else {
load_config = false;
}
slurm_mutex_unlock(&config_lock);
if (load_config)
_load_config();
if (backfill_interval == -1) {
log_flag(BACKFILL, "skipping backfill cycle for %ds",
BACKFILL_INTERVAL);
continue;
}
now = time(NULL);
wait_time = difftime(now, last_backfill_time);
if ((wait_time < backfill_interval) ||
job_is_completing(NULL) || _many_pending_rpcs() ||
!_more_work(last_backfill_time)) {
short_sleep = true;
continue;
}
slurm_mutex_lock(&check_bf_running_lock);
slurmctld_diag_stats.bf_active = 1;
slurm_mutex_unlock(&check_bf_running_lock);
lock_slurmctld(all_locks);
validate_all_reservations(true, false);
if ((backfill_cnt++ % 2) == 0)
_het_job_start_clear();
_attempt_backfill();
last_backfill_time = time(NULL);
(void) bb_g_job_try_stage_in();
unlock_slurmctld(all_locks);
slurm_mutex_lock(&check_bf_running_lock);
slurmctld_diag_stats.bf_active = 0;
slurm_mutex_unlock(&check_bf_running_lock);
short_sleep = false;
}
FREE_NULL_LIST(het_job_list);
xhash_free(user_usage_map); /* May have been init'ed if used */
FREE_NULL_BITMAP(planned_bitmap);
return NULL;
}
/*
* Clear the start_time and sched_nodes for all pending jobs. This is used to
* ensure that a job which can run in multiple partitions has its start_time and
* sched_nodes set to the partition offering the earliest start_time.
*/
static int _clear_job_estimates(void *x, void *arg)
{
job_record_t *job_ptr = (job_record_t *) x;
if (IS_JOB_PENDING(job_ptr)) {
job_ptr->start_time = 0;
xfree(job_ptr->sched_nodes);
}
return SLURM_SUCCESS;
}
/*
* Return non-zero to break the backfill loop if change in job, node,
* reservation or partition state or the backfill scheduler needs to be stopped.
*/
static int _yield_locks(int64_t usec)
{
slurmctld_lock_t all_locks = {
READ_LOCK, WRITE_LOCK, WRITE_LOCK, READ_LOCK, READ_LOCK };
time_t job_update, node_update, part_update, config_update, resv_update;
bool load_config = false;
job_update = last_job_update;
node_update = last_node_update;
part_update = last_part_update;
config_update = slurm_conf.last_update;
resv_update = last_resv_update;
unlock_slurmctld(all_locks);
while (!stop_backfill) {
bf_sleep_usec += _my_sleep(usec);
slurm_mutex_lock(&slurmctld_config.thread_count_lock);
if ((max_rpc_cnt == 0) ||
(slurmctld_config.server_thread_count <= yield_rpc_cnt)) {
slurm_mutex_unlock(&slurmctld_config.thread_count_lock);
break;
}
verbose("continuing to yield locks, %d RPCs pending",
slurmctld_config.server_thread_count);
slurm_mutex_unlock(&slurmctld_config.thread_count_lock);
}
lock_slurmctld(all_locks);
slurm_mutex_lock(&config_lock);
if (config_flag)
load_config = true;
slurm_mutex_unlock(&config_lock);
if (((!backfill_continue) && ((last_job_update != job_update) ||
(last_node_update != node_update))) ||
(last_part_update != part_update) ||
(slurm_conf.last_update != config_update) ||
(validate_resv_cnt != 0) ||
(last_resv_update != resv_update) ||
stop_backfill || load_config)
return 1;
else
return 0;
}
/* Test if this job still has access to the specified partition. The job's
* available partitions may have changed when locks were released */
static bool _job_part_valid(job_record_t *job_ptr, part_record_t *part_ptr)
{
part_record_t *avail_part_ptr;
list_itr_t *part_iterator;
bool rc = false;
if (job_ptr->part_ptr_list) {
part_iterator = list_iterator_create(job_ptr->part_ptr_list);
while ((avail_part_ptr = list_next(part_iterator))) {
if (avail_part_ptr == part_ptr) {
rc = true;
break;
}
}
list_iterator_destroy(part_iterator);
} else if (job_ptr->part_ptr == part_ptr) {
rc = true;
}
return rc;
}
/* Determine if job in the backfill queue is still runnable.
* Job state could change when lock are periodically released */
static bool _job_runnable_now(job_record_t *job_ptr)
{
if (IS_JOB_REVOKED(job_ptr)) {
log_flag(BACKFILL, "%pJ revoked during bf yield", job_ptr);
return false;
}
if (!IS_JOB_PENDING(job_ptr)) { /* Started in other partition */
log_flag(BACKFILL, "%pJ started in other partition during bf yield",
job_ptr);
return false;
}
if (job_ptr->priority == 0) { /* Job has been held */
log_flag(BACKFILL, "%pJ job held during bf yield", job_ptr);
return false;
}
if (IS_JOB_COMPLETING(job_ptr)) { /* Started, requeue and completing */
log_flag(BACKFILL, "%pJ job started during bf yield", job_ptr);
return false;
}
/*
* Already reserved resources for either bf_max_job_array_resv or
* max_run_tasks number of jobs in the array. If max_run_tasks is 0, it
* wasn't set, so ignore it.
*/
if (job_ptr->array_recs &&
((job_ptr->array_recs->pend_run_tasks >= bf_max_job_array_resv) ||
(job_ptr->array_recs->max_run_tasks &&
((job_ptr->array_recs->pend_run_tasks +
job_ptr->array_recs->tot_run_tasks) >=
job_ptr->array_recs->max_run_tasks))))
return false;
return true;
}
static void _restore_preempt_state(job_record_t *job_ptr,
time_t *tmp_preempt_start_time,
bool *tmp_preempt_in_progress)
{
if ((*tmp_preempt_start_time != 0)
&& (job_ptr->details->preempt_start_time == 0)) {
job_ptr->details->preempt_start_time =
*tmp_preempt_start_time;
job_ptr->preempt_in_progress = *tmp_preempt_in_progress;
}
*tmp_preempt_start_time = 0;
*tmp_preempt_in_progress = false;
}
/*
* IN/OUT: prio to be adjusted
* IN: value from current component partition
*/
static void _adjust_hetjob_prio(uint32_t *prio, uint32_t val)
{
if (!*prio)
*prio = val;
else if (bf_hetjob_prio & HETJOB_PRIO_MIN)
*prio = MIN(*prio, val);
else if (bf_hetjob_prio & HETJOB_PRIO_MAX)
*prio = MAX(*prio, val);
else if (bf_hetjob_prio & HETJOB_PRIO_AVG)
*prio += val;
}
/*
* IN: job_record pointer of a hetjob leader (caller responsible)
* RET: [min|max|avg] Priority of all components from same hetjob
*/
static uint32_t _hetjob_calc_prio(job_record_t *het_leader)
{
job_record_t *het_comp = NULL;
uint32_t prio = 0, tmp = 0, cnt = 0, i = 0, nparts = 0;
list_itr_t *iter = NULL;
if (bf_hetjob_prio & HETJOB_PRIO_MIN)
prio = INFINITE;
iter = list_iterator_create(het_leader->het_job_list);
while ((het_comp = list_next(iter))) {
if (het_comp->part_ptr_list &&
het_comp->prio_mult &&
het_comp->prio_mult->priority_array &&
(nparts = list_count(het_comp->part_ptr_list))) {
for (i = 0; i < nparts; i++) {
tmp = het_comp->prio_mult->priority_array[i];
if (tmp == 0) { /* job held */
prio = 0;
break;
}
_adjust_hetjob_prio(&prio, tmp);
cnt++;
}
if (prio == 0) /* job held */
break;
} else {
tmp = het_comp->priority;
if (tmp == 0) { /* job held */
prio = 0;
break;
}
_adjust_hetjob_prio(&prio, tmp);
cnt++;
}
if ((bf_hetjob_prio & HETJOB_PRIO_MIN) && (prio == 1))
break; /* Can not get lower */
}
list_iterator_destroy(iter);
if (prio && cnt && (bf_hetjob_prio & HETJOB_PRIO_AVG))
prio /= cnt;
return prio;
}
/*
* IN: job_record pointer of a hetjob leader (caller responsible)
* RET: [min|max|avg] PriorityTier of all components from same hetjob
*/
static uint32_t _hetjob_calc_prio_tier(job_record_t *het_leader)
{
job_record_t *het_comp = NULL;
part_record_t *part_ptr = NULL;
uint32_t prio_tier = 0, tmp = 0, cnt = 0;
list_itr_t *iter = NULL, *iter2 = NULL;
if (bf_hetjob_prio & HETJOB_PRIO_MIN)
prio_tier = NO_VAL16 - 1;
iter = list_iterator_create(het_leader->het_job_list);
while ((het_comp = list_next(iter))) {
if (het_comp->part_ptr_list &&
list_count(het_comp->part_ptr_list)) {
iter2 = list_iterator_create(het_comp->part_ptr_list);
while ((part_ptr = list_next(iter2))) {
tmp = part_ptr->priority_tier;
_adjust_hetjob_prio(&prio_tier, tmp);
cnt++;
}
list_iterator_destroy(iter2);
} else {
tmp = het_comp->part_ptr->priority_tier;
_adjust_hetjob_prio(&prio_tier, tmp);
cnt++;
}
if ((bf_hetjob_prio & HETJOB_PRIO_MIN) && (prio_tier == 0))
break; /* Minimum found. */
if ((bf_hetjob_prio & HETJOB_PRIO_MAX) &&
(prio_tier == (NO_VAL16 - 1)))
break; /* Maximum found. */
}
list_iterator_destroy(iter);
if (prio_tier && cnt && (bf_hetjob_prio & HETJOB_PRIO_AVG))
prio_tier /= cnt;
return prio_tier;
}
/*
* IN: job_record pointer of a hetjob leader (caller responsible)
* RET: true if any component from same hetjob has a reservation
*/
static bool _hetjob_any_resv(job_record_t *het_leader)
{
job_record_t *het_comp = NULL;
list_itr_t *iter = NULL;
bool any_resv = false;
iter = list_iterator_create(het_leader->het_job_list);
while (!any_resv && (het_comp = list_next(iter))) {
if (het_comp->resv_id != 0)
any_resv = true;
}
list_iterator_destroy(iter);
return any_resv;
}
static int _foreach_het_job_details(void *x, void *arg)
{
job_record_t *job_ptr = (job_record_t *) x;
job_ptr->het_details = (het_job_details_t *)arg;
return SLURM_SUCCESS;
}
static int _bf_reserve_resv_licenses(void *x, void *arg)
{
slurmctld_resv_t *resv_ptr = x;
node_space_handler_t *ns_h = arg;
node_space_map_t *node_space = ns_h->node_space;
int *ns_recs_ptr = ns_h->node_space_recs;
time_t start_time, end_time;
job_record_t fake_job = {
.license_list = resv_ptr->license_list,
.resv_ptr = resv_ptr,
};
if (!resv_ptr->license_list)
return 0;
if (resv_ptr->end_time < node_space[0].begin_time)
return 0;
/* treat flex reservations as always active */
if (resv_ptr->flags & RESERVE_FLAG_FLEX) {
start_time = 0;
end_time = INFINITE;
} else {
/* align to resolution */
start_time = resv_ptr->start_time / backfill_resolution;
start_time *= backfill_resolution;
end_time = ROUNDUP(resv_ptr->end_time, backfill_resolution);
end_time *= backfill_resolution;
}
_add_reservation(start_time, end_time, NULL, &fake_job, node_space,
ns_recs_ptr, 0);
return 0;
}
static int _bf_reserve_running(void *x, void *arg)
{
job_record_t *job_ptr = (job_record_t *) x;
node_space_handler_t *ns_h = (node_space_handler_t *) arg;
node_space_map_t *node_space = ns_h->node_space;
int *ns_recs_ptr = ns_h->node_space_recs;
time_t end_time = job_ptr->end_time;
bool licenses, whole, preemptable;
bitstr_t *tmp_bitmap;
if (!job_ptr || !IS_JOB_RUNNING(job_ptr) || !job_ptr->job_resrcs)
return SLURM_SUCCESS;
whole = (job_ptr->job_resrcs->whole_node & WHOLE_NODE_REQUIRED) ||
(IS_JOB_WHOLE_TOPO(job_ptr));
licenses = (job_ptr->license_list);
if (!whole && !licenses)
return SLURM_SUCCESS;
preemptable = (slurm_job_preempt_mode(job_ptr) != PREEMPT_MODE_OFF);
if (preemptable && !licenses)
return SLURM_SUCCESS;
if (*ns_recs_ptr >= bf_node_space_size)
return SLURM_ERROR;
if (soft_time_limit && job_ptr->time_min) {
time_t now = time(NULL);
time_t soft_end = job_ptr->start_time + job_ptr->time_min * 60;
/*
* If over the soft limit, assume the job will use half of the
* remaining time until the hard limit.
*/
if (soft_end < now)
soft_end = now + (end_time - now) / 2;
end_time = soft_end;
}
end_time = ROUNDUP(end_time, backfill_resolution) * backfill_resolution;
if (preemptable || !whole) {
/* Reservation only needed for licenses. */
tmp_bitmap = bit_alloc(node_record_count);
} else {
tmp_bitmap = bit_copy(job_ptr->node_bitmap);
}
/*
* Ensure reservation start time is aligned to the start of the
* backfill map by sending 0 in instead of the actual start time.
* A long-running backfill cycle could lead to a skew of a few
* seconds - or significantly longer with bf_continue set - which
* would fragment the start of the backfill map.
*/
_add_reservation(0, end_time, tmp_bitmap, job_ptr, node_space,
ns_recs_ptr, 0);
FREE_NULL_BITMAP(tmp_bitmap);
return SLURM_SUCCESS;
}
static int _set_hetjob_details(void *x, void *arg)
{
job_record_t *job_ptr = (job_record_t *) x;
het_job_details_t *details = NULL;
if (IS_JOB_PENDING(job_ptr) && job_ptr->het_job_id &&
!job_ptr->het_job_offset && job_ptr->het_job_list) {
/*
* Pending hetjob leader component. Do calculations only once
* for whole hetjob. xmalloc memory for 1 het_details struct,
* but make the pointer accessible in all hetjob components.
*/
if (!job_ptr->het_details)
job_ptr->het_details =
xmalloc(sizeof(het_job_details_t));
details = job_ptr->het_details;
details->any_resv = _hetjob_any_resv(job_ptr);
details->priority_tier = _hetjob_calc_prio_tier(job_ptr);
details->priority = _hetjob_calc_prio(job_ptr);
list_for_each(job_ptr->het_job_list,
_foreach_het_job_details, details);
}
return SLURM_SUCCESS;
}
/* Fetch key from xhash_t item. Called from function ptr */
static void _bf_map_key_id(void *item, const char **key, uint32_t *key_len)
{
bf_user_usage_t *user = (bf_user_usage_t *)item;
xassert(user);
*key = (char *)&user->uid;
*key_len = sizeof(uid_t);
}
/* Free item from xhash_t. Called from function ptr */
static void _bf_map_free(void *item)
{
bf_user_usage_t *user = (bf_user_usage_t *)item;
if (!user)
return;
slurmdb_destroy_bf_usage_members(&user->bf_usage);
xfree(user);
}
/* Allocate new user and add to xhash_t map */
static bf_user_usage_t *_bf_map_add_user(xhash_t *map, uid_t uid)
{
bf_user_usage_t *user = xmalloc(sizeof(bf_user_usage_t));
user->uid = uid;
xhash_add(map, user);
return user;
}
/* Find user usage from uid. Add new empty entry to map if not found */
static slurmdb_bf_usage_t *_bf_map_find_add(xhash_t* map, uid_t uid)
{
bf_user_usage_t *user;
xassert(map != NULL);
if (!(user = xhash_get(map, (char *)&uid, sizeof(uid_t))))
user = _bf_map_add_user(map, uid);
return &user->bf_usage;
}
/*
* Check if limit exceeded. Reset usage if usage time is before current
* scheduling iteration time
*/
static bool _check_bf_usage(
slurmdb_bf_usage_t *usage, int limit, time_t sched_time)
{
if (usage->last_sched < sched_time) {
usage->last_sched = sched_time;
usage->count = 0;
return false;
}
return usage->count >= limit;
}
/*
* Check if job exceeds configured count limits
* returns true if count exceeded
*/
static bool _job_exceeds_max_bf_param(job_record_t *job_ptr,
time_t sched_start)
{
slurmdb_bf_usage_t *part_usage = NULL, *user_usage = NULL,
*assoc_usage = NULL, *user_part_usage = NULL;
slurmdb_assoc_rec_t *assoc_ptr = job_ptr->assoc_ptr;
part_record_t *part_ptr = job_ptr->part_ptr;
if (max_backfill_job_per_user_part) {
xassert(part_ptr->bf_data);
user_part_usage = _bf_map_find_add(
part_ptr->bf_data->user_usage,
job_ptr->user_id);
if (_check_bf_usage(user_part_usage,
max_backfill_job_per_user_part,
sched_start)) {
log_flag(BACKFILL, "have already checked %u jobs for user %u on partition %s; skipping job %u, %pJ",
max_backfill_job_per_user_part,
job_ptr->user_id, job_ptr->part_ptr->name,
job_ptr->job_id, job_ptr);
return true;
}
}
if (max_backfill_job_per_part) {
xassert(part_ptr->bf_data);
part_usage = part_ptr->bf_data->job_usage;
if (_check_bf_usage(part_usage, max_backfill_job_per_part,
sched_start)) {
log_flag(BACKFILL, "have already checked %u jobs for partition %s; skipping %pJ",
max_backfill_job_per_part,
job_ptr->part_ptr->name, job_ptr);
return true;
}
}
if (max_backfill_job_per_assoc) {
if (assoc_ptr) {
if (!assoc_ptr->bf_usage)
assoc_ptr->bf_usage =
xmalloc(sizeof(slurmdb_bf_usage_t));
assoc_usage = assoc_ptr->bf_usage;
if (_check_bf_usage(assoc_usage,
max_backfill_job_per_assoc,
sched_start)) {
log_flag(BACKFILL, "have already checked %u jobs for user %u, assoc %u; skipping %pJ",
max_backfill_job_per_assoc,
job_ptr->user_id, job_ptr->assoc_id,
job_ptr);
return true;
}
} else {
/* Null assoc_ptr indicates no database */
log_flag(BACKFILL, "no assoc for job %u, required for parameter bf_max_job_per_assoc",
job_ptr->job_id);
assoc_usage = NULL;
}
}
if (max_backfill_job_per_user) {
if (assoc_ptr && assoc_ptr->user_rec) {
if (!assoc_ptr->user_rec->bf_usage)
assoc_ptr->user_rec->bf_usage =
xmalloc(sizeof(slurmdb_bf_usage_t));
user_usage = assoc_ptr->user_rec->bf_usage;
} else {
/* No database, or user rec missing from assoc */
if (!user_usage_map)
user_usage_map = xhash_init(_bf_map_key_id,
_bf_map_free);
user_usage = _bf_map_find_add(user_usage_map,
job_ptr->user_id);
}
if (_check_bf_usage(user_usage, max_backfill_job_per_user,
sched_start)) {
log_flag(BACKFILL, "have already checked %u jobs for user %u; skipping %pJ",
max_backfill_job_per_user, job_ptr->user_id,
job_ptr);
return true;
}
}
/*
* Don't count queue records for magnetic reservation against
* backfill limits.
*/
if ((job_ptr->bit_flags & JOB_MAGNETIC) && !bf_allow_magnetic_slot)
return false;
/* Increment our user/partition limit counters as needed */
if (user_part_usage)
user_part_usage->count++;
if (part_usage)
part_usage->count++;
if (user_usage)
user_usage->count++;
if (assoc_usage)
assoc_usage->count++;
return false;
}
/*
* Handle the planned list.
* set - If true we are setting states, else we clear them.
*/
static void _handle_planned(bool set)
{
node_record_t *node_ptr;
bool node_update = false, select_synced = false;
if (!planned_bitmap)
return;
for (int n = 0; (n = bit_ffs_from_bit(planned_bitmap, n)) >= 0; n++) {
if (!(node_ptr = node_record_table_ptr[n])) {
/* Node could have been deleted while planned */
bit_clear(planned_bitmap, n);
continue;
}
if (set) {
/*
* If the node is fully allocated ignore this flag.
* This only really matters for IDLE and MIXED.
*/
if (IS_NODE_ALLOCATED(node_ptr)) {
uint16_t idle_cpus = 0;
if (!select_synced) {
select_g_select_nodeinfo_set_all();
select_synced = true;
}
idle_cpus = node_ptr->cpus_efctv -
node_ptr->alloc_cpus;
if (idle_cpus &&
(idle_cpus < node_ptr->cpus_efctv))
/* Mixed node as planned */
goto mixed;
/*
* Node fully allocated. Remove from planned.
* This is happening when a mixed node gets
* fully allocated while looping in
* _attempt_backfill (BF sched loop)
*/
node_ptr->node_state &= ~NODE_STATE_PLANNED;
node_update = true;
bit_clear(planned_bitmap, n);
} else {
/* Idle node as planned */
mixed:
node_ptr->node_state |= NODE_STATE_PLANNED;
node_update = true;
}
} else {
/* Reset planned state for all nodes */
node_ptr->node_state &= ~NODE_STATE_PLANNED;
node_update = true;
bit_clear(planned_bitmap, n);
}
log_flag(BACKFILL, "%s: %s state is %s",
set ? "set" : "cleared",
node_ptr->name,
node_state_string(node_ptr->node_state));
}
if (node_update)
last_node_update = time(NULL);
}
static void _set_slot_time(job_record_t *job_ptr, uint32_t time_limit,
uint32_t boot_time, uint32_t *start, uint32_t *end)
{
*start = job_ptr->start_time;
*end = *start + boot_time + (time_limit * 60) + backfill_resolution - 1;
*start = (*start / backfill_resolution) * backfill_resolution;
*end = (*end / backfill_resolution) * backfill_resolution;
}
/*
* Marks nodes' user status and last job end time
* Return positive if a node's last_job_end was updated else return 0
*/
static int _mark_nodes_usage(void *x, void *arg)
{
job_record_t *job_ptr = x;
node_used_t *nodes_used = arg;
bool last_job_end_updated = false;
bool owned;
int i;
xassert(job_ptr);
xassert(nodes_used);
if (IS_JOB_PENDING(job_ptr) || IS_JOB_COMPLETED(job_ptr) ||
!job_ptr->node_bitmap)
return last_job_end_updated;
owned = ((job_ptr->details->whole_node & WHOLE_NODE_USER) ||
(job_ptr->part_ptr &&
(job_ptr->part_ptr->flags & PART_FLAG_EXCLUSIVE_USER)));
for (i = 0; (i = bit_ffs_from_bit(job_ptr->node_bitmap, i)) >= 0; i++) {
if (!nodes_used[i].allocated) {
nodes_used[i].allocated = true;
nodes_used[i].uid = job_ptr->user_id;
nodes_used[i].node_index = i;
nodes_used[i].owned = owned;
} else if (!nodes_used[i].owned && !nodes_used[i].mixed_user) {
nodes_used[i].mixed_user =
nodes_used[i].uid != job_ptr->user_id;
nodes_used[i].owned = owned;
}
if (!nodes_used[i].mcs_label && job_ptr->mcs_label &&
slurm_mcs_get_select(job_ptr) == 1) {
/*
* We do not need to copy mcs_label, jobs are not purged
* during backfill, so this memory should always be
* valid.
*/
nodes_used[i].mcs_label = job_ptr->mcs_label;
}
if (nodes_used[i].last_job_end < job_ptr->end_time) {
nodes_used[i].last_job_end = job_ptr->end_time;
last_job_end_updated = true;
}
}
return last_job_end_updated;
}
static int _cmp_last_job_end(void *x, void *y)
{
node_used_t *node1 = *(node_used_t **) x;
node_used_t *node2 = *(node_used_t **) y;
if (node1->last_job_end < node2->last_job_end)
return 1;
else if (node1->last_job_end > node2->last_job_end)
return -1;
return 0;
}
/* For each node find if they have multiple users and the latest job end time */
static void _init_node_used_array_and_list(node_used_t **nodes_used,
list_t **nodes_used_list)
{
xassert(nodes_used && !*nodes_used);
xassert(nodes_used_list && !*nodes_used_list);
*nodes_used = xcalloc(node_record_count, sizeof(**nodes_used));
*nodes_used_list = list_create(NULL); /* NULL to avoid double free */
list_for_each(job_list, _mark_nodes_usage, *nodes_used);
for (int i = 0; i < node_record_count; i++)
list_append(*nodes_used_list, &(*nodes_used)[i]);
/* Sort list in descending order of last_job_end */
list_sort(*nodes_used_list, _cmp_last_job_end);
}
static bool _user_conflicts(bool is_exclusive_user, bool job_user_on_node,
node_used_t *node)
{
if (is_exclusive_user && !node->mixed_user && job_user_on_node)
return false; /* user alone on node */
if (!is_exclusive_user && (!node->owned || job_user_on_node))
return false; /* node not owned or the user owns the node */
return true; /* can't use node due to user conflict */
}
static bool _mcs_label_conflicts(char *job_mcs_label, char *node_mcs_label)
{
if (job_mcs_label && !xstrcmp(node_mcs_label, job_mcs_label))
return false; /* node already has required mcs_label */
if (!job_mcs_label && !node_mcs_label)
return false; /* node can't have mcs_label and it doesn't */
return true; /* can't use node due to mcs_label conflict */
}
/*
* Check if a node can be used, if not remove it. If the node can't be remove
* delay the start time.
* Return true if the start was delayed (or can't be delayed)
*/
static int _rm_node_or_delay_start(void *x, void *arg)
{
node_used_t *node = x;
filter_exclusive_args_t *args = arg;
bool job_user_on_node = node->uid == args->job_user;
if (!node->allocated)
return true; /* following nodes are idle */
if (node->last_job_end <= args->start_time)
return true; /* following nodes will be idle by start_time */
if (!bit_test(args->node_bitmap, node->node_index))
return false; /* not available to start with */
if (!_user_conflicts(args->is_exclusive_user, job_user_on_node, node) &&
!_mcs_label_conflicts(args->mcs_label, node->mcs_label))
return false; /* job user and mcs don't conflict with node's */
/* can't use this node */
*(args->later_start) = node->last_job_end;
if ((args->node_cnt > args->min_nodes) &&
(!args->req_nodes ||
!bit_test(args->req_nodes, node->node_index))) {
/* able to remove the node*/
bit_clear(args->node_bitmap, node->node_index);
args->node_cnt--;
return false;
}
/* can't remove the node, delay job start */
args->delay_start = true;
return true;
}
/* Return true if start_time was delayed */
static bool _filter_exclusive_user_mcs_nodes(job_record_t *job_ptr,
int mcs_select,
uint32_t min_nodes,
list_t *nodes_used_list,
time_t start_time,
time_t *later_filter_start,
bitstr_t *node_bitmap)
{
*later_filter_start = 0;
filter_exclusive_args_t args = {
.min_nodes = min_nodes,
.job_user = job_ptr->user_id,
.node_bitmap = node_bitmap,
.req_nodes = job_ptr->details->req_node_bitmap,
.node_cnt = bit_set_count(node_bitmap),
.later_start = later_filter_start,
.start_time = start_time,
};
/*
* Filter out any nodes used by other users, is_exclusive_user = true,
* or filter out nodes owned by other users, is_exclusive_user = false
*/
if ((job_ptr->details->whole_node & WHOLE_NODE_USER) ||
(job_ptr->part_ptr->flags & PART_FLAG_EXCLUSIVE_USER))
args.is_exclusive_user = true;
/* Need to filter out any nodes allocated with other mcs */
args.mcs_label = (mcs_select == 1) ? job_ptr->mcs_label : NULL;
/* Note that nodes_used_list is sorted in descending order of job end */
list_find_first(nodes_used_list, _rm_node_or_delay_start, &args);
return args.delay_start;
}
/* This is for use in _attempt_backfill() only */
#define SKIP_SCHED_OR_TRY_LATER(job_ptr, job_no_reserve, later_start, \
orig_time_limit, orig_start_time) \
{ \
_set_job_time_limit(job_ptr, orig_time_limit); \
if (later_start && !job_no_reserve) { \
log_flag(BACKFILL, "Try later %pJ later_start %ld", \
job_ptr, later_start); \
job_ptr->start_time = 0; \
goto TRY_LATER; \
} \
/* \
* Job can not start until too far in the future. \
* Use orig_start_time if job can't \
* start in different partition it will be 0 \
*/ \
log_flag(BACKFILL, "Can't schedule %pJ in partition %s", \
job_ptr, job_ptr->part_ptr->name); \
job_ptr->start_time = orig_start_time; \
continue; /* not runnable in this partition */ \
}
static void _attempt_backfill(void)
{
DEF_TIMERS;
list_t *job_queue = NULL;
job_queue_rec_t *job_queue_rec = NULL;
int bb, i, j, node_space_recs, mcs_select = 0;
slurmdb_qos_rec_t *qos_ptr = NULL;
job_record_t *job_ptr = NULL;
part_record_t *part_ptr;
uint32_t end_time, end_reserve, deadline_time_limit, boot_time;
uint32_t orig_end_time;
uint32_t time_limit, comp_time_limit, orig_time_limit = 0, part_time_limit;
uint32_t min_nodes, max_nodes, req_nodes;
bitstr_t *active_bitmap = NULL, *avail_bitmap = NULL;
bitstr_t *resv_bitmap = NULL, *excluded_topo_bitmap = NULL;
time_t now, sched_start, later_start, start_res, resv_end, window_end;
time_t het_job_time, orig_sched_start, orig_start_time = (time_t) 0;
time_t later_filter_start;
node_space_map_t *node_space;
node_used_t *nodes_used = NULL;
list_t *nodes_used_list = NULL;
struct timeval bf_time1, bf_time2;
int error_code;
int job_test_count = 0, test_time_count = 0, pend_time;
bool already_counted, many_rpcs = false;
job_record_t *reject_array_job = NULL;
part_record_t *reject_array_part = NULL;
slurmdb_qos_rec_t *reject_array_qos = NULL;
slurmctld_resv_t *reject_array_resv = NULL;
bool reject_array_use_prefer = false;
uint32_t start_time, array_start_time = 0;
struct timeval start_tv;
uint32_t test_array_job_id = 0;
uint32_t test_array_count = 0;
uint32_t job_no_reserve;
bool is_job_array_head, resv_overlap = false;
uint8_t save_share_res = 0, save_whole_node = 0;
int test_fini;
uint32_t qos_flags = 0;
time_t qos_blocked_until = 0, qos_part_blocked_until = 0;
time_t tmp_preempt_start_time = 0;
bool tmp_preempt_in_progress = false;
bitstr_t *tmp_bitmap = NULL;
bool state_changed_break = false, nodes_planned = false;
bitstr_t *next_bitmap = NULL, *current_bitmap = NULL;
resv_exc_t resv_exc = { 0 };
will_run_data_t will_run_data = { 0 };
bool overlap_tested = false;
/* QOS Read lock */
assoc_mgr_lock_t qos_read_lock = {
.qos = READ_LOCK,
};
bf_sleep_usec = 0;
job_start_cnt = 0;
job_test_cnt = 0;
if (!fed_mgr_sibs_synced()) {
info("returning, federation siblings not synced yet");
return;
}
(void) bb_g_load_state(false);
START_TIMER;
if (slurm_conf.debug_flags & DEBUG_FLAG_BACKFILL)
info("beginning");
else
debug("beginning");
sched_start = orig_sched_start = now = time(NULL);
gettimeofday(&start_tv, NULL);
_handle_planned(nodes_planned);
job_queue = build_job_queue(true, true);
job_test_count = list_count(job_queue);
if (job_test_count == 0) {
if (slurm_conf.debug_flags & DEBUG_FLAG_BACKFILL)
info("no jobs to backfill");
else
debug("no jobs to backfill");
FREE_NULL_LIST(job_queue);
return;
} else
debug("%u jobs to backfill", job_test_count);
list_for_each(job_list, _clear_job_estimates, NULL);
if (bf_hetjob_prio)
list_for_each(job_list, _set_hetjob_details, NULL);
gettimeofday(&bf_time1, NULL);
slurmctld_diag_stats.bf_queue_len = job_test_count;
slurmctld_diag_stats.bf_queue_len_sum += slurmctld_diag_stats.
bf_queue_len;
job_test_count = 0;
slurmctld_diag_stats.bf_last_depth = 0;
slurmctld_diag_stats.bf_last_depth_try = 0;
slurmctld_diag_stats.bf_when_last_cycle = now;
node_space = xcalloc((bf_node_space_size + 1),
sizeof(node_space_map_t));
node_space[0].begin_time = sched_start / backfill_resolution;
node_space[0].begin_time *= backfill_resolution;
window_end = (sched_start + backfill_window) / backfill_resolution;
window_end *= backfill_resolution;
node_space[0].end_time = window_end;
node_space[0].avail_bitmap = bit_copy(avail_node_bitmap);
/* Make "resuming" nodes available to be scheduled in backfill */
bit_or(node_space[0].avail_bitmap, rs_node_bitmap);
if (bf_licenses)
node_space[0].licenses =
bf_licenses_initial(bf_running_job_reserve);
if (bf_topopt_enable) {
node_space[0].fragmentation = topology_g_get_fragmentation(
node_space[0].avail_bitmap);
}
node_space[0].next = 0;
node_space_recs = 1;
if (bf_running_job_reserve) {
node_space_handler_t node_space_handler;
node_space_handler.node_space = node_space;
node_space_handler.node_space_recs = &node_space_recs;
if (bf_licenses)
list_for_each(resv_list, _bf_reserve_resv_licenses,
&node_space_handler);
list_for_each(job_list, _bf_reserve_running,
&node_space_handler);
}
_init_node_used_array_and_list(&nodes_used, &nodes_used_list);
if (slurm_conf.debug_flags & DEBUG_FLAG_BACKFILL_MAP)
_dump_node_space_table(node_space);
if (assoc_limit_stop) {
assoc_mgr_lock(&qos_read_lock);
list_for_each(assoc_mgr_qos_list,
_clear_qos_blocked_times, NULL);
assoc_mgr_unlock(&qos_read_lock);
}
sort_job_queue(job_queue);
/* Ignore nodes that have been set as available during this cycle. */
bit_clear_all(bf_ignore_node_bitmap);
if (bf_topopt_enable)
init_oracle();
while (1) {
uint32_t bf_job_priority, prio_reserve;
bool get_boot_time = false;
bool licenses_unavail;
bool use_prefer = false;
slurmctld_resv_t *resv_ptr = NULL;
/* Run some final guaranteed logic after each job iteration */
if (job_ptr) {
job_resv_clear_magnetic_flag(job_ptr);
fill_array_reasons(job_ptr, reject_array_job);
/* Restore preemption state if needed. */
_restore_preempt_state(job_ptr, &tmp_preempt_start_time,
&tmp_preempt_in_progress);
/*
* Restore the original time limit in every corner case
* we didn't have done yet, like when we are looping
* through array tasks.
*/
if ((qos_flags & QOS_FLAG_NO_RESERVE) &&
slurm_conf.preempt_mode && orig_time_limit &&
(orig_time_limit != job_ptr->time_limit))
job_ptr->time_limit = orig_time_limit;
/*
* An array job with pending tasks should take on the
* start_time of the earliest pending task in the
* array.
*/
if (job_ptr->array_recs && array_start_time)
job_ptr->start_time = array_start_time;
}
array_start_time = 0;
xfree(job_queue_rec);
job_queue_rec = list_pop(job_queue);
if (!job_queue_rec) {
log_flag(BACKFILL, "reached end of job queue");
_set_bf_exit(BF_EXIT_END);
break;
}
if (job_test_cnt >=
max_backfill_job_cnt) {
log_flag(BACKFILL, "bf_max_job_test: limit of %d reached",
max_backfill_job_cnt);
_set_bf_exit(BF_EXIT_MAX_JOB_TEST);
break;
}
if (window_end < now) {
log_flag(BACKFILL, "Now after current backfill window");
_set_bf_exit(BF_EXIT_TIMEOUT);
break;
}
job_ptr = job_queue_rec->job_ptr;
part_ptr = job_queue_rec->part_ptr;
bf_job_priority = job_queue_rec->priority;
qos_ptr = job_queue_rec->qos_ptr;
use_prefer = job_queue_rec->use_prefer;
if (job_ptr->array_recs &&
(job_queue_rec->array_task_id == NO_VAL))
is_job_array_head = true;
else
is_job_array_head = false;
if (slurmctld_config.shutdown_time ||
(difftime(time(NULL),orig_sched_start) >= bf_max_time)){
_set_bf_exit(BF_EXIT_TIMEOUT);
break;
}
many_rpcs = false;
slurm_mutex_lock(&slurmctld_config.thread_count_lock);
if ((max_rpc_cnt > 0) &&
(slurmctld_config.server_thread_count >= max_rpc_cnt))
many_rpcs = true;
slurm_mutex_unlock(&slurmctld_config.thread_count_lock);
if (many_rpcs || (slurm_delta_tv(&start_tv) >= yield_interval)) {
if (slurm_conf.debug_flags & DEBUG_FLAG_BACKFILL) {
END_TIMER;
info("yielding locks after testing %u(%d) jobs, %s",
slurmctld_diag_stats.bf_last_depth,
job_test_count, TIME_STR);
}
/* Sync planned nodes before yielding locks */
nodes_planned = true;
_handle_planned(nodes_planned);
if (_yield_locks(yield_sleep)) {
log_flag(BACKFILL, "system state changed, breaking out after testing %u(%d) jobs",
slurmctld_diag_stats.bf_last_depth,
job_test_count);
state_changed_break = true;
_set_bf_exit(BF_EXIT_STATE_CHANGED);
break;
}
/* Reset backfill scheduling timers, resume testing */
sched_start = time(NULL);
gettimeofday(&start_tv, NULL);
job_test_count = 0;
test_time_count = 0;
nodes_planned = false;
START_TIMER;
}
if (is_job_array_head &&
(job_ptr->array_task_id != NO_VAL)) {
/* Job array element started in other partition,
* reset pointer to "master" job array record */
log_flag(BACKFILL, "%pJ array scheduled during bf yield, try master",
job_ptr);
job_ptr = find_job_record(job_ptr->array_job_id);
if (!job_ptr) /* All task array elements started */
continue;
job_queue_rec->job_ptr = job_ptr;
}
/*
* Establish baseline (worst case) start time for hetjob
* Update time once start time estimate established
*/
_het_job_start_set(job_ptr, (now + YEAR_SECONDS), NO_VAL);
if (job_ptr->het_job_id &&
(job_ptr->state_reason == WAIT_NO_REASON)) {
xfree(job_ptr->state_desc);
job_ptr->state_reason = WAIT_RESOURCES;
}
if (!_job_runnable_now(job_ptr))
continue;
if (!part_ptr)
continue;
if (!_job_part_valid(job_ptr, part_ptr))
continue; /* Partition change during lock yield */
if (job_ptr->resv_list)
job_queue_rec_resv_list(job_queue_rec);
else
job_queue_rec_magnetic_resv(job_queue_rec);
resv_ptr = job_ptr->resv_ptr;
xfree(job_queue_rec);
job_ptr->bit_flags |= BACKFILL_SCHED;
job_ptr->last_sched_eval = now;
job_ptr->part_ptr = part_ptr;
job_ptr->priority = bf_job_priority;
job_ptr->qos_ptr = qos_ptr;
mcs_select = slurm_mcs_get_select(job_ptr);
het_job_time = _het_job_start_find(job_ptr);
if (het_job_time > (now + backfill_window))
continue;
if (job_ptr->qos_ptr) {
assoc_mgr_lock_t locks = {
.assoc = READ_LOCK,
.qos = READ_LOCK,
};
assoc_mgr_lock(&locks);
if (job_ptr->assoc_ptr
&& (accounting_enforce & ACCOUNTING_ENFORCE_QOS)
&& ((job_ptr->qos_ptr->id >= g_qos_count) ||
!job_ptr->assoc_ptr->usage ||
!job_ptr->assoc_ptr->usage->valid_qos ||
!bit_test(job_ptr->assoc_ptr->usage->valid_qos,
job_ptr->qos_ptr->id))
&& !job_ptr->limit_set.qos) {
debug("%pJ has invalid QOS",
job_ptr);
assoc_mgr_unlock(&locks);
job_fail_qos(job_ptr, __func__, false);
last_job_update = now;
continue;
} else if (job_ptr->state_reason == FAIL_QOS) {
xfree(job_ptr->state_desc);
job_ptr->state_reason = WAIT_NO_REASON;
last_job_update = now;
}
assoc_mgr_unlock(&locks);
}
assoc_mgr_lock(&qos_read_lock);
if (job_ptr->qos_ptr) {
qos_flags = job_ptr->qos_ptr->flags;
qos_blocked_until = job_ptr->qos_ptr->blocked_until;
} else {
qos_flags = 0;
qos_blocked_until = 0;
}
if (job_ptr->part_ptr->qos_ptr)
qos_part_blocked_until =
job_ptr->part_ptr->qos_ptr->blocked_until;
else
qos_part_blocked_until = 0;
if (part_policy_valid_qos(job_ptr->part_ptr, job_ptr->qos_ptr,
job_ptr->user_id, job_ptr) !=
SLURM_SUCCESS) {
assoc_mgr_unlock(&qos_read_lock);
continue;
}
assoc_mgr_unlock(&qos_read_lock);
if (!assoc_limit_stop &&
!acct_policy_job_runnable_pre_select(job_ptr, false)) {
continue;
}
if (!(prio_reserve = acct_policy_get_prio_thresh(
job_ptr, false)))
prio_reserve = bf_min_prio_reserve;
if (prio_reserve)
log_flag(BACKFILL, "%pJ has a prio_reserve of %u",
job_ptr, prio_reserve);
job_no_reserve = 0;
if (prio_reserve &&
(job_ptr->priority < prio_reserve)) {
job_no_reserve = TEST_NOW_ONLY;
} else if (bf_min_age_reserve && job_ptr->details->begin_time) {
pend_time = difftime(time(NULL),
job_ptr->details->begin_time);
if (pend_time < bf_min_age_reserve)
job_no_reserve = TEST_NOW_ONLY;
}
if (bf_one_resv_per_job && job_ptr->start_time) {
log_flag(BACKFILL, "%pJ already added a backfill reservation. Test immediate start only for partition %s",
job_ptr, job_ptr->part_ptr->name);
job_no_reserve = TEST_NOW_ONLY;
}
/*
* If we are trying to schedule preferred features don't
* reserve.
*/
if (use_prefer)
job_no_reserve = TEST_NOW_ONLY;
/* If partition data is needed and not yet initialized, do so */
if (!job_ptr->part_ptr->bf_data &&
(bf_job_part_count_reserve ||
max_backfill_job_per_user_part ||
max_backfill_job_per_part)) {
bf_part_data_t *part_data =
xmalloc(sizeof(bf_part_data_t));
part_data->job_usage =
xmalloc(sizeof(slurmdb_bf_usage_t));
part_data->resv_usage =
xmalloc(sizeof(slurmdb_bf_usage_t));
part_data->user_usage = xhash_init(_bf_map_key_id,
_bf_map_free);
job_ptr->part_ptr->bf_data = part_data;
}
if ((job_no_reserve == 0) && bf_job_part_count_reserve) {
if (_check_bf_usage(
job_ptr->part_ptr->bf_data->resv_usage,
bf_job_part_count_reserve,
orig_sched_start))
job_no_reserve = TEST_NOW_ONLY;
}
if (job_ptr->preempt_in_progress)
continue; /* scheduled in another partition */
orig_start_time = job_ptr->start_time;
orig_time_limit = job_ptr->time_limit;
next_task:
/*
* Restore time_limit for array tasks, just in case it has been
* overridden. This is no-op for the rest of cases.
*/
job_ptr->time_limit = orig_time_limit;
/*
* Save the current preemption state. Reset preemption state
* in the job_ptr so a job array can preempt multiple jobs.
*/
if (job_ptr->preempt_in_progress) {
tmp_preempt_in_progress = job_ptr->preempt_in_progress;
tmp_preempt_start_time = job_ptr->details->preempt_start_time;
job_ptr->details->preempt_start_time = 0;
job_ptr->preempt_in_progress = false;
}
/*
* Don't count queue records for magnetic reservation against
* backfill limits.
*/
if ((job_ptr->bit_flags & JOB_MAGNETIC) &&
!bf_allow_magnetic_slot) {
already_counted = true;
} else {
job_test_count++;
slurmctld_diag_stats.bf_last_depth++;
already_counted = false;
}
if (!IS_JOB_PENDING(job_ptr) || /* Started in other partition */
(job_ptr->priority == 0)) /* Job has been held */
continue;
if ((job_ptr->array_task_id != NO_VAL) || job_ptr->array_recs) {
if (reject_array_job &&
(reject_array_job->array_job_id ==
job_ptr->array_job_id) &&
(reject_array_part == part_ptr) &&
(reject_array_qos == qos_ptr) &&
(reject_array_resv == resv_ptr) &&
(reject_array_use_prefer == use_prefer))
continue; /* already rejected array element */
/* assume reject whole array for now, clear if OK */
reject_array_job = job_ptr;
reject_array_part = part_ptr;
reject_array_qos = qos_ptr;
reject_array_resv = resv_ptr;
reject_array_use_prefer = use_prefer;
if (!job_array_start_test(job_ptr))
continue;
}
/*
* If we are on a different task (see goto next_task) set it up
* the same way as we did it before.
*/
job_ptr->part_ptr = part_ptr;
job_ptr->qos_ptr = qos_ptr;
job_ptr->resv_ptr = resv_ptr;
if (resv_ptr)
job_ptr->resv_id = resv_ptr->resv_id;
if (job_limits_check(&job_ptr, true) != WAIT_NO_REASON) {
/* should never happen */
continue;
}
log_flag(BACKFILL, "test for %pJ Prio=%u Partition=%s Reservation=%s",
job_ptr, job_ptr->priority, job_ptr->part_ptr->name,
job_ptr->resv_ptr ? job_ptr->resv_ptr->name : "NONE");
/* Test to see if we've exceeded any per user/partition limit */
if (_job_exceeds_max_bf_param(job_ptr, orig_sched_start))
continue;
if (((part_ptr->state_up & PARTITION_SCHED) == 0) ||
(part_ptr->node_bitmap == NULL)) {
log_flag(BACKFILL, "partition %s not usable",
job_ptr->part_ptr->name);
continue;
}
if (!bf_licenses &&
license_job_test(job_ptr, time(NULL), true)) {
log_flag(BACKFILL, "%pJ not runable now due to licenses",
job_ptr);
continue;
}
if (!job_independent(job_ptr)) {
log_flag(BACKFILL, "%pJ not runable now",
job_ptr);
continue;
}
/* Determine minimum and maximum node counts */
error_code = get_node_cnts(job_ptr, qos_flags, part_ptr,
&min_nodes, &req_nodes, &max_nodes);
if (error_code == ESLURM_ACCOUNTING_POLICY) {
log_flag(BACKFILL, "%pJ acct policy node limit",
job_ptr);
continue;
} else if (error_code ==
ESLURM_REQUESTED_PART_CONFIG_UNAVAILABLE) {
log_flag(BACKFILL, "%pJ node count too high",
job_ptr);
continue;
} else if (error_code != SLURM_SUCCESS) {
log_flag(BACKFILL, "error setting nodes for %pJ: %s",
job_ptr, slurm_strerror(error_code));
continue;
}
/* test of deadline */
now = time(NULL);
deadline_time_limit = 0;
if ((job_ptr->deadline) && (job_ptr->deadline != NO_VAL)) {
if (!deadline_ok(job_ptr, __func__))
continue;
deadline_time_limit = (job_ptr->deadline - now) / 60;
}
/* Determine job's expected completion time */
if (part_ptr->max_time == INFINITE)
part_time_limit = YEAR_MINUTES;
else
part_time_limit = part_ptr->max_time;
if ((job_ptr->time_limit == NO_VAL) ||
(job_ptr->time_limit == INFINITE)) {
time_limit = part_time_limit;
job_ptr->limit_set.time = 1;
} else {
if (part_ptr->max_time == INFINITE)
time_limit = job_ptr->time_limit;
else
time_limit = MIN(job_ptr->time_limit,
part_time_limit);
}
if (deadline_time_limit)
comp_time_limit = MIN(time_limit, deadline_time_limit);
else if (job_ptr->time_min &&
(job_ptr->time_min < time_limit)) {
comp_time_limit = time_limit;
time_limit = job_ptr->time_limit = job_ptr->time_min;
} else
comp_time_limit = time_limit;
if ((qos_flags & QOS_FLAG_NO_RESERVE) &&
slurm_conf.preempt_mode)
time_limit = job_ptr->time_limit = 1;
later_start = now;
used_slots = 0;
if (assoc_limit_stop) {
if (qos_blocked_until > later_start) {
later_start = qos_blocked_until;
log_flag(BACKFILL, "QOS blocked_until move start_res to %ld",
later_start);
}
if (qos_part_blocked_until > later_start) {
later_start = qos_part_blocked_until;
log_flag(BACKFILL, "Part QOS blocked_until move start_res to %ld",
later_start);
}
}
TRY_LATER:
if (slurmctld_config.shutdown_time ||
(difftime(time(NULL), orig_sched_start) >=
bf_max_time)) {
_set_job_time_limit(job_ptr, orig_time_limit);
_set_bf_exit(BF_EXIT_TIMEOUT);
break;
}
test_time_count++;
many_rpcs = false;
slurm_mutex_lock(&slurmctld_config.thread_count_lock);
if ((max_rpc_cnt > 0) &&
(slurmctld_config.server_thread_count >= max_rpc_cnt))
many_rpcs = true;
slurm_mutex_unlock(&slurmctld_config.thread_count_lock);
if (many_rpcs || (slurm_delta_tv(&start_tv) >= yield_interval)) {
uint32_t save_time_limit = job_ptr->time_limit;
_set_job_time_limit(job_ptr, orig_time_limit);
if (slurm_conf.debug_flags & DEBUG_FLAG_BACKFILL) {
END_TIMER;
log_flag(BACKFILL, "yielding locks after testing %u(%d) jobs tested, %u time slots, %s",
slurmctld_diag_stats.bf_last_depth,
job_test_count, test_time_count,
TIME_STR);
}
/* Sync planned nodes before yielding locks */
nodes_planned = true;
_handle_planned(nodes_planned);
if (_yield_locks(yield_sleep)) {
log_flag(BACKFILL, "system state changed, breaking out after testing %u(%d) jobs",
slurmctld_diag_stats.bf_last_depth,
job_test_count);
state_changed_break = true;
_set_bf_exit(BF_EXIT_STATE_CHANGED);
break;
}
/* Reset backfill scheduling timers, resume testing */
sched_start = time(NULL);
gettimeofday(&start_tv, NULL);
job_test_count = 1;
test_time_count = 0;
nodes_planned = false;
START_TIMER;
if (is_job_array_head &&
(job_ptr->array_task_id != NO_VAL)) {
/*
* Job array element started in other partition,
* reset pointer to "master" job array record
*/
log_flag(BACKFILL, "%pJ array scheduled during bf yield, try master",
job_ptr);
job_ptr = find_job_record(
job_ptr->array_job_id);
if (!job_ptr)
/* All task array elements started */
continue;
}
/*
* With bf_continue configured, the original job could
* have been scheduled. Revalidate the job record here.
*/
if (!_job_runnable_now(job_ptr))
continue;
/*
* If the job wasn't scheduled while we didn't have the
* locks restore the pointers we were last on just in
* case the main scheduler changed them.
*/
job_ptr->resv_ptr = resv_ptr;
if (resv_ptr)
job_ptr->resv_id = resv_ptr->resv_id;
if (!_job_part_valid(job_ptr, part_ptr))
continue; /* Partition change during lock yield */
if (!job_independent(job_ptr)) {
log_flag(BACKFILL, "%pJ no longer independent after bf yield",
job_ptr);
/* No longer independent
* (e.g. another singleton started) */
continue;
}
job_ptr->time_limit = save_time_limit;
job_ptr->part_ptr = part_ptr;
job_ptr->qos_ptr = qos_ptr;
}
/*
* feature_list_use is a temporary variable and should
* be reset before each use.
* Do this after bf_yield to ensure the pointers are valid even
* if the job was updated during the bf_yield.
*/
if (use_prefer) {
/*
* Prefer was removed from the job since the
* job_queue_rec was created (during bf_yield).
* This is a separate queue record for prefer. Skip it.
*/
if (!job_ptr->details->prefer)
continue;
job_ptr->details->features_use =
job_ptr->details->prefer;
job_ptr->details->feature_list_use =
job_ptr->details->prefer_list;
} else {
job_ptr->details->features_use =
job_ptr->details->features;
job_ptr->details->feature_list_use =
job_ptr->details->feature_list;
}
FREE_NULL_BITMAP(avail_bitmap);
reservation_delete_resv_exc_parts(&resv_exc);
start_res = MAX(later_start, het_job_time);
resv_end = 0;
later_start = 0;
licenses_unavail = false;
/*
* Restore the original time limit before checking against
* reservations, and revert it after.
*/
if ((qos_flags & QOS_FLAG_NO_RESERVE) &&
slurm_conf.preempt_mode)
job_ptr->time_limit = orig_time_limit;
/* Determine impact of any advance reservations */
j = job_test_resv(job_ptr, &start_res, true, &avail_bitmap,
&resv_exc, &resv_overlap, false);
if (j != SLURM_SUCCESS) {
log_flag(BACKFILL, "%pJ reservation defer",
job_ptr);
_set_job_time_limit(job_ptr, orig_time_limit);
continue;
} else if ((qos_flags & QOS_FLAG_NO_RESERVE) &&
slurm_conf.preempt_mode)
job_ptr->time_limit = time_limit;
if (window_end < start_res) {
log_flag(BACKFILL, "%pJ start_res after current backfill window",
job_ptr);
_set_job_time_limit(job_ptr, orig_time_limit);
continue;
}
if (start_res > now)
end_time = (time_limit * 60) + start_res;
else
end_time = (time_limit * 60) + now;
if (end_time < now) /* Overflow 32-bits */
end_time = INFINITE;
if (resv_overlap)
resv_end = find_resv_end(start_res,
backfill_resolution);
/* Identify usable nodes for this job */
bit_and(avail_bitmap, part_ptr->node_bitmap);
bit_and(avail_bitmap, up_node_bitmap);
bit_and_not(avail_bitmap, bf_ignore_node_bitmap);
if (job_ptr->details->exc_node_bitmap) {
bit_and_not(avail_bitmap,
job_ptr->details->exc_node_bitmap);
}
if (_filter_exclusive_user_mcs_nodes(job_ptr, mcs_select,
min_nodes, nodes_used_list,
start_res,
&later_filter_start,
avail_bitmap)) {
/* start_res delayed must check resv times again */
later_start = later_filter_start;
SKIP_SCHED_OR_TRY_LATER(job_ptr, job_no_reserve,
later_start, orig_time_limit,
orig_start_time);
}
if (IS_JOB_WHOLE_TOPO(job_ptr)) {
if (excluded_topo_bitmap)
bit_clear_all(excluded_topo_bitmap);
else
excluded_topo_bitmap =
bit_alloc(node_record_count);
}
COPY_BITMAP(tmp_bitmap, avail_bitmap);
for (j = 0; ; ) {
if ((node_space[j].end_time > start_res) &&
node_space[j].next && (later_start == 0)) {
int tmp = node_space[j].next;
if (job_ptr->license_list &&
!bf_licenses_equal(node_space[tmp].licenses,
node_space[j]
.licenses)) {
later_start = node_space[j].end_time;
goto later_start_set;
}
COPY_BITMAP(next_bitmap, tmp_bitmap);
COPY_BITMAP(current_bitmap, avail_bitmap);
bit_and(next_bitmap,
node_space[tmp].avail_bitmap);
bit_and(current_bitmap,
node_space[j].avail_bitmap);
/*
* Normally later_start is set at the end of the
* first backfill reservation when the select
* plugin predicts start time after later_start.
* Then it goes to TRY_LATER and tries again on
* a new set of nodes to check if the job can
* start earlier. But if the next set of nodes
* is a subset of the currently tested ones then
* calling _try_sched (expensive function) would
* be useless and would impact performance.
*/
if (!bit_super_set(next_bitmap, current_bitmap))
later_start = node_space[j].end_time;
}
later_start_set:
if (node_space[j].end_time <= start_res)
;
else if (node_space[j].begin_time <= end_time) {
bit_and(avail_bitmap,
node_space[j].avail_bitmap);
bf_hres_filter(job_ptr, avail_bitmap,
node_space[j].licenses);
if (!bf_licenses_avail(node_space[j].licenses,
job_ptr, NULL)) {
licenses_unavail = true;
later_start = node_space[j].end_time;
xfree(job_ptr->state_desc);
job_ptr->state_reason = WAIT_LICENSES;
break;
}
if (IS_JOB_WHOLE_TOPO(job_ptr)) {
bit_or_not(excluded_topo_bitmap,
node_space[j].avail_bitmap);
}
} else {
int next = node_space[j].next;
if ((later_start == 0) && next &&
node_space[next].next)
later_start = node_space[next].end_time;
break;
}
if ((j = node_space[j].next) == 0)
break;
}
if (resv_end && (++resv_end < window_end) &&
((later_start == 0) || (resv_end < later_start))) {
later_start = resv_end;
}
if (IS_JOB_WHOLE_TOPO(job_ptr)) {
bit_and(excluded_topo_bitmap,
node_space[0].avail_bitmap);
topology_g_whole_topo(excluded_topo_bitmap,
job_ptr->part_ptr->topology_idx);
bit_and_not(avail_bitmap, excluded_topo_bitmap);
}
/* Test if licenses are unavailable OR
* required nodes missing OR
* nodes lack features OR
* no change since previously tested nodes (only changes
* in other partition nodes) */
if (licenses_unavail ||
((job_ptr->details->req_node_bitmap) &&
(!bit_super_set(job_ptr->details->req_node_bitmap,
avail_bitmap))) ||
(job_req_node_filter(job_ptr, avail_bitmap, true))) {
SKIP_SCHED_OR_TRY_LATER(job_ptr, job_no_reserve,
later_start, orig_time_limit,
orig_start_time);
}
if (!later_start && later_filter_start)
later_start = later_filter_start; /* filter out fewer */
/* Test if insufficient nodes remain */
if (bit_set_count(avail_bitmap) < min_nodes) {
SKIP_SCHED_OR_TRY_LATER(job_ptr, job_no_reserve,
later_start, orig_time_limit,
orig_start_time);
}
/* Identify nodes which are definitely off limits */
FREE_NULL_BITMAP(resv_bitmap);
resv_bitmap = bit_copy(avail_bitmap);
bit_not(resv_bitmap);
/* this is the time consuming operation */
debug2("entering _try_sched for %pJ.",
job_ptr);
if (!already_counted) {
slurmctld_diag_stats.bf_last_depth_try++;
job_test_cnt++;
already_counted = true;
}
if (slurm_conf.debug_flags & DEBUG_FLAG_BACKFILL_MAP)
_dump_job_test(job_ptr, avail_bitmap, start_res,
later_start);
test_fini = -1;
build_active_feature_bitmap(job_ptr, avail_bitmap,
&active_bitmap);
job_ptr->bit_flags |= BACKFILL_TEST;
job_ptr->bit_flags |= job_no_reserve; /* 0 or TEST_NOW_ONLY */
if (active_bitmap) {
will_run_data.start = start_res;
will_run_data.end = later_start;
j = _try_sched(job_ptr, &active_bitmap, min_nodes,
max_nodes, req_nodes, &resv_exc,
&will_run_data);
if (j == SLURM_SUCCESS) {
FREE_NULL_BITMAP(avail_bitmap);
avail_bitmap = active_bitmap;
active_bitmap = NULL;
test_fini = 1;
} else {
if (node_features_g_overlap(active_bitmap))
get_boot_time = true;
FREE_NULL_BITMAP(active_bitmap);
save_share_res = job_ptr->details->share_res;
save_whole_node = job_ptr->details->whole_node;
job_ptr->details->share_res = 0;
job_ptr->details->whole_node |=
WHOLE_NODE_REQUIRED;
if (!save_whole_node)
job_ptr->bit_flags |= BF_WHOLE_NODE_TEST;
test_fini = 0;
}
}
boot_time = 0;
if (test_fini == 0) {
/* Unable to start job using currently active features,
* need to try using features which can be made
* available after node reboot */
resv_exc_t tmp_resv_exc = { 0 };
bitstr_t *tmp_node_bitmap = NULL;
debug2("entering _try_sched for %pJ. Need to use features which can be made available after node reboot",
job_ptr);
/*
* Restore the original time limit before checking against
* reservations, and revert it after.
*/
if ((qos_flags & QOS_FLAG_NO_RESERVE) &&
slurm_conf.preempt_mode)
job_ptr->time_limit = orig_time_limit;
/* Determine impact of any advance reservations */
resv_end = 0;
j = job_test_resv(job_ptr, &start_res, false,
&tmp_node_bitmap, &tmp_resv_exc,
&resv_overlap, true);
if ((qos_flags & QOS_FLAG_NO_RESERVE) &&
slurm_conf.preempt_mode)
job_ptr->time_limit = time_limit;
if (resv_overlap)
resv_end = find_resv_end(start_res,
backfill_resolution);
if (resv_end && (++resv_end < window_end) &&
((later_start == 0) || (resv_end < later_start))) {
later_start = resv_end;
}
if (j == SLURM_SUCCESS) {
reservation_delete_resv_exc_parts(&resv_exc);
memcpy(&resv_exc, &tmp_resv_exc,
sizeof(resv_exc));
bit_and(avail_bitmap, tmp_node_bitmap);
FREE_NULL_BITMAP(tmp_node_bitmap);
}
if (get_boot_time)
boot_time = node_features_g_boot_time();
orig_end_time = end_time;
end_time += boot_time;
for (j = 0; ; ) {
if (node_space[j].end_time <= start_res)
;
else if (node_space[j].begin_time <= end_time) {
if (node_space[j].begin_time >
orig_end_time)
bit_and(avail_bitmap,
node_space[j].avail_bitmap);
} else
break;
if ((j = node_space[j].next) == 0)
break;
}
}
if (test_fini != 1) {
/* Either active_bitmap was NULL or not usable by the
* job. Test using avail_bitmap instead */
will_run_data.start = start_res;
will_run_data.end = later_start;
j = _try_sched(job_ptr, &avail_bitmap, min_nodes,
max_nodes, req_nodes, &resv_exc,
&will_run_data);
if (test_fini == 0) {
job_ptr->details->share_res = save_share_res;
job_ptr->details->whole_node = save_whole_node;
}
}
job_ptr->bit_flags &= ~BACKFILL_TEST;
job_ptr->bit_flags &= ~BF_WHOLE_NODE_TEST;
job_ptr->bit_flags &= ~TEST_NOW_ONLY;
now = time(NULL);
if (j != SLURM_SUCCESS) {
SKIP_SCHED_OR_TRY_LATER(job_ptr, job_no_reserve,
later_start, orig_time_limit,
orig_start_time);
}
if (start_res > job_ptr->start_time) {
job_ptr->start_time = start_res;
last_job_update = now;
}
if (job_ptr->start_time > now) {
_set_slot_time(job_ptr, time_limit, boot_time,
&start_time, &end_reserve);
if (_test_resv_overlap(node_space, avail_bitmap,
job_ptr, start_time,
end_reserve)) {
later_start = job_ptr->start_time;
if (start_res == job_ptr->start_time) {
later_start += backfill_resolution;
log_flag(BACKFILL, "%pJ inf loop detect", job_ptr);
}
job_ptr->start_time = 0;
log_flag(BACKFILL, "%pJ overlaps with existing reservation start_time=%u end_reserve=%u boot_time=%u later_start %ld",
job_ptr, start_time, end_reserve,
boot_time, later_start);
goto TRY_LATER;
}
overlap_tested = true;
} else
overlap_tested = false;
if (!job_no_reserve && bf_topopt_enable) {
if (oracle(job_ptr, avail_bitmap, later_start,
&time_limit, &boot_time, node_space)) {
log_flag(BACKFILL, "%pJ used_slots:%u later_start %ld",
job_ptr, used_slots, later_start);
goto TRY_LATER;
}
_set_slot_time(job_ptr, time_limit, boot_time,
&start_time, &end_reserve);
}
/*
* avail_bitmap at this point contains a bitmap of nodes
* selected for this job to be allocated
*/
if ((job_ptr->start_time <= now) &&
(bit_overlap_any(avail_bitmap, cg_node_bitmap) ||
bit_overlap_any(avail_bitmap, rs_node_bitmap))) {
/* Need to wait for in-progress completion/epilog */
job_ptr->start_time = now + 1;
later_start = 0;
}
if ((job_ptr->start_time <= now) &&
((bb = bb_g_job_test_stage_in(job_ptr, true)) != 1)) {
if (job_ptr->state_reason != WAIT_NO_REASON) {
/*
* Don't change state_reason if it was already
* set.
*/
;
} else if (bb == -1) {
/*
* Set reason now instead of in if (bb == -1)
* below for the sched_debug3()
*/
xfree(job_ptr->state_desc);
job_ptr->state_reason =
WAIT_BURST_BUFFER_RESOURCE;
} else { /* bb == 0 */
xfree(job_ptr->state_desc);
job_ptr->state_reason=WAIT_BURST_BUFFER_STAGING;
/*
* Cannot start now, set start time in the
* future.
*/
job_ptr->start_time = now + 1;
}
sched_debug3("%pJ. State=%s. Reason=%s. Priority=%u.",
job_ptr,
job_state_string(job_ptr->job_state),
job_state_reason_string(
job_ptr->state_reason),
job_ptr->priority);
last_job_update = now;
_set_job_time_limit(job_ptr, orig_time_limit);
later_start = 0;
if (bb == -1) {
/*
* bb == -1 means that burst buffer stage-in
* hasn't started yet. Set an estimated start
* time so stage-in can start.
*
* Clear reject_array_job; otherwise we'll skip
* looking at other jobs in this array (if this
* is a job array), therefore we won't set
* estimated start times, therefore we won't be
* able to start stage-in for any other jobs in
* this array.
*/
job_ptr->start_time =
bb_g_job_get_est_start(job_ptr);
reject_array_job = NULL;
reject_array_part = NULL;
reject_array_qos = NULL;
reject_array_resv = NULL;
continue;
}
} else if ((job_ptr->het_job_id == 0) &&
(job_ptr->start_time <= now)) { /* Can start now */
uint32_t save_time_limit = job_ptr->time_limit;
uint32_t hard_limit;
bool reset_time = false;
int rc;
/* get fed job lock from origin cluster */
if (fed_mgr_job_lock(job_ptr)) {
log_flag(BACKFILL, "%pJ can't get fed job lock from origin cluster to backfill job",
job_ptr);
rc = ESLURM_FED_JOB_LOCK;
goto skip_start;
}
rc = _start_job(job_ptr, resv_bitmap);
if (rc == SLURM_SUCCESS) {
/*
* If the following fails because of network
* connectivity, the origin cluster should ask
* when it comes back up if the cluster_lock
* cluster actually started the job
*/
fed_mgr_job_start(job_ptr, job_ptr->start_time);
} else {
fed_mgr_job_unlock(job_ptr);
}
skip_start:
if (qos_flags & QOS_FLAG_NO_RESERVE) {
if (orig_time_limit == NO_VAL) {
acct_policy_alter_job(
job_ptr, comp_time_limit);
job_ptr->time_limit = comp_time_limit;
job_ptr->limit_set.time = 1;
} else {
acct_policy_alter_job(
job_ptr, orig_time_limit);
_set_job_time_limit(job_ptr,
orig_time_limit);
}
} else if ((rc == SLURM_SUCCESS) && soft_time_limit &&
job_ptr->time_min) {
acct_policy_alter_job(job_ptr, orig_time_limit);
job_ptr->time_limit = orig_time_limit;
} else if ((rc == SLURM_SUCCESS) && job_ptr->time_min) {
/* Set time limit as high as possible */
acct_policy_alter_job(job_ptr, comp_time_limit);
job_ptr->time_limit = comp_time_limit;
reset_time = true;
} else if (orig_time_limit == NO_VAL) {
acct_policy_alter_job(job_ptr, comp_time_limit);
job_ptr->time_limit = comp_time_limit;
job_ptr->limit_set.time = 1;
} else if (deadline_time_limit &&
(rc == SLURM_SUCCESS)) {
acct_policy_alter_job(job_ptr, comp_time_limit);
job_ptr->time_limit = comp_time_limit;
reset_time = true;
} else {
acct_policy_alter_job(job_ptr, orig_time_limit);
_set_job_time_limit(job_ptr, orig_time_limit);
}
/*
* Only set end_time if start_time is set,
* or else end_time will be small (ie. 1969).
*/
if (IS_JOB_FINISHED(job_ptr)) {
/* Zero size or killed on startup */
} else if (job_ptr->start_time) {
node_space_handler_t ns_handler = {
.node_space = node_space,
.node_space_recs = &node_space_recs,
};
if (job_ptr->time_limit == INFINITE)
hard_limit = YEAR_SECONDS;
else
hard_limit = job_ptr->time_limit * 60;
job_ptr->end_time = job_ptr->start_time +
hard_limit;
/*
* Only set if start_time. end_time must be set
* beforehand for _reset_job_time_limit.
*/
if (reset_time) {
_reset_job_time_limit(job_ptr, now,
node_space);
time_limit = job_ptr->time_limit;
}
_bf_reserve_running(job_ptr, &ns_handler);
} else if (rc == SLURM_SUCCESS) {
error("start_time of 0 on successful backfill. This shouldn't happen. :)");
}
if ((rc == ESLURM_RESERVATION_BUSY) ||
(rc == ESLURM_ACCOUNTING_POLICY &&
!assoc_limit_stop) ||
((rc == ESLURM_REQUESTED_NODE_CONFIG_UNAVAILABLE) &&
job_ptr->extra_constraints)) {
/* Unknown future start time, just skip job */
job_ptr->start_time = orig_start_time;
_set_job_time_limit(job_ptr, orig_time_limit);
continue;
} else if (rc == ESLURM_ACCOUNTING_POLICY) {
/* Unknown future start time. Determining
* when it can start with certainty requires
* when every running and pending job starts
* and ends and tracking all of there resources.
* That requires very high overhead, that we
* don't want to add. Estimate that it can start
* after the next job ends (or in 5 minutes if
* we don't have that information yet). */
if (later_start)
job_ptr->start_time = later_start;
else
job_ptr->start_time = now + 500;
if (job_ptr->qos_blocking_ptr &&
job_state_reason_check(
job_ptr->state_reason,
JSR_QOS_GRP)) {
assoc_mgr_lock(&qos_read_lock);
qos_ptr = job_ptr->qos_blocking_ptr;
if (qos_ptr->blocked_until <
job_ptr->start_time) {
qos_ptr->blocked_until =
job_ptr->start_time;
}
assoc_mgr_unlock(&qos_read_lock);
}
} else if (rc != SLURM_SUCCESS) {
log_flag(BACKFILL, "planned start of %pJ failed: %s",
job_ptr, slurm_strerror(rc));
/* Drop through and reserve these resources.
* Likely due to state changes during sleep.
* Make best-effort based upon original state */
_set_job_time_limit(job_ptr, orig_time_limit);
later_start = 0;
} else {
/* Started this job, move to next one */
/* Clear assumed rejected array status */
reject_array_job = NULL;
reject_array_part = NULL;
reject_array_qos = NULL;
reject_array_resv = NULL;
/* Update the database if job time limit
* changed and move to next job */
if (save_time_limit != job_ptr->time_limit)
jobacct_storage_g_job_start(
acct_db_conn, job_ptr);
job_start_cnt++;
if (max_backfill_jobs_start &&
(job_start_cnt >= max_backfill_jobs_start)){
log_flag(BACKFILL, "bf_max_job_start limit of %d reached",
max_backfill_jobs_start);
_set_bf_exit(BF_EXIT_MAX_JOB_START);
break;
}
if (job_test_cnt >= max_backfill_job_cnt) {
log_flag(BACKFILL, "bf_max_job_test: limit of %d reached",
max_backfill_job_cnt);
_set_bf_exit(BF_EXIT_MAX_JOB_TEST);
break;
}
if (_mark_nodes_usage(job_ptr, nodes_used))
list_sort(nodes_used_list,
_cmp_last_job_end);
if (is_job_array_head &&
(job_ptr->array_task_id != NO_VAL)) {
/* Try starting next task of job array */
job_record_t *tmp = job_ptr;
job_ptr = find_job_record(job_ptr->
array_job_id);
if (job_ptr && (job_ptr != tmp) &&
IS_JOB_PENDING(job_ptr) &&
(bb_g_job_test_stage_in(
job_ptr, false) == 1))
goto next_task;
}
continue;
}
} else if (job_ptr->het_job_id != 0) {
uint32_t max_time_limit;
max_time_limit =_get_job_max_tl(job_ptr, now,
node_space);
comp_time_limit = MIN(comp_time_limit, max_time_limit);
job_ptr->node_cnt_wag =
MAX(bit_set_count(avail_bitmap), 1);
_het_job_start_set(job_ptr, job_ptr->start_time,
comp_time_limit);
_set_job_time_limit(job_ptr, orig_time_limit);
if (bf_hetjob_immediate &&
(!max_backfill_jobs_start ||
(job_start_cnt < max_backfill_jobs_start)))
_het_job_start_test(node_space,
job_ptr->het_job_id,
nodes_used,
nodes_used_list);
}
if ((job_ptr->start_time > now) && (job_no_reserve != 0)) {
if ((orig_start_time != 0) &&
(orig_start_time < job_ptr->start_time)) {
/* Can start earlier in different partition */
job_ptr->start_time = orig_start_time;
} else {
log_flag(BACKFILL, "%pJ StartTime set but no backfill reservation created.",
job_ptr);
}
_set_job_time_limit(job_ptr, orig_time_limit);
continue;
}
if (later_start && (job_ptr->start_time > later_start)) {
/* Try later when some nodes currently reserved for
* pending jobs are free */
log_flag(BACKFILL, "Try later %pJ later_start %ld",
job_ptr, later_start);
job_ptr->start_time = 0;
goto TRY_LATER;
}
if (!overlap_tested) {
/* Job start deferred from now*/
_set_slot_time(job_ptr, time_limit, boot_time,
&start_time, &end_reserve);
}
if (job_ptr->start_time > (sched_start + backfill_window)) {
/* Starts too far in the future to worry about */
end_reserve = job_ptr->start_time + boot_time +
(time_limit * 60);
if (slurm_conf.debug_flags & DEBUG_FLAG_BACKFILL)
_dump_job_sched(job_ptr, end_reserve,
avail_bitmap);
if ((orig_start_time != 0) &&
(orig_start_time < job_ptr->start_time)) {
/* Can start earlier in different partition */
job_ptr->start_time = orig_start_time;
} else {
log_flag(BACKFILL, "%pJ StartTime set to time after current backfill window. No reservation created",
job_ptr);
}
_set_job_time_limit(job_ptr, orig_time_limit);
continue;
}
if (!overlap_tested &&
(job_ptr->state_reason != WAIT_BURST_BUFFER_RESOURCE) &&
(job_ptr->state_reason != WAIT_BURST_BUFFER_STAGING) &&
_test_resv_overlap(node_space, avail_bitmap, job_ptr,
start_time, end_reserve)) {
/* This job overlaps with an existing reservation for
* job to be backfill scheduled, which the sched
* plugin does not know about. Try again later. */
later_start = job_ptr->start_time;
job_ptr->start_time = 0;
log_flag(BACKFILL, "%pJ after defer overlaps with existing reservation start_time=%u end_reserve=%u boot_time=%u later_start %ld",
job_ptr, start_time, end_reserve, boot_time,
later_start);
goto TRY_LATER;
}
if (_het_job_deadlock_test(job_ptr)) {
_set_job_time_limit(job_ptr, orig_time_limit);
continue;
}
/*
* Add reservation to scheduling table if appropriate
*/
if (!assoc_limit_stop) {
uint32_t selected_node_cnt;
uint64_t tres_req_cnt[slurmctld_tres_cnt];
uint16_t sockets_per_node;
assoc_mgr_lock_t locks = {
.assoc = READ_LOCK,
.qos = WRITE_LOCK,
.tres = READ_LOCK,
};
selected_node_cnt = bit_set_count(avail_bitmap);
memcpy(tres_req_cnt, job_ptr->tres_req_cnt,
sizeof(tres_req_cnt));
tres_req_cnt[TRES_ARRAY_CPU] =
(uint64_t)(job_ptr->total_cpus ?
job_ptr->total_cpus :
job_ptr->details->min_cpus);
sockets_per_node = job_get_sockets_per_node(job_ptr);
tres_req_cnt[TRES_ARRAY_MEM] = job_get_tres_mem(
job_ptr->job_resrcs,
job_ptr->details->pn_min_memory,
tres_req_cnt[TRES_ARRAY_CPU],
selected_node_cnt,
job_ptr->part_ptr,
job_ptr->gres_list_req,
(job_ptr->bit_flags &
JOB_MEM_SET), sockets_per_node,
job_ptr->details->num_tasks);
tres_req_cnt[TRES_ARRAY_NODE] =
(uint64_t)selected_node_cnt;
assoc_mgr_lock(&locks);
gres_stepmgr_set_job_tres_cnt(job_ptr->gres_list_req,
selected_node_cnt,
tres_req_cnt,
true);
tres_req_cnt[TRES_ARRAY_BILLING] =
assoc_mgr_tres_weighted(
tres_req_cnt,
job_ptr->part_ptr->billing_weights,
slurm_conf.priority_flags, true);
if (!acct_policy_job_runnable_post_select(job_ptr,
tres_req_cnt, true)) {
assoc_mgr_unlock(&locks);
log_flag(BACKFILL, "adding reservation for %pJ blocked by acct_policy_job_runnable_post_select",
job_ptr);
_set_job_time_limit(job_ptr, orig_time_limit);
continue;
}
assoc_mgr_unlock(&locks);
}
if (slurm_conf.debug_flags & DEBUG_FLAG_BACKFILL)
_dump_job_sched(job_ptr, end_reserve, avail_bitmap);
if (qos_flags & QOS_FLAG_NO_RESERVE) {
_set_job_time_limit(job_ptr, orig_time_limit);
continue;
}
if (bf_job_part_count_reserve) {
if (_check_bf_usage(
job_ptr->part_ptr->bf_data->resv_usage,
bf_job_part_count_reserve,
orig_sched_start)) {
_set_job_time_limit(job_ptr, orig_time_limit);
continue;
}
job_ptr->part_ptr->bf_data->resv_usage->count++;
}
/* Clear assumed rejected array status */
reject_array_job = NULL;
reject_array_part = NULL;
reject_array_qos = NULL;
reject_array_resv = NULL;
if ((!bf_one_resv_per_job || !orig_start_time) &&
(!(job_ptr->bit_flags & JOB_MAGNETIC) ||
bf_allow_magnetic_slot)) {
if (node_space_recs >= bf_node_space_size) {
log_flag(BACKFILL, "table size limit of %u reached",
bf_node_space_size);
if ((max_backfill_job_per_part != 0) &&
(max_backfill_job_per_part >=
(bf_node_space_size / 2))) {
error("bf_max_job_part >= bf_node_space_size / 2 (%u >= %u)",
max_backfill_job_per_part,
(bf_node_space_size / 2));
} else if ((max_backfill_job_per_user != 0) &&
(max_backfill_job_per_user >
(bf_node_space_size / 2))) {
warning("bf_max_job_user > bf_node_space_size / 2 (%u > %u)",
max_backfill_job_per_user,
(bf_node_space_size / 2));
} else if ((max_backfill_job_per_assoc != 0) &&
(max_backfill_job_per_assoc >
(bf_node_space_size / 2))) {
warning("bf_max_job_assoc > bf_node_space_size / 2 (%u > %u)",
max_backfill_job_per_assoc,
(bf_node_space_size / 2));
}
_set_job_time_limit(job_ptr, orig_time_limit);
_set_bf_exit(BF_EXIT_TABLE_LIMIT);
break;
}
_add_reservation(start_time, end_reserve, avail_bitmap,
job_ptr, node_space, &node_space_recs,
orig_start_time);
}
if (slurm_conf.debug_flags & DEBUG_FLAG_BACKFILL_MAP)
_dump_node_space_table(node_space);
if ((orig_start_time != 0) &&
(orig_start_time < job_ptr->start_time)) {
/* Can start earlier in different partition */
job_ptr->start_time = orig_start_time;
}
_set_job_time_limit(job_ptr, orig_time_limit);
if (job_ptr->array_recs) {
/* Try making reservation for next task of job array */
if (test_array_job_id != job_ptr->array_job_id) {
test_array_job_id = job_ptr->array_job_id;
test_array_count = 1;
array_start_time = job_ptr->start_time;
} else {
test_array_count++;
array_start_time = MIN(array_start_time,
job_ptr->start_time);
}
/*
* Don't consider the next task if it would exceed the
* maximum number of runnable tasks. If max_run_tasks is
* 0, then it wasn't set, so ignore it.
*/
if ((test_array_count < bf_max_job_array_resv) &&
(test_array_count <
job_ptr->array_recs->task_cnt) &&
(!job_ptr->array_recs->max_run_tasks ||
((MAX(job_ptr->array_recs->pend_run_tasks,
test_array_count) +
job_ptr->array_recs->tot_run_tasks) <
job_ptr->array_recs->max_run_tasks)))
goto next_task;
}
}
if (!nodes_planned)
_handle_planned(true);
xfree(job_queue_rec);
if (job_ptr) {
/* Restore preemption state if needed. */
_restore_preempt_state(job_ptr, &tmp_preempt_start_time,
&tmp_preempt_in_progress);
job_resv_clear_magnetic_flag(job_ptr);
if (job_ptr->array_recs && array_start_time)
job_ptr->start_time = array_start_time;
}
_het_job_deadlock_fini();
if (!bf_hetjob_immediate && !state_changed_break &&
(!max_backfill_jobs_start ||
(job_start_cnt < max_backfill_jobs_start)))
_het_job_start_test(node_space, 0, NULL, NULL);
FREE_NULL_BITMAP(avail_bitmap);
FREE_NULL_BITMAP(excluded_topo_bitmap);
reservation_delete_resv_exc_parts(&resv_exc);
FREE_NULL_BITMAP(resv_bitmap);
FREE_NULL_BITMAP(tmp_bitmap);
FREE_NULL_BITMAP(next_bitmap);
FREE_NULL_BITMAP(current_bitmap);
for (i = 0; ; ) {
FREE_NULL_BITMAP(node_space[i].avail_bitmap);
FREE_NULL_BF_LICENSES(node_space[i].licenses);
if ((i = node_space[i].next) == 0)
break;
}
for (i = node_space_recs; i <= bf_node_space_size; i++) {
if (!node_space[i].avail_bitmap)
break;
FREE_NULL_BITMAP(node_space[i].avail_bitmap);
}
xfree(node_space);
FREE_NULL_LIST(job_queue);
FREE_NULL_LIST(nodes_used_list);
xfree(nodes_used);
if (bf_topopt_enable)
fini_oracle();
gettimeofday(&bf_time2, NULL);
_do_diag_stats(&bf_time1, &bf_time2, node_space_recs);
if (slurm_conf.debug_flags & DEBUG_FLAG_BACKFILL) {
END_TIMER;
info("completed testing %u(%d) jobs, %s",
slurmctld_diag_stats.bf_last_depth,
job_test_count, TIME_STR);
}
slurm_mutex_lock(&slurmctld_config.thread_count_lock);
if (slurmctld_config.server_thread_count >= 150) {
info("%d pending RPCs at cycle end, consider "
"configuring max_rpc_cnt",
slurmctld_config.server_thread_count);
}
slurm_mutex_unlock(&slurmctld_config.thread_count_lock);
return;
}
/* Try to start the job on any non-reserved nodes */
static int _start_job(job_record_t *job_ptr, bitstr_t *resv_bitmap)
{
int rc;
bitstr_t *orig_exc_nodes = NULL;
bool is_job_array_head = false;
static uint32_t fail_jobid = 0;
job_node_select_t job_node_select = {
.job_ptr = job_ptr,
};
if (job_ptr->details->exc_node_bitmap) {
orig_exc_nodes = bit_copy(job_ptr->details->exc_node_bitmap);
bit_or(job_ptr->details->exc_node_bitmap, resv_bitmap);
} else
job_ptr->details->exc_node_bitmap = bit_copy(resv_bitmap);
if (job_ptr->array_recs)
is_job_array_head = true;
rc = select_nodes(&job_node_select, false, false,
SLURMDB_JOB_FLAG_BACKFILL);
if (is_job_array_head && job_ptr->details) {
job_record_t *base_job_ptr;
base_job_ptr = find_job_record(job_ptr->array_job_id);
if (base_job_ptr && base_job_ptr != job_ptr
&& base_job_ptr->array_recs) {
FREE_NULL_BITMAP(
base_job_ptr->details->exc_node_bitmap);
if (orig_exc_nodes)
base_job_ptr->details->exc_node_bitmap =
bit_copy(orig_exc_nodes);
}
}
if (job_ptr->details) { /* select_nodes() might reset exc_node_bitmap */
FREE_NULL_BITMAP(job_ptr->details->exc_node_bitmap);
job_ptr->details->exc_node_bitmap = orig_exc_nodes;
} else
FREE_NULL_BITMAP(orig_exc_nodes);
if (rc == SLURM_SUCCESS) {
/* job initiated */
last_job_update = time(NULL);
info("Started %pJ in %s on %s",
job_ptr, job_ptr->part_ptr->name, job_ptr->nodes);
if (job_ptr->batch_flag == 0)
srun_allocate(job_ptr);
else if (!IS_JOB_CONFIGURING(job_ptr))
launch_job(job_ptr);
slurmctld_diag_stats.backfilled_jobs++;
slurmctld_diag_stats.last_backfilled_jobs++;
if (job_ptr->het_job_id)
slurmctld_diag_stats.backfilled_het_jobs++;
log_flag(BACKFILL, "Jobs backfilled since boot: %u",
slurmctld_diag_stats.backfilled_jobs);
} else if ((job_ptr->job_id != fail_jobid) &&
(rc != ESLURM_ACCOUNTING_POLICY)) {
char *node_list;
bit_not(resv_bitmap);
node_list = bitmap2node_name(resv_bitmap);
/* This happens when a job has sharing disabled and
* a selected node is still completing some job,
* which should be a temporary situation. */
verbose("Failed to start %pJ with %s avail: %s",
job_ptr, node_list, slurm_strerror(rc));
xfree(node_list);
fail_jobid = job_ptr->job_id;
} else {
debug3("Failed to start %pJ: %s",
job_ptr, slurm_strerror(rc));
}
return rc;
}
/*
* Compute a job's maximum time based upon conflicts in resources
* planned for use by other jobs and that job's min/max time limit
* Return NO_VAL if no restriction
*/
static uint32_t _get_job_max_tl(job_record_t *job_ptr, time_t now,
node_space_map_t *node_space)
{
int32_t j;
time_t comp_time = 0;
uint32_t max_tl = NO_VAL;
if (job_ptr->time_min == 0)
return max_tl;
for (j = 0; ; ) {
if ((node_space[j].begin_time != now) && // No current conflicts
(node_space[j].begin_time < job_ptr->end_time) &&
(!bit_super_set(job_ptr->node_bitmap,
node_space[j].avail_bitmap) ||
!bf_licenses_avail(node_space[j].licenses, job_ptr,
job_ptr->node_bitmap))) {
/* Job overlaps pending job's resource reservation */
if ((comp_time == 0) ||
(comp_time > node_space[j].begin_time))
comp_time = node_space[j].begin_time;
}
if ((j = node_space[j].next) == 0)
break;
}
if (comp_time != 0)
max_tl = (comp_time - now + 59) / 60;
return max_tl;
}
/*
* Reset a job's time limit (and end_time) as high as possible
* within the range job_ptr->time_min and job_ptr->time_limit.
* Avoid using resources reserved for pending jobs or in resource
* reservations
*/
static void _reset_job_time_limit(job_record_t *job_ptr, time_t now,
node_space_map_t *node_space)
{
int32_t j, resv_delay;
uint32_t orig_time_limit = job_ptr->time_limit;
uint32_t new_time_limit;
for (j = 0; ; ) {
if ((node_space[j].begin_time != now) && // No current conflicts
(node_space[j].begin_time < job_ptr->end_time) &&
(!bit_super_set(job_ptr->node_bitmap,
node_space[j].avail_bitmap))) {
/* Job overlaps pending job's resource reservation */
resv_delay = difftime(node_space[j].begin_time, now);
resv_delay /= 60; /* seconds to minutes */
if (resv_delay < job_ptr->time_limit)
job_ptr->time_limit = resv_delay;
}
if ((j = node_space[j].next) == 0)
break;
}
new_time_limit = MAX(job_ptr->time_min, job_ptr->time_limit);
acct_policy_alter_job(job_ptr, new_time_limit);
job_ptr->time_limit = new_time_limit;
job_ptr->end_time = job_ptr->start_time + (job_ptr->time_limit * 60);
job_time_adj_resv(job_ptr);
if (orig_time_limit != job_ptr->time_limit) {
info("%pJ time limit changed from %u to %u",
job_ptr, orig_time_limit, job_ptr->time_limit);
}
}
/*
* Report if any changes occurred to job, node, reservation
* or partition information
*/
static bool _more_work(time_t last_backfill_time)
{
bool rc = false;
if ((last_job_update >= last_backfill_time) ||
(last_node_update >= last_backfill_time) ||
(last_part_update >= last_backfill_time) ||
(last_resv_update >= last_backfill_time)) {
rc = true;
}
return rc;
}
/* Create a reservation for a job in the future */
static void _add_reservation(time_t start_time, time_t end_reserve,
bitstr_t *res_bitmap, job_record_t *job_ptr,
node_space_map_t *node_space, int *node_space_recs,
time_t orig_start_time)
{
bool placed = false;
int i, j, one_before = 0, one_after = -1;
bitstr_t *res_bitmap_orig = res_bitmap;
bitstr_t *res_bitmap_efctv = NULL;
#if 0
info("add job start:%u end:%u", start_time, end_reserve);
for (j = 0; ; ) {
info("node start:%u end:%u",
(uint32_t) node_space[j].begin_time,
(uint32_t) node_space[j].end_time);
if ((j = node_space[j].next) == 0)
break;
}
#endif
if (res_bitmap) {
if (IS_JOB_WHOLE_TOPO(job_ptr)) {
res_bitmap_efctv = bit_copy(res_bitmap);
topology_g_whole_topo(res_bitmap_efctv,
job_ptr->part_ptr->topology_idx);
res_bitmap = res_bitmap_efctv;
}
if (!IS_JOB_RUNNING(job_ptr) &&
((orig_start_time == 0) ||
(job_ptr->start_time < orig_start_time))) {
/* Can't start earlier in different partition. */
xfree(job_ptr->sched_nodes);
job_ptr->sched_nodes = bitmap2node_name(res_bitmap);
/*
* These nodes are planned. We will set the state
* afterwards.
*/
bit_or(planned_bitmap, res_bitmap);
}
}
start_time = MAX(start_time, node_space[0].begin_time);
/*
* Ensure that the job always occupies at least one bf_resolution
* slot within the map. This also fixes potential issues when
* running with bf_running_job_reserve if jobs have run past
* their timelimit but have not yet been terminated.
*/
if (end_reserve < (start_time + backfill_resolution))
end_reserve = start_time + backfill_resolution;
for (j = 0; ; ) {
if (node_space[j].end_time > start_time) {
/* insert start entry record */
i = *node_space_recs;
node_space[i].begin_time = start_time;
node_space[i].end_time = node_space[j].end_time;
node_space[j].end_time = start_time;
COPY_BITMAP(node_space[i].avail_bitmap,
node_space[j].avail_bitmap);
node_space[i].licenses =
bf_licenses_copy(node_space[j].licenses);
node_space[i].fragmentation =
node_space[j].fragmentation;
node_space[i].next = node_space[j].next;
node_space[j].next = i;
(*node_space_recs)++;
placed = true;
break;
}
if (node_space[j].end_time == start_time) {
/* no need to insert new start entry record */
placed = true;
break;
}
one_before = j;
if ((j = node_space[j].next) == 0)
break;
}
while (placed && (j = node_space[j].next)) {
if (end_reserve < node_space[j].end_time) {
/* insert end entry record */
i = *node_space_recs;
node_space[i].begin_time = end_reserve;
node_space[i].end_time = node_space[j].end_time;
node_space[j].end_time = end_reserve;
COPY_BITMAP(node_space[i].avail_bitmap,
node_space[j].avail_bitmap);
node_space[i].licenses =
bf_licenses_copy(node_space[j].licenses);
node_space[i].fragmentation =
node_space[j].fragmentation;
node_space[i].next = node_space[j].next;
node_space[j].next = i;
(*node_space_recs)++;
}
/* merge in new usage with this record */
if (res_bitmap) {
bitstr_t *node_bitmap_orig = job_ptr->node_bitmap;
bit_and_not(node_space[j].avail_bitmap, res_bitmap);
if (!IS_JOB_RUNNING(job_ptr))
job_ptr->node_bitmap = res_bitmap_orig;
bf_licenses_deduct(node_space[j].licenses, job_ptr);
if (!IS_JOB_RUNNING(job_ptr))
job_ptr->node_bitmap = node_bitmap_orig;
if (bf_topopt_enable) {
node_space[j].fragmentation =
topology_g_get_fragmentation(
node_space[j].avail_bitmap);
}
} else {
/* setting up reservation licenses */
bf_licenses_transfer(node_space[j].licenses, job_ptr);
}
if (end_reserve == node_space[j].end_time) {
if (node_space[j].next)
one_after = node_space[j].next;
break;
}
}
/* Drop records with identical bitmaps (up to one record).
* This can significantly improve performance of the backfill tests. */
for (i = one_before; i != one_after; ) {
if ((j = node_space[i].next) == 0)
break;
if (!bf_licenses_equal(node_space[i].licenses,
node_space[j].licenses)) {
i = j;
continue;
}
if (!bit_equal(node_space[i].avail_bitmap,
node_space[j].avail_bitmap)) {
i = j;
continue;
}
node_space[i].end_time = node_space[j].end_time;
node_space[i].next = node_space[j].next;
if (node_space[j].avail_bitmap) {
for (i = *node_space_recs;
i <= bf_node_space_size; i++) {
if (!node_space[i].avail_bitmap) {
node_space[i].avail_bitmap =
node_space[j].avail_bitmap;
node_space[j].avail_bitmap = NULL;
break;
}
}
}
FREE_NULL_BITMAP(node_space[j].avail_bitmap);
FREE_NULL_BF_LICENSES(node_space[j].licenses);
break;
}
FREE_NULL_BITMAP(res_bitmap_efctv);
}
/*
* Determine if the resource specification for a new job overlaps with a
* reservation that the backfill scheduler has made for a job to be
* started in the future.
* IN use_bitmap - nodes to be allocated
* IN job_ptr - used for license and reservation info
* IN start_time - start time of job
* IN end_reserve - end time of job
*/
static bool _test_resv_overlap(node_space_map_t *node_space,
bitstr_t *use_bitmap, job_record_t *job_ptr,
uint32_t start_time, uint32_t end_reserve)
{
bool overlap = false;
int j = 0;
bitstr_t *use_bitmap_efctv = NULL;
bitstr_t *use_bitmap_orig = use_bitmap;
if (IS_JOB_WHOLE_TOPO(job_ptr)) {
use_bitmap_efctv = bit_copy(use_bitmap);
topology_g_whole_topo(use_bitmap_efctv,
job_ptr->part_ptr->topology_idx);
use_bitmap = use_bitmap_efctv;
}
while (true) {
if ((node_space[j].end_time > start_time) &&
(node_space[j].begin_time < end_reserve)) {
/*
* Jobs will run concurrently.
* Do they conflict for resources?
*/
if (!bit_super_set(use_bitmap,
node_space[j].avail_bitmap)) {
overlap = true;
break;
}
if (!bf_licenses_avail(node_space[j].licenses, job_ptr,
use_bitmap_orig)) {
overlap = true;
break;
}
}
if ((j = node_space[j].next) == 0)
break;
}
FREE_NULL_BITMAP(use_bitmap_efctv);
return overlap;
}
/*
* Delete het_job_map_t record from het_job_list
*/
static void _het_job_map_del(void *x)
{
het_job_map_t *map = (het_job_map_t *) x;
FREE_NULL_LIST(map->het_job_rec_list);
xfree(map);
}
/*
* Return 1 if a het_job_map_t record with a specific het_job_id is found.
* Always return 1 if "key" is zero.
*/
static int _het_job_find_map(void *x, void *key)
{
het_job_map_t *map = (het_job_map_t *) x;
uint32_t *het_job_id = (uint32_t *) key;
if ((het_job_id == NULL) ||
(map->het_job_id == *het_job_id))
return 1;
return 0;
}
/*
* Return 1 if a het_job_rec_t record with a specific job_id is found.
*/
static int _het_job_find_rec(void *x, void *key)
{
het_job_rec_t *rec = (het_job_rec_t *) x;
uint32_t *job_id = (uint32_t *) key;
if (rec->job_id == *job_id)
return 1;
return 0;
}
/*
* Remove vestigial elements from het_job_list. For still active element,
* clear the previously computted start time. This is used to periodically clear
* history so that heterogeneous jobs do not keep getting deferred based
* upon old system state
*/
static void _het_job_start_clear(void)
{
het_job_map_t *map;
list_itr_t *iter;
iter = list_iterator_create(het_job_list);
while ((map = list_next(iter))) {
if (map->prev_start == 0) {
list_delete_item(iter);
} else {
map->prev_start = 0;
list_flush(map->het_job_rec_list);
}
}
list_iterator_destroy(iter);
}
/*
* For a given het_job_map_t record, determine the earliest that it can start,
* which is the time at which it's latest starting component begins. The
* "exclude_job_id" is used to exclude a hetjob component currently being
* tested to start, presumably in a different partition.
*/
static time_t _het_job_start_compute(het_job_map_t *map,
uint32_t exclude_job_id)
{
list_itr_t *iter;
het_job_rec_t *rec;
time_t latest_start = map->prev_start;
iter = list_iterator_create(map->het_job_rec_list);
while ((rec = list_next(iter))) {
if (rec->job_id == exclude_job_id)
continue;
latest_start = MAX(latest_start, rec->latest_start);
}
list_iterator_destroy(iter);
return latest_start;
}
/*
* Return the earliest that a job can start based upon _other_ components of
* that same heterogeneous job. Return 0 if no limitation.
*
* If the job's state reason is BeginTime (the way all hetjobs start) and that
* time is passed, then clear the reason field.
*/
static time_t _het_job_start_find(job_record_t *job_ptr)
{
het_job_map_t *map;
time_t latest_start = (time_t) 0;
if (job_ptr->het_job_id) {
map = list_find_first(het_job_list, _het_job_find_map,
&job_ptr->het_job_id);
if (map) {
latest_start = _het_job_start_compute(map,
job_ptr->job_id);
}
log_flag(HETJOB, "%pJ in partition %s expected to start in %ld secs",
job_ptr, job_ptr->part_ptr->name,
MAX(0, latest_start - time(NULL)));
}
return latest_start;
}
/*
* Record the earliest that a hetjob component can start. If it can be
* started in multiple partitions, we only record the earliest start time
* for the job in any partition and reservation.
*/
static void _het_job_start_set(job_record_t *job_ptr, time_t latest_start,
uint32_t comp_time_limit)
{
het_job_map_t *map;
het_job_rec_t *rec;
if (comp_time_limit == NO_VAL)
comp_time_limit = job_ptr->time_limit;
if (job_ptr->het_job_id) {
map = list_find_first(het_job_list, _het_job_find_map,
&job_ptr->het_job_id);
if (map) {
if (!map->comp_time_limit) {
map->comp_time_limit = comp_time_limit;
} else {
map->comp_time_limit = MIN(map->comp_time_limit,
comp_time_limit);
}
rec = list_find_first(map->het_job_rec_list,
_het_job_find_rec,
&job_ptr->job_id);
if (rec && (rec->latest_start <= latest_start)) {
/*
* This job can start an earlier time in
* some other partition, so ignore new info
*/
} else if (rec) {
rec->latest_start = latest_start;
rec->part_ptr = job_ptr->part_ptr;
rec->resv_ptr = job_ptr->resv_ptr;
} else {
rec = xmalloc(sizeof(het_job_rec_t));
rec->job_id = job_ptr->job_id;
rec->job_ptr = job_ptr;
rec->latest_start = latest_start;
rec->part_ptr = job_ptr->part_ptr;
rec->resv_ptr = job_ptr->resv_ptr;
list_append(map->het_job_rec_list, rec);
}
} else {
rec = xmalloc(sizeof(het_job_rec_t));
rec->job_id = job_ptr->job_id;
rec->job_ptr = job_ptr;
rec->latest_start = latest_start;
rec->part_ptr = job_ptr->part_ptr;
rec->resv_ptr = job_ptr->resv_ptr;
map = xmalloc(sizeof(het_job_map_t));
map->comp_time_limit = comp_time_limit;
map->het_job_id = job_ptr->het_job_id;
map->het_job_rec_list = list_create(xfree_ptr);
list_append(map->het_job_rec_list, rec);
list_append(het_job_list, map);
}
log_flag(HETJOB, "%pJ in partition %s set to start in %ld secs",
job_ptr, job_ptr->part_ptr->name,
MAX(0, _het_job_start_compute(map, 0) - time(NULL)));
}
}
/*
* Return TRUE if we have expected start times for all components of a hetjob
* and all components are valid and runable.
*
* NOTE: This should never happen, but we will also start the job if all of the
* other components are already running,
*/
static bool _het_job_full(het_job_map_t *map)
{
job_record_t *het_job_ptr, *job_ptr;
list_itr_t *iter;
bool rc = true;
het_job_ptr = find_job_record(map->het_job_id);
if (!het_job_ptr || !het_job_ptr->het_job_list ||
(!IS_JOB_RUNNING(het_job_ptr) &&
!_job_runnable_now(het_job_ptr))) {
return false;
}
iter = list_iterator_create(het_job_ptr->het_job_list);
while ((job_ptr = list_next(iter))) {
if ((job_ptr->magic != JOB_MAGIC) ||
(job_ptr->het_job_id != map->het_job_id)) {
rc = false; /* bad job pointer */
break;
}
if (IS_JOB_RUNNING(job_ptr))
continue;
if (!list_find_first(map->het_job_rec_list, _het_job_find_rec,
&job_ptr->job_id) ||
!_job_runnable_now(job_ptr)) {
rc = false;
break;
}
}
list_iterator_destroy(iter);
return rc;
}
/*
* Determine if all components of a hetjob can be started now or are
* prevented from doing so because of association or QOS limits.
* Return true if they can all start.
*
* NOTE: That a hetjob passes this test does not mean that it will be able
* to run. For example, this test assumes resource allocation at the CPU level.
* If each task is allocated one core, with 2 CPUs, then the CPU limit test
* would not be accurate.
*/
static bool _het_job_limit_check(het_job_map_t *map, time_t now)
{
job_record_t *job_ptr;
het_job_rec_t *rec;
list_itr_t *iter;
int begun_jobs = 0, fini_jobs = 0, slurmctld_tres_size;
bool runnable = true;
uint32_t selected_node_cnt;
uint64_t tres_req_cnt[slurmctld_tres_cnt];
uint64_t **tres_alloc_save = NULL;
tres_alloc_save = xcalloc(list_count(map->het_job_rec_list),
sizeof(uint64_t *));
slurmctld_tres_size = sizeof(uint64_t) * slurmctld_tres_cnt;
iter = list_iterator_create(map->het_job_rec_list);
while ((rec = list_next(iter))) {
uint16_t sockets_per_node;
assoc_mgr_lock_t locks = {
.assoc = READ_LOCK,
.qos = WRITE_LOCK,
.tres = READ_LOCK,
};
job_ptr = rec->job_ptr;
job_ptr->part_ptr = rec->part_ptr;
if (rec->resv_ptr) {
job_ptr->resv_ptr = rec->resv_ptr;
job_ptr->resv_id = job_ptr->resv_ptr->resv_id;
}
selected_node_cnt = job_ptr->node_cnt_wag;
memcpy(tres_req_cnt, job_ptr->tres_req_cnt,
slurmctld_tres_size);
tres_req_cnt[TRES_ARRAY_CPU] = (uint64_t)(job_ptr->total_cpus ?
job_ptr->total_cpus :
job_ptr->details->min_cpus);
sockets_per_node = job_get_sockets_per_node(job_ptr);
tres_req_cnt[TRES_ARRAY_MEM] = job_get_tres_mem(
job_ptr->job_resrcs,
job_ptr->details->pn_min_memory,
tres_req_cnt[TRES_ARRAY_CPU],
selected_node_cnt,
job_ptr->part_ptr,
job_ptr->gres_list_req,
(job_ptr->bit_flags &
JOB_MEM_SET), sockets_per_node,
job_ptr->details->num_tasks);
tres_req_cnt[TRES_ARRAY_NODE] = (uint64_t)selected_node_cnt;
assoc_mgr_lock(&locks);
gres_stepmgr_set_job_tres_cnt(job_ptr->gres_list_req,
selected_node_cnt,
tres_req_cnt, true);
tres_req_cnt[TRES_ARRAY_BILLING] =
assoc_mgr_tres_weighted(
tres_req_cnt,
job_ptr->part_ptr->billing_weights,
slurm_conf.priority_flags, true);
if (acct_policy_job_runnable_pre_select(job_ptr, true) &&
acct_policy_job_runnable_post_select(job_ptr,
tres_req_cnt, true)) {
assoc_mgr_unlock(&locks);
tres_alloc_save[begun_jobs++] = job_ptr->tres_alloc_cnt;
job_ptr->tres_alloc_cnt = xmalloc(slurmctld_tres_size);
memcpy(job_ptr->tres_alloc_cnt, tres_req_cnt,
slurmctld_tres_size);
acct_policy_job_begin(job_ptr, false);
} else {
assoc_mgr_unlock(&locks);
runnable = false;
break;
}
}
list_iterator_reset(iter);
while ((rec = list_next(iter))) {
job_ptr = rec->job_ptr;
if (begun_jobs > fini_jobs) {
time_t end_time_exp = job_ptr->end_time_exp;
time_t end_time = job_ptr->end_time;
uint32_t job_state = job_ptr->job_state;
/* Simulate normal job completion */
job_ptr->end_time_exp = now;
job_ptr->end_time = job_ptr->start_time;
job_state_set(job_ptr, (JOB_COMPLETE | JOB_COMPLETING));
acct_policy_job_fini(job_ptr, false);
job_ptr->end_time_exp = end_time_exp;
job_ptr->end_time = end_time;
job_state_set(job_ptr, job_state);
xfree(job_ptr->tres_alloc_cnt);
job_ptr->tres_alloc_cnt = tres_alloc_save[fini_jobs++];
}
}
list_iterator_destroy(iter);
xfree(tres_alloc_save);
return runnable;
}
/*
* Start all components of a hetjob now
*/
static int _het_job_start_now(het_job_map_t *map, node_space_map_t *node_space)
{
job_record_t *job_ptr;
bitstr_t *avail_bitmap = NULL;
bitstr_t *resv_bitmap = NULL, *used_bitmap = NULL;
het_job_rec_t *rec;
list_itr_t *iter;
int rc = SLURM_SUCCESS;
bool resv_overlap = false;
time_t now = time(NULL), start_res;
uint32_t hard_limit;
resv_exc_t resv_exc = { 0 };
iter = list_iterator_create(map->het_job_rec_list);
while ((rec = list_next(iter))) {
bool reset_time = false;
job_ptr = rec->job_ptr;
job_ptr->part_ptr = rec->part_ptr;
if (rec->resv_ptr) {
job_ptr->resv_ptr = rec->resv_ptr;
job_ptr->resv_id = job_ptr->resv_ptr->resv_id;
}
/*
* Identify the nodes which this job can use
*/
start_res = now;
rc = job_test_resv(job_ptr, &start_res, true, &avail_bitmap,
&resv_exc, &resv_overlap, false);
reservation_delete_resv_exc_parts(&resv_exc);
if (rc != SLURM_SUCCESS) {
error("%pJ failed to start due to reservation",
job_ptr);
FREE_NULL_BITMAP(avail_bitmap);
break;
}
bit_and(avail_bitmap, job_ptr->part_ptr->node_bitmap);
bit_and(avail_bitmap, up_node_bitmap);
if (used_bitmap)
bit_and_not(avail_bitmap, used_bitmap);
if (job_ptr->details->exc_node_bitmap) {
bit_and_not(avail_bitmap,
job_ptr->details->exc_node_bitmap);
}
if (fed_mgr_job_lock(job_ptr)) {
error("%pJ failed to start due to fed job lock",
job_ptr);
FREE_NULL_BITMAP(avail_bitmap);
continue;
}
resv_bitmap = avail_bitmap;
avail_bitmap = NULL;
bit_not(resv_bitmap);
rc = _start_job(job_ptr, resv_bitmap);
FREE_NULL_BITMAP(resv_bitmap);
if (rc == SLURM_SUCCESS) {
/*
* If the following fails because of network
* connectivity, the origin cluster should ask
* when it comes back up if the cluster_lock
* cluster actually started the job
*/
fed_mgr_job_start(job_ptr, job_ptr->start_time);
log_flag(HETJOB, "%pJ started", job_ptr);
if (!used_bitmap && job_ptr->node_bitmap)
used_bitmap = bit_copy(job_ptr->node_bitmap);
else if (job_ptr->node_bitmap)
bit_or(used_bitmap, job_ptr->node_bitmap);
} else {
fed_mgr_job_unlock(job_ptr);
break;
}
if (job_ptr->time_min) {
/* Set time limit as high as possible */
acct_policy_alter_job(job_ptr, map->comp_time_limit);
job_ptr->time_limit = map->comp_time_limit;
reset_time = true;
}
if (job_ptr->start_time) {
if (job_ptr->time_limit == INFINITE)
hard_limit = YEAR_SECONDS;
else
hard_limit = job_ptr->time_limit * 60;
job_ptr->end_time = job_ptr->start_time + hard_limit;
/*
* Only set if start_time. end_time must be set
* beforehand for _reset_job_time_limit.
*/
if (reset_time)
_reset_job_time_limit(job_ptr, now, node_space);
}
if (reset_time)
jobacct_storage_g_job_start(acct_db_conn, job_ptr);
}
list_iterator_destroy(iter);
FREE_NULL_BITMAP(used_bitmap);
return rc;
}
/*
* Deallocate all components if failed hetjob start
*/
static void _het_job_kill_now(het_job_map_t *map)
{
job_record_t *job_ptr;
het_job_rec_t *rec;
list_itr_t *iter;
time_t now = time(NULL);
int cred_lifetime = 1200;
uint32_t save_bitflags;
cred_lifetime = cred_expiration();
iter = list_iterator_create(map->het_job_rec_list);
while ((rec = list_next(iter))) {
job_ptr = rec->job_ptr;
if (IS_JOB_PENDING(job_ptr))
continue;
info("Deallocate %pJ due to hetjob start failure",
job_ptr);
job_ptr->details->begin_time = now + cred_lifetime + 1;
job_ptr->end_time = now;
job_state_set(job_ptr, (JOB_PENDING | JOB_COMPLETING));
last_job_update = now;
build_cg_bitmap(job_ptr);
job_completion_logger(job_ptr, false);
deallocate_nodes(job_ptr, false, false, false);
/*
* Since the job_completion_logger() removes the submit,
* we need to add it again, but don't stage-out burst buffer
*/
save_bitflags = job_ptr->bit_flags;
job_ptr->bit_flags |= JOB_KILL_HURRY;
acct_policy_add_job_submit(job_ptr, false);
job_ptr->bit_flags = save_bitflags;
if (!job_ptr->node_bitmap_cg ||
(bit_set_count(job_ptr->node_bitmap_cg) == 0))
batch_requeue_fini(job_ptr);
}
list_iterator_destroy(iter);
}
/*
* If all components of a heterogeneous job can start now, then do so
* node_space IN - map of available resources through time
* map IN - info about this heterogeneous job
* single IN - true if testing single heterogeneous jobs
* Return true if heterogeneous job can start now
*/
static bool _het_job_start_test_single(node_space_map_t *node_space,
het_job_map_t *map, bool single)
{
time_t now = time(NULL);
int rc;
if (!map)
return false;
if (!_het_job_full(map)) {
log_flag(HETJOB, "Hetjob %u has indefinite start time",
map->het_job_id);
if (!single)
map->prev_start = now + YEAR_SECONDS;
return false;
}
map->prev_start = _het_job_start_compute(map, 0);
if (map->prev_start > now) {
log_flag(HETJOB, "Hetjob %u should be able to start in %u seconds",
map->het_job_id, (uint32_t) (map->prev_start - now));
return false;
}
if (!_het_job_limit_check(map, now)) {
log_flag(HETJOB, "Hetjob %u prevented from starting by account/QOS limit",
map->het_job_id);
map->prev_start = now + YEAR_SECONDS;
return false;
}
log_flag(HETJOB, "Attempting to start hetjob %u", map->het_job_id);
rc = _het_job_start_now(map, node_space);
if (rc != SLURM_SUCCESS) {
log_flag(HETJOB, "Failed to start hetjob %u", map->het_job_id);
_het_job_kill_now(map);
} else {
job_start_cnt += list_count(map->het_job_rec_list);
if (max_backfill_jobs_start &&
(job_start_cnt >= max_backfill_jobs_start)) {
log_flag(BACKFILL, "bf_max_job_start limit of %d reached",
max_backfill_jobs_start);
}
return true;
}
return false;
}
static int _het_job_start_test_list(void *map, void *node_space)
{
if (!max_backfill_jobs_start ||
(job_start_cnt < max_backfill_jobs_start))
_het_job_start_test_single(node_space, map, false);
return SLURM_SUCCESS;
}
static int _foreach_add_job_to_nodes_used(void *x, void *arg)
{
het_job_rec_t *het_rec = x;
node_used_t *nodes_used = arg;
if (_mark_nodes_usage(het_rec->job_ptr, nodes_used))
nodes_used->needs_sorting = true;
return 0;
}
/*
* If all components of a heterogeneous job can start now, then do so
* node_space IN - map of available resources through time
* het_job_id IN - the ID of the heterogeneous job to evaluate,
* if zero then evaluate all heterogeneous jobs and
* nodes_used/node_used_list are not updated
* nodes_used IN/OUT - array of node usage used for exclusive filtering
* nodes_used_list IN/OUT - list of node usage used for exclusive filtering
*/
static void _het_job_start_test(node_space_map_t *node_space,
uint32_t het_job_id, node_used_t *nodes_used,
list_t *nodes_used_list)
{
het_job_map_t *map = NULL;
if (!het_job_id) {
/* Test all maps. */
(void)list_for_each(het_job_list,
_het_job_start_test_list, node_space);
} else {
/* Test single map. */
map = list_find_first(het_job_list, _het_job_find_map,
&het_job_id);
if (_het_job_start_test_single(node_space, map, true)) {
nodes_used->needs_sorting = false;
(void) list_for_each(map->het_job_rec_list,
_foreach_add_job_to_nodes_used,
nodes_used);
if (nodes_used->needs_sorting) {
nodes_used->needs_sorting = false;
list_sort(nodes_used_list, _cmp_last_job_end);
}
}
}
}
static void _deadlock_global_list_del(void *x)
{
deadlock_part_struct_t *dl_part_ptr = (deadlock_part_struct_t *) x;
FREE_NULL_LIST(dl_part_ptr->deadlock_job_list);
xfree(dl_part_ptr);
}
static int _deadlock_part_list_srch(void *x, void *key)
{
deadlock_job_struct_t *dl_job = (deadlock_job_struct_t *) x;
job_record_t *job_ptr = (job_record_t *) key;
if (dl_job->het_job_id == job_ptr->het_job_id)
return 1;
return 0;
}
static int _deadlock_part_list_srch2(void *x, void *key)
{
deadlock_job_struct_t *dl_job = (deadlock_job_struct_t *) x;
deadlock_job_struct_t *dl_job2 = (deadlock_job_struct_t *) key;
if (dl_job->het_job_id == dl_job2->het_job_id)
return 1;
return 0;
}
static int _deadlock_global_list_srch(void *x, void *key)
{
deadlock_part_struct_t *dl_part = (deadlock_part_struct_t *) x;
if (dl_part->part_ptr == (part_record_t *) key)
return 1;
return 0;
}
static int _deadlock_job_list_sort(void *x, void *y)
{
deadlock_job_struct_t *dl_job_ptr1 = *(deadlock_job_struct_t **) x;
deadlock_job_struct_t *dl_job_ptr2 = *(deadlock_job_struct_t **) y;
if (dl_job_ptr1->start_time > dl_job_ptr2->start_time)
return -1;
else if (dl_job_ptr1->start_time < dl_job_ptr2->start_time)
return 1;
return 0;
}
/*
* Call at end of backup execution to release memory allocated by
* _het_job_deadlock_test()
*/
static void _het_job_deadlock_fini(void)
{
FREE_NULL_LIST(deadlock_global_list);
}
/*
* Determine if job can run at it's "start_time" or later.
* job_ptr IN - job to test, set reason to "HET_JOB_DEADLOCK" if it will deadlock
* RET true if the job can not run due to possible deadlock with other hetjob
*
* NOTE: If there are a large number of hetjobs this will be painfully slow
* as the algorithm must be order n^2
*/
static bool _het_job_deadlock_test(job_record_t *job_ptr)
{
deadlock_job_struct_t *dl_job_ptr = NULL, *dl_job_ptr2 = NULL;
deadlock_job_struct_t *dl_job_ptr3 = NULL;
deadlock_part_struct_t *dl_part_ptr = NULL, *dl_part_ptr2 = NULL;
list_itr_t *job_iter, *part_iter;
bool have_deadlock = false;
if (!job_ptr->het_job_id || !job_ptr->part_ptr)
return false;
/*
* Find the list representing the ordering of jobs in this specific
* partition and add this job in the list, sorted by job start time
*/
if (!deadlock_global_list) {
deadlock_global_list = list_create(_deadlock_global_list_del);
} else {
dl_part_ptr = list_find_first(deadlock_global_list,
_deadlock_global_list_srch,
job_ptr->part_ptr);
}
if (!dl_part_ptr) {
dl_part_ptr = xmalloc(sizeof(deadlock_part_struct_t));
dl_part_ptr->deadlock_job_list = list_create(xfree_ptr);
dl_part_ptr->part_ptr = job_ptr->part_ptr;
list_append(deadlock_global_list, dl_part_ptr);
} else {
dl_job_ptr = list_find_first(dl_part_ptr->deadlock_job_list,
_deadlock_part_list_srch,
job_ptr);
}
if (!dl_job_ptr) {
dl_job_ptr = xmalloc(sizeof(deadlock_job_struct_t));
dl_job_ptr->het_job_id = job_ptr->het_job_id;
dl_job_ptr->start_time = job_ptr->start_time;
list_append(dl_part_ptr->deadlock_job_list, dl_job_ptr);
} else if (dl_job_ptr->start_time < job_ptr->start_time) {
dl_job_ptr->start_time = job_ptr->start_time;
}
list_sort(dl_part_ptr->deadlock_job_list, _deadlock_job_list_sort);
/*
* Log current table of hetjob start times by partition
*/
if (slurm_conf.debug_flags & DEBUG_FLAG_BACKFILL) {
part_iter = list_iterator_create(deadlock_global_list);
while ((dl_part_ptr2 = list_next(part_iter))){
info("Partition %s Hetjobs:",
dl_part_ptr2->part_ptr->name);
job_iter = list_iterator_create(dl_part_ptr2->
deadlock_job_list);
while ((dl_job_ptr2 = list_next(job_iter))) {
info(" Hetjob %u to start at %"PRIu64,
dl_job_ptr2->het_job_id,
(uint64_t) dl_job_ptr2->start_time);
}
list_iterator_destroy(job_iter);
}
list_iterator_destroy(part_iter);
}
/*
* Determine if any hetjobs scheduled to start earlier than this job
* in this partition are scheduled to start after it in some other
* partition
*/
part_iter = list_iterator_create(deadlock_global_list);
while ((dl_part_ptr2 = list_next(part_iter))){
if (dl_part_ptr2 == dl_part_ptr) /* Current partition, skip it */
continue;
dl_job_ptr2 = list_find_first(dl_part_ptr2->deadlock_job_list,
_deadlock_part_list_srch,
job_ptr);
if (!dl_job_ptr2) /* Hetjob not in this partition, no check */
continue;
job_iter = list_iterator_create(dl_part_ptr->deadlock_job_list);
while ((dl_job_ptr2 = list_next(job_iter))) {
if (dl_job_ptr2->het_job_id == dl_job_ptr->het_job_id)
break; /* Self */
dl_job_ptr3 = list_find_first(
dl_part_ptr2->deadlock_job_list,
_deadlock_part_list_srch2,
dl_job_ptr2);
if (dl_job_ptr3 &&
(dl_job_ptr3->start_time < dl_job_ptr->start_time)){
have_deadlock = true;
break;
}
}
list_iterator_destroy(job_iter);
if (have_deadlock)
log_flag(HETJOB, "Hetjob %u in partition %s would deadlock with hetjob %u in partition %s, skipping it",
dl_job_ptr->het_job_id,
dl_part_ptr->part_ptr->name,
dl_job_ptr3->het_job_id,
dl_part_ptr2->part_ptr->name);
if (have_deadlock)
break;
}
list_iterator_destroy(part_iter);
return have_deadlock;
}
static void _set_bf_exit(bf_exit_t code)
{
xassert(code < BF_EXIT_COUNT);
slurmctld_diag_stats.bf_exit[code]++;
}