blob: 3886db79d6a4883acab86729d8d403cb9815bfe0 [file] [log] [blame]
/*****************************************************************************\
* node_scheduler.c - select and allocated nodes to jobs
* Note: there is a global node table (node_record_table_ptr)
*****************************************************************************
* Copyright (C) 2002-2007 The Regents of the University of California.
* Copyright (C) 2008-2010 Lawrence Livermore National Security.
* Copyright (C) SchedMD LLC.
* Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
* Written by Morris Jette <jette1@llnl.gov>
* CODE-OCEC-09-009. All rights reserved.
*
* This file is part of Slurm, a resource management program.
* For details, see <https://slurm.schedmd.com/>.
* Please also read the included file: DISCLAIMER.
*
* Slurm is free software; you can redistribute it and/or modify it under
* the terms of the GNU General Public License as published by the Free
* Software Foundation; either version 2 of the License, or (at your option)
* any later version.
*
* In addition, as a special exception, the copyright holders give permission
* to link the code of portions of this program with the OpenSSL library under
* certain conditions as described in each individual source file, and
* distribute linked combinations including the two. You must obey the GNU
* General Public License in all respects for all of the code used other than
* OpenSSL. If you modify file(s) with this exception, you may extend this
* exception to your version of the file(s), but you are not obligated to do
* so. If you do not wish to do so, delete this exception statement from your
* version. If you delete this exception statement from all source files in
* the program, then also delete it here.
*
* Slurm is distributed in the hope that it will be useful, but WITHOUT ANY
* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
* FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
* details.
*
* You should have received a copy of the GNU General Public License along
* with Slurm; if not, write to the Free Software Foundation, Inc.,
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
\*****************************************************************************/
#include "config.h"
#include <errno.h>
#include <pthread.h>
#include <signal.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <syslog.h>
#include <unistd.h>
#include "slurm/slurm_errno.h"
#include "src/common/assoc_mgr.h"
#include "src/common/group_cache.h"
#include "src/common/hostlist.h"
#include "src/common/id_util.h"
#include "src/common/job_features.h"
#include "src/common/list.h"
#include "src/common/node_features.h"
#include "src/common/port_mgr.h"
#include "src/common/slurm_protocol_pack.h"
#include "src/common/xassert.h"
#include "src/common/xmalloc.h"
#include "src/common/xstring.h"
#include "src/interfaces/accounting_storage.h"
#include "src/interfaces/burst_buffer.h"
#include "src/interfaces/gres.h"
#include "src/interfaces/jobcomp.h"
#include "src/interfaces/mcs.h"
#include "src/interfaces/node_features.h"
#include "src/interfaces/preempt.h"
#include "src/interfaces/priority.h"
#include "src/interfaces/select.h"
#include "src/interfaces/switch.h"
#include "src/interfaces/topology.h"
#include "src/slurmctld/acct_policy.h"
#include "src/slurmctld/agent.h"
#include "src/slurmctld/gang.h"
#include "src/slurmctld/job_scheduler.h"
#include "src/slurmctld/licenses.h"
#include "src/slurmctld/node_scheduler.h"
#include "src/slurmctld/power_save.h"
#include "src/slurmctld/proc_req.h"
#include "src/slurmctld/reservation.h"
#include "src/slurmctld/slurmctld.h"
#include "src/stepmgr/gres_stepmgr.h"
#include "src/stepmgr/stepmgr.h"
#define _DEBUG 0
#define MAX_FEATURES 64 /* max exclusive features "[fs1|fs2]"=2 */
struct node_set { /* set of nodes with same configuration */
uint16_t cpus_per_node; /* NOTE: This is the minimum count */
char *features; /* Node features */
bitstr_t *feature_bits; /* MORed feature's position */
uint32_t flags; /* See NODE_SET_* below */
bitstr_t *my_bitmap; /* Node bitmap */
uint32_t node_cnt; /* Node count */
uint32_t node_weight; /* Node weight */
uint64_t real_memory; /* Real memory on node */
uint64_t sched_weight; /* Scheduling weight, based upon
* node_weight and flags */
};
#define NODE_SET_NOFLAG SLURM_BIT(0)
#define NODE_SET_REBOOT SLURM_BIT(1)
#define NODE_SET_OUTSIDE_FLEX SLURM_BIT(2)
#define NODE_SET_POWER_DN SLURM_BIT(3)
#define NODE_SET_POWERING_UP SLURM_BIT(4)
enum {
IN_FL, /* Inside flex reservation */
OUT_FL, /* Outside flex reservation */
IN_FL_RE, /* Inside flex reservation + need reboot */
OUT_FL_NO_RE, /* Outside flex reservation + NO to need reboot */
OUT_FL_RE, /* Outside flex reservation + need reboot */
REBOOT, /* Needs reboot */
NM_TYPES /* Number of node types */
};
static int _build_node_list(job_record_t *job_ptr,
struct node_set **node_set_pptr,
int *node_set_size, char **err_msg,
bool test_only, bool can_reboot);
static bitstr_t *_find_grp_node_bitmap(job_record_t *job_ptr);
static bool _first_array_task(job_record_t *job_ptr);
static void _log_node_set(job_record_t *job_ptr,
struct node_set *node_set_ptr,
int node_set_size);
static int _match_feature(list_t *feature_list, bitstr_t **inactive_bitmap);
static int _nodes_in_sets(bitstr_t *req_bitmap,
struct node_set * node_set_ptr,
int node_set_size);
static int _pick_best_nodes(struct node_set *node_set_ptr,
int node_set_size, bitstr_t ** select_bitmap,
job_record_t *job_ptr, part_record_t *part_ptr,
uint32_t min_nodes, uint32_t max_nodes,
uint32_t req_nodes, bool test_only,
list_t *preemptee_candidates,
list_t **preemptee_job_list, bool has_xand,
resv_exc_t *resv_exc_ptr, bool resv_overlap);
static void _set_err_msg(bool cpus_ok, bool mem_ok, bool disk_ok,
bool job_mc_ok, char **err_msg);
static void _set_sched_weight(struct node_set *node_set_ptr);
static int _sort_node_set(const void *x, const void *y);
static bitstr_t *_valid_features(job_record_t *job_ptr,
config_record_t *config_ptr,
bool can_reboot, bitstr_t *reboot_bitmap);
/*
* _get_ntasks_per_core - Retrieve the value of ntasks_per_core from
* the given job_details record. If it wasn't set, return INFINITE16.
* Intended for use with the adjust_cpus_nppcu function.
*/
static uint16_t _get_ntasks_per_core(job_details_t *details)
{
if (details->mc_ptr)
return details->mc_ptr->ntasks_per_core;
else
return INFINITE16;
}
/*
* allocate_nodes - change state of specified nodes to NODE_STATE_ALLOCATED
* also claim required licenses and resources reserved by accounting
* policy association
* IN job_ptr - job being allocated resources
*/
extern void allocate_nodes(job_record_t *job_ptr)
{
node_record_t *node_ptr;
for (int i = 0; (node_ptr = next_node_bitmap(job_ptr->node_bitmap, &i));
i++) {
make_node_alloc(node_ptr, job_ptr);
}
node_mgr_make_node_blocked(job_ptr, true);
last_node_update = time(NULL);
license_job_get(job_ptr, false);
set_initial_job_alias_list(job_ptr);
}
extern void set_initial_job_alias_list(job_record_t *job_ptr)
{
node_record_t *node_ptr;
bool has_cloud = false, has_cloud_power_save = false;
bool has_dynamic_norm = false;
for (int i = 0; (node_ptr = next_node_bitmap(job_ptr->node_bitmap, &i));
i++) {
if (IS_NODE_DYNAMIC_FUTURE(node_ptr))
has_cloud = true;
if (IS_NODE_DYNAMIC_NORM(node_ptr)) {
/* Must set alias list as nodes won't exist in conf */
has_cloud = true;
has_dynamic_norm = true;
}
if (IS_NODE_CLOUD(node_ptr)) {
has_cloud = true;
if (IS_NODE_POWERED_DOWN(node_ptr) ||
IS_NODE_POWERING_UP(node_ptr))
has_cloud_power_save = true;
}
}
if (has_cloud) {
if (has_cloud_power_save &&
job_ptr->origin_cluster &&
xstrcmp(slurm_conf.cluster_name, job_ptr->origin_cluster)) {
/* Set TBD so remote srun will updated node_addrs */
job_ptr->alias_list = xstrdup("TBD");
job_ptr->wait_all_nodes = 1;
} else if (cloud_dns && !has_dynamic_norm) {
job_ptr->wait_all_nodes = 1;
} else if (has_cloud_power_save) {
job_ptr->alias_list = xstrdup("TBD");
job_ptr->wait_all_nodes = 1;
} else
set_job_alias_list(job_ptr);
} else {
/* set addrs if the job is coming from a different cluster */
set_job_node_addrs(job_ptr, job_ptr->origin_cluster);
}
}
/*
* Set addrs if:
* 1. There is an alias_list (cloud/dynamic nodes) and it isn't TBD (nodes are
* powering up).
* 2. No alias_list but job/request is from a different cluster.
*/
extern void set_job_node_addrs(job_record_t *job_ptr,
const char *origin_cluster)
{
if (!job_ptr->node_addrs &&
job_ptr->node_bitmap &&
bit_set_count(job_ptr->node_bitmap) &&
((!job_ptr->alias_list && /* remote job */
origin_cluster &&
xstrcmp(origin_cluster, slurm_conf.cluster_name)) ||
(job_ptr->alias_list && xstrcmp(job_ptr->alias_list, "TBD")))) {
node_record_t *node_ptr;
job_ptr->node_addrs =
xcalloc(bit_set_count(job_ptr->node_bitmap),
sizeof(slurm_addr_t));
for (int i = 0, addr_index = 0;
(node_ptr = next_node_bitmap(job_ptr->node_bitmap,
&i));
i++) {
slurm_conf_get_addr(node_ptr->name,
&job_ptr->node_addrs[addr_index++],
0);
}
}
}
/* Set a job's alias_list string */
extern void set_job_alias_list(job_record_t *job_ptr)
{
node_record_t *node_ptr;
xfree(job_ptr->alias_list);
if (cloud_dns && bit_super_set(job_ptr->node_bitmap, cloud_node_bitmap))
return;
for (int i = 0; (node_ptr = next_node_bitmap(job_ptr->node_bitmap, &i));
i++) {
if (IS_NODE_DYNAMIC_FUTURE(node_ptr) ||
IS_NODE_DYNAMIC_NORM(node_ptr) ||
(!cloud_dns && IS_NODE_CLOUD(node_ptr))) {
if (job_ptr->alias_list)
xstrcat(job_ptr->alias_list, ",");
xstrfmtcat(job_ptr->alias_list, "%s:[%s]:%s",
node_ptr->name, node_ptr->comm_name,
node_ptr->node_hostname);
}
}
set_job_node_addrs(job_ptr, job_ptr->origin_cluster);
}
extern void set_job_features_use(job_details_t *details_ptr)
{
if (!details_ptr)
return;
if (details_ptr->prefer) {
details_ptr->features_use = details_ptr->prefer;
details_ptr->feature_list_use = details_ptr->prefer_list;
} else {
details_ptr->features_use = details_ptr->features;
details_ptr->feature_list_use = details_ptr->feature_list;
}
}
/*
* deallocate_nodes - for a given job, deallocate its nodes and make
* their state NODE_STATE_COMPLETING also release the job's licenses
* and resources reserved by accounting policy association
* IN job_ptr - pointer to terminating job (already in some COMPLETING state)
* IN timeout - true if job exhausted time limit, send REQUEST_KILL_TIMELIMIT
* RPC instead of REQUEST_TERMINATE_JOB
* IN suspended - true if job was already suspended (node's run_job_cnt
* already decremented);
* IN preempted - true if job is being preempted
*/
extern void deallocate_nodes(job_record_t *job_ptr, bool timeout,
bool suspended, bool preempted)
{
kill_job_msg_t *kill_job = NULL;
agent_arg_t *agent_args = NULL;
node_record_t *node_ptr;
hostlist_t *hostlist = NULL;
uint16_t use_protocol_version = 0;
uint16_t msg_flags = 0;
xassert(job_ptr);
xassert(job_ptr->details);
log_flag(TRACE_JOBS, "%s: %pJ", __func__, job_ptr);
acct_policy_job_fini(job_ptr, false);
node_mgr_make_node_blocked(job_ptr, false);
if (select_g_job_fini(job_ptr) != SLURM_SUCCESS)
error("select_g_job_fini(%pJ): %m", job_ptr);
/* Release any job-related switch data */
switch_g_job_complete(job_ptr);
epilog_slurmctld(job_ptr);
if (!job_ptr->details->prolog_running)
hostlist = hostlist_create(NULL);
if (!job_ptr->node_bitmap_cg)
build_cg_bitmap(job_ptr);
use_protocol_version = SLURM_PROTOCOL_VERSION;
for (int i = 0;
(node_ptr = next_node_bitmap(job_ptr->node_bitmap_cg, &i)); i++) {
/* Sync up conditionals with make_node_comp() */
if (IS_NODE_DOWN(node_ptr) ||
IS_NODE_POWERED_DOWN(node_ptr) ||
IS_NODE_POWERING_UP(node_ptr)) {
/* Issue the KILL RPC, but don't verify response */
bit_clear(job_ptr->node_bitmap_cg, i);
job_update_tres_cnt(job_ptr, i);
/*
* node_cnt indicates how many nodes we are waiting
* to get epilog complete messages from, so do not
* count down nodes. NOTE: The job's node_cnt will not
* match the number of entries in the node string
* during its completion.
*/
job_ptr->node_cnt--;
}
make_node_comp(node_ptr, job_ptr, suspended);
if (hostlist &&
!IS_NODE_POWERED_DOWN(node_ptr) &&
!IS_NODE_POWERING_UP(node_ptr)) {
hostlist_push_host(hostlist, node_ptr->name);
if (use_protocol_version > node_ptr->protocol_version) {
use_protocol_version =
node_ptr->protocol_version;
debug3("%s: protocol version downgraded to %u from node %s",
__func__, use_protocol_version,
node_ptr->name);
}
if (PACK_FANOUT_ADDRS(node_ptr))
msg_flags |= SLURM_PACK_ADDRS;
}
}
if (job_ptr->details->prolog_running) {
/*
* Job was configuring when it was cancelled and epilog wasn't
* run on the nodes, so cleanup the nodes now. Final cleanup
* will happen after EpilogSlurmctld is done.
*/
if (job_ptr->node_bitmap_cg) {
/*
* Call cleanup_completing before job_epilog_complete or
* we will end up requeuing there before this is called.
*/
cleanup_completing(job_ptr, false);
/*
* job_epilog_complete() can free
* job_ptr->node_bitmap_cg
*/
for (int i = 0;
job_ptr->node_bitmap_cg &&
(node_ptr = next_node_bitmap(
job_ptr->node_bitmap_cg, &i));
i++) {
job_epilog_complete(job_ptr->job_id,
node_ptr->name, 0);
}
}
return;
}
/* Can not wait for epilog complete to release licenses and
* update gang scheduling table */
cleanup_completing(job_ptr, false);
resv_replace_update(job_ptr);
if (!hostlist || !hostlist_count(hostlist)) {
hostlist_destroy(hostlist);
return;
}
if (job_ptr->bit_flags & EXTERNAL_JOB) {
debug("%s: %pJ is external, no need to wait to complete",
__func__, job_ptr);
for (int i = 0;
(node_ptr = next_node_bitmap(job_ptr->node_bitmap_cg, &i));
i++) {
make_node_idle(node_ptr, job_ptr);
}
hostlist_destroy(hostlist);
return;
}
agent_args = xmalloc(sizeof(agent_arg_t));
if (timeout)
agent_args->msg_type = REQUEST_KILL_TIMELIMIT;
else if (preempted)
agent_args->msg_type = REQUEST_KILL_PREEMPTED;
else
agent_args->msg_type = REQUEST_TERMINATE_JOB;
agent_args->retry = 0; /* re_kill_job() resends as needed */
agent_args->protocol_version = use_protocol_version;
agent_args->hostlist = hostlist;
agent_args->node_count = hostlist_count(hostlist);
agent_args->msg_flags = msg_flags;
last_node_update = time(NULL);
kill_job = create_kill_job_msg(job_ptr, use_protocol_version);
kill_job->nodes = xstrdup(job_ptr->nodes);
agent_args->msg_args = kill_job;
set_agent_arg_r_uid(agent_args, SLURM_AUTH_UID_ANY);
agent_queue_request(agent_args);
}
static void _log_feature_nodes(job_feature_t *job_feat_ptr)
{
char *tmp1, *tmp2, *tmp3, *tmp4 = NULL;
if (!(slurm_conf.debug_flags & DEBUG_FLAG_NODE_FEATURES))
return;
if (job_feat_ptr->op_code == FEATURE_OP_OR)
tmp3 = "OR";
else if (job_feat_ptr->op_code == FEATURE_OP_AND)
tmp3 = "AND";
else if (job_feat_ptr->op_code == FEATURE_OP_MOR)
tmp3 = "MOR";
else if (job_feat_ptr->op_code == FEATURE_OP_XAND)
tmp3 = "XAND";
else if (job_feat_ptr->op_code == FEATURE_OP_END)
tmp3 = "END";
else {
xstrfmtcat(tmp4, "UNKNOWN:%u", job_feat_ptr->op_code);
tmp3 = tmp4;
}
tmp1 = bitmap2node_name(job_feat_ptr->node_bitmap_active);
tmp2 = bitmap2node_name(job_feat_ptr->node_bitmap_avail);
log_flag(NODE_FEATURES, "%s: FEAT:%s COUNT:%u BRACKET:%u PAREN:%d OP:%s ACTIVE:%s AVAIL:%s",
__func__, job_feat_ptr->name, job_feat_ptr->count,
job_feat_ptr->bracket, job_feat_ptr->paren, tmp3, tmp1, tmp2);
xfree(tmp1);
xfree(tmp2);
xfree(tmp4);
}
/*
* For every element in the feature_list, identify the nodes with that feature
* either active or available and set the feature_list's node_bitmap_active and
* node_bitmap_avail fields accordingly.
*/
extern void find_feature_nodes(list_t *feature_list, bool can_reboot)
{
list_itr_t *feat_iter;
job_feature_t *job_feat_ptr;
node_feature_t *node_feat_ptr;
if (!feature_list)
return;
feat_iter = list_iterator_create(feature_list);
while ((job_feat_ptr = list_next(feat_iter))) {
FREE_NULL_BITMAP(job_feat_ptr->node_bitmap_active);
FREE_NULL_BITMAP(job_feat_ptr->node_bitmap_avail);
node_feat_ptr = list_find_first(active_feature_list,
list_find_feature,
job_feat_ptr->name);
if (node_feat_ptr && node_feat_ptr->node_bitmap) {
job_feat_ptr->node_bitmap_active =
bit_copy(node_feat_ptr->node_bitmap);
} else { /* This feature not active */
job_feat_ptr->node_bitmap_active =
bit_alloc(node_record_count);
}
if (can_reboot && job_feat_ptr->changeable) {
node_feat_ptr = list_find_first(avail_feature_list,
list_find_feature,
job_feat_ptr->name);
if (node_feat_ptr && node_feat_ptr->node_bitmap) {
job_feat_ptr->node_bitmap_avail =
bit_copy(node_feat_ptr->node_bitmap);
} else { /* This feature not available */
job_feat_ptr->node_bitmap_avail =
bit_alloc(node_record_count);
}
} else if (job_feat_ptr->node_bitmap_active) {
job_feat_ptr->node_bitmap_avail =
bit_copy(job_feat_ptr->node_bitmap_active);
}
_log_feature_nodes(job_feat_ptr);
}
list_iterator_destroy(feat_iter);
}
/*
* _match_feature - determine which of the job features are now inactive
* IN feature_list - Job's feature request list
* OUT inactive_bitmap - Nodes with this as inactive feature
* RET 1 if some nodes with this inactive feature, 0 no inactive feature
* NOTE: Currently fully supports only AND/OR of features, not XAND/MOR
*/
static int _match_feature(list_t *feature_list, bitstr_t **inactive_bitmap)
{
list_itr_t *job_feat_iter;
job_feature_t *job_feat_ptr;
int last_op = FEATURE_OP_AND, last_paren_op = FEATURE_OP_AND;
int i, last_paren_cnt = 0;
bitstr_t *feature_bitmap, *paren_bitmap = NULL, *work_bitmap;
xassert(inactive_bitmap);
if (!feature_list || /* nothing to look for */
(node_features_g_count() == 0)) /* No inactive features */
return 0;
feature_bitmap = node_conf_get_active_bitmap();
work_bitmap = feature_bitmap;
job_feat_iter = list_iterator_create(feature_list);
while ((job_feat_ptr = list_next(job_feat_iter))) {
if (last_paren_cnt < job_feat_ptr->paren) {
/* Start of expression in parenthesis */
last_paren_op = last_op;
last_op = FEATURE_OP_AND;
FREE_NULL_BITMAP(paren_bitmap);
paren_bitmap = node_conf_get_active_bitmap();
work_bitmap = paren_bitmap;
}
if (job_feat_ptr->node_bitmap_avail) {
if (last_op == FEATURE_OP_AND) {
bit_and(work_bitmap,
job_feat_ptr->node_bitmap_active);
} else if (last_op == FEATURE_OP_OR) {
bit_or(work_bitmap,
job_feat_ptr->node_bitmap_active);
} else { /* FEATURE_OP_MOR or FEATURE_OP_XAND */
bit_and(work_bitmap,
job_feat_ptr->node_bitmap_active);
}
} else { /* feature not found */
if (last_op == FEATURE_OP_AND) {
bit_clear_all(work_bitmap);
}
}
if (last_paren_cnt > job_feat_ptr->paren) {
/* End of expression in parenthesis */
if (last_paren_op == FEATURE_OP_AND) {
bit_and(feature_bitmap, work_bitmap);
} else if (last_paren_op == FEATURE_OP_OR) {
bit_or(feature_bitmap, work_bitmap);
} else { /* FEATURE_OP_MOR or FEATURE_OP_XAND */
bit_and(feature_bitmap, work_bitmap);
}
work_bitmap = feature_bitmap;
}
last_op = job_feat_ptr->op_code;
last_paren_cnt = job_feat_ptr->paren;
}
list_iterator_destroy(job_feat_iter);
#if 0
{
char tmp[32];
bit_fmt(tmp, sizeof(tmp), work_bitmap);
info("%s: NODE_BITMAP:%s", __func__, tmp);
}
#endif
FREE_NULL_BITMAP(paren_bitmap);
i = bit_ffc(feature_bitmap);
if (i == -1) { /* No required node features inactive */
FREE_NULL_BITMAP(feature_bitmap);
return 0;
}
bit_not(feature_bitmap);
*inactive_bitmap = feature_bitmap;
return 1;
}
/*
* For a given job, if the available nodes differ from those with currently
* active features, return a bitmap of nodes with the job's required
* features currently active
* IN job_ptr - job requesting resource allocation
* IN avail_bitmap - nodes currently available for this job
* OUT active_bitmap - nodes with job's features currently active, NULL if
* identical to avail_bitmap
* NOTE: Currently fully supports only AND/OR of features, not XAND/MOR
*/
extern void build_active_feature_bitmap(job_record_t *job_ptr,
bitstr_t *avail_bitmap,
bitstr_t **active_bitmap)
{
job_details_t *details_ptr = job_ptr->details;
bitstr_t *tmp_bitmap = NULL;
bool can_reboot;
*active_bitmap = NULL;
if (!details_ptr->feature_list_use || /* nothing to look for */
(node_features_g_count() == 0)) /* No inactive features */
return;
can_reboot = node_features_g_user_update(job_ptr->user_id);
find_feature_nodes(details_ptr->feature_list_use, can_reboot);
if (_match_feature(details_ptr->feature_list_use, &tmp_bitmap) == 0)
return; /* No inactive features */
bit_not(tmp_bitmap);
if (bit_super_set(avail_bitmap, tmp_bitmap)) {
FREE_NULL_BITMAP(tmp_bitmap);
return;
}
bit_and(tmp_bitmap, avail_bitmap);
*active_bitmap = tmp_bitmap;
}
/* Return bitmap of nodes with all specified features currently active */
extern bitstr_t *build_active_feature_bitmap2(char *reboot_features)
{
const char *delim = ",";
char *tmp, *tok, *save_ptr = NULL;
bitstr_t *active_node_bitmap = NULL;
node_feature_t *node_feat_ptr;
if (!reboot_features || (reboot_features[0] == '\0')) {
active_node_bitmap = node_conf_get_active_bitmap();
return active_node_bitmap;
}
tmp = xstrdup(reboot_features);
tok = strtok_r(tmp, delim, &save_ptr);
while (tok) {
node_feat_ptr = list_find_first(active_feature_list,
list_find_feature, tok);
if (node_feat_ptr && node_feat_ptr->node_bitmap) {
/*
* Found feature, add nodes with this feature and
* remove nodes without this feature (bit_and)
*/
if (!active_node_bitmap)
active_node_bitmap =
bit_copy(node_feat_ptr->node_bitmap);
else
bit_and(active_node_bitmap,
node_feat_ptr->node_bitmap);
} else {
/*
* Feature not found in any nodes, so we definitely
* need to reboot all of the nodes
*/
if (!active_node_bitmap)
active_node_bitmap =
bit_alloc(node_record_count);
else
bit_clear_all(active_node_bitmap);
break;
}
tok = strtok_r(NULL, delim, &save_ptr);
}
xfree(tmp);
return active_node_bitmap;
}
/*
* Decide if a job can share nodes with other jobs based on the
* following three input parameters:
*
* IN user_flag - may be 0 (do not share nodes), 1 (node sharing allowed),
* or any other number means "don't care"
* IN part_max_share - current partition's node sharing policy
*
*
* The followed table details the node SHARED state for the various scenarios
*
* part= part= part= part=
* cons_tres user_request EXCLUS NO YES FORCE
* -------- ------------ ------ ----- ----- -----
* no default whole whole whole whole/O
* no exclusive whole whole whole whole/O
* no share=yes whole whole whole/O whole/O
* yes default whole share share share/O
* yes exclusive whole whole whole whole/O
* yes share=yes whole share share/O share/O
*
* whole = entire node is allocated to the job
* share = less than entire node may be allocated to the job
* -/O = resources can be over-committed (e.g. gang scheduled)
*
* part->max_share:
* &SHARED_FORCE = FORCE
* 0 = EXCLUSIVE
* 1 = NO
* > 1 = YES
*
* job_ptr->details->share_res:
* 0 = default or share=no
* 1 = share=yes
*
* job_ptr->details->whole_node:
* 0 = default
* WHOLE_NODE_REQUIRED = 1 = exclusive
* WHOLE_NODE_USER = 2 = user
* WHOLE_NODE_MCS = 3 = mcs
*
* Return values:
* 0 = requires idle nodes
* 1 = can use non-idle nodes
*/
static int _resolve_shared_status(job_record_t *job_ptr,
uint16_t part_max_share)
{
if (job_ptr->reboot)
return 0;
/* no sharing if partition OverSubscribe=EXCLUSIVE */
if (part_max_share == 0) {
job_ptr->details->whole_node |= WHOLE_NODE_REQUIRED;
job_ptr->details->share_res = 0;
return 0;
}
/* sharing if partition OverSubscribe=FORCE with count > 1 */
if ((part_max_share & SHARED_FORCE) &&
((part_max_share & (~SHARED_FORCE)) > 1)) {
job_ptr->details->share_res = 1;
return 1;
}
if (running_cons_tres()) {
if ((job_ptr->details->share_res == 0) ||
(job_ptr->details->whole_node & WHOLE_NODE_REQUIRED)) {
job_ptr->details->share_res = 0;
return 0;
}
return 1;
} else {
job_ptr->details->whole_node |= WHOLE_NODE_REQUIRED;
if (part_max_share == 1) { /* partition is OverSubscribe=NO */
job_ptr->details->share_res = 0;
return 0;
}
/* share if the user requested it */
if (job_ptr->details->share_res == 1)
return 1;
job_ptr->details->share_res = 0;
return 0;
}
}
typedef struct {
job_record_t *job_ptr;
bitstr_t *usable_node_mask;
} foreach_filter_by_node_t;
static int _foreach_filter_by_node_owner(void *x, void *arg)
{
job_record_t *job_ptr2 = x;
foreach_filter_by_node_t *argstruct = arg;
job_record_t *job_ptr = argstruct->job_ptr;
bitstr_t *usable_node_mask = argstruct->usable_node_mask;
if (IS_JOB_PENDING(job_ptr2) || IS_JOB_COMPLETED(job_ptr2) ||
(job_ptr->user_id == job_ptr2->user_id) || !job_ptr2->node_bitmap)
return 0;
bit_and_not(usable_node_mask, job_ptr2->node_bitmap);
return 0;
}
/*
* Remove nodes from consideration for allocation based upon "ownership" by
* other users
* job_ptr IN - Job to be scheduled
* usable_node_mask IN/OUT - Nodes available for use by this job's user
*/
extern void filter_by_node_owner(job_record_t *job_ptr,
bitstr_t *usable_node_mask)
{
node_record_t *node_ptr;
int i;
foreach_filter_by_node_t argstruct = { .job_ptr = job_ptr,
.usable_node_mask =
usable_node_mask };
if ((job_ptr->details->whole_node & WHOLE_NODE_USER) ||
(job_ptr->part_ptr->flags & PART_FLAG_EXCLUSIVE_USER)) {
/* Need to remove all nodes allocated to any active job from
* any other user */
list_for_each(job_list, _foreach_filter_by_node_owner,
&argstruct);
return;
}
/* Need to filter out any nodes exclusively allocated to other users */
for (i = 0; (node_ptr = next_node(&i)); i++) {
if ((node_ptr->owner != NO_VAL) &&
(node_ptr->owner != job_ptr->user_id))
bit_clear(usable_node_mask, node_ptr->index);
}
}
/*
* Remove nodes from consideration for allocation based upon "mcs" by
* other users
* job_ptr IN - Job to be scheduled
* usable_node_mask IN/OUT - Nodes available for use by this job's mcs
*/
extern void filter_by_node_mcs(job_record_t *job_ptr, int mcs_select,
bitstr_t *usable_node_mask)
{
node_record_t *node_ptr;
int i;
/* Need to filter out any nodes allocated with other mcs */
if (job_ptr->mcs_label && (mcs_select == 1)) {
for (i = 0; (node_ptr = next_node(&i)); i++) {
/* if there is a mcs_label -> OK if it's the same */
if ((node_ptr->mcs_label != NULL) &&
xstrcmp(node_ptr->mcs_label,job_ptr->mcs_label)) {
bit_clear(usable_node_mask, node_ptr->index);
}
/* if no mcs_label -> OK if no jobs running */
if ((node_ptr->mcs_label == NULL) &&
(node_ptr->run_job_cnt != 0)) {
bit_clear(usable_node_mask, node_ptr->index);
}
}
} else {
for (i = 0; (node_ptr = next_node(&i)); i++) {
if (node_ptr->mcs_label != NULL) {
bit_clear(usable_node_mask, node_ptr->index);
}
}
}
}
/*
* Remove nodes from the "avail_node_bitmap" which need to be rebooted in order
* to be used if the job's "delay_boot" time has not yet been reached.
*/
static void _filter_by_node_feature(job_record_t *job_ptr,
struct node_set *node_set_ptr,
int node_set_size)
{
int i;
if ((job_ptr->details == NULL) ||
((job_ptr->details->begin_time != 0) &&
((job_ptr->details->begin_time + job_ptr->delay_boot) <=
time(NULL))))
return;
for (i = 0; i < node_set_size; i++) {
if (node_set_ptr[i].flags & NODE_SET_REBOOT) {
bit_and_not(avail_node_bitmap,
node_set_ptr[i].my_bitmap);
}
}
}
static void _find_qos_grp_node_bitmap(job_record_t *job_ptr,
slurmdb_qos_rec_t *qos_ptr,
bitstr_t **grp_node_bitmap,
bool *per_grp_limit,
bool *per_user_limit,
bool *per_acct_limit)
{
slurmdb_used_limits_t *used_limits = NULL;
if (!qos_ptr || !qos_ptr->usage)
return;
if (!*per_grp_limit &&
qos_ptr->usage->grp_node_bitmap &&
(qos_ptr->grp_tres_ctld[TRES_ARRAY_NODE] != INFINITE64)) {
*per_grp_limit = true;
*grp_node_bitmap = bit_copy(qos_ptr->usage->grp_node_bitmap);
}
if (!*per_user_limit &&
(qos_ptr->max_tres_pu_ctld[TRES_ARRAY_NODE] != INFINITE64)) {
*per_user_limit = true;
used_limits = acct_policy_get_user_used_limits(
&qos_ptr->usage->user_limit_list,
job_ptr->user_id);
if (used_limits && used_limits->node_bitmap) {
if (*grp_node_bitmap)
bit_or(*grp_node_bitmap,
used_limits->node_bitmap);
else
*grp_node_bitmap =
bit_copy(used_limits->node_bitmap);
}
}
if (!*per_acct_limit &&
job_ptr->assoc_ptr &&
(qos_ptr->max_tres_pa_ctld[TRES_ARRAY_NODE] != INFINITE64)) {
*per_acct_limit = true;
used_limits = acct_policy_get_acct_used_limits(
&qos_ptr->usage->acct_limit_list,
job_ptr->assoc_ptr->acct);
if (used_limits && used_limits->node_bitmap) {
if (*grp_node_bitmap)
bit_or(*grp_node_bitmap,
used_limits->node_bitmap);
else
*grp_node_bitmap =
bit_copy(used_limits->node_bitmap);
}
}
}
/*
* For a given job, return a bitmap of nodes to be preferred in it's allocation
*/
static bitstr_t *_find_grp_node_bitmap(job_record_t *job_ptr)
{
bitstr_t *grp_node_bitmap = NULL;
slurmdb_qos_rec_t *qos_ptr1 = NULL, *qos_ptr2 = NULL;
bool per_acct_limit = false, per_user_limit = false,
per_grp_limit = false;
assoc_mgr_lock_t qos_read_locks =
{ .assoc = READ_LOCK, .qos = READ_LOCK };
slurmdb_assoc_rec_t *assoc_ptr = job_ptr->assoc_ptr;
/* check to see if we are enforcing associations */
if (!(accounting_enforce & ACCOUNTING_ENFORCE_LIMITS))
return NULL;
assoc_mgr_lock(&qos_read_locks);
acct_policy_set_qos_order(job_ptr, &qos_ptr1, &qos_ptr2);
_find_qos_grp_node_bitmap(job_ptr, qos_ptr1, &grp_node_bitmap,
&per_grp_limit,
&per_user_limit,
&per_acct_limit);
_find_qos_grp_node_bitmap(job_ptr, qos_ptr2, &grp_node_bitmap,
&per_grp_limit,
&per_user_limit,
&per_acct_limit);
while (assoc_ptr && assoc_ptr->usage && !per_grp_limit) {
if (assoc_ptr->usage->grp_node_bitmap &&
(assoc_ptr->grp_tres_ctld[TRES_ARRAY_NODE] != INFINITE64)) {
per_grp_limit = true;
if (grp_node_bitmap)
bit_or(grp_node_bitmap,
assoc_ptr->usage->grp_node_bitmap);
else
grp_node_bitmap = bit_copy(assoc_ptr->usage->
grp_node_bitmap);
break;
}
assoc_ptr = assoc_ptr->usage->parent_assoc_ptr;
}
assoc_mgr_unlock(&qos_read_locks);
return grp_node_bitmap;
}
/*
* If the job has required feature counts, then accumulate those
* required resources using multiple calls to _pick_best_nodes()
* and adding those selected nodes to the job's required node list.
* Upon completion, return job's requirements to match the values
* which were in effect upon calling this function.
* Input and output are the same as _pick_best_nodes().
*/
static int _get_req_features(struct node_set *node_set_ptr, int node_set_size,
bitstr_t **select_bitmap, job_record_t *job_ptr,
part_record_t *part_ptr, uint32_t min_nodes,
uint32_t max_nodes, uint32_t req_nodes,
bool test_only, list_t **preemptee_job_list,
bool can_reboot, bool submission)
{
uint32_t saved_min_nodes, saved_job_min_nodes, saved_job_num_tasks;
bitstr_t *saved_req_node_bitmap = NULL;
bitstr_t *inactive_bitmap = NULL;
uint32_t saved_min_cpus, saved_req_nodes;
int resv_rc = SLURM_SUCCESS, tmp_node_set_size;
int mcs_select = 0;
struct node_set *tmp_node_set_ptr, *prev_node_set_ptr;
int error_code = SLURM_SUCCESS, i;
bitstr_t *feature_bitmap, *accumulate_bitmap = NULL;
bitstr_t *save_avail_node_bitmap = NULL, *resv_bitmap = NULL;
bitstr_t *save_share_node_bitmap = NULL;
list_t *preemptee_candidates = NULL;
bool old_feat_change = false;
bool has_xand = false;
bool resv_overlap = false;
resv_exc_t resv_exc = { 0 };
/*
* Mark nodes reserved for other jobs as off limit for this job.
* If the job has a reservation, we've already limited the contents
* of select_bitmap to those nodes. Assume node reboot required
* since we have not selected the compute nodes yet.
*/
if (job_ptr->resv_name == NULL) {
time_t start_res = time(NULL);
resv_rc = job_test_resv(job_ptr, &start_res, false,
&resv_bitmap, &resv_exc,
&resv_overlap, true);
if ((resv_rc == ESLURM_NODES_BUSY) ||
(resv_rc == ESLURM_RESERVATION_MAINT)) {
save_avail_node_bitmap = avail_node_bitmap;
avail_node_bitmap = bit_alloc(node_record_count);
FREE_NULL_BITMAP(resv_bitmap);
/*
* Continue executing through _pick_best_nodes() below
* in order reject job if it can never run
*/
} else if (resv_rc != SLURM_SUCCESS) {
FREE_NULL_BITMAP(resv_bitmap);
reservation_delete_resv_exc_parts(&resv_exc);
return ESLURM_NODES_BUSY; /* reserved */
} else if (resv_bitmap &&
(!bit_equal(resv_bitmap, avail_node_bitmap))) {
bit_and(resv_bitmap, avail_node_bitmap);
save_avail_node_bitmap = avail_node_bitmap;
if (slurm_conf.debug_flags & DEBUG_FLAG_RESERVATION &&
!bit_equal(avail_node_bitmap, resv_bitmap)) {
bitstr_t *removed_nodes =
bit_copy(save_avail_node_bitmap);
bit_and_not(removed_nodes, resv_bitmap);
log_flag(RESERVATION, "Advanced reservation removed nodes:%s from consideration for %pJ",
bitmap2node_name(removed_nodes),
job_ptr);
FREE_NULL_BITMAP(removed_nodes);
}
avail_node_bitmap = resv_bitmap;
resv_bitmap = NULL;
} else {
FREE_NULL_BITMAP(resv_bitmap);
}
} else {
time_t start_res = time(NULL);
/*
* We do not care about return value.
* We are just interested in resv_exc being filled in
*/
(void) job_test_resv(job_ptr, &start_res, false, &resv_bitmap,
&resv_exc, &resv_overlap, true);
FREE_NULL_BITMAP(resv_bitmap);
}
if (submission)
resv_overlap = false;
if (!save_avail_node_bitmap)
save_avail_node_bitmap = bit_copy(avail_node_bitmap);
save_share_node_bitmap = bit_copy(share_node_bitmap);
filter_by_node_owner(job_ptr, share_node_bitmap);
if (can_reboot && !test_only)
_filter_by_node_feature(job_ptr, node_set_ptr, node_set_size);
if (!test_only) {
mcs_select = slurm_mcs_get_select(job_ptr);
filter_by_node_mcs(job_ptr, mcs_select, share_node_bitmap);
}
if (!test_only) {
hres_filter(job_ptr, avail_node_bitmap);
}
/* save job and request state */
saved_min_nodes = min_nodes;
saved_req_nodes = req_nodes;
saved_job_min_nodes = job_ptr->details->min_nodes;
if (job_ptr->details->req_node_bitmap) {
accumulate_bitmap = job_ptr->details->req_node_bitmap;
saved_req_node_bitmap = bit_copy(accumulate_bitmap);
job_ptr->details->req_node_bitmap = NULL;
}
saved_min_cpus = job_ptr->details->min_cpus;
/*
* Don't mess with max_cpus here since it is only set to be a limit
* and not user configurable.
*/
job_ptr->details->min_cpus = 1;
tmp_node_set_ptr = xcalloc((node_set_size * 2), sizeof(struct node_set));
/* Accumulate nodes with required feature counts. */
preemptee_candidates = slurm_find_preemptable_jobs(job_ptr);
if (job_ptr->details->feature_list_use) {
list_itr_t *feat_iter;
job_feature_t *feat_ptr;
int last_paren_cnt = 0, last_paren_opt = FEATURE_OP_AND;
bitstr_t *paren_bitmap = NULL, *work_bitmap;
uint64_t smallest_min_mem = INFINITE64;
uint64_t orig_req_mem = job_ptr->details->pn_min_memory;
bool feat_change = false;
feat_iter = list_iterator_create(
job_ptr->details->feature_list_use);
while ((feat_ptr = list_next(feat_iter))) {
bool sort_again = false;
if (last_paren_cnt < feat_ptr->paren) {
/* Start of expression in parenthesis */
if (paren_bitmap) {
error("%s@%d: %pJ has bad feature expression: %s",
__func__, __LINE__, job_ptr,
job_ptr->details->features_use);
FREE_NULL_BITMAP(paren_bitmap);
}
feat_change |= feat_ptr->changeable;
paren_bitmap =
bit_copy(feat_ptr->node_bitmap_avail);
last_paren_opt = feat_ptr->op_code;
last_paren_cnt = feat_ptr->paren;
continue;
} else if (last_paren_cnt > 0) {
feat_change |= feat_ptr->changeable;
if (last_paren_opt == FEATURE_OP_AND) {
bit_and(paren_bitmap,
feat_ptr->node_bitmap_avail);
} else {
bit_or(paren_bitmap,
feat_ptr->node_bitmap_avail);
}
last_paren_opt = feat_ptr->op_code;
last_paren_cnt = feat_ptr->paren;
if (last_paren_cnt)
continue;
work_bitmap = paren_bitmap;
} else {
/* Outside of parenthesis */
feat_change = feat_ptr->changeable;
work_bitmap = feat_ptr->node_bitmap_avail;
}
if (feat_ptr->count == 0) {
FREE_NULL_BITMAP(paren_bitmap);
continue;
}
tmp_node_set_size = 0;
/*
* _pick_best_nodes() is destructive of the node_set
* data structure, so we need to make a copy and then
* purge it
*/
for (i = 0; i < node_set_size; i++) {
if (!bit_overlap_any(node_set_ptr[i].my_bitmap,
work_bitmap))
continue;
tmp_node_set_ptr[tmp_node_set_size].
cpus_per_node =
node_set_ptr[i].cpus_per_node;
tmp_node_set_ptr[tmp_node_set_size].
real_memory =
node_set_ptr[i].real_memory;
tmp_node_set_ptr[tmp_node_set_size].node_weight =
node_set_ptr[i].node_weight;
tmp_node_set_ptr[tmp_node_set_size].sched_weight =
node_set_ptr[i].sched_weight;
tmp_node_set_ptr[tmp_node_set_size].flags =
node_set_ptr[i].flags;
tmp_node_set_ptr[tmp_node_set_size].features =
xstrdup(node_set_ptr[i].features);
tmp_node_set_ptr[tmp_node_set_size].
feature_bits =
bit_copy(node_set_ptr[i].feature_bits);
tmp_node_set_ptr[tmp_node_set_size].my_bitmap =
bit_copy(node_set_ptr[i].my_bitmap);
bit_and(tmp_node_set_ptr[tmp_node_set_size].
my_bitmap, work_bitmap);
if (accumulate_bitmap && has_xand) {
bit_and_not(tmp_node_set_ptr[
tmp_node_set_size].my_bitmap,
accumulate_bitmap);
}
tmp_node_set_ptr[tmp_node_set_size].node_cnt =
bit_set_count(tmp_node_set_ptr
[tmp_node_set_size].my_bitmap);
prev_node_set_ptr = tmp_node_set_ptr +
tmp_node_set_size;
tmp_node_set_size++;
if (test_only || !can_reboot ||
(prev_node_set_ptr->flags &
NODE_SET_REBOOT))
continue;
inactive_bitmap =
bit_copy(node_set_ptr[i].my_bitmap);
bit_and_not(inactive_bitmap,
feat_ptr->node_bitmap_active);
if (bit_ffs(inactive_bitmap) == -1) {
/* No inactive nodes (require reboot) */
FREE_NULL_BITMAP(inactive_bitmap);
continue;
}
sort_again = true;
if (bit_equal(prev_node_set_ptr->my_bitmap,
inactive_bitmap)) {
prev_node_set_ptr->flags |=
NODE_SET_REBOOT;
FREE_NULL_BITMAP(inactive_bitmap);
continue;
}
tmp_node_set_ptr[tmp_node_set_size].
cpus_per_node =
node_set_ptr[i].cpus_per_node;
tmp_node_set_ptr[tmp_node_set_size].
real_memory =
node_set_ptr[i].real_memory;
tmp_node_set_ptr[tmp_node_set_size].flags |=
NODE_SET_REBOOT;
tmp_node_set_ptr[tmp_node_set_size].features =
xstrdup(node_set_ptr[i].features);
tmp_node_set_ptr[tmp_node_set_size].
feature_bits =
bit_copy(node_set_ptr[i].feature_bits);
tmp_node_set_ptr[tmp_node_set_size].my_bitmap =
bit_copy(tmp_node_set_ptr
[tmp_node_set_size-1].my_bitmap);
bit_and(tmp_node_set_ptr[tmp_node_set_size].
my_bitmap, inactive_bitmap);
tmp_node_set_ptr[tmp_node_set_size].node_cnt =
bit_set_count(tmp_node_set_ptr
[tmp_node_set_size].my_bitmap);
bit_and_not(tmp_node_set_ptr[tmp_node_set_size-1].
my_bitmap, inactive_bitmap);
tmp_node_set_ptr[tmp_node_set_size-1].node_cnt =
bit_set_count(tmp_node_set_ptr
[tmp_node_set_size-1].my_bitmap);
tmp_node_set_size++;
FREE_NULL_BITMAP(inactive_bitmap);
}
FREE_NULL_BITMAP(paren_bitmap);
feature_bitmap = NULL;
min_nodes = feat_ptr->count;
req_nodes = feat_ptr->count;
saved_job_num_tasks = job_ptr->details->num_tasks;
job_ptr->details->min_nodes = feat_ptr->count;
job_ptr->details->min_cpus = feat_ptr->count;
/*
* Ensure that num_tasks is accurate if ntasks_per_node
* is set
*/
if (job_ptr->details->ntasks_per_node)
job_ptr->details->num_tasks = min_nodes *
job_ptr->details->ntasks_per_node;
FREE_NULL_LIST(*preemptee_job_list);
job_ptr->details->pn_min_memory = orig_req_mem;
if (sort_again) {
for (i = 0; i < tmp_node_set_size; i++)
_set_sched_weight(tmp_node_set_ptr + i);
qsort(tmp_node_set_ptr, tmp_node_set_size,
sizeof(struct node_set), _sort_node_set);
}
error_code = _pick_best_nodes(tmp_node_set_ptr,
tmp_node_set_size, &feature_bitmap,
job_ptr, part_ptr, min_nodes,
max_nodes, req_nodes, test_only,
preemptee_candidates,
preemptee_job_list, false,
&resv_exc, resv_overlap);
job_ptr->details->num_tasks = saved_job_num_tasks;
if (job_ptr->details->pn_min_memory) {
if (job_ptr->details->pn_min_memory <
smallest_min_mem)
smallest_min_mem =
job_ptr->details->pn_min_memory;
else
job_ptr->details->pn_min_memory =
smallest_min_mem;
}
#if _DEBUG
{
char *tmp_str = bitmap2node_name(feature_bitmap);
info("%pJ needs %u nodes with feature %s, using %s, error_code=%d",
job_ptr, feat_ptr->count, feat_ptr->name,
tmp_str, error_code);
xfree(tmp_str);
}
#endif
for (i = 0; i < tmp_node_set_size; i++) {
xfree(tmp_node_set_ptr[i].features);
FREE_NULL_BITMAP(tmp_node_set_ptr[i].
feature_bits);
FREE_NULL_BITMAP(tmp_node_set_ptr[i].
my_bitmap);
}
if (error_code != SLURM_SUCCESS) {
FREE_NULL_BITMAP(feature_bitmap);
break;
}
if (feature_bitmap) {
if (feat_ptr->op_code == FEATURE_OP_XAND)
has_xand = true;
if (has_xand) {
if (old_feat_change && feat_change) {
error_code =
ESLURM_MULTI_KNL_CONSTRAINT;
break;
}
old_feat_change |= feat_change;
/*
* Don't make nodes required since we
* check value on each call to
* _pick_best_nodes()
*/
} else if (job_ptr->details->req_node_bitmap) {
bit_or(job_ptr->details->
req_node_bitmap,
feature_bitmap);
} else {
job_ptr->details->req_node_bitmap =
bit_copy(feature_bitmap);
}
if (accumulate_bitmap) {
bit_or(accumulate_bitmap,
feature_bitmap);
FREE_NULL_BITMAP(feature_bitmap);
} else
accumulate_bitmap = feature_bitmap;
}
}
list_iterator_destroy(feat_iter);
if (paren_bitmap) {
error("%s@%d: %pJ has bad feature expression: %s",
__func__, __LINE__, job_ptr,
job_ptr->details->features_use);
FREE_NULL_BITMAP(paren_bitmap);
}
}
/* restore most of job state and accumulate remaining resources */
if (saved_req_node_bitmap) {
FREE_NULL_BITMAP(job_ptr->details->req_node_bitmap);
job_ptr->details->req_node_bitmap =
bit_copy(saved_req_node_bitmap);
}
if (accumulate_bitmap) {
uint32_t node_cnt;
if (job_ptr->details->req_node_bitmap) {
bit_or(job_ptr->details->req_node_bitmap,
accumulate_bitmap);
FREE_NULL_BITMAP(accumulate_bitmap);
} else
job_ptr->details->req_node_bitmap = accumulate_bitmap;
node_cnt = bit_set_count(job_ptr->details->req_node_bitmap);
job_ptr->details->min_cpus = MAX(saved_min_cpus, node_cnt);
min_nodes = MAX(saved_min_nodes, node_cnt);
job_ptr->details->min_nodes = min_nodes;
req_nodes = MAX(min_nodes, req_nodes);
if (req_nodes > max_nodes)
error_code = ESLURM_REQUESTED_NODE_CONFIG_UNAVAILABLE;
} else {
min_nodes = saved_min_nodes;
req_nodes = saved_req_nodes;
job_ptr->details->min_cpus = saved_min_cpus;
job_ptr->details->min_nodes = saved_job_min_nodes;
}
#if _DEBUG
{
char *tmp_str = bitmap2node_name(job_ptr->details->req_node_bitmap);
info("%pJ requires %d:%d:%d req_nodes:%s err:%u",
job_ptr, min_nodes, req_nodes, max_nodes, tmp_str, error_code);
xfree(tmp_str);
}
#endif
xfree(tmp_node_set_ptr);
if (error_code == SLURM_SUCCESS) {
FREE_NULL_LIST(*preemptee_job_list);
error_code = _pick_best_nodes(node_set_ptr, node_set_size,
select_bitmap, job_ptr, part_ptr, min_nodes,
max_nodes, req_nodes, test_only,
preemptee_candidates, preemptee_job_list,
has_xand, &resv_exc, resv_overlap);
}
if ((resv_rc == ESLURM_RESERVATION_MAINT) &&
(error_code == ESLURM_NODE_NOT_AVAIL))
error_code = ESLURM_RESERVATION_MAINT;
#if _DEBUG
{
char *tmp_str = bitmap2node_name(*select_bitmap);
info("%pJ allocated nodes:%s err:%u", job_ptr, tmp_str, error_code);
xfree(tmp_str);
}
#endif
FREE_NULL_LIST(preemptee_candidates);
/* restore job's initial required node bitmap */
FREE_NULL_BITMAP(job_ptr->details->req_node_bitmap);
job_ptr->details->req_node_bitmap = saved_req_node_bitmap;
job_ptr->details->min_cpus = saved_min_cpus;
job_ptr->details->min_nodes = saved_job_min_nodes;
/* Restore available node bitmap, ignoring reservations */
if (save_avail_node_bitmap) {
FREE_NULL_BITMAP(avail_node_bitmap);
avail_node_bitmap = save_avail_node_bitmap;
}
if (save_share_node_bitmap) {
FREE_NULL_BITMAP(share_node_bitmap);
share_node_bitmap = save_share_node_bitmap;
}
reservation_delete_resv_exc_parts(&resv_exc);
return error_code;
}
static void _sync_node_weight(struct node_set *node_set_ptr, int node_set_size)
{
node_record_t *node_ptr;
for (int s = 0; s < node_set_size; s++) {
if (!node_set_ptr[s].my_bitmap)
continue; /* No nodes in this set */
for (int i = 0;
(node_ptr = next_node_bitmap(node_set_ptr[s].my_bitmap,
&i));
i++) {
node_ptr->sched_weight = node_set_ptr[s].sched_weight;
}
}
}
static int _bit_or_cond_internal(void *x, void *arg)
{
job_record_t *job_ptr = (job_record_t *)x;
bitstr_t *bitmap = (bitstr_t *)arg;
if (!IS_JOB_RUNNING(job_ptr) || job_ptr->details->share_res ||
!job_ptr->job_resrcs)
return 0;
bit_or(bitmap, job_ptr->job_resrcs->node_bitmap);
return 0;
}
static void _bit_or_cond(job_record_t *job_ptr, bitstr_t *bitmap)
{
if (!job_ptr->het_job_list)
_bit_or_cond_internal(job_ptr, bitmap);
else
list_for_each_nobreak(job_ptr->het_job_list,
_bit_or_cond_internal, bitmap);
}
/*
* _pick_best_nodes - from a weight order list of all nodes satisfying a
* job's specifications, select the "best" for use
* IN node_set_ptr - pointer to node specification information
* IN node_set_size - number of entries in records pointed to by node_set_ptr
* OUT select_bitmap - returns bitmap of selected nodes, must FREE_NULL_BITMAP
* IN job_ptr - pointer to job being scheduled
* IN part_ptr - pointer to the partition in which the job is being scheduled
* IN min_nodes - minimum count of nodes required by the job
* IN max_nodes - maximum count of nodes required by the job (0==no limit)
* IN req_nodes - requested (or desired) count of nodes
* IN test_only - do not actually allocate resources
* IN/OUT preemptee_job_list - list of pointers to jobs to be preempted
* IN resv_exc_ptr - Various TRES which can not be used
* NULL on first entry
* IN has_xand - set of the constraint list includes XAND operators *and*
* we have already satisfied them all
* in resv_overlap - designated reservation overlaps another reservation
* RET SLURM_SUCCESS on success,
* ESLURM_NODES_BUSY if request can not be satisfied now,
* ESLURM_REQUESTED_NODE_CONFIG_UNAVAILABLE if request can never
* be satisfied,
* ESLURM_REQUESTED_PART_CONFIG_UNAVAILABLE if the job can not be
* initiated until the partition's configuration changes or
* ESLURM_NODE_NOT_AVAIL if required nodes are DOWN or DRAINED
* ESLURM_RESERVATION_BUSY if requested reservation overlaps another
* NOTE: the caller must FREE_NULL_BITMAP memory pointed to by select_bitmap
* Notes: The algorithm is
* 1) If required node list is specified, determine implicitly required
* processor and node count
* 2) Determine how many disjoint required "features" are represented
* (e.g. "FS1|FS2|FS3")
* 3) For each feature: find matching node table entries, identify nodes
* that are up and available (idle or shared) and add them to a bit
* map
* 4) Select_g_job_test() to select the "best" of those based upon
* topology and/or workload
* 5) If request can't be satisfied now, execute select_g_job_test()
* against the list of nodes that exist in any state (perhaps DOWN
* DRAINED or ALLOCATED) to determine if the request can
* ever be satisfied.
*/
static int _pick_best_nodes(struct node_set *node_set_ptr, int node_set_size,
bitstr_t **select_bitmap, job_record_t *job_ptr,
part_record_t *part_ptr, uint32_t min_nodes,
uint32_t max_nodes, uint32_t req_nodes,
bool test_only, list_t *preemptee_candidates,
list_t **preemptee_job_list, bool has_xand,
resv_exc_t *resv_exc_ptr, bool resv_overlap)
{
int error_code = SLURM_SUCCESS, i, j, pick_code = SLURM_SUCCESS;
int total_nodes = 0, avail_nodes = 0;
bitstr_t *avail_bitmap = NULL, *total_bitmap = NULL;
bitstr_t *backup_bitmap = NULL;
bitstr_t *possible_bitmap = NULL;
bitstr_t *node_set_map;
int max_feature, min_feature;
bool runable_ever = false; /* Job can ever run */
bool runable_avail = false; /* Job can run with available nodes */
bool tried_sched = false; /* Tried to schedule with avail nodes */
bool preempt_flag = false;
bool nodes_busy = false;
bool licenses_unavailable = false;
int shared = 0, select_mode;
list_t *preemptee_cand = NULL;
/*
* Since you could potentially have multiple features and the
* job might not request memory we need to keep track of a minimum
* from the selected features. This is to fulfill commit
* 700e7b1d4e9.
* If no memory is requested but we are running with
* SELECT_*_MEMORY and the request is for
* nodes of different memory sizes we need to reset the
* pn_min_memory as select_g_job_test can
* alter that making it so the order of constraints
* matter since the first pass through this will set the
* pn_min_memory based on that first constraint and if
* it isn't smaller than all the other requests they
* will fail. We have to keep track of the
* memory for accounting, these next 2 variables do this for us.
*/
uint64_t smallest_min_mem = INFINITE64;
uint64_t orig_req_mem = job_ptr->details->pn_min_memory;
if (test_only)
select_mode = SELECT_MODE_TEST_ONLY;
else
select_mode = SELECT_MODE_RUN_NOW;
if ((job_ptr->details->min_nodes == 0) &&
(job_ptr->details->max_nodes == 0)) {
/* Zero compute node job (burst buffer use only) */
avail_bitmap = bit_alloc(node_record_count);
pick_code = select_g_job_test(job_ptr,
avail_bitmap,
0, 0, 0,
select_mode,
preemptee_candidates,
preemptee_job_list,
resv_exc_ptr,
NULL);
if (pick_code == SLURM_SUCCESS) {
*select_bitmap = avail_bitmap;
return SLURM_SUCCESS;
} else {
FREE_NULL_BITMAP(avail_bitmap);
if (pick_code == ESLURM_LICENSES_UNAVAILABLE)
return ESLURM_LICENSES_UNAVAILABLE;
else
return ESLURM_REQUESTED_NODE_CONFIG_UNAVAILABLE;
}
} else if (node_set_size == 0) {
info("%s: empty node set for selection", __func__);
return ESLURM_REQUESTED_NODE_CONFIG_UNAVAILABLE;
}
shared = _resolve_shared_status(job_ptr, part_ptr->max_share);
/*
* If job preemption is enabled, then do NOT limit the set of available
* nodes by their current 'sharable' or 'idle' setting
*/
preempt_flag = slurm_preemption_enabled();
if (job_ptr->details->req_node_bitmap) { /* specific nodes required */
/*
* We have already confirmed that all of these nodes have a
* usable configuration and are in the proper partition.
* Check that these nodes can be used by this job.
*/
if (min_nodes != 0) {
total_nodes = bit_set_count(
job_ptr->details->req_node_bitmap);
}
if (total_nodes > max_nodes) { /* exceeds node limit */
return ESLURM_REQUESTED_PART_CONFIG_UNAVAILABLE;
}
if ((job_ptr->details->core_spec != NO_VAL16) &&
((job_ptr->details->core_spec & CORE_SPEC_THREAD) == 0)) {
i = bit_ffs(job_ptr->details->req_node_bitmap);
if (i >= 0) {
j = node_record_table_ptr[i]->tot_cores;
}
if ((i >= 0) && (job_ptr->details->core_spec >= j)) {
if (part_ptr->name) {
info("%s: %pJ never runnable in partition %s",
__func__, job_ptr,
part_ptr->name);
} else {
info("%s: %pJ never runnable",
__func__, job_ptr);
}
return ESLURM_REQUESTED_NODE_CONFIG_UNAVAILABLE;
}
}
/*
* Check the availability of these nodes.
* Should we check memory availability on these nodes?
*/
if (!bit_super_set(job_ptr->details->req_node_bitmap,
avail_node_bitmap)) {
return ESLURM_NODE_NOT_AVAIL;
}
/*
* Still must go through select_g_job_test() to determine the
* validity of request and/or perform set-up before job launch
*/
total_nodes = 0; /* reinitialize */
}
/* identify the min and max feature values for possible exclusive OR */
max_feature = -1;
min_feature = MAX_FEATURES;
for (i = 0; i < node_set_size; i++) {
j = bit_ffs(node_set_ptr[i].feature_bits);
if ((j >= 0) && (j < min_feature))
min_feature = j;
j = bit_fls(node_set_ptr[i].feature_bits);
if ((j >= 0) && (j > max_feature))
max_feature = j;
}
debug3("%s: %pJ idle_nodes %u share_nodes %u",
__func__, job_ptr, bit_set_count(idle_node_bitmap),
bit_set_count(share_node_bitmap));
if (running_cons_tres())
_sync_node_weight(node_set_ptr, node_set_size);
/*
* Accumulate resources for this job based upon its required
* features (possibly with node counts).
*/
for (j = min_feature; j <= max_feature; j++) {
if (job_ptr->details->req_node_bitmap) {
bool missing_required_nodes = false;
bool feature_found = false;
for (i = 0; i < node_set_size; i++) {
if (!bit_test(node_set_ptr[i].feature_bits, j))
continue;
feature_found = true;
node_set_map =
bit_copy(node_set_ptr[i].my_bitmap);
if ((node_set_ptr[i].flags & NODE_SET_REBOOT)) {
/* Node reboot required */
bit_and(node_set_map,
idle_node_bitmap);
/*
* Powered up cloud nodes can't be
* rebooted to get new features. Must be
* powered down first.
*/
bit_and_not(node_set_map,
cloud_node_bitmap);
}
if (avail_bitmap) {
bit_or(avail_bitmap, node_set_map);
FREE_NULL_BITMAP(node_set_map);
} else {
avail_bitmap = node_set_map;
}
}
if (!feature_found)
continue;
if (!bit_super_set(job_ptr->details->req_node_bitmap,
avail_bitmap))
missing_required_nodes = true;
if (missing_required_nodes)
continue;
FREE_NULL_BITMAP(avail_bitmap);
avail_bitmap = bit_copy(job_ptr->details->
req_node_bitmap);
bit_and_not(avail_bitmap, rs_node_bitmap);
}
for (i = 0; i < node_set_size; i++) {
int count1 = 0, count2 = 0;
if (!has_xand &&
!bit_test(node_set_ptr[i].feature_bits, j)) {
if ((i+1) < node_set_size || !avail_bitmap)
continue;
else
goto try_sched;
}
if (total_bitmap) {
bit_or(total_bitmap,
node_set_ptr[i].my_bitmap);
} else {
total_bitmap = bit_copy(
node_set_ptr[i].my_bitmap);
}
if ((node_set_ptr[i].flags & NODE_SET_REBOOT)) {
/* Node reboot required */
count1 = bit_set_count(node_set_ptr[i].
my_bitmap);
bit_and(node_set_ptr[i].my_bitmap,
idle_node_bitmap);
/*
* Powered up cloud nodes can't be rebooted to
* get new features. Must be powered down first.
*/
bit_and_not(node_set_ptr[i].my_bitmap,
cloud_node_bitmap);
count2 = bit_set_count(node_set_ptr[i].
my_bitmap);
if (count1 != count2)
nodes_busy = true;
}
bit_and(node_set_ptr[i].my_bitmap, avail_node_bitmap);
if (!nodes_busy) {
count1 = bit_set_count(node_set_ptr[i].
my_bitmap);
}
if (!preempt_flag) {
if (shared) {
bit_and(node_set_ptr[i].my_bitmap,
share_node_bitmap);
bit_and_not(node_set_ptr[i].my_bitmap,
cg_node_bitmap);
} else {
bit_and(node_set_ptr[i].my_bitmap,
idle_node_bitmap);
/* IDLE nodes are not COMPLETING */
}
} else {
bit_and_not(node_set_ptr[i].my_bitmap,
cg_node_bitmap);
}
/*
* We must skip the node *only* in the case it is
* rebooted with ASAP flag.
*/
bit_and_not(node_set_ptr[i].my_bitmap,
asap_node_bitmap);
if (!nodes_busy) {
count2 = bit_set_count(node_set_ptr[i].
my_bitmap);
if (count1 != count2)
nodes_busy = true;
}
if (avail_bitmap) {
bit_or(avail_bitmap,
node_set_ptr[i].my_bitmap);
} else {
avail_bitmap = bit_copy(node_set_ptr[i].
my_bitmap);
}
tried_sched = false; /* need to test these nodes */
if (running_cons_tres() && ((i + 1) < node_set_size)) {
/*
* Execute select_g_job_test() _once_ using
* sched_weight in node_record_t as set
* by _sync_node_weight()
*/
continue;
}
try_sched:
/* NOTE: select_g_job_test() is destructive of
* avail_bitmap, so save a backup copy */
backup_bitmap = bit_copy(avail_bitmap);
FREE_NULL_LIST(*preemptee_job_list);
if (job_ptr->details->req_node_bitmap == NULL)
bit_and(avail_bitmap, avail_node_bitmap);
bit_and(avail_bitmap, share_node_bitmap);
avail_nodes = bit_set_count(avail_bitmap);
if (((avail_nodes < min_nodes) ||
((avail_nodes >= min_nodes) &&
(avail_nodes < req_nodes))) &&
((i+1) < node_set_size)) {
FREE_NULL_BITMAP(avail_bitmap);
avail_bitmap = backup_bitmap;
continue; /* Keep accumulating nodes */
}
/* Only preempt jobs when all possible nodes are being
* considered for use, otherwise we would preempt jobs
* to use the lowest weight nodes. */
if ((i+1) < node_set_size || !preemptee_candidates)
preemptee_cand = NULL;
else if (preempt_flag) {
job_record_t *tmp_job_ptr = NULL;
list_itr_t *job_iterator;
job_iterator = list_iterator_create(preemptee_candidates);
while ((tmp_job_ptr = list_next(job_iterator)))
_bit_or_cond(tmp_job_ptr, avail_bitmap);
list_iterator_destroy(job_iterator);
bit_and(avail_bitmap, avail_node_bitmap);
bit_and(avail_bitmap, total_bitmap);
preemptee_cand = preemptee_candidates;
} else
preemptee_cand = preemptee_candidates;
job_ptr->details->pn_min_memory = orig_req_mem;
pick_code = select_g_job_test(job_ptr,
avail_bitmap,
min_nodes,
max_nodes,
req_nodes,
select_mode,
preemptee_cand,
preemptee_job_list,
resv_exc_ptr,
NULL);
if (job_ptr->details->pn_min_memory) {
if (job_ptr->details->pn_min_memory <
smallest_min_mem)
smallest_min_mem =
job_ptr->details->pn_min_memory;
else
job_ptr->details->pn_min_memory =
smallest_min_mem;
}
#if _DEBUG
{
char *tmp_str1 = bitmap2node_name(avail_bitmap);
char *tmp_str2 = bitmap2node_name(backup_bitmap);
info("%s: %pJ err:%d nodes:%u:%u:%u mode:%u select %s from %s",
__func__, job_ptr, pick_code, min_nodes, req_nodes,
max_nodes, select_mode, tmp_str1, tmp_str2);
xfree(tmp_str1);
xfree(tmp_str2);
}
#endif
if (pick_code == SLURM_SUCCESS) {
FREE_NULL_BITMAP(backup_bitmap);
if (bit_set_count(avail_bitmap) > max_nodes) {
/* end of tests for this feature */
avail_nodes = 0;
break;
}
FREE_NULL_BITMAP(total_bitmap);
FREE_NULL_BITMAP(possible_bitmap);
*select_bitmap = avail_bitmap;
return SLURM_SUCCESS;
} else {
tried_sched = true; /* test failed */
FREE_NULL_BITMAP(avail_bitmap);
avail_bitmap = backup_bitmap;
}
} /* for (i = 0; i < node_set_size; i++) */
/* try to get req_nodes now for this feature */
if (avail_bitmap && (!tried_sched) &&
(avail_nodes >= min_nodes) &&
((job_ptr->details->req_node_bitmap == NULL) ||
bit_super_set(job_ptr->details->req_node_bitmap,
avail_bitmap))) {
FREE_NULL_LIST(*preemptee_job_list);
job_ptr->details->pn_min_memory = orig_req_mem;
pick_code = select_g_job_test(job_ptr, avail_bitmap,
min_nodes, max_nodes,
req_nodes,
select_mode,
preemptee_candidates,
preemptee_job_list,
resv_exc_ptr,
NULL);
if (job_ptr->details->pn_min_memory) {
if (job_ptr->details->pn_min_memory <
smallest_min_mem)
smallest_min_mem =
job_ptr->details->pn_min_memory;
else
job_ptr->details->pn_min_memory =
smallest_min_mem;
}
if ((pick_code == SLURM_SUCCESS) &&
(bit_set_count(avail_bitmap) <= max_nodes)) {
FREE_NULL_BITMAP(total_bitmap);
FREE_NULL_BITMAP(possible_bitmap);
*select_bitmap = avail_bitmap;
return SLURM_SUCCESS;
}
}
if (pick_code == ESLURM_LICENSES_UNAVAILABLE)
licenses_unavailable = true;
/* determine if job could possibly run (if all configured
* nodes available) */
if (total_bitmap)
total_nodes = bit_set_count(total_bitmap);
if (total_bitmap &&
(!runable_ever || !runable_avail) &&
(total_nodes >= min_nodes) &&
((job_ptr->details->req_node_bitmap == NULL) ||
(bit_super_set(job_ptr->details->req_node_bitmap,
total_bitmap)))) {
avail_nodes = bit_set_count(avail_bitmap);
if (!runable_avail && (avail_nodes >= min_nodes)) {
FREE_NULL_BITMAP(avail_bitmap);
avail_bitmap = bit_copy(total_bitmap);
bit_and(avail_bitmap, avail_node_bitmap);
job_ptr->details->pn_min_memory = orig_req_mem;
pick_code = select_g_job_test(job_ptr,
avail_bitmap,
min_nodes,
max_nodes,
req_nodes,
SELECT_MODE_TEST_ONLY,
preemptee_candidates, NULL,
resv_exc_ptr,
NULL);
if (job_ptr->details->pn_min_memory) {
if (job_ptr->details->pn_min_memory <
smallest_min_mem)
smallest_min_mem =
job_ptr->details->
pn_min_memory;
else
job_ptr->details->
pn_min_memory =
smallest_min_mem;
}
if (pick_code == SLURM_SUCCESS) {
runable_ever = true;
if (bit_set_count(avail_bitmap) <=
max_nodes)
runable_avail = true;
FREE_NULL_BITMAP(possible_bitmap);
possible_bitmap = avail_bitmap;
avail_bitmap = NULL;
}
}
if (!runable_ever) {
job_ptr->details->pn_min_memory = orig_req_mem;
pick_code = select_g_job_test(job_ptr,
total_bitmap,
min_nodes,
max_nodes,
req_nodes,
SELECT_MODE_TEST_ONLY,
preemptee_candidates, NULL,
resv_exc_ptr,
NULL);
if (job_ptr->details->pn_min_memory) {
if (job_ptr->details->pn_min_memory <
smallest_min_mem)
smallest_min_mem =
job_ptr->details->
pn_min_memory;
else
job_ptr->details->
pn_min_memory =
smallest_min_mem;
}
if (pick_code == SLURM_SUCCESS) {
FREE_NULL_BITMAP(possible_bitmap);
possible_bitmap = total_bitmap;
total_bitmap = NULL;
runable_ever = true;
}
}
}
FREE_NULL_BITMAP(avail_bitmap);
FREE_NULL_BITMAP(total_bitmap);
if (error_code != SLURM_SUCCESS)
break;
}
FREE_NULL_BITMAP(avail_bitmap);
FREE_NULL_BITMAP(total_bitmap);
/* The job is not able to start right now, return a
* value indicating when the job can start */
if (!runable_ever && resv_overlap &&
(pick_code != ESLURM_REQUESTED_TOPO_CONFIG_UNAVAILABLE)) {
error_code = ESLURM_RESERVATION_BUSY;
return error_code;
}
if (licenses_unavailable) {
error_code = ESLURM_LICENSES_UNAVAILABLE;
} else if (!runable_ever) {
char *tmp;
/*
* If a job requested extra_constraints, then assume
* that the job might be runnable at some point in the
* future. FIXME: This is a kludge and this assumption
* may be wrong.
*/
tmp = job_ptr->extra_constraints ?
"currently not runnable" : "never runnable";
if (part_ptr->name) {
info("%s: %pJ %s in partition %s",
__func__, job_ptr, tmp, part_ptr->name);
} else {
info("%s: job %pJ %s",
__func__, job_ptr, tmp);
}
if (pick_code == ESLURM_REQUESTED_TOPO_CONFIG_UNAVAILABLE)
error_code = pick_code;
else
error_code = ESLURM_REQUESTED_NODE_CONFIG_UNAVAILABLE;
} else if (!runable_avail && !nodes_busy) {
error_code = ESLURM_NODE_NOT_AVAIL;
} else if (job_ptr->details->req_node_bitmap &&
bit_overlap_any(job_ptr->details->req_node_bitmap,
rs_node_bitmap)) {
error_code = ESLURM_NODES_BUSY;
} else if (!preempt_flag && job_ptr->details->req_node_bitmap) {
/* specific nodes required */
if (shared) {
if (!bit_super_set(job_ptr->details->req_node_bitmap,
share_node_bitmap)) {
error_code = ESLURM_NODES_BUSY;
}
if (bit_overlap_any(job_ptr->details->req_node_bitmap,
cg_node_bitmap)) {
error_code = ESLURM_NODES_BUSY;
}
} else if (!bit_super_set(job_ptr->details->req_node_bitmap,
idle_node_bitmap)) {
error_code = ESLURM_NODES_BUSY;
/* Note: IDLE nodes are not COMPLETING */
}
} else if (job_ptr->details->req_node_bitmap &&
bit_overlap_any(job_ptr->details->req_node_bitmap,
cg_node_bitmap)) {
error_code = ESLURM_NODES_BUSY;
}
if (error_code == SLURM_SUCCESS) {
error_code = ESLURM_NODES_BUSY;
}
if (possible_bitmap && runable_ever) {
*select_bitmap = possible_bitmap;
} else {
FREE_NULL_BITMAP(possible_bitmap);
}
return error_code;
}
static void _preempt_jobs(list_t *preemptee_job_list, bool kill_pending,
int *error_code, job_record_t *preemptor_ptr)
{
list_itr_t *iter;
job_record_t *job_ptr;
uint16_t mode;
int job_cnt = 0;
static time_t sched_update = 0;
if (sched_update != slurm_conf.last_update) {
preempt_send_user_signal = false;
if (xstrcasestr(slurm_conf.preempt_params,
"send_user_signal") ||
xstrcasestr(slurm_conf.slurmctld_params,
"preempt_send_user_signal"))
preempt_send_user_signal = true;
sched_update = slurm_conf.last_update;
}
iter = list_iterator_create(preemptee_job_list);
while ((job_ptr = list_next(iter))) {
mode = slurm_job_preempt_mode(job_ptr);
if (mode == PREEMPT_MODE_OFF) {
error("%s: Invalid preempt_mode %u for %pJ",
__func__, mode, job_ptr);
continue;
}
if ((mode == PREEMPT_MODE_SUSPEND) &&
(slurm_conf.preempt_mode & PREEMPT_MODE_GANG)) {
debug("preempted %pJ suspended by gang scheduler to reclaim resources for %pJ",
job_ptr, preemptor_ptr);
job_ptr->preempt_time = time(NULL);
continue;
}
job_cnt++;
if (!kill_pending)
continue;
if (slurm_job_preempt(job_ptr, preemptor_ptr, mode, true) !=
SLURM_SUCCESS)
continue;
}
list_iterator_destroy(iter);
if (job_cnt > 0)
*error_code = ESLURM_NODES_BUSY;
}
/* Return true if this job record is
* 1) not a job array OR
* 2) the first task of a job array to begin execution */
static bool _first_array_task(job_record_t *job_ptr)
{
job_record_t *meta_job_ptr;
if (job_ptr->array_task_id == NO_VAL)
return true;
meta_job_ptr = find_job_record(job_ptr->array_job_id);
if (!meta_job_ptr || !meta_job_ptr->array_recs) {
error("%s: Could not find meta job record for %pJ",
__func__, job_ptr);
return true;
}
if ((meta_job_ptr->array_recs->tot_run_tasks == 1) && /* This task */
(meta_job_ptr->array_recs->tot_comp_tasks == 0))
return true;
return false;
}
/*
* This job has zero node count. It is only designed to create or destroy
* persistent burst buffer resources. Terminate it now.
*/
static void _end_null_job(job_record_t *job_ptr)
{
time_t now = time(NULL);
job_ptr->exit_code = 0;
gres_stepmgr_job_clear_alloc(job_ptr->gres_list_req);
gres_stepmgr_job_clear_alloc(job_ptr->gres_list_req_accum);
FREE_NULL_LIST(job_ptr->gres_list_alloc);
job_state_set(job_ptr, JOB_RUNNING);
job_ptr->bit_flags |= JOB_WAS_RUNNING;
FREE_NULL_BITMAP(job_ptr->node_bitmap);
xfree(job_ptr->nodes);
xfree(job_ptr->sched_nodes);
job_ptr->start_time = now;
job_ptr->state_reason = WAIT_NO_REASON;
xfree(job_ptr->state_desc);
job_ptr->time_last_active = now;
if (!job_ptr->step_list)
job_ptr->step_list = list_create(free_step_record);
(void) job_array_post_sched(job_ptr, true);
(void) bb_g_job_begin(job_ptr);
job_array_start(job_ptr);
rebuild_job_part_list(job_ptr);
if ((job_ptr->mail_type & MAIL_JOB_BEGIN) &&
((job_ptr->mail_type & MAIL_ARRAY_TASKS) ||
_first_array_task(job_ptr)))
mail_job_info(job_ptr, MAIL_JOB_BEGIN);
slurmctld_diag_stats.jobs_started++;
/* Call job_set_alloc_tres() before acct_policy_job_begin() */
job_set_alloc_tres(job_ptr, false);
acct_policy_job_begin(job_ptr, false);
/*
* If run with slurmdbd, this is handled out of band in the job if
* happening right away. If the job has already become eligible and
* registered in the db then the start message.
*/
jobacct_storage_g_job_start(acct_db_conn, job_ptr);
jobcomp_g_record_job_start(job_ptr);
prolog_slurmctld(job_ptr);
job_ptr->end_time = now;
job_state_set(job_ptr, JOB_COMPLETE);
job_completion_logger(job_ptr, false);
acct_policy_job_fini(job_ptr, false);
if (select_g_job_fini(job_ptr) != SLURM_SUCCESS)
error("select_g_job_fini(%pJ): %m", job_ptr);
epilog_slurmctld(job_ptr);
}
static void _handle_explicit_req(void *x, void *arg)
{
gres_state_t *gres_state_job = x;
list_t **ret_gres_list = arg;
/* Copy over the explicit gres, skip others */
if (!(gres_state_job->config_flags & GRES_CONF_EXPLICIT) &&
!gres_id_shared(gres_state_job->config_flags))
return;
if (!*ret_gres_list)
*ret_gres_list = list_create(gres_job_list_delete);
list_append(*ret_gres_list,
gres_create_state(
gres_state_job,
GRES_STATE_SRC_STATE_PTR,
GRES_STATE_TYPE_JOB,
gres_job_state_dup(gres_state_job->gres_data)));
}
static void _gres_select_explicit(
list_t *req_gres_list, list_t **ret_gres_list)
{
if (!req_gres_list)
return;
(void) list_for_each(req_gres_list,
(ListForF) _handle_explicit_req,
ret_gres_list);
}
static list_t *_handle_exclusive_gres(job_record_t *job_ptr,
bitstr_t *select_bitmap, bool test_only)
{
list_t *post_list = NULL;
node_record_t *node_ptr;
if (test_only || !gres_get_gres_cnt())
return NULL;
xassert(job_ptr);
xassert(select_bitmap);
if (!job_ptr->details ||
!(job_ptr->details->whole_node & WHOLE_NODE_REQUIRED))
return NULL;
if (job_ptr->gres_list_req)
_gres_select_explicit(job_ptr->gres_list_req, &post_list);
for (int i = 0; (node_ptr = next_node_bitmap(select_bitmap, &i)); i++) {
gres_stepmgr_job_select_whole_node(
&post_list,
node_ptr->gres_list,
job_ptr->job_id,
node_ptr->name);
}
return post_list;
}
typedef struct {
uint64_t gpu_cnt;
int node_inx;
} foreach_node_gpu_args_t;
static int _get_node_gpu_sum(void *x, void *arg)
{
foreach_node_gpu_args_t *args = arg;
gres_state_t *gres_job_state = x;
gres_job_state_t *gres_js;
if (gres_job_state->plugin_id != gres_get_gpu_plugin_id())
return SLURM_SUCCESS;
gres_js = gres_job_state->gres_data;
args->gpu_cnt += gres_js->gres_cnt_node_select[args->node_inx];
return SLURM_SUCCESS;
}
static uint64_t _get_max_node_gpu_cnt(bitstr_t *node_bitmap, list_t* gres_list)
{
foreach_node_gpu_args_t args;
uint64_t max_node_gpu_cnt = 0;
xassert(node_bitmap);
xassert(gres_list);
for (int i = 0; (i = bit_ffs_from_bit(node_bitmap, i)) >= 0; i++) {
args.gpu_cnt = 0;
args.node_inx = i;
/* Get the sum of all gpu types on the node */
list_for_each(gres_list, _get_node_gpu_sum, &args);
max_node_gpu_cnt = MAX(max_node_gpu_cnt, args.gpu_cnt);
}
return max_node_gpu_cnt;
}
static int _get_resv_mpi_ports(job_record_t *job_ptr,
uint16_t *orig_resv_port_cnt,
uint32_t node_cnt,
time_t now)
{
int error_code = SLURM_SUCCESS;
bool resv_ports_present = false;
if (!(job_ptr->bit_flags & STEPMGR_ENABLED))
return SLURM_SUCCESS;
if (slurm_conf.mpi_params && xstrstr(slurm_conf.mpi_params, "ports="))
resv_ports_present = true;
if (resv_ports_present &&
(job_ptr->resv_port_cnt == NO_VAL16)) {
if (!job_ptr->job_resrcs) {
error("Select plugin failed to set job resources");
/*
* Do not attempt to allocate the select_bitmap nodes
* since select plugin failed to set job resources
*/
error_code = ESLURM_NODES_BUSY;
job_ptr->start_time = 0;
job_ptr->time_last_active = 0;
job_ptr->end_time = 0;
job_ptr->state_reason = WAIT_RESOURCES;
last_job_update = now;
xfree(job_ptr->state_desc);
return error_code;
}
*orig_resv_port_cnt = job_ptr->resv_port_cnt;
job_ptr->resv_port_cnt = 0;
/*
* reserved port count set to maximum task count on
* any node plus one, or if the job is exclusive give all
* resvered ports.
*/
if ((job_ptr->job_resrcs->node_req == NODE_CR_RESERVED) ||
(job_ptr->details->whole_node & WHOLE_NODE_REQUIRED)) {
job_ptr->resv_port_cnt =
resv_port_get_resv_port_cnt() - 1;
} else if (!job_ptr->details->overcommit &&
(job_ptr->details->num_tasks ||
job_ptr->details->ntasks_per_node ||
job_ptr->details->ntasks_per_tres)) {
for (int i = 0; i < node_cnt; i++) {
uint16_t tasks =
job_ptr->job_resrcs->tasks_per_node[i];
job_ptr->resv_port_cnt =
MAX(job_ptr->resv_port_cnt, tasks * 2);
}
} else if (!job_ptr->details->overcommit) {
uint16_t max_node_cpus = 0;
for (int i = 0; i < node_cnt; i++) {
max_node_cpus =
MAX(max_node_cpus,
job_ptr->job_resrcs->cpus[i] * 2);
}
job_ptr->resv_port_cnt = max_node_cpus;
} else if (job_ptr->details->ntasks_per_node) {
job_ptr->resv_port_cnt =
job_ptr->details->ntasks_per_node;
} else if (job_ptr->details->ntasks_per_tres &&
job_ptr->gres_list_req ) {
uint64_t max_gpu_per_node =
_get_max_node_gpu_cnt(
job_ptr->node_bitmap,
job_ptr->gres_list_req);
if (max_gpu_per_node > slurm_conf.max_tasks_per_node)
max_gpu_per_node =
slurm_conf.max_tasks_per_node;
job_ptr->resv_port_cnt =
(uint16_t) max_gpu_per_node *
job_ptr->details->ntasks_per_tres;
} else if (job_ptr->details->num_tasks) {
job_ptr->resv_port_cnt = ROUNDUP(
job_ptr->details->num_tasks, node_cnt);
} else {
job_ptr->resv_port_cnt = ROUNDUP(
job_ptr->job_resrcs->ncpus, node_cnt);
}
job_ptr->resv_port_cnt++;
}
if ((job_ptr->resv_port_cnt != NO_VAL16) &&
(job_ptr->resv_port_cnt != 0)) {
error_code = resv_port_job_alloc(job_ptr);
if (error_code) {
job_ptr->start_time = 0;
job_ptr->time_last_active = 0;
job_ptr->end_time = 0;
job_ptr->state_reason = WAIT_MPI_PORTS_BUSY;
last_job_update = now;
xfree(job_ptr->state_desc);
}
}
return error_code;
}
/*
* select_nodes - select and allocate nodes to a specific job
* IN job_node_select - pointer with at least a pointer to the job record
* IN test_only - if set do not allocate nodes, just confirm they
* could be allocated now
* IN select_node_bitmap - bitmap of nodes to be used for the
* job's resource allocation (not returned if NULL), caller
* must free
* IN submission - if set ignore reservations
* IN scheduler_type - which scheduler is calling this
* (i.e. SLURMDB_JOB_FLAG_BACKFILL, SLURMDB_JOB_FLAG_SCHED, etc)
* RET 0 on success, ESLURM code from slurm_errno.h otherwise
* globals: list_part - global list of partition info
* default_part_loc - pointer to default partition
* config_list - global list of node configuration info
* Notes: The algorithm is
* 1) Build a table (node_set_ptr) of nodes with the requisite
* configuration. Each table entry includes their weight,
* node_list, features, etc.
* 2) Call _pick_best_nodes() to select those nodes best satisfying
* the request, (e.g. best-fit or other criterion)
* 3) Call allocate_nodes() to perform the actual allocation
*/
extern int select_nodes(job_node_select_t *job_node_select,
bool test_only, bool submission,
uint32_t scheduler_type)
{
int bb, error_code = SLURM_SUCCESS, i, node_set_size = 0;
bitstr_t *select_bitmap = NULL;
struct node_set *node_set_ptr = NULL;
part_record_t *part_ptr = NULL;
uint8_t orig_whole_node, orig_share_res;
uint16_t orig_resv_port_cnt = 0;
uint32_t min_nodes = 0, max_nodes = 0, req_nodes = 0;
time_t now = time(NULL);
bool configuring = false;
list_t *preemptee_job_list = NULL;
uint32_t selected_node_cnt = NO_VAL;
uint64_t tres_req_cnt[slurmctld_tres_cnt];
bool can_reboot;
uint32_t qos_flags = 0;
assoc_mgr_lock_t qos_read_lock =
{ .assoc = READ_LOCK, .qos = READ_LOCK };
assoc_mgr_lock_t job_read_locks =
{ .assoc = READ_LOCK, .qos = WRITE_LOCK, .tres = READ_LOCK };
list_t *gres_list_pre = NULL;
bool gres_list_pre_set = false;
job_record_t *tmp_job, *job_ptr = job_node_select->job_ptr;
xassert(job_ptr);
xassert(job_ptr->magic == JOB_MAGIC);
/*
* The call path from _get_req_features() (called later in this
* function) can eventually call _resolve_shared_status(). This latter
* function can alter the job_ptr->details->{whole_node,share_res}.
*
* Saving the original values here and restoring them at cleanup time
* at the bottom of this function if needed.
*/
orig_whole_node = job_ptr->details->whole_node;
orig_share_res = job_ptr->details->share_res;
if (!acct_policy_job_runnable_pre_select(job_ptr, false))
return ESLURM_ACCOUNTING_POLICY;
part_ptr = job_ptr->part_ptr;
/* identify partition */
if (part_ptr == NULL) {
part_ptr = find_part_record(job_ptr->partition);
xassert(part_ptr);
job_ptr->part_ptr = part_ptr;
error("partition pointer reset for %pJ, part %s",
job_ptr, job_ptr->partition);
}
/* Quick check to see if this QOS is allowed on this partition. */
assoc_mgr_lock(&qos_read_lock);
if (job_ptr->qos_ptr)
qos_flags = job_ptr->qos_ptr->flags;
if ((error_code = part_policy_valid_qos(job_ptr->part_ptr,
job_ptr->qos_ptr,
job_ptr->user_id, job_ptr)) !=
SLURM_SUCCESS) {
assoc_mgr_unlock(&qos_read_lock);
return ESLURM_REQUESTED_PART_CONFIG_UNAVAILABLE;
}
/* Quick check to see if this account is allowed on this partition. */
if ((error_code = part_policy_valid_acct(
job_ptr->part_ptr,
job_ptr->assoc_ptr ? job_ptr->assoc_ptr->acct : NULL,
job_ptr))
!= SLURM_SUCCESS) {
assoc_mgr_unlock(&qos_read_lock);
return ESLURM_REQUESTED_PART_CONFIG_UNAVAILABLE;
}
assoc_mgr_unlock(&qos_read_lock);
/* Quick check to see if this group is allowed on this partition. */
if (!validate_group(job_ptr->part_ptr, job_ptr->user_id)) {
xfree(job_ptr->state_desc);
xstrfmtcat(job_ptr->state_desc,
"uid %u not in group permitted to use this partition (%s). groups allowed: %s",
job_ptr->user_id, job_ptr->part_ptr->name,
part_ptr->allow_groups);
debug2("%s: %s", __func__, job_ptr->state_desc);
job_ptr->state_reason = WAIT_ACCOUNT;
last_job_update = now;
return ESLURM_REQUESTED_PART_CONFIG_UNAVAILABLE;
}
if (job_ptr->priority == 0) { /* user/admin hold */
if (job_ptr->state_reason != FAIL_BAD_CONSTRAINTS
&& (job_ptr->state_reason != FAIL_BURST_BUFFER_OP)
&& (job_ptr->state_reason != WAIT_HELD)
&& (job_ptr->state_reason != WAIT_HELD_USER)
&& (job_ptr->state_reason != WAIT_MAX_REQUEUE)) {
job_ptr->state_reason = WAIT_HELD;
}
return ESLURM_JOB_HELD;
}
bb = bb_g_job_test_stage_in(job_ptr, test_only);
if (bb != 1) {
if ((bb == -1) &&
(job_ptr->state_reason == FAIL_BURST_BUFFER_OP))
return ESLURM_BURST_BUFFER_WAIT; /* Fatal BB event */
xfree(job_ptr->state_desc);
last_job_update = now;
if (bb == 0)
job_ptr->state_reason = WAIT_BURST_BUFFER_STAGING;
else
job_ptr->state_reason = WAIT_BURST_BUFFER_RESOURCE;
return ESLURM_BURST_BUFFER_WAIT;
}
if ((job_ptr->details->min_nodes == 0) &&
(job_ptr->details->max_nodes == 0)) {
if (!job_ptr->burst_buffer)
return ESLURM_INVALID_NODE_COUNT;
if (!test_only)
_end_null_job(job_ptr);
return SLURM_SUCCESS;
}
/* build sets of usable nodes based upon their configuration */
can_reboot = node_features_g_user_update(job_ptr->user_id);
error_code = _build_node_list(job_ptr, &node_set_ptr, &node_set_size,
job_node_select->err_msg,
test_only, can_reboot);
if (error_code)
return error_code;
if (node_set_ptr == NULL) /* Should never be true */
return ESLURM_REQUESTED_NODE_CONFIG_UNAVAILABLE;
for (i = 0; i < node_set_size; i++)
_set_sched_weight(node_set_ptr + i);
qsort(node_set_ptr, node_set_size, sizeof(struct node_set),
_sort_node_set);
_log_node_set(job_ptr, node_set_ptr, node_set_size);
/* ensure that selected nodes are in these node sets */
if (job_ptr->details->req_node_bitmap) {
error_code = _nodes_in_sets(job_ptr->details->req_node_bitmap,
node_set_ptr, node_set_size);
if (error_code) {
info("No nodes satisfy requirements for %pJ in partition %s",
job_ptr, job_ptr->part_ptr->name);
goto cleanup;
}
}
/* enforce both user's and partition's node limits if the qos
* isn't set to override them */
/* info("req: %u-%u, %u", job_ptr->details->min_nodes, */
/* job_ptr->details->max_nodes, part_ptr->max_nodes); */
error_code = get_node_cnts(job_ptr, qos_flags, part_ptr,
&min_nodes, &req_nodes, &max_nodes);
if ((error_code == ESLURM_ACCOUNTING_POLICY) ||
(error_code == ESLURM_REQUESTED_NODE_CONFIG_UNAVAILABLE))
goto cleanup;
else if ((error_code != ESLURM_REQUESTED_PART_CONFIG_UNAVAILABLE) &&
(error_code != ESLURM_RESERVATION_MAINT)) {
/* Select resources for the job here */
job_array_pre_sched(job_ptr);
if (job_ptr->job_resrcs)
debug2("%s: calling _get_req_features() for %pJ with not NULL job resources",
__func__, job_ptr);
error_code = _get_req_features(node_set_ptr, node_set_size,
&select_bitmap, job_ptr,
part_ptr, min_nodes, max_nodes,
req_nodes, test_only,
&preemptee_job_list, can_reboot,
submission);
}
/* Set this guess here to give the user tools an idea
* of how many nodes Slurm is planning on giving the job.
* This needs to be done on success or not. It means the job
* could run on nodes.
*/
if (select_bitmap) {
list_t *gres_list_whole_node = _handle_exclusive_gres(
job_ptr, select_bitmap, test_only);
selected_node_cnt = bit_set_count(select_bitmap);
job_ptr->node_cnt_wag = selected_node_cnt;
if (gres_list_whole_node) {
gres_list_pre_set = true;
gres_list_pre = job_ptr->gres_list_req;
job_ptr->gres_list_req = gres_list_whole_node;
}
} else
selected_node_cnt = req_nodes;
if (!test_only && select_bitmap && (max_powered_nodes != NO_VAL)) {
bitstr_t *tmp = bit_copy(select_bitmap);
hostlist_t *select = NULL, *need = NULL;
char *select_str = NULL, *need_str = NULL;
int32_t count, powerup_count, before_count = 0;
/* selected and powered down */
bit_and(tmp, power_down_node_bitmap);
powerup_count = bit_set_count(tmp);
if (slurm_conf.debug_flags & DEBUG_FLAG_POWER) {
select = bitmap2hostlist(select_bitmap);
select_str = slurm_hostlist_ranged_string_xmalloc(
select);
need = bitmap2hostlist(tmp);
need_str = slurm_hostlist_ranged_string_xmalloc(need);
before_count = bit_set_count(power_up_node_bitmap);
}
bit_or(tmp, power_up_node_bitmap);
count = bit_set_count(tmp);
log_flag(POWER, "Need to power up %d nodes (%s) from (%s). powered up count before: %d after: %d",
powerup_count, need_str, select_str, before_count,
count);
if ((powerup_count > 0) && (count > max_powered_nodes)) {
error_code = ESLURM_MAX_POWERED_NODES;
log_flag(POWER, "%s: Cannot power up more nodes for %pJ due to MaxPoweredUpNodes limit",
__func__, job_ptr);
}
FREE_NULL_BITMAP(tmp);
FREE_NULL_HOSTLIST(need);
FREE_NULL_HOSTLIST(select);
xfree(select_str);
xfree(need_str);
}
memcpy(tres_req_cnt, job_ptr->tres_req_cnt, sizeof(tres_req_cnt));
tres_req_cnt[TRES_ARRAY_CPU] =
(uint64_t)(job_ptr->total_cpus ?
job_ptr->total_cpus : job_ptr->details->min_cpus);
tres_req_cnt[TRES_ARRAY_MEM] = job_get_tres_mem(
job_ptr->job_resrcs,
job_ptr->details->pn_min_memory,
tres_req_cnt[TRES_ARRAY_CPU],
selected_node_cnt, job_ptr->part_ptr,
job_ptr->gres_list_req,
job_ptr->bit_flags & JOB_MEM_SET,
job_get_sockets_per_node(job_ptr),
job_ptr->details->num_tasks);
tres_req_cnt[TRES_ARRAY_NODE] = (uint64_t)selected_node_cnt;
assoc_mgr_lock(&job_read_locks);
gres_stepmgr_set_job_tres_cnt(
job_ptr->gres_list_req,
selected_node_cnt,
tres_req_cnt,
true);
tres_req_cnt[TRES_ARRAY_BILLING] =
assoc_mgr_tres_weighted(tres_req_cnt,
job_ptr->part_ptr->billing_weights,
slurm_conf.priority_flags, true);
if (!test_only && (selected_node_cnt != NO_VAL) &&
!acct_policy_job_runnable_post_select(job_ptr, tres_req_cnt, true)) {
assoc_mgr_unlock(&job_read_locks);
/* If there was an reason we couldn't schedule before hand we
* want to check if an accounting limit was also breached. If
* it was we want to override the other reason so if we are
* backfilling we don't reserve resources if we don't have to.
*/
free_job_resources(&job_ptr->job_resrcs);
if (error_code != SLURM_SUCCESS)
debug2("Replacing scheduling error code for %pJ from '%s' to 'Accounting policy'",
job_ptr, slurm_strerror(error_code));
error_code = ESLURM_ACCOUNTING_POLICY;
goto cleanup;
}
assoc_mgr_unlock(&job_read_locks);
/* set up the cpu_cnt here so we can decrement it as nodes
* free up. total_cpus is set within _get_req_features */
job_ptr->cpu_cnt = job_ptr->total_cpus;
if (!test_only && preemptee_job_list
&& (error_code == SLURM_SUCCESS)) {
job_details_t *detail_ptr = job_ptr->details;
time_t now = time(NULL);
bool kill_pending = true;
if ((detail_ptr->preempt_start_time != 0) &&
(detail_ptr->preempt_start_time >
(now - slurm_conf.kill_wait - slurm_conf.msg_timeout))) {
/* Job preemption may still be in progress,
* do not cancel or requeue any more jobs yet */
kill_pending = false;
}
_preempt_jobs(preemptee_job_list, kill_pending, &error_code,
job_ptr);
if ((error_code == ESLURM_NODES_BUSY) && kill_pending) {
detail_ptr->preempt_start_time = now;
job_ptr->preempt_in_progress = true;
if (job_ptr->array_recs)
job_ptr->array_recs->pend_run_tasks++;
}
}
if (error_code) {
/* Fatal errors for job here */
if (error_code == ESLURM_REQUESTED_PART_CONFIG_UNAVAILABLE) {
/* Too many nodes requested */
debug3("%s: %pJ not runnable with present config",
__func__, job_ptr);
job_ptr->state_reason = WAIT_PART_NODE_LIMIT;
xfree(job_ptr->state_desc);
last_job_update = now;
/* Non-fatal errors for job below */
} else if (error_code == ESLURM_NODE_NOT_AVAIL) {
/* Required nodes are down or drained */
char *node_str = NULL, *unavail_node = NULL;
bitstr_t *unavail_bitmap;
debug3("%s: %pJ required nodes not avail",
__func__, job_ptr);
job_ptr->state_reason = WAIT_NODE_NOT_AVAIL;
xfree(job_ptr->state_desc);
unavail_bitmap = bit_copy(avail_node_bitmap);
filter_by_node_owner(job_ptr, unavail_bitmap);
bit_not(unavail_bitmap);
bit_and_not(unavail_bitmap, future_node_bitmap);
bit_and(unavail_bitmap, part_ptr->node_bitmap);
bit_and_not(unavail_bitmap, up_node_bitmap);
if (job_ptr->details->req_node_bitmap) {
bit_and(unavail_bitmap,
job_ptr->details->req_node_bitmap);
}
if (bit_ffs(unavail_bitmap) != -1) {
unavail_node = bitmap2node_name(unavail_bitmap);
node_str = unavail_node;
}
FREE_NULL_BITMAP(unavail_bitmap);
if (node_str) {
xstrfmtcat(job_ptr->state_desc,
"ReqNodeNotAvail, "
"UnavailableNodes:%s",
node_str);
} else {
xstrfmtcat(job_ptr->state_desc,
"ReqNodeNotAvail, May be reserved "
"for other job");
}
xfree(unavail_node);
last_job_update = now;
} else if (error_code == ESLURM_RESERVATION_MAINT) {
error_code = ESLURM_RESERVATION_BUSY; /* All reserved */
job_ptr->state_reason = WAIT_NODE_NOT_AVAIL;
xfree(job_ptr->state_desc);
xstrfmtcat(job_ptr->state_desc,
"ReqNodeNotAvail, Reserved for maintenance");
} else if ((error_code == ESLURM_RESERVATION_NOT_USABLE) ||
(error_code == ESLURM_RESERVATION_BUSY)) {
job_ptr->state_reason = WAIT_RESERVATION;
xfree(job_ptr->state_desc);
} else if (error_code == ESLURM_LICENSES_UNAVAILABLE) {
job_ptr->state_reason = WAIT_LICENSES;
xfree(job_ptr->state_desc);
} else if ((job_ptr->state_reason == WAIT_HELD) &&
(job_ptr->priority == 0)) {
/* Held by select plugin due to some failure */
} else if ((error_code ==
ESLURM_REQUESTED_NODE_CONFIG_UNAVAILABLE) &&
job_ptr->extra_constraints) {
/*
* If a job requested extra_constraints, then assume
* that the job might be runnable at some point in the
* future. FIXME: This is a kludge and this assumption
* may be wrong.
*/
job_ptr->state_reason = FAIL_CONSTRAINTS;
xfree(job_ptr->state_desc);
} else if (error_code == ESLURM_MAX_POWERED_NODES) {
job_ptr->state_reason = WAIT_MAX_POWERED_NODES;
xfree(job_ptr->state_desc);
} else {
job_ptr->state_reason = WAIT_RESOURCES;
xfree(job_ptr->state_desc);
}
goto cleanup;
}
if (test_only) { /* set if job not highest priority */
error_code = SLURM_SUCCESS;
goto cleanup;
}
/*
* This job may be getting requeued, clear vestigial state information
* before over-writing and leaking memory or referencing old GRES or
* step data.
*/
job_ptr->bit_flags &= ~JOB_KILL_HURRY;
job_state_unset_flag(job_ptr, JOB_POWER_UP_NODE);
FREE_NULL_BITMAP(job_ptr->node_bitmap);
xfree(job_ptr->nodes);
xfree(job_ptr->sched_nodes);
job_ptr->exit_code = 0;
gres_stepmgr_job_clear_alloc(job_ptr->gres_list_req);
gres_stepmgr_job_clear_alloc(job_ptr->gres_list_req_accum);
FREE_NULL_LIST(job_ptr->gres_list_alloc);
if (!job_ptr->step_list)
job_ptr->step_list = list_create(free_step_record);
job_ptr->node_bitmap = select_bitmap;
select_bitmap = NULL; /* nothing left to free */
if ((error_code = _get_resv_mpi_ports(job_ptr, &orig_resv_port_cnt,
selected_node_cnt, now)))
goto cleanup;
/*
* we need to have these times set to know when the endtime
* is for the job when we place it
*/
job_ptr->start_time = job_ptr->time_last_active = now;
if ((job_ptr->limit_set.time != ADMIN_SET_LIMIT) &&
((job_ptr->time_limit == NO_VAL) ||
((job_ptr->time_limit > part_ptr->max_time) &&
!(qos_flags & QOS_FLAG_PART_TIME_LIMIT)))) {
if (part_ptr->default_time != NO_VAL)
job_ptr->time_limit = part_ptr->default_time;
else
job_ptr->time_limit = part_ptr->max_time;
job_ptr->limit_set.time = 1;
}
job_end_time_reset(job_ptr);
/*
* job_array_post_sched() must happen before allocate_nodes() because
* we need the pending job array state to be copied. For example,
* allocate_nodes() calls license_job_get() which can modify the job's
* license_list if the job requested OR'd licenses.
*/
tmp_job = job_array_post_sched(job_ptr, true);
if (tmp_job && (tmp_job != job_ptr) && (orig_resv_port_cnt == NO_VAL16))
tmp_job->resv_port_cnt = orig_resv_port_cnt;
if (bb_g_job_begin(job_ptr) != SLURM_SUCCESS) {
/* Leave job queued, something is hosed */
error_code = ESLURM_INVALID_BURST_BUFFER_REQUEST;
error("bb_g_job_begin(%pJ): %s",
job_ptr, slurm_strerror(error_code));
job_ptr->start_time = 0;
job_ptr->time_last_active = 0;
job_ptr->end_time = 0;
job_ptr->priority = 0;
job_ptr->state_reason = WAIT_HELD;
last_job_update = now;
goto cleanup;
}
if (select_g_job_begin(job_ptr) != SLURM_SUCCESS) {
/* Leave job queued, something is hosed */
error("select_g_job_begin(%pJ): %m", job_ptr);
/* Cancel previously started job */
(void) bb_g_job_revoke_alloc(job_ptr);
error_code = ESLURM_NODES_BUSY;
job_ptr->start_time = 0;
job_ptr->time_last_active = 0;
job_ptr->end_time = 0;
job_ptr->state_reason = WAIT_RESOURCES;
last_job_update = now;
goto cleanup;
}
/* assign the nodes and stage_in the job */
job_ptr->state_reason = WAIT_NO_REASON;
xfree(job_ptr->state_desc);
if (job_ptr->job_resrcs && job_ptr->job_resrcs->nodes) {
job_ptr->nodes = xstrdup(job_ptr->job_resrcs->nodes);
} else {
error("Select plugin failed to set job resources, nodes");
/* Do not attempt to allocate the select_bitmap nodes since
* select plugin failed to set job resources */
/* Cancel previously started job */
(void) bb_g_job_revoke_alloc(job_ptr);
error_code = ESLURM_NODES_BUSY;
job_ptr->start_time = 0;
job_ptr->time_last_active = 0;
job_ptr->end_time = 0;
job_ptr->state_reason = WAIT_RESOURCES;
last_job_update = now;
goto cleanup;
}
job_ptr->db_flags &= ~SLURMDB_JOB_CLEAR_SCHED;
job_ptr->db_flags |= scheduler_type;
/* This could be set in the select plugin so we want to keep the flag */
configuring = IS_JOB_CONFIGURING(job_ptr);
job_state_set(job_ptr, JOB_RUNNING);
job_ptr->bit_flags |= JOB_WAS_RUNNING;
if (select_g_select_nodeinfo_set(job_ptr) != SLURM_SUCCESS) {
error("select_g_select_nodeinfo_set(%pJ): %m", job_ptr);
if (!job_ptr->job_resrcs) {
/* If we don't exit earlier the empty job_resrcs might
* be dereferenced later */
/* Cancel previously started job */
(void) bb_g_job_revoke_alloc(job_ptr);
error_code = ESLURM_NODES_BUSY;
job_ptr->start_time = 0;
job_ptr->time_last_active = 0;
job_ptr->end_time = 0;
job_ptr->state_reason = WAIT_RESOURCES;
job_state_set(job_ptr, JOB_PENDING);
last_job_update = now;
goto cleanup;
}
}
allocate_nodes(job_ptr);
job_array_start(job_ptr);
build_node_details(job_ptr, true);
rebuild_job_part_list(job_ptr);
if ((job_ptr->mail_type & MAIL_JOB_BEGIN) &&
((job_ptr->mail_type & MAIL_ARRAY_TASKS) ||
_first_array_task(job_ptr)))
mail_job_info(job_ptr, MAIL_JOB_BEGIN);
slurmctld_diag_stats.jobs_started++;
/* job_set_alloc_tres has to be done before acct_policy_job_begin */
job_set_alloc_tres(job_ptr, false);
acct_policy_job_begin(job_ptr, false);
resv_replace_update(job_ptr);
/*
* If ran with slurmdbd this is handled out of band in the
* job if happening right away. If the job has already
* become eligible and registered in the db then the start message.
*/
jobacct_storage_g_job_start(acct_db_conn, job_ptr);
jobcomp_g_record_job_start(job_ptr);
switch_g_job_start(job_ptr);
prolog_slurmctld(job_ptr);
reboot_job_nodes(job_ptr);
gs_job_start(job_ptr);
if (bit_overlap_any(job_ptr->node_bitmap, power_down_node_bitmap)) {
job_state_set_flag(job_ptr, JOB_POWER_UP_NODE);
if (resume_job_list) {
uint32_t *tmp = xmalloc(sizeof(uint32_t));
*tmp = job_ptr->job_id;
list_append(resume_job_list, tmp);
}
}
if (configuring || IS_JOB_POWER_UP_NODE(job_ptr) ||
!bit_super_set(job_ptr->node_bitmap, avail_node_bitmap)) {
/* This handles nodes explicitly requesting node reboot */
job_state_set_flag(job_ptr, JOB_CONFIGURING);
}
/*
* Request asynchronous launch of a prolog for a
* non-batch job as long as the node is not configuring for
* a reboot first. Job state could be changed above so we need to
* recheck its state to see if it's currently configuring.
* PROLOG_FLAG_CONTAIN also turns on PROLOG_FLAG_ALLOC.
*/
if (!IS_JOB_CONFIGURING(job_ptr)) {
if (slurm_conf.prolog_flags & PROLOG_FLAG_ALLOC)
launch_prolog(job_ptr);
}
cleanup:
if (job_ptr->array_recs && job_ptr->array_recs->task_id_bitmap &&
!IS_JOB_STARTED(job_ptr) &&
(bit_ffs(job_ptr->array_recs->task_id_bitmap) != -1)) {
job_ptr->array_task_id = NO_VAL;
}
FREE_NULL_LIST(preemptee_job_list);
FREE_NULL_BITMAP(select_bitmap);
if (node_set_ptr) {
for (i = 0; i < node_set_size; i++) {
xfree(node_set_ptr[i].features);
FREE_NULL_BITMAP(node_set_ptr[i].my_bitmap);
FREE_NULL_BITMAP(node_set_ptr[i].feature_bits);
}
xfree(node_set_ptr);
}
if (error_code != SLURM_SUCCESS) {
if (gres_list_pre_set &&
(job_ptr->gres_list_req != gres_list_pre)) {
FREE_NULL_LIST(job_ptr->gres_list_req);
job_ptr->gres_list_req = gres_list_pre;
}
if (orig_resv_port_cnt == NO_VAL16)
job_ptr->resv_port_cnt = orig_resv_port_cnt;
if (job_ptr->resv_ports) {
resv_port_job_free(job_ptr);
xfree(job_ptr->resv_ports);
}
FREE_NULL_BITMAP(job_ptr->node_bitmap);
} else
FREE_NULL_LIST(gres_list_pre);
/*
* Unless the job is allocated resources now, we need to restore the
* original whole_node/share_res values since _resolve_shared_status()
* might have altered them during evaluation, and we don't want to
* propagate the changes for potential subsequent evaluations for the
* same job in a different partition with different configuration.
*
* NOTE: If we ever add an early return between the call to
* _get_req_features() and the last return below we should ensure to
* amend the restore logic consequently (probably copy this snippet
* before such early return).
*
* NOTE: We could have moved this snippet right after the call to
* _get_req_features(), but we need it here since after the call the
* error_code might change.
*
* NOTE: select_nodes() is the first common caller ancestor of the
* different call tree ramifications ending in _resolve_shared_status(),
* thus considered the appropriate spot for the save/restore logic.
*/
if (test_only || (error_code != SLURM_SUCCESS)) {
job_ptr->details->whole_node = orig_whole_node;
job_ptr->details->share_res = orig_share_res;
}
return error_code;
}
/*
* get_node_cnts - determine the number of nodes for the requested job.
* IN job_ptr - pointer to the job record.
* IN qos_flags - Flags of the job_ptr's qos. This is so we don't have to send
* in a pointer or lock the qos read lock before calling.
* IN part_ptr - pointer to the job's partition.
* OUT min_nodes - The minimum number of nodes for the job.
* OUT req_nodes - The number of node the select plugin should target.
* OUT max_nodes - The max number of nodes for the job.
* RET SLURM_SUCCESS on success, ESLURM code from slurm_errno.h otherwise.
*/
extern int get_node_cnts(job_record_t *job_ptr, uint32_t qos_flags,
part_record_t *part_ptr, uint32_t *min_nodes,
uint32_t *req_nodes, uint32_t *max_nodes)
{
int error_code = SLURM_SUCCESS, i;
uint32_t acct_max_nodes;
uint32_t wait_reason = 0;
xassert(job_ptr);
xassert(part_ptr);
/* On BlueGene systems don't adjust the min/max node limits
* here. We are working on midplane values. */
if (qos_flags & QOS_FLAG_PART_MIN_NODE)
*min_nodes = job_ptr->details->min_nodes;
else
*min_nodes = MAX(job_ptr->details->min_nodes,
part_ptr->min_nodes);
if (!job_ptr->details->max_nodes)
*max_nodes = part_ptr->max_nodes;
else if (qos_flags & QOS_FLAG_PART_MAX_NODE)
*max_nodes = job_ptr->details->max_nodes;
else
*max_nodes = MIN(job_ptr->details->max_nodes,
part_ptr->max_nodes);
if (job_ptr->details->req_node_bitmap && job_ptr->details->max_nodes) {
i = bit_set_count(job_ptr->details->req_node_bitmap);
if (i > job_ptr->details->max_nodes) {
info("%pJ required node list has more nodes than the job can use (%d > %u)",
job_ptr, i, job_ptr->details->max_nodes);
error_code = ESLURM_REQUESTED_NODE_CONFIG_UNAVAILABLE;
goto end_it;
}
}
/* Don't call functions in MIN/MAX it will result in the
* function being called multiple times. */
acct_max_nodes = acct_policy_get_max_nodes(job_ptr, &wait_reason);
*max_nodes = MIN(*max_nodes, acct_max_nodes);
*max_nodes = MIN(*max_nodes, 500000); /* prevent overflows */
if (!job_ptr->limit_set.tres[TRES_ARRAY_NODE] &&
job_ptr->details->max_nodes &&
!(job_ptr->bit_flags & USE_MIN_NODES))
*req_nodes = *max_nodes;
else
*req_nodes = *min_nodes;
if (acct_max_nodes < *min_nodes) {
error_code = ESLURM_ACCOUNTING_POLICY;
xfree(job_ptr->state_desc);
job_ptr->state_reason = wait_reason;
goto end_it;
} else if (*max_nodes < *min_nodes) {
error_code = ESLURM_REQUESTED_PART_CONFIG_UNAVAILABLE;
goto end_it;
}
end_it:
return error_code;
}
/*
* Launch prolog via RPC to slurmd. This is useful when we need to run
* prolog at allocation stage. Then we ask slurmd to launch the prolog
* asynchronously and wait on REQUEST_COMPLETE_PROLOG message from slurmd.
*/
extern void launch_prolog(job_record_t *job_ptr)
{
prolog_launch_msg_t *prolog_msg_ptr;
uint16_t protocol_version = job_ptr->start_protocol_ver;
uint16_t msg_flags = 0;
agent_arg_t *agent_arg_ptr;
job_resources_t *job_resrcs_ptr;
slurm_cred_arg_t cred_arg;
node_record_t *node_ptr;
xassert(job_ptr);
if (job_ptr->bit_flags & EXTERNAL_JOB)
return;
for (int i = 0; (node_ptr = next_node_bitmap(job_ptr->node_bitmap, &i));
i++) {
if (protocol_version > node_ptr->protocol_version)
protocol_version = node_ptr->protocol_version;
if (PACK_FANOUT_ADDRS(node_ptr))
msg_flags |= SLURM_PACK_ADDRS;
}
prolog_msg_ptr = xmalloc(sizeof(prolog_launch_msg_t));
/* Locks: Write job */
if ((slurm_conf.prolog_flags & PROLOG_FLAG_ALLOC) &&
!(slurm_conf.prolog_flags & PROLOG_FLAG_NOHOLD)) {
job_ptr->state_reason = WAIT_PROLOG;
FREE_NULL_BITMAP(job_ptr->node_bitmap_pr);
job_ptr->node_bitmap_pr = bit_copy(job_ptr->node_bitmap);
}
prolog_msg_ptr->alloc_tls_cert = xstrdup(job_ptr->alloc_tls_cert);
prolog_msg_ptr->job_gres_prep =
gres_g_prep_build_env(job_ptr->gres_list_alloc,
job_ptr->nodes);
prolog_msg_ptr->job_id = job_ptr->job_id;
prolog_msg_ptr->het_job_id = job_ptr->het_job_id;
prolog_msg_ptr->uid = job_ptr->user_id;
prolog_msg_ptr->gid = job_ptr->group_id;
if (!job_ptr->user_name)
job_ptr->user_name = user_from_job(job_ptr);
prolog_msg_ptr->nodes = xstrdup(job_ptr->nodes);
prolog_msg_ptr->work_dir = xstrdup(job_ptr->details->work_dir);
prolog_msg_ptr->x11 = job_ptr->details->x11;
if (prolog_msg_ptr->x11) {
char *x11_alloc_host = NULL;
prolog_msg_ptr->x11_magic_cookie =
xstrdup(job_ptr->details->x11_magic_cookie);
/*
* If resp_host is localhost, send slurmctld's hostname instead.
* This gives the compute node a better chance of getting the
* connection set up - otherwise it'd try to connect back to
* itself by mistake.
*/
if (!xstrncmp(job_ptr->resp_host, "127.", 4) ||
!xstrcmp(job_ptr->resp_host, "::1")) {
char hostname[HOST_NAME_MAX];
if (!gethostname(hostname, sizeof(hostname)))
x11_alloc_host = xstrdup(hostname);
}
if (!x11_alloc_host)
x11_alloc_host = xstrdup(job_ptr->resp_host);
prolog_msg_ptr->x11_alloc_host = x11_alloc_host;
prolog_msg_ptr->x11_alloc_port = job_ptr->other_port;
prolog_msg_ptr->x11_target = xstrdup(job_ptr->details->x11_target);
prolog_msg_ptr->x11_target_port = job_ptr->details->x11_target_port;
}
prolog_msg_ptr->spank_job_env_size = job_ptr->spank_job_env_size;
prolog_msg_ptr->spank_job_env = xduparray(job_ptr->spank_job_env_size,
job_ptr->spank_job_env);
if (job_ptr->bit_flags & STEPMGR_ENABLED) {
node_record_t *bit_node;
/* Only keep pointers to nodes */
list_t *job_node_array = list_create(NULL);
for (int i = 0;
(bit_node = next_node_bitmap(job_ptr->node_bitmap, &i));
i++) {
list_append(job_node_array, bit_node);
}
/*
* Pack while we are in locks so that we don't need to make a
* copies of job_ptr and job_node_array since the agent queue
* doesn't pack until sending.
*/
prolog_msg_ptr->job_ptr_buf = init_buf(BUF_SIZE);
job_record_pack(job_ptr, slurmctld_tres_cnt,
prolog_msg_ptr->job_ptr_buf, protocol_version);
prolog_msg_ptr->job_node_array_buf = init_buf(BUF_SIZE);
slurm_pack_list(job_node_array, node_record_pack,
prolog_msg_ptr->job_node_array_buf,
protocol_version);
prolog_msg_ptr->part_ptr_buf = init_buf(BUF_SIZE);
part_record_pack(job_ptr->part_ptr,
prolog_msg_ptr->part_ptr_buf,
protocol_version);
FREE_NULL_LIST(job_node_array);
}
xassert(job_ptr->job_resrcs);
job_resrcs_ptr = job_ptr->job_resrcs;
setup_cred_arg(&cred_arg, job_ptr);
cred_arg.step_id.job_id = job_ptr->job_id;
cred_arg.step_id.step_id = SLURM_EXTERN_CONT;
cred_arg.step_id.step_het_comp = NO_VAL;
if (job_resrcs_ptr->memory_allocated) {
slurm_array64_to_value_reps(job_resrcs_ptr->memory_allocated,
job_resrcs_ptr->nhosts,
&cred_arg.job_mem_alloc,
&cred_arg.job_mem_alloc_rep_count,
&cred_arg.job_mem_alloc_size);
}
cred_arg.step_core_bitmap = job_resrcs_ptr->core_bitmap;
cred_arg.step_hostlist = job_ptr->job_resrcs->nodes;
switch_g_extern_stepinfo(&cred_arg.switch_step, job_ptr);
prolog_msg_ptr->cred = slurm_cred_create(&cred_arg, false,
protocol_version);
switch_g_free_stepinfo(cred_arg.switch_step);
xfree(cred_arg.job_mem_alloc);
xfree(cred_arg.job_mem_alloc_rep_count);
if (!prolog_msg_ptr->cred) {
error("%s: slurm_cred_create failure for %pJ, holding job",
__func__, job_ptr);
slurm_free_prolog_launch_msg(prolog_msg_ptr);
job_mgr_handle_cred_failure(job_ptr);
return;
}
agent_arg_ptr = xmalloc(sizeof(agent_arg_t));
agent_arg_ptr->retry = 0;
agent_arg_ptr->protocol_version = protocol_version;
agent_arg_ptr->hostlist = hostlist_create(job_ptr->nodes);
agent_arg_ptr->node_count = job_ptr->node_cnt;
agent_arg_ptr->msg_type = REQUEST_LAUNCH_PROLOG;
agent_arg_ptr->msg_args = (void *) prolog_msg_ptr;
agent_arg_ptr->msg_flags = msg_flags;
/* At least on a Cray we have to treat this as a real step, so
* this is where to do it.
*/
if (slurm_conf.prolog_flags & PROLOG_FLAG_CONTAIN) {
step_record_t *step_ptr = build_extern_step(job_ptr);
if (!step_ptr)
error("%s: build_extern_step failure for %pJ",
__func__, job_ptr);
}
job_ptr->prolog_launch_time = time(NULL);
/* Launch the RPC via agent */
set_agent_arg_r_uid(agent_arg_ptr, SLURM_AUTH_UID_ANY);
agent_queue_request(agent_arg_ptr);
}
/*
* valid_feature_counts - validate a job's features can be satisfied
* by the selected nodes (NOTE: does not process MOR or XAND operators)
* IN job_ptr - job to operate on
* IN use_active - if set, then only consider nodes with the identified features
* active, otherwise use available features
* IN/OUT node_bitmap - nodes available for use, clear if unusable
* OUT has_mor - set if MOR/XAND found in feature expression
* RET SLURM_SUCCESS or error
*/
extern int valid_feature_counts(job_record_t *job_ptr, bool use_active,
bitstr_t *node_bitmap, bool *has_mor)
{
job_details_t *detail_ptr = job_ptr->details;
list_itr_t *job_feat_iter;
job_feature_t *job_feat_ptr;
int last_op = FEATURE_OP_AND, last_paren_op = FEATURE_OP_AND;
int last_paren_cnt = 0;
bitstr_t *feature_bitmap, *paren_bitmap = NULL;
bitstr_t *tmp_bitmap, *work_bitmap;
bool have_count = false, user_update;
int rc = SLURM_SUCCESS;
list_t *feature_list = NULL;
char *features;
xassert(detail_ptr);
xassert(node_bitmap);
xassert(has_mor);
/*
* This is used in two different ways. 1 to pick nodes where
* feature_use is set and another to set the predicted start time where
* it isn't.
*/
if (detail_ptr->features_use) {
feature_list = detail_ptr->feature_list_use;
features = detail_ptr->features_use;
} else {
feature_list = detail_ptr->feature_list;
features = detail_ptr->features;
}
*has_mor = false;
if (!feature_list) /* no constraints */
return rc;
user_update = node_features_g_user_update(job_ptr->user_id);
find_feature_nodes(feature_list, user_update);
feature_bitmap = bit_copy(node_bitmap);
work_bitmap = feature_bitmap;
job_feat_iter = list_iterator_create(feature_list);
while ((job_feat_ptr = list_next(job_feat_iter))) {
if (last_paren_cnt < job_feat_ptr->paren) {
/* Start of expression in parenthesis */
/*
* If this pair of parentheses is inside of brackets,
* then this is XAND or MOR. Set last_paren_op to
* avoid incorrectly doing bit_and() or bit_or() at the
* end of parentheses. This only matters if the
* parentheses are the first thing inside of brackets,
* in which case last_op is AND or OR depending on what
* (if anything) came before the brackets. If the
* parentheses are not the first thing inside of
* brackets then last_op is XAND or MOR.
*/
if (job_feat_ptr->bracket &&
(last_op != FEATURE_OP_XAND) &&
(last_op != FEATURE_OP_MOR))
last_paren_op = FEATURE_OP_XAND;
else
last_paren_op = last_op;
last_op = FEATURE_OP_AND;
if (paren_bitmap) {
if (job_ptr->job_id) {
error("%s: %pJ has bad feature expression: %s",
__func__, job_ptr,
features);
} else {
error("%s: Reservation has bad feature expression: %s",
__func__, features);
}
FREE_NULL_BITMAP(paren_bitmap);
}
paren_bitmap = bit_copy(node_bitmap);
work_bitmap = paren_bitmap;
}
if (use_active)
tmp_bitmap = job_feat_ptr->node_bitmap_active;
else
tmp_bitmap = job_feat_ptr->node_bitmap_avail;
if (tmp_bitmap) {
/*
* Here we need to use the current feature for MOR/AND
* not the last_op. For instance fastio&[xeon|nehalem]
* should ignore xeon (in valid_feature_count), but if
* would be based on last_op it will see AND operation.
* This should only be used when dealing with middle
* options, not for the end as done in the last_paren
* check below.
*/
if ((job_feat_ptr->op_code == FEATURE_OP_MOR) ||
(job_feat_ptr->op_code == FEATURE_OP_XAND)) {
*has_mor = true;
} else if (last_op == FEATURE_OP_AND) {
bit_and(work_bitmap, tmp_bitmap);
} else if (last_op == FEATURE_OP_OR) {
bit_or(work_bitmap, tmp_bitmap);
}
} else { /* feature not found */
if (last_op == FEATURE_OP_AND)
bit_clear_all(work_bitmap);
}
if (job_feat_ptr->count)
have_count = true;
if (last_paren_cnt > job_feat_ptr->paren) {
/* End of expression in parenthesis */
if (last_paren_op == FEATURE_OP_AND) {
bit_and(feature_bitmap, work_bitmap);
} else if (last_paren_op == FEATURE_OP_OR) {
bit_or(feature_bitmap, work_bitmap);
} else { /* FEATURE_OP_MOR or FEATURE_OP_XAND */
*has_mor = true;
}
FREE_NULL_BITMAP(paren_bitmap);
work_bitmap = feature_bitmap;
}
last_op = job_feat_ptr->op_code;
last_paren_cnt = job_feat_ptr->paren;
if (slurm_conf.debug_flags & DEBUG_FLAG_NODE_FEATURES) {
char *tmp_f, *tmp_w, *tmp_t;
tmp_f = bitmap2node_name(feature_bitmap);
tmp_w = bitmap2node_name(work_bitmap);
tmp_t = bitmap2node_name(tmp_bitmap);
log_flag(NODE_FEATURES, "%s: feature:%s feature_bitmap:%s work_bitmap:%s tmp_bitmap:%s count:%u",
__func__, job_feat_ptr->name, tmp_f, tmp_w,
tmp_t, job_feat_ptr->count);
xfree(tmp_f);
xfree(tmp_w);
xfree(tmp_t);
}
}
list_iterator_destroy(job_feat_iter);
if (!have_count)
bit_and(node_bitmap, work_bitmap);
FREE_NULL_BITMAP(feature_bitmap);
FREE_NULL_BITMAP(paren_bitmap);
if (slurm_conf.debug_flags & DEBUG_FLAG_NODE_FEATURES) {
char *tmp = bitmap2node_name(node_bitmap);
log_flag(NODE_FEATURES, "%s: NODES:%s HAS_MOR:%c status:%s",
__func__, tmp, (*has_mor ? 'T' : 'F'),
slurm_strerror(rc));
xfree(tmp);
}
return rc;
}
/*
* job_req_node_filter - job request node filter.
* clear from a bitmap the nodes which can not be used for a job
* test memory size, required features, processor count, etc.
* NOTE: Does not support exclusive OR of features.
* It just matches first element of MOR and ignores count.
* IN job_ptr - pointer to node to be scheduled
* IN/OUT bitmap - set of nodes being considered for use
* RET SLURM_SUCCESS or EINVAL if can't filter (exclusive OR of features)
*/
extern int job_req_node_filter(job_record_t *job_ptr,
bitstr_t *avail_bitmap, bool test_only)
{
job_details_t *detail_ptr = job_ptr->details;
multi_core_data_t *mc_ptr;
node_record_t *node_ptr;
bool has_mor = false;
if (detail_ptr == NULL) {
error("%s: %pJ has no details",
__func__, job_ptr);
return EINVAL;
}
mc_ptr = detail_ptr->mc_ptr;
for (int i = 0; (node_ptr = next_node_bitmap(avail_bitmap, &i)); i++) {
if ((detail_ptr->pn_min_cpus > node_ptr->cpus) ||
((detail_ptr->pn_min_memory & (~MEM_PER_CPU)) >
node_ptr->real_memory) ||
((detail_ptr->pn_min_memory & (MEM_PER_CPU)) &&
((detail_ptr->pn_min_memory & (~MEM_PER_CPU)) *
detail_ptr->pn_min_cpus) >
node_ptr->real_memory) ||
(detail_ptr->pn_min_tmp_disk >
node_ptr->tmp_disk)) {
bit_clear(avail_bitmap, i);
continue;
}
if (mc_ptr &&
(((mc_ptr->sockets_per_node > node_ptr->tot_sockets) &&
(mc_ptr->sockets_per_node != NO_VAL16)) ||
((mc_ptr->cores_per_socket > node_ptr->cores) &&
(mc_ptr->cores_per_socket != NO_VAL16)) ||
((mc_ptr->threads_per_core > node_ptr->threads) &&
(mc_ptr->threads_per_core != NO_VAL16)))) {
bit_clear(avail_bitmap, i);
continue;
}
}
return valid_feature_counts(job_ptr, false, avail_bitmap, &has_mor);
}
/*
* Split the node set record in two
* IN node_set_ptr - array of node_set records
* IN config_ptr - configuration info for the nodes being added to a node set
* IN nset_inx_base - index of original/base node_set to split
* IN nset_inx - index of the new node_set record
* IN nset_feature_bits - feature bitmap for the new node_set record
* IN nset_node_bitmap - bitmap of nodes for the new node_set record
* IN nset_flags - flags of nodes for the new node_set record
*/
static void _split_node_set(struct node_set *nset, config_record_t *config_ptr,
int nset_inx_base, int nset_inx,
bitstr_t *nset_feature_bits,
bitstr_t *nset_node_bitmap, uint32_t nset_flags)
{
nset[nset_inx].cpus_per_node = config_ptr->cpus;
nset[nset_inx].features = xstrdup(config_ptr->feature);
nset[nset_inx].feature_bits = bit_copy(nset_feature_bits);
nset[nset_inx].flags = nset_flags;
nset[nset_inx].real_memory = config_ptr->real_memory;
nset[nset_inx].node_weight = nset[nset_inx_base].node_weight;
/*
* The bitmap of this new nodeset will contain only the nodes that
* are present both in the original bitmap AND in the new bitmap.
*/
nset[nset_inx].my_bitmap = bit_copy(nset[nset_inx_base].my_bitmap);
bit_and(nset[nset_inx].my_bitmap, nset_node_bitmap);
nset[nset_inx].node_cnt = bit_set_count(nset[nset_inx].my_bitmap);
/* Now we remove these nodes from the original bitmap */
bit_and_not(nset[nset_inx_base].my_bitmap, nset_node_bitmap);
nset[nset_inx_base].node_cnt -= nset[nset_inx].node_cnt;
}
/* Split from an existing node_set */
static void _split_node_set2(struct node_set *nset, int idx, int *last_inx,
int cnt, bitstr_t *nset_bitmap,
uint32_t nset_flags)
{
nset[*last_inx].cpus_per_node = nset[idx].cpus_per_node;
nset[*last_inx].features = xstrdup(nset[idx].features);
nset[*last_inx].feature_bits = bit_copy(nset[idx].feature_bits);
nset[*last_inx].flags = nset_flags;
nset[*last_inx].real_memory = nset[idx].real_memory;
nset[*last_inx].node_weight = nset[idx].node_weight;
nset[*last_inx].my_bitmap = bit_copy(nset[idx].my_bitmap);
bit_and(nset[*last_inx].my_bitmap, nset_bitmap);
nset[*last_inx].node_cnt = cnt;
/* Remove the bits and count from the original set */
bit_and_not(nset[idx].my_bitmap, nset_bitmap);
nset[idx].node_cnt -= cnt;
(*last_inx)++;
}
static void _apply_extra_constraints(job_record_t *job_ptr,
bitstr_t *usable_node_mask)
{
node_record_t *node_ptr = NULL;
xassert(job_ptr->extra);
xassert(job_ptr->extra_constraints);
for (int i = 0; (node_ptr = next_node_bitmap(usable_node_mask, &i));
i++) {
if (!node_ptr->extra_data) {
bit_clear(usable_node_mask, i);
continue;
}
if (!extra_constraints_test(job_ptr->extra_constraints,
node_ptr->extra_data)) {
bit_clear(usable_node_mask, i);
continue;
}
}
}
/*
* _build_node_list - identify which nodes could be allocated to a job
* based upon node features, memory, processors, etc. Note that a
* bitmap is set to indicate which of the job's features that the
* nodes satisfy.
* IN job_ptr - pointer to node to be scheduled
* OUT node_set_pptr - list of node sets which could be used for the job
* OUT node_set_size - number of node_set entries
* OUT err_msg - error message for job, caller must xfree
* IN test_only - true if only testing if job can be started at some point
* IN can_reboot - if true node can use any available feature,
* else job can use only active features
* RET error code
*/
static int _build_node_list(job_record_t *job_ptr,
struct node_set **node_set_pptr,
int *node_set_size, char **err_msg, bool test_only,
bool can_reboot)
{
int adj_cpus, i, node_set_inx, node_set_len, node_set_inx_base;
int rc, qos_cnt;
struct node_set *node_set_ptr, *prev_node_set_ptr;
config_record_t *config_ptr;
part_record_t *part_ptr = job_ptr->part_ptr;
list_itr_t *config_iterator;
int total_cores;
job_details_t *detail_ptr = job_ptr->details;
bitstr_t *usable_node_mask = NULL;
multi_core_data_t *mc_ptr = detail_ptr->mc_ptr;
bitstr_t *tmp_feature;
bitstr_t *grp_node_bitmap;
bool has_mor = false;
bool resv_overlap = false;
bitstr_t *node_maps[NM_TYPES] = { NULL, NULL, NULL, NULL, NULL, NULL };
bitstr_t *reboot_bitmap = NULL;
if (job_ptr->resv_name) {
/*
* Limit node selection to those in selected reservation.
* Assume node reboot required since we have not selected the
* compute nodes yet.
*/
time_t start_res = time(NULL);
rc = job_test_resv(job_ptr, &start_res, false,
&usable_node_mask, NULL, &resv_overlap,
true);
if (rc != SLURM_SUCCESS) {
job_ptr->state_reason = WAIT_RESERVATION;
xfree(job_ptr->state_desc);
if (rc == ESLURM_INVALID_TIME_VALUE)
return ESLURM_RESERVATION_NOT_USABLE;
if (rc == ESLURM_NODES_BUSY)
return ESLURM_NODES_BUSY;
if (err_msg) {
xfree(*err_msg);
*err_msg = xstrdup("Problem using reservation");
}
return ESLURM_REQUESTED_NODE_CONFIG_UNAVAILABLE;
}
if ((detail_ptr->req_node_bitmap) &&
(!bit_super_set(detail_ptr->req_node_bitmap,
usable_node_mask))) {
job_ptr->state_reason = WAIT_RESERVATION;
xfree(job_ptr->state_desc);
FREE_NULL_BITMAP(usable_node_mask);
if (err_msg) {
xfree(*err_msg);
*err_msg = xstrdup("Required nodes outside of "
"the reservation");
}
return ESLURM_REQUESTED_NODE_CONFIG_UNAVAILABLE;
}
if (resv_overlap && bit_ffs(usable_node_mask) < 0) {
job_ptr->state_reason = WAIT_NODE_NOT_AVAIL;
xfree(job_ptr->state_desc);
xstrfmtcat(job_ptr->state_desc,
"ReqNodeNotAvail, Reserved for maintenance");
FREE_NULL_BITMAP(usable_node_mask);
return ESLURM_RESERVATION_BUSY; /* All reserved */
}
}
if (detail_ptr->exc_node_bitmap) {
if (usable_node_mask) {
bit_and_not(usable_node_mask, detail_ptr->exc_node_bitmap);
} else {
usable_node_mask =
bit_copy(detail_ptr->exc_node_bitmap);
bit_not(usable_node_mask);
}
} else if (usable_node_mask == NULL) {
usable_node_mask = node_conf_get_active_bitmap();
}
if (!(job_ptr->bit_flags & EXTERNAL_JOB)) {
bit_and_not(usable_node_mask, external_node_bitmap);
}
if (!test_only && job_ptr->extra_constraints) {
_apply_extra_constraints(job_ptr, usable_node_mask);
if (!bit_set_count(usable_node_mask)) {
rc = ESLURM_REQUESTED_NODE_CONFIG_UNAVAILABLE;
debug("%s: No nodes satisfy %pJ extra constraints in partition %s",
__func__, job_ptr, job_ptr->part_ptr->name);
xfree(job_ptr->state_desc);
job_ptr->state_reason = FAIL_CONSTRAINTS;
debug2("%s: setting %pJ to \"%s\" (%s)",
__func__, job_ptr,
job_state_reason_string(job_ptr->state_reason),
slurm_strerror(rc));
FREE_NULL_BITMAP(usable_node_mask);
return rc;
}
}
if ((rc = valid_feature_counts(job_ptr, false, usable_node_mask,
&has_mor))) {
info("%pJ feature requirements can not be satisfied: %s",
job_ptr, slurm_strerror(rc));
FREE_NULL_BITMAP(usable_node_mask);
if (err_msg) {
xfree(*err_msg);
*err_msg = xstrdup("Node feature requirements can not "
"be satisfied");
}
return rc;
}
if (can_reboot)
reboot_bitmap = bit_alloc(node_record_count);
node_set_inx = 0;
node_set_len = list_count(config_list) * 32 + 1;
node_set_ptr = xcalloc(node_set_len, sizeof(struct node_set));
config_iterator = list_iterator_create(config_list);
while ((config_ptr = list_next(config_iterator))) {
bool cpus_ok = false, mem_ok = false, disk_ok = false;
bool job_mc_ok = false, config_filter = false;
total_cores = config_ptr->tot_sockets * config_ptr->cores;
adj_cpus = adjust_cpus_nppcu(_get_ntasks_per_core(detail_ptr),
detail_ptr->cpus_per_task,
total_cores, config_ptr->cpus);
if (detail_ptr->pn_min_cpus <= adj_cpus)
cpus_ok = true;
if ((detail_ptr->pn_min_memory & (~MEM_PER_CPU)) <=
config_ptr->real_memory)
mem_ok = true;
if (detail_ptr->pn_min_tmp_disk <= config_ptr->tmp_disk)
disk_ok = true;
if (!mc_ptr)
job_mc_ok = true;
if (mc_ptr &&
(((mc_ptr->sockets_per_node <= config_ptr->tot_sockets) ||
(mc_ptr->sockets_per_node == NO_VAL16)) &&
((mc_ptr->cores_per_socket <= config_ptr->cores) ||
(mc_ptr->cores_per_socket == NO_VAL16)) &&
((mc_ptr->threads_per_core <= config_ptr->threads) ||
(mc_ptr->threads_per_core == NO_VAL16))))
job_mc_ok = true;
config_filter = !(cpus_ok && mem_ok && disk_ok && job_mc_ok);
/*
* since nodes can register with more resources than defined
* in the configuration, we want to use those higher values
* for scheduling, but only as needed (slower)
*/
node_set_ptr[node_set_inx].my_bitmap =
bit_copy(config_ptr->node_bitmap);
bit_and(node_set_ptr[node_set_inx].my_bitmap,
part_ptr->node_bitmap);
if (usable_node_mask) {
bit_and(node_set_ptr[node_set_inx].my_bitmap,
usable_node_mask);
}
node_set_ptr[node_set_inx].node_cnt =
bit_set_count(node_set_ptr[node_set_inx].my_bitmap);
if (node_set_ptr[node_set_inx].node_cnt == 0) {
debug2("%s: JobId=%u matched 0 nodes (%s) due to job partition or features",
__func__, job_ptr->job_id, config_ptr->nodes);
FREE_NULL_BITMAP(node_set_ptr[node_set_inx].my_bitmap);
continue;
}
if (config_filter) {
_set_err_msg(cpus_ok, mem_ok, disk_ok, job_mc_ok,
err_msg);
debug2("%s: JobId=%u filtered all nodes (%s): %s",
__func__, job_ptr->job_id, config_ptr->nodes,
err_msg ? *err_msg : NULL);
FREE_NULL_BITMAP(node_set_ptr[node_set_inx].my_bitmap);
continue;
}
if (has_mor) {
tmp_feature = _valid_features(job_ptr, config_ptr,
can_reboot, reboot_bitmap);
if (tmp_feature == NULL) {
debug2("%s: JobId=%u matched 0 nodes (%s) due to MOR job features",
__func__, job_ptr->job_id,
config_ptr->nodes);
FREE_NULL_BITMAP(node_set_ptr[node_set_inx].
my_bitmap);
continue;
}
} else {
/* We've already filtered for AND/OR features */
tmp_feature = bit_alloc(MAX_FEATURES);
bit_set(tmp_feature, 0);
}
/* NOTE: FREE_NULL_BITMAP(tmp_feature) to avoid memory leak */
node_set_ptr[node_set_inx].cpus_per_node =
config_ptr->cpus;
node_set_ptr[node_set_inx].real_memory =
config_ptr->real_memory;
node_set_ptr[node_set_inx].node_weight = config_ptr->weight;
node_set_ptr[node_set_inx].features =
xstrdup(config_ptr->feature);
node_set_ptr[node_set_inx].feature_bits = tmp_feature;
debug2("found %u usable nodes from config containing %s",
node_set_ptr[node_set_inx].node_cnt, config_ptr->nodes);
prev_node_set_ptr = node_set_ptr + node_set_inx;
node_set_inx++;
if (node_set_inx >= node_set_len) {
error("%s: node_set buffer filled", __func__);
break;
}
/*
* If we have a FLEX reservation we will want a nodeset for
* those nodes outside the reservation.
*/
if (job_ptr->resv_ptr &&
(job_ptr->resv_ptr->flags & RESERVE_FLAG_FLEX) &&
job_ptr->resv_ptr->node_bitmap &&
!bit_super_set(prev_node_set_ptr->my_bitmap,
job_ptr->resv_ptr->node_bitmap)) {
node_maps[IN_FL] =
bit_copy(job_ptr->resv_ptr->node_bitmap);
node_maps[OUT_FL] =
bit_copy(prev_node_set_ptr->my_bitmap);
bit_and_not(node_maps[OUT_FL], node_maps[IN_FL]);
}
/* Identify the nodes that need reboot for use */
if (!test_only && can_reboot) {
if (has_mor) {
node_maps[REBOOT] = bit_copy(reboot_bitmap);
} else {
(void) _match_feature(
job_ptr->details->feature_list_use,
&node_maps[REBOOT]);
}
/* No nodes in set require reboot */
if (node_maps[REBOOT] &&
!bit_overlap_any(prev_node_set_ptr->my_bitmap,
node_maps[REBOOT]))
FREE_NULL_BITMAP(node_maps[REBOOT]);
}
/* No nodes to split from this node set */
if (!node_maps[OUT_FL] && !node_maps[REBOOT])
continue;
/* Just need to split these nodes that need reboot */
if (!node_maps[OUT_FL] && node_maps[REBOOT]) {
if (bit_super_set(prev_node_set_ptr->my_bitmap,
node_maps[REBOOT])) {
/* All nodes in set require reboot */
prev_node_set_ptr->flags = NODE_SET_REBOOT;
goto end_node_set;
}
node_set_inx_base = node_set_inx - 1;
_split_node_set(node_set_ptr, config_ptr,
node_set_inx_base, node_set_inx,
tmp_feature, node_maps[REBOOT],
NODE_SET_REBOOT);
node_set_inx++;
goto end_node_set;
}
/* Just need to split for these nodes that are outside FLEX */
if (node_maps[OUT_FL] && !node_maps[REBOOT]) {
if (bit_super_set(prev_node_set_ptr->my_bitmap,
node_maps[OUT_FL])) {
/* All nodes outside of flex reservation */
prev_node_set_ptr->flags =NODE_SET_OUTSIDE_FLEX;
goto end_node_set;
}
node_set_inx_base = node_set_inx - 1;
_split_node_set(node_set_ptr, config_ptr,
node_set_inx_base, node_set_inx,
tmp_feature, node_maps[OUT_FL],
NODE_SET_OUTSIDE_FLEX);
node_set_inx++;
goto end_node_set;
}
/* We may have to split in several subsets */
if (node_maps[OUT_FL] && node_maps[REBOOT]) {
node_maps[IN_FL_RE] = bit_copy(node_maps[IN_FL]);
bit_and(node_maps[IN_FL_RE], node_maps[REBOOT]);
node_maps[OUT_FL_RE] = bit_copy(node_maps[OUT_FL]);
bit_and(node_maps[OUT_FL_RE], node_maps[REBOOT]);
node_maps[OUT_FL_NO_RE] = bit_copy(node_maps[OUT_FL]);
bit_and_not(node_maps[OUT_FL_NO_RE],
node_maps[REBOOT]);
}
/*
* All nodes in this set should be avoided. No need to split.
* Just set the FLAGS and the Weight.
*/
if (bit_super_set(prev_node_set_ptr->my_bitmap,
node_maps[IN_FL_RE])) {
prev_node_set_ptr->flags = NODE_SET_REBOOT;
goto end_node_set;
}
if (bit_super_set(prev_node_set_ptr->my_bitmap,
node_maps[OUT_FL_NO_RE])) {
prev_node_set_ptr->flags = NODE_SET_OUTSIDE_FLEX;
goto end_node_set;
}
if (bit_super_set(prev_node_set_ptr->my_bitmap,
node_maps[OUT_FL_RE])) {
prev_node_set_ptr->flags = (NODE_SET_OUTSIDE_FLEX |
NODE_SET_REBOOT);
goto end_node_set;
}
/*
* At this point we split the node set record in four,
* in this order of priority:
*
* 1. Inside flex reservation and need to reboot
* 2. Outside flex reservation and NO need to reboot
* 3. Outside flex reservation and need to reboot
* 4. Available now, inside the flex reservation and NO need
* to reboot
*
* If there are no such reservations or need to reboot,
* additional nodesets will not be created.
*/
node_set_inx_base = node_set_inx - 1;
if (node_maps[IN_FL_RE]) {
_split_node_set(node_set_ptr, config_ptr,
node_set_inx_base, node_set_inx,
tmp_feature, node_maps[IN_FL_RE],
NODE_SET_REBOOT);
FREE_NULL_BITMAP(node_maps[IN_FL_RE]);
node_set_inx++;
if (node_set_inx >= node_set_len) {
error("%s: node_set buffer filled", __func__);
break;
}
}
if (node_maps[OUT_FL_NO_RE]) {
_split_node_set(node_set_ptr, config_ptr,
node_set_inx_base, node_set_inx,
tmp_feature, node_maps[OUT_FL_NO_RE],
(NODE_SET_OUTSIDE_FLEX));
FREE_NULL_BITMAP(node_maps[OUT_FL_NO_RE]);
node_set_inx++;
if (node_set_inx >= node_set_len) {
error("%s: node_set buffer filled", __func__);
break;
}
}
if (node_maps[OUT_FL_RE]) {
_split_node_set(node_set_ptr, config_ptr,
node_set_inx_base, node_set_inx,
tmp_feature, node_maps[OUT_FL_RE],
(NODE_SET_OUTSIDE_FLEX |
NODE_SET_REBOOT));
FREE_NULL_BITMAP(node_maps[OUT_FL_RE]);
node_set_inx++;
if (node_set_inx >= node_set_len) {
error("%s: node_set buffer filled", __func__);
break;
}
}
end_node_set:
for (i = 0; i < NM_TYPES; i++)
FREE_NULL_BITMAP(node_maps[i]);
if (node_set_inx >= node_set_len) {
error("%s: node_set buffer filled", __func__);
break;
}
}
list_iterator_destroy(config_iterator);
/* eliminate any incomplete node_set record */
xfree(node_set_ptr[node_set_inx].features);
FREE_NULL_BITMAP(node_set_ptr[node_set_inx].my_bitmap);
FREE_NULL_BITMAP(node_set_ptr[node_set_inx].feature_bits);
FREE_NULL_BITMAP(usable_node_mask);
if (node_set_inx == 0) {
rc = ESLURM_REQUESTED_NODE_CONFIG_UNAVAILABLE;
info("%s: No nodes satisfy %pJ requirements in partition %s",
__func__, job_ptr, job_ptr->part_ptr->name);
xfree(node_set_ptr);
xfree(job_ptr->state_desc);
job_ptr->state_reason = FAIL_BAD_CONSTRAINTS;
debug2("%s: setting %pJ to \"%s\" (%s)",
__func__, job_ptr,
job_state_reason_string(job_ptr->state_reason),
slurm_strerror(rc));
FREE_NULL_BITMAP(reboot_bitmap);
return rc;
}
/*
* Clear message about any nodes which fail to satisfy specific
* job requirements as there are some nodes which can be used
*/
if (err_msg)
xfree(*err_msg);
/*
* If any nodes are powered down or powering up, put them into a
* new node_sets record with a higher scheduling weight. This means
* we avoid scheduling jobs on powered down and powering up nodes where
* possible. If those are required we prefer powering up nodes over
* powered down nodes.
*/
for (i = (node_set_inx - 1); i >= 0; i--) {
int booting_cnt = bit_overlap(node_set_ptr[i].my_bitmap,
booting_node_bitmap);
if (booting_cnt == 0)
continue; /* no nodes powering up */
if (booting_cnt == node_set_ptr[i].node_cnt) {
node_set_ptr[i].flags = NODE_SET_POWERING_UP;
continue; /* all nodes powering up */
}
/* Some nodes powering up, split record */
_split_node_set2(node_set_ptr, i, &node_set_inx, booting_cnt,
booting_node_bitmap, NODE_SET_POWERING_UP);
if (node_set_inx >= node_set_len) {
error("%s: node_set buffer filled", __func__);
break;
}
}
for (i = (node_set_inx-1); i >= 0; i--) {
int power_cnt = bit_overlap(node_set_ptr[i].my_bitmap,
power_down_node_bitmap);
if (power_cnt == 0)
continue; /* no nodes powered down */
if (power_cnt == node_set_ptr[i].node_cnt) {
node_set_ptr[i].flags = NODE_SET_POWER_DN;
continue; /* all nodes powered down */
}
/* Some nodes powered down, others up, split record */
_split_node_set2(node_set_ptr, i, &node_set_inx, power_cnt,
power_down_node_bitmap, NODE_SET_POWER_DN);
if (node_set_inx >= node_set_len) {
error("%s: node_set buffer filled", __func__);
break;
}
}
grp_node_bitmap = _find_grp_node_bitmap(job_ptr);
if (grp_node_bitmap) {
#if _DEBUG
char node_bitstr[64];
bit_fmt(node_bitstr, sizeof(node_bitstr), grp_node_bitmap);
info("%s: _find_grp_node_bitmap() grp_node_bitmap:%s", __func__, node_bitstr);
#endif
for (i = (node_set_inx-1); i >= 0; i--) {
qos_cnt = bit_overlap(node_set_ptr[i].my_bitmap,
grp_node_bitmap);
if (qos_cnt == 0) {
node_set_ptr[node_set_inx].node_weight += 1;
continue; /* no nodes overlap */
}
if (qos_cnt == node_set_ptr[i].node_cnt) {
continue; /* all nodes overlap */
}
/* Some nodes overlap, split record */
_split_node_set2(node_set_ptr, i, &node_set_inx,
qos_cnt, grp_node_bitmap,
node_set_ptr[i].flags);
node_set_ptr[i].node_weight++;
if (node_set_inx >= node_set_len) {
error("%s: node_set buffer filled", __func__);
break;
}
}
FREE_NULL_BITMAP(grp_node_bitmap);
}
FREE_NULL_BITMAP(reboot_bitmap);
*node_set_size = node_set_inx;
*node_set_pptr = node_set_ptr;
return SLURM_SUCCESS;
}
/*
* For a given node_set, set a scheduling weight based upon a combination of
* node_weight and flags (e.g. try to avoid reboot).
* 0x20000000000 - Requires boot
* 0x10000000000 - Outside of flex reservation
* 0x0########00 - Node weight
* 0x000000000## - Reserved for cons_tres, favor nodes with co-located CPU/GPU
*/
static void _set_sched_weight(struct node_set *node_set_ptr)
{
xassert(node_set_ptr);
node_set_ptr->sched_weight = node_set_ptr->node_weight << 8;
node_set_ptr->sched_weight |= 0xff;
if ((node_set_ptr->flags & NODE_SET_REBOOT) ||
(node_set_ptr->flags & NODE_SET_POWER_DN)) /* Boot required */
node_set_ptr->sched_weight |= 0x30000000000;
else if ((node_set_ptr->flags & NODE_SET_POWERING_UP))
node_set_ptr->sched_weight |= 0x20000000000;
else if (node_set_ptr->flags & NODE_SET_OUTSIDE_FLEX ||
node_set_ptr->flags & NODE_SET_POWERING_UP)
node_set_ptr->sched_weight |= 0x10000000000;
}
static int _sort_node_set(const void *x, const void *y)
{
struct node_set *node_set_ptr1 = (struct node_set *) x;
struct node_set *node_set_ptr2 = (struct node_set *) y;
xassert(node_set_ptr1);
xassert(node_set_ptr2);
if (node_set_ptr1->sched_weight < node_set_ptr2->sched_weight)
return -1;
if (node_set_ptr1->sched_weight > node_set_ptr2->sched_weight)
return 1;
return 0;
}
static void _log_node_set(job_record_t *job_ptr,
struct node_set *node_set_ptr,
int node_set_size)
{
char *node_list, feature_bits[64];
int i;
if (get_log_level() < LOG_LEVEL_DEBUG2)
return;
debug2("NodeSet for %pJ", job_ptr);
for (i = 0; i < node_set_size; i++) {
node_list = bitmap2node_name(node_set_ptr[i].my_bitmap);
if (node_set_ptr[i].feature_bits) {
bit_fmt(feature_bits, sizeof(feature_bits),
node_set_ptr[i].feature_bits);
} else
feature_bits[0] = '\0';
debug2("NodeSet[%d] Nodes:%s NodeWeight:%u Flags:%u FeatureBits:%s SchedWeight:%"PRIu64,
i, node_list, node_set_ptr[i].node_weight,
node_set_ptr[i].flags, feature_bits,
node_set_ptr[i].sched_weight);
xfree(node_list);
}
}
static void _set_err_msg(bool cpus_ok, bool mem_ok, bool disk_ok,
bool job_mc_ok, char **err_msg)
{
if (!err_msg)
return;
if (!cpus_ok) {
xfree(*err_msg);
*err_msg = xstrdup("CPU count per node can not be satisfied");
return;
}
if (!mem_ok) {
xfree(*err_msg);
*err_msg = xstrdup("Memory specification can not be satisfied");
return;
}
if (!disk_ok) {
xfree(*err_msg);
*err_msg = xstrdup("Temporary disk specification can not be "
"satisfied");
return;
}
if (!job_mc_ok) {
xfree(*err_msg);
*err_msg = xstrdup("Socket, core and/or thread specification "
"can not be satisfied");
return;
}
}
/*
* _nodes_in_sets - Determine if required nodes are included in node_set(s)
* IN req_bitmap - nodes specifically required by the job
* IN node_set_ptr - sets of valid nodes
* IN node_set_size - count of node_set entries
* RET 0 if in set, otherwise an error code
*/
static int _nodes_in_sets(bitstr_t *req_bitmap,
struct node_set * node_set_ptr,
int node_set_size)
{
bitstr_t *scratch_bitmap = NULL;
int error_code = SLURM_SUCCESS, i;
for (i=0; i<node_set_size; i++) {
if (scratch_bitmap)
bit_or(scratch_bitmap,
node_set_ptr[i].my_bitmap);
else {
scratch_bitmap =
bit_copy(node_set_ptr[i].my_bitmap);
}
}
if ((scratch_bitmap == NULL)
|| (bit_super_set(req_bitmap, scratch_bitmap) != 1))
error_code = ESLURM_REQUESTED_NODE_CONFIG_UNAVAILABLE;
FREE_NULL_BITMAP(scratch_bitmap);
return error_code;
}
/*
* build_node_details - sets addresses for allocated nodes
* IN job_ptr - pointer to a job record
* IN new_alloc - set if new job allocation, cleared if state recovery
*/
extern void build_node_details(job_record_t *job_ptr, bool new_alloc)
{
hostlist_t *host_list = NULL;
node_record_t *node_ptr;
char *this_node_name;
int node_inx = 0;
if ((job_ptr->node_bitmap == NULL) || (job_ptr->nodes == NULL)) {
/* No nodes allocated, we're done... */
job_ptr->node_cnt = 0;
return;
}
/* Use hostlist here to ensure ordering of info matches that of srun */
if ((host_list = hostlist_create(job_ptr->nodes)) == NULL)
fatal("hostlist_create error for %s: %m", job_ptr->nodes);
job_ptr->total_nodes = job_ptr->node_cnt = hostlist_count(host_list);
xfree(job_ptr->batch_host);
while ((this_node_name = hostlist_shift(host_list))) {
if ((node_ptr = find_node_record(this_node_name))) {
node_inx++;
} else {
error("Invalid node %s in %pJ",
this_node_name, job_ptr);
}
if (!job_ptr->batch_host && !job_ptr->batch_features) {
/*
* Do not select until launch_job() as node features
* might be changed by node_features plugin between
* allocation time (now) and launch.
*/
job_ptr->batch_host = xstrdup(this_node_name);
}
free(this_node_name);
}
hostlist_destroy(host_list);
if (job_ptr->node_cnt != node_inx) {
error("Node count mismatch for %pJ (%u,%u)",
job_ptr, job_ptr->node_cnt, node_inx);
}
}
/*
* Set "batch_host" for this job based upon it's "batch_features" and
* "node_bitmap". Selection is performed on a best-effort basis (i.e. if no
* node satisfies the batch_features specification then pick first node).
* Execute this AFTER any node feature changes are made by the node_features
* plugin.
*
* If changes are made here, see if changes need to be made in
* test_job_nodes_ready().
*
* Return SLURM_SUCCESS or error code
*/
extern int pick_batch_host(job_record_t *job_ptr)
{
int i, i_first;
node_record_t *node_ptr;
char *tmp, *tok, sep, last_sep = '&';
node_feature_t *feature_ptr;
list_itr_t *feature_iter;
bitstr_t *feature_bitmap;
if (job_ptr->batch_host)
return SLURM_SUCCESS;
if (!job_ptr->node_bitmap) {
error("%s: %pJ lacks a node_bitmap", __func__, job_ptr);
return SLURM_ERROR;
}
i_first = bit_ffs(job_ptr->node_bitmap);
if (i_first < 0) {
error("%s: %pJ allocated no nodes", __func__, job_ptr);
return SLURM_ERROR;
}
if (!job_ptr->batch_features) {
/* Run batch script on first node of job allocation */
node_ptr = node_record_table_ptr[i_first];
job_ptr->batch_host = xstrdup(node_ptr->name);
return SLURM_SUCCESS;
}
feature_bitmap = bit_copy(job_ptr->node_bitmap);
tmp = xstrdup(job_ptr->batch_features);
tok = tmp;
for (i = 0; ; i++) {
if (tmp[i] == '&')
sep = '&';
else if (tmp[i] == '|')
sep = '|';
else if (tmp[i] == '\0')
sep = '\0';
else
continue;
tmp[i] = '\0';
feature_iter = list_iterator_create(active_feature_list);
while ((feature_ptr = list_next(feature_iter))) {
if (xstrcmp(feature_ptr->name, tok))
continue;
if (last_sep == '&') {
bit_and(feature_bitmap,
feature_ptr->node_bitmap);
} else {
bit_or(feature_bitmap,
feature_ptr->node_bitmap);
}
break;
}
list_iterator_destroy(feature_iter);
if (!feature_ptr) /* No match */
bit_clear_all(feature_bitmap);
if (sep == '\0')
break;
tok = tmp + i + 1;
last_sep = sep;
}
xfree(tmp);
bit_and(feature_bitmap, job_ptr->node_bitmap);
if ((i = bit_ffs(feature_bitmap)) >= 0)
node_ptr = node_record_table_ptr[i];
else
node_ptr = node_record_table_ptr[i_first];
job_ptr->batch_host = xstrdup(node_ptr->name);
FREE_NULL_BITMAP(feature_bitmap);
return SLURM_SUCCESS;
}
/*
* _valid_features - Determine if the requested features are satisfied by
* the available nodes. This is only used for MOR operators.
* IN job_ptr - job being scheduled
* IN config_ptr - node's configuration record
* IN can_reboot - if true node can use any available feature,
* else job can use only active features
* IN reboot_bitmap - bitmap of nodes requiring reboot for use (updated)
* RET NULL if request is not satisfied, otherwise a bitmap indicating
* which mutually exclusive features are satisfied. For example
* _valid_features("[fs1|fs2|fs3|fs4]", "fs3") returns a bitmap with
* the third bit set. For another example
* _valid_features("[fs1|fs2|fs3|fs4]", "fs1,fs3") returns a bitmap
* with the first and third bits set. The function returns a bitmap
* with the first bit set if requirements are satisfied without a
* mutually exclusive feature list.
*/
static bitstr_t *_valid_features(job_record_t *job_ptr,
config_record_t *config_ptr,
bool can_reboot, bitstr_t *reboot_bitmap)
{
job_details_t *details_ptr = job_ptr->details;
bitstr_t *result_node_bitmap = NULL, *paren_node_bitmap = NULL;
bitstr_t *working_node_bitmap, *active_node_bitmap = NULL;
bitstr_t *tmp_node_bitmap = NULL;
list_itr_t *feat_iter;
job_feature_t *job_feat_ptr;
int last_op = FEATURE_OP_AND, paren_op = FEATURE_OP_AND;
int last_paren = 0, position = 0;
if (details_ptr->feature_list_use == NULL) { /* no constraints */
result_node_bitmap = bit_alloc(MAX_FEATURES);
bit_set(result_node_bitmap, 0);
return result_node_bitmap;
}
feat_iter = list_iterator_create(details_ptr->feature_list_use);
while ((job_feat_ptr = list_next(feat_iter))) {
if (job_feat_ptr->paren > last_paren) {
/* Combine features within parenthesis */
paren_node_bitmap =
bit_copy(job_feat_ptr->node_bitmap_avail);
if (can_reboot)
active_node_bitmap = bit_copy(paren_node_bitmap);
last_paren = job_feat_ptr->paren;
paren_op = job_feat_ptr->op_code;
/*
* If this pair of parentheses is inside of brackets,
* then this is XAND or MOR. Set last_op so that the
* features in parentheses are considered as XAND or
* MOR and are evaluated in the if at the bottom of this
* loop. This only matters if the parentheses are the
* first thing inside of brackets because last_op is
* initialized to AND.
*/
if (job_feat_ptr->bracket &&
(last_op != FEATURE_OP_XAND) &&
(last_op != FEATURE_OP_MOR))
last_op = FEATURE_OP_XAND;
while ((job_feat_ptr = list_next(feat_iter))) {
if ((paren_op == FEATURE_OP_AND) &&
can_reboot) {
bit_and(paren_node_bitmap,
job_feat_ptr->node_bitmap_avail);
bit_and(active_node_bitmap,
job_feat_ptr->node_bitmap_active);
} else if (paren_op == FEATURE_OP_AND) {
bit_and(paren_node_bitmap,
job_feat_ptr->node_bitmap_active);
} else if ((paren_op == FEATURE_OP_OR) &&
can_reboot) {
bit_or(paren_node_bitmap,
job_feat_ptr->node_bitmap_avail);
bit_or(active_node_bitmap,
job_feat_ptr->node_bitmap_active);
} else if (paren_op == FEATURE_OP_OR) {
bit_or(paren_node_bitmap,
job_feat_ptr->node_bitmap_active);
} else {
error("%s: Bad feature expression for %pJ: %s",
__func__, job_ptr,
details_ptr->features_use);
break;
}
paren_op = job_feat_ptr->op_code;
if (job_feat_ptr->paren < last_paren) {
last_paren = job_feat_ptr->paren;
break;
}
}
working_node_bitmap = paren_node_bitmap;
} else {
working_node_bitmap = job_feat_ptr->node_bitmap_avail;
}
if (!job_feat_ptr) {
error("%s: Bad feature expression for %pJ: %s",
__func__, job_ptr, details_ptr->features_use);
}
if ((job_feat_ptr->op_code == FEATURE_OP_XAND) ||
(job_feat_ptr->op_code == FEATURE_OP_MOR) ||
((job_feat_ptr->op_code != FEATURE_OP_XAND) &&
(job_feat_ptr->op_code != FEATURE_OP_MOR) &&
((last_op == FEATURE_OP_XAND) ||
(last_op == FEATURE_OP_MOR)))) {
if (bit_overlap_any(config_ptr->node_bitmap,
working_node_bitmap)) {
if (!result_node_bitmap)
result_node_bitmap =
bit_alloc(MAX_FEATURES);
bit_set(result_node_bitmap, position);
if (can_reboot && reboot_bitmap &&
active_node_bitmap) {
tmp_node_bitmap = bit_copy(config_ptr->
node_bitmap);
bit_and_not(tmp_node_bitmap,
active_node_bitmap);
bit_or(reboot_bitmap, tmp_node_bitmap);
FREE_NULL_BITMAP(tmp_node_bitmap);
}
}
position++;
last_op = job_feat_ptr->op_code;
}
FREE_NULL_BITMAP(active_node_bitmap);
FREE_NULL_BITMAP(paren_node_bitmap);
}
list_iterator_destroy(feat_iter);
#if _DEBUG
{
char tmp[64];
if (result_node_bitmap)
bit_fmt(tmp, sizeof(tmp), result_node_bitmap);
else
snprintf(tmp, sizeof(tmp), "NONE");
info("CONFIG_FEATURE:%s FEATURE_MOR_BITS:%s", config_ptr->feature, tmp);
if (reboot_bitmap && (bit_ffs(reboot_bitmap) >= 0)) {
char *reboot_node_str = bitmap2node_name(reboot_bitmap);
info("REBOOT_NODES:%s", reboot_node_str);
xfree(reboot_node_str);
}
}
#endif
return result_node_bitmap;
}
/*
* re_kill_job - for a given job, deallocate its nodes for a second time,
* basically a cleanup for failed deallocate() calls
* IN job_ptr - pointer to terminating job (already in some COMPLETING state)
* globals: node_record_count - number of nodes in the system
* node_record_table_ptr - pointer to global node table
*/
extern void re_kill_job(job_record_t *job_ptr)
{
agent_arg_t *agent_args;
hostlist_t *kill_hostlist;
char *host_str = NULL;
static uint32_t last_job_id = 0;
node_record_t *node_ptr;
xassert(job_ptr);
xassert(job_ptr->details);
kill_hostlist = hostlist_create(NULL);
agent_args = xmalloc(sizeof(agent_arg_t));
agent_args->msg_type = REQUEST_TERMINATE_JOB;
agent_args->hostlist = hostlist_create(NULL);
agent_args->protocol_version = SLURM_PROTOCOL_VERSION;
agent_args->retry = 0;
if (job_ptr->node_bitmap_cg) {
for (int i = 0;
(node_ptr = next_node_bitmap(job_ptr->node_bitmap_cg, &i));
i++) {
if (IS_NODE_DOWN(node_ptr)) {
/* Consider job already completed */
bit_clear(job_ptr->node_bitmap_cg,
node_ptr->index);
job_update_tres_cnt(job_ptr, node_ptr->index);
if (node_ptr->comp_job_cnt)
(node_ptr->comp_job_cnt)--;
if ((job_ptr->node_cnt > 0) &&
((--job_ptr->node_cnt) == 0)) {
cleanup_completing(job_ptr, true);
last_node_update = time(NULL);
}
} else if (!IS_NODE_NO_RESPOND(node_ptr)) {
(void)hostlist_push_host(kill_hostlist,
node_ptr->name);
if (agent_args->protocol_version >
node_ptr->protocol_version)
agent_args->protocol_version =
node_ptr->protocol_version;
hostlist_push_host(agent_args->hostlist,
node_ptr->name);
agent_args->node_count++;
}
if (PACK_FANOUT_ADDRS(node_ptr))
agent_args->msg_flags |= SLURM_PACK_ADDRS;
}
}
if (agent_args->node_count == 0) {
FREE_NULL_HOSTLIST(agent_args->hostlist);
xfree(agent_args);
hostlist_destroy(kill_hostlist);
return;
}
hostlist_uniq(kill_hostlist);
host_str = hostlist_ranged_string_xmalloc(kill_hostlist);
if (job_ptr->job_id != last_job_id) {
info("Resending TERMINATE_JOB request %pJ Nodelist=%s",
job_ptr, host_str);
} else {
debug("Resending TERMINATE_JOB request %pJ Nodelist=%s",
job_ptr, host_str);
}
xfree(host_str);
last_job_id = job_ptr->job_id;
hostlist_destroy(kill_hostlist);
agent_args->msg_args =
create_kill_job_msg(job_ptr, agent_args->protocol_version);
set_agent_arg_r_uid(agent_args, SLURM_AUTH_UID_ANY);
agent_queue_request(agent_args);
}