blob: 45f932f8d7ca7c6eac32f3fa30595e90f6c6a389 [file] [log] [blame] [edit]
/*****************************************************************************\
* node_scheduler.c - select and allocated nodes to jobs
* Note: there is a global node table (node_record_table_ptr)
*****************************************************************************
* Copyright (C) 2002-2007 The Regents of the University of California.
* Copyright (C) 2008-2009 Lawrence Livermore National Security.
* Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
* Written by Morris Jette <jette1@llnl.gov>
* CODE-OCEC-09-009. All rights reserved.
*
* This file is part of SLURM, a resource management program.
* For details, see <https://computing.llnl.gov/linux/slurm/>.
* Please also read the included file: DISCLAIMER.
*
* SLURM is free software; you can redistribute it and/or modify it under
* the terms of the GNU General Public License as published by the Free
* Software Foundation; either version 2 of the License, or (at your option)
* any later version.
*
* In addition, as a special exception, the copyright holders give permission
* to link the code of portions of this program with the OpenSSL library under
* certain conditions as described in each individual source file, and
* distribute linked combinations including the two. You must obey the GNU
* General Public License in all respects for all of the code used other than
* OpenSSL. If you modify file(s) with this exception, you may extend this
* exception to your version of the file(s), but you are not obligated to do
* so. If you do not wish to do so, delete this exception statement from your
* version. If you delete this exception statement from all source files in
* the program, then also delete it here.
*
* SLURM is distributed in the hope that it will be useful, but WITHOUT ANY
* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
* FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
* details.
*
* You should have received a copy of the GNU General Public License along
* with SLURM; if not, write to the Free Software Foundation, Inc.,
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
\*****************************************************************************/
#ifdef HAVE_CONFIG_H
# include "config.h"
#endif
#ifdef HAVE_SYS_SYSLOG_H
# include <sys/syslog.h>
#endif
#include <errno.h>
#include <pthread.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <syslog.h>
#include <unistd.h>
#include <slurm/slurm_errno.h>
#include "src/common/hostlist.h"
#include "src/common/list.h"
#include "src/common/node_select.h"
#include "src/common/slurm_accounting_storage.h"
#include "src/common/xassert.h"
#include "src/common/xmalloc.h"
#include "src/common/xstring.h"
#include "src/slurmctld/acct_policy.h"
#include "src/slurmctld/agent.h"
#include "src/slurmctld/basil_interface.h"
#include "src/slurmctld/job_scheduler.h"
#include "src/slurmctld/licenses.h"
#include "src/slurmctld/node_scheduler.h"
#include "src/slurmctld/preempt.h"
#include "src/slurmctld/proc_req.h"
#include "src/slurmctld/reservation.h"
#include "src/slurmctld/sched_plugin.h"
#include "src/slurmctld/slurmctld.h"
#define MAX_FEATURES 32 /* max exclusive features "[fs1|fs2]"=2 */
#define MAX_RETRIES 10
struct node_set { /* set of nodes with same configuration */
uint16_t cpus_per_node; /* NOTE: This is the minimum count,
* if FastSchedule==0 then individual
* nodes within the same configuration
* line (in slurm.conf) can actually
* have different CPU counts */
uint32_t real_memory;
uint32_t nodes;
uint32_t weight;
char *features;
bitstr_t *feature_bits; /* XORed feature's position */
bitstr_t *my_bitmap; /* node bitmap */
};
static int _build_node_list(struct job_record *job_ptr,
struct node_set **node_set_pptr,
int *node_set_size);
static void _filter_nodes_in_set(struct node_set *node_set_ptr,
struct job_details *detail_ptr);
static int _list_find_feature(void *feature_entry, void *key);
static int _match_feature(char *seek, struct node_set *node_set_ptr);
static int _nodes_in_sets(bitstr_t *req_bitmap,
struct node_set * node_set_ptr,
int node_set_size);
static int _pick_best_nodes(struct node_set *node_set_ptr,
int node_set_size, bitstr_t ** select_bitmap,
struct job_record *job_ptr,
struct part_record *part_ptr,
uint32_t min_nodes, uint32_t max_nodes,
uint32_t req_nodes, bool test_only,
List preemptee_candidates,
List *preemptee_job_list);
static bool _valid_feature_counts(struct job_details *detail_ptr,
bitstr_t *node_bitmap, bool *has_xor);
static bitstr_t *_valid_features(struct job_details *detail_ptr,
struct config_record *config_ptr);
/*
* allocate_nodes - change state of specified nodes to NODE_STATE_ALLOCATED
* also claim required licenses and resources reserved by accounting
* policy association
* IN job_ptr - job being allocated resources
*/
extern void allocate_nodes(struct job_record *job_ptr)
{
int i;
last_node_update = time(NULL);
for (i = 0; i < node_record_count; i++) {
if (bit_test(job_ptr->node_bitmap, i))
make_node_alloc(&node_record_table_ptr[i], job_ptr);
}
license_job_get(job_ptr);
return;
}
/*
* deallocate_nodes - for a given job, deallocate its nodes and make
* their state NODE_STATE_COMPLETING also release the job's licenses
* and resources reserved by accounting policy association
* IN job_ptr - pointer to terminating job (already in some COMPLETING state)
* IN timeout - true if job exhausted time limit, send REQUEST_KILL_TIMELIMIT
* RPC instead of REQUEST_TERMINATE_JOB
* IN suspended - true if job was already suspended (node's job_run_cnt
* already decremented);
*/
extern void deallocate_nodes(struct job_record *job_ptr, bool timeout,
bool suspended)
{
int i;
kill_job_msg_t *kill_job = NULL;
agent_arg_t *agent_args = NULL;
int down_node_cnt = 0;
struct node_record *node_ptr;
xassert(job_ptr);
xassert(job_ptr->details);
license_job_return(job_ptr);
acct_policy_job_fini(job_ptr);
if (slurm_sched_freealloc(job_ptr) != SLURM_SUCCESS)
error("slurm_sched_freealloc(%u): %m", job_ptr->job_id);
if (select_g_job_fini(job_ptr) != SLURM_SUCCESS)
error("select_g_job_fini(%u): %m", job_ptr->job_id);
(void) epilog_slurmctld(job_ptr);
#ifdef HAVE_CRAY_XT
basil_release(job_ptr);
#endif /* HAVE_CRAY_XT */
agent_args = xmalloc(sizeof(agent_arg_t));
if (timeout)
agent_args->msg_type = REQUEST_KILL_TIMELIMIT;
else
agent_args->msg_type = REQUEST_TERMINATE_JOB;
agent_args->retry = 0; /* re_kill_job() resends as needed */
agent_args->hostlist = hostlist_create("");
kill_job = xmalloc(sizeof(kill_job_msg_t));
last_node_update = time(NULL);
kill_job->job_id = job_ptr->job_id;
kill_job->step_id = NO_VAL;
kill_job->job_state = job_ptr->job_state;
kill_job->job_uid = job_ptr->user_id;
kill_job->nodes = xstrdup(job_ptr->nodes);
kill_job->time = time(NULL);
kill_job->select_jobinfo = select_g_select_jobinfo_copy(
job_ptr->select_jobinfo);
kill_job->spank_job_env = xduparray(job_ptr->spank_job_env_size,
job_ptr->spank_job_env);
kill_job->spank_job_env_size = job_ptr->spank_job_env_size;
for (i=0, node_ptr=node_record_table_ptr;
i < node_record_count; i++, node_ptr++) {
if (bit_test(job_ptr->node_bitmap, i) == 0)
continue;
if (IS_NODE_DOWN(node_ptr)) {
/* Issue the KILL RPC, but don't verify response */
down_node_cnt++;
bit_clear(job_ptr->node_bitmap, i);
job_update_cpu_cnt(job_ptr, i);
job_ptr->node_cnt--;
}
make_node_comp(node_ptr, job_ptr, suspended);
#ifdef HAVE_FRONT_END /* Operate only on front-end */
if (agent_args->node_count > 0)
continue;
#endif
hostlist_push(agent_args->hostlist, node_ptr->name);
agent_args->node_count++;
}
if ((agent_args->node_count - down_node_cnt) == 0) {
job_ptr->job_state &= (~JOB_COMPLETING);
delete_step_records(job_ptr, 0);
slurm_sched_schedule();
}
if (agent_args->node_count == 0) {
error("Job %u allocated no nodes to be killed on",
job_ptr->job_id);
xfree(kill_job->nodes);
select_g_select_jobinfo_free(kill_job->select_jobinfo);
xfree(kill_job);
hostlist_destroy(agent_args->hostlist);
xfree(agent_args);
return;
}
agent_args->msg_args = kill_job;
agent_queue_request(agent_args);
return;
}
/*
* _match_feature - determine if the desired feature is one of those available
* IN seek - desired feature
* IN node_set_ptr - Pointer to node_set being searched
* RET 1 if found, 0 otherwise
*/
static int _match_feature(char *seek, struct node_set *node_set_ptr)
{
struct features_record *feat_ptr;
if (seek == NULL)
return 1; /* nothing to look for */
feat_ptr = list_find_first(feature_list, _list_find_feature,
(void *) seek);
if (feat_ptr == NULL)
return 0; /* no such feature */
if (bit_super_set(node_set_ptr->my_bitmap, feat_ptr->node_bitmap))
return 1; /* nodes have this feature */
return 0;
}
/*
* Decide if a job can share nodes with other jobs based on the
* following three input parameters:
*
* IN user_flag - may be 0 (do not share nodes), 1 (node sharing allowed),
* or any other number means "don't care"
* IN part_max_share - current partition's node sharing policy
* IN cons_res_flag - 1 if the consumable resources flag is enable, 0 otherwise
*
*
* The followed table details the node SHARED state for the various scenarios
*
* part= part= part= part=
* cons_res user_request EXCLUS NO YES FORCE
* -------- ------------ ------ ----- ----- -----
* no default/exclus whole whole whole share/O
* no share=yes whole whole share/O share/O
* yes default whole share share/O share/O
* yes exclusive whole whole whole share/O
* yes share=yes whole share share/O share/O
*
* whole = whole node is allocated exclusively to the user
* share = nodes may be shared but the resources are not overcommitted
* share/O = nodes are shared and the resources can be overcommitted
*
* part->max_share:
* &SHARED_FORCE = FORCE
* 0 = EXCLUSIVE
* 1 = NO
* > 1 = YES
*
* job_ptr->details->shared:
* (uint16_t)NO_VAL = default
* 0 = exclusive
* 1 = share=yes
*
* Return values:
* 0 = no sharing
* 1 = user requested sharing
* 2 = sharing enforced (either by partition or cons_res)
* (cons_res plugin needs to distinguish between "enforced" and
* "requested" sharing)
*/
static int
_resolve_shared_status(uint16_t user_flag, uint16_t part_max_share,
int cons_res_flag)
{
/* no sharing if part=EXCLUSIVE */
if (part_max_share == 0)
return 0;
/* sharing if part=FORCE with count > 1 */
if ((part_max_share & SHARED_FORCE) &&
((part_max_share & (~SHARED_FORCE)) > 1))
return 2;
if (cons_res_flag) {
/* sharing unless user requested exclusive */
if (user_flag == 0)
return 0;
if (user_flag == 1)
return 1;
return 2;
} else {
/* no sharing if part=NO */
if (part_max_share == 1)
return 0;
/* share if the user requested it */
if (user_flag == 1)
return 1;
}
return 0;
}
/*
* If the job has required feature counts, then accumulate those
* required resources using multiple calls to _pick_best_nodes()
* and adding those selected nodes to the job's required node list.
* Upon completion, return job's requirements to match the values
* which were in effect upon calling this function.
* Input and output are the same as _pick_best_nodes().
*/
static int
_get_req_features(struct node_set *node_set_ptr, int node_set_size,
bitstr_t ** select_bitmap, struct job_record *job_ptr,
struct part_record *part_ptr,
uint32_t min_nodes, uint32_t max_nodes, uint32_t req_nodes,
bool test_only, List *preemptee_job_list)
{
uint32_t saved_min_nodes, saved_job_min_nodes;
bitstr_t *saved_req_node_bitmap = NULL;
uint32_t saved_num_procs, saved_req_nodes;
int rc, tmp_node_set_size;
struct node_set *tmp_node_set_ptr;
int error_code = SLURM_SUCCESS, i;
bitstr_t *feature_bitmap, *accumulate_bitmap = NULL;
bitstr_t *save_avail_node_bitmap = NULL, *resv_bitmap;
time_t start_res = time(NULL);
List preemptee_candidates = NULL;
/* Mark nodes reserved for other jobs as off limit for this job */
rc = job_test_resv(job_ptr, &start_res, false, &resv_bitmap);
if ((rc != SLURM_SUCCESS) ||
(bit_set_count(resv_bitmap) < min_nodes) ||
(job_ptr->details->req_node_bitmap &&
(!bit_super_set(job_ptr->details->req_node_bitmap,
resv_bitmap)))) {
FREE_NULL_BITMAP(resv_bitmap);
return ESLURM_NODES_BUSY; /* reserved */
}
if (resv_bitmap &&
(!bit_equal(resv_bitmap, avail_node_bitmap))) {
bit_and(resv_bitmap, avail_node_bitmap);
save_avail_node_bitmap = avail_node_bitmap;
avail_node_bitmap = resv_bitmap;
} else
FREE_NULL_BITMAP(resv_bitmap);
preemptee_candidates = slurm_find_preemptable_jobs(job_ptr);
/* save job and request state */
saved_min_nodes = min_nodes;
saved_req_nodes = req_nodes;
saved_job_min_nodes = job_ptr->details->min_nodes;
if (job_ptr->details->req_node_bitmap) {
accumulate_bitmap = job_ptr->details->req_node_bitmap;
saved_req_node_bitmap = bit_copy(accumulate_bitmap);
job_ptr->details->req_node_bitmap = NULL;
}
saved_num_procs = job_ptr->num_procs;
job_ptr->num_procs = 1;
tmp_node_set_ptr = xmalloc(sizeof(struct node_set) * node_set_size);
/* Accumulate nodes with required feature counts.
* Ignored if job_ptr->details->req_node_layout is set (by wiki2).
* Selected nodes become part of job's required node list. */
if (job_ptr->details->feature_list &&
(job_ptr->details->req_node_layout == NULL)) {
ListIterator feat_iter;
struct feature_record *feat_ptr;
feat_iter = list_iterator_create(
job_ptr->details->feature_list);
while ((feat_ptr = (struct feature_record *)
list_next(feat_iter))) {
if (feat_ptr->count == 0)
continue;
tmp_node_set_size = 0;
/* _pick_best_nodes() is destructive of the node_set
* data structure, so we need to make a copy then
* purge it */
for (i=0; i<node_set_size; i++) {
if (!_match_feature(feat_ptr->name,
node_set_ptr+i))
continue;
tmp_node_set_ptr[tmp_node_set_size].
cpus_per_node =
node_set_ptr[i].cpus_per_node;
tmp_node_set_ptr[tmp_node_set_size].
real_memory =
node_set_ptr[i].real_memory;
tmp_node_set_ptr[tmp_node_set_size].nodes =
node_set_ptr[i].nodes;
tmp_node_set_ptr[tmp_node_set_size].weight =
node_set_ptr[i].weight;
tmp_node_set_ptr[tmp_node_set_size].features =
xstrdup(node_set_ptr[i].features);
tmp_node_set_ptr[tmp_node_set_size].
feature_bits =
bit_copy(node_set_ptr[i].feature_bits);
tmp_node_set_ptr[tmp_node_set_size].my_bitmap =
bit_copy(node_set_ptr[i].my_bitmap);
tmp_node_set_size++;
}
feature_bitmap = NULL;
min_nodes = feat_ptr->count;
req_nodes = feat_ptr->count;
job_ptr->details->min_nodes = feat_ptr->count;
job_ptr->num_procs = feat_ptr->count;
if (*preemptee_job_list) {
list_destroy(*preemptee_job_list);
*preemptee_job_list = NULL;
}
error_code = _pick_best_nodes(tmp_node_set_ptr,
tmp_node_set_size, &feature_bitmap,
job_ptr, part_ptr, min_nodes,
max_nodes, req_nodes, test_only,
preemptee_candidates,
preemptee_job_list);
#if 0
{
char *tmp_str = bitmap2node_name(feature_bitmap);
info("job %u needs %u nodes with feature %s, "
"using %s, error_code=%d",
job_ptr->job_id, feat_ptr->count,
feat_ptr->name, tmp_str, error_code);
xfree(tmp_str);
}
#endif
for (i=0; i<tmp_node_set_size; i++) {
xfree(tmp_node_set_ptr[i].features);
FREE_NULL_BITMAP(tmp_node_set_ptr[i].
feature_bits);
FREE_NULL_BITMAP(tmp_node_set_ptr[i].
my_bitmap);
}
if (error_code != SLURM_SUCCESS)
break;
if (feature_bitmap) {
if (job_ptr->details->req_node_bitmap) {
bit_or(job_ptr->details->
req_node_bitmap,
feature_bitmap);
} else {
job_ptr->details->req_node_bitmap =
bit_copy(feature_bitmap);
}
if (accumulate_bitmap) {
bit_or(accumulate_bitmap,
feature_bitmap);
bit_free(feature_bitmap);
} else
accumulate_bitmap = feature_bitmap;
}
}
list_iterator_destroy(feat_iter);
}
/* restore most of job state and accumulate remaining resources */
if (saved_req_node_bitmap) {
FREE_NULL_BITMAP(job_ptr->details->req_node_bitmap);
job_ptr->details->req_node_bitmap =
bit_copy(saved_req_node_bitmap);
}
if (accumulate_bitmap) {
uint32_t node_cnt;
if (job_ptr->details->req_node_bitmap) {
bit_or(job_ptr->details->req_node_bitmap,
accumulate_bitmap);
FREE_NULL_BITMAP(accumulate_bitmap);
} else
job_ptr->details->req_node_bitmap = accumulate_bitmap;
node_cnt = bit_set_count(job_ptr->details->req_node_bitmap);
job_ptr->num_procs = MAX(saved_num_procs, node_cnt);
min_nodes = MAX(saved_min_nodes, node_cnt);
job_ptr->details->min_nodes = min_nodes;
req_nodes = MAX(min_nodes, req_nodes);
if (req_nodes > max_nodes)
error_code = ESLURM_REQUESTED_NODE_CONFIG_UNAVAILABLE;
} else {
min_nodes = saved_min_nodes;
req_nodes = saved_req_nodes;
job_ptr->num_procs = saved_num_procs;
job_ptr->details->min_nodes = saved_job_min_nodes;
}
#if 0
{
char *tmp_str = bitmap2node_name(job_ptr->details->req_node_bitmap);
info("job %u requires %d:%d:%d nodes %s err:%u",
job_ptr->job_id, min_nodes, req_nodes, max_nodes,
tmp_str, error_code);
xfree(tmp_str);
}
#endif
xfree(tmp_node_set_ptr);
if (error_code == SLURM_SUCCESS) {
if (*preemptee_job_list) {
list_destroy(*preemptee_job_list);
*preemptee_job_list = NULL;
}
error_code = _pick_best_nodes(node_set_ptr, node_set_size,
select_bitmap, job_ptr, part_ptr, min_nodes,
max_nodes, req_nodes, test_only,
preemptee_candidates, preemptee_job_list);
}
#if 0
{
char *tmp_str = bitmap2node_name(*select_bitmap);
info("job %u allocated nodes %s err:%u",
job_ptr->job_id, tmp_str, error_code);
xfree(tmp_str);
}
#endif
if (preemptee_candidates)
list_destroy(preemptee_candidates);
/* restore job's initial required node bitmap */
FREE_NULL_BITMAP(job_ptr->details->req_node_bitmap);
job_ptr->details->req_node_bitmap = saved_req_node_bitmap;
job_ptr->num_procs = saved_num_procs;
job_ptr->details->min_nodes = saved_job_min_nodes;
/* Restore available node bitmap, ignoring reservations */
if (save_avail_node_bitmap) {
bit_free(avail_node_bitmap);
avail_node_bitmap = save_avail_node_bitmap;
}
return error_code;
}
/*
* _pick_best_nodes - from a weight order list of all nodes satisfying a
* job's specifications, select the "best" for use
* IN node_set_ptr - pointer to node specification information
* IN node_set_size - number of entries in records pointed to by node_set_ptr
* OUT select_bitmap - returns bitmap of selected nodes, must FREE_NULL_BITMAP
* IN job_ptr - pointer to job being scheduled
* IN part_ptr - pointer to the partition in which the job is being scheduled
* IN min_nodes - minimum count of nodes required by the job
* IN max_nodes - maximum count of nodes required by the job (0==no limit)
* IN req_nodes - requested (or desired) count of nodes
* IN test_only - do not actually allocate resources
* IN/OUT preemptee_job_list - list of pointers to jobs to be preempted
* NULL on first entry
* RET SLURM_SUCCESS on success,
* ESLURM_NODES_BUSY if request can not be satisfied now,
* ESLURM_REQUESTED_NODE_CONFIG_UNAVAILABLE if request can never
* be satisfied ,
* ESLURM_REQUESTED_PART_CONFIG_UNAVAILABLE if the job can not be
* initiated until the parition's configuration changes or
* ESLURM_NODE_NOT_AVAIL if required nodes are DOWN or DRAINED
* NOTE: the caller must FREE_NULL_BITMAP memory pointed to by select_bitmap
* Notes: The algorithm is
* 1) If required node list is specified, determine implicitly required
* processor and node count
* 2) Determine how many disjoint required "features" are represented
* (e.g. "FS1|FS2|FS3")
* 3) For each feature: find matching node table entries, identify nodes
* that are up and available (idle or shared) and add them to a bit
* map
* 4) Select_g_job_test() to select the "best" of those based upon
* topology and/or workload
* 5) If request can't be satisfied now, execute select_g_job_test()
* against the list of nodes that exist in any state (perhaps DOWN
* DRAINED or ALLOCATED) to determine if the request can
* ever be satified.
*/
static int
_pick_best_nodes(struct node_set *node_set_ptr, int node_set_size,
bitstr_t ** select_bitmap, struct job_record *job_ptr,
struct part_record *part_ptr,
uint32_t min_nodes, uint32_t max_nodes, uint32_t req_nodes,
bool test_only, List preemptee_candidates,
List *preemptee_job_list)
{
int error_code = SLURM_SUCCESS, i, j, pick_code;
int total_nodes = 0, avail_nodes = 0;
bitstr_t *avail_bitmap = NULL, *total_bitmap = NULL;
bitstr_t *backup_bitmap = NULL;
bitstr_t *possible_bitmap = NULL;
int max_feature, min_feature;
bool runable_ever = false; /* Job can ever run */
bool runable_avail = false; /* Job can run with available nodes */
bool tried_sched = false; /* Tried to schedule with avail nodes */
static uint32_t cr_enabled = NO_VAL;
bool preempt_flag = false;
int shared = 0, select_mode;
if (test_only)
select_mode = SELECT_MODE_TEST_ONLY;
else
select_mode = SELECT_MODE_RUN_NOW;
if (node_set_size == 0) {
info("_pick_best_nodes: empty node set for selection");
return ESLURM_REQUESTED_NODE_CONFIG_UNAVAILABLE;
}
/* Are Consumable Resources enabled? Check once. */
if (cr_enabled == NO_VAL) {
cr_enabled = 0; /* select/linear and bluegene are no-ops */
error_code = select_g_get_info_from_plugin (SELECT_CR_PLUGIN,
NULL, &cr_enabled);
if (error_code != SLURM_SUCCESS) {
cr_enabled = NO_VAL;
return error_code;
}
}
shared = _resolve_shared_status(job_ptr->details->shared,
part_ptr->max_share, cr_enabled);
job_ptr->details->shared = shared;
if (cr_enabled)
job_ptr->cr_enabled = cr_enabled; /* CR enabled for this job */
/* If job preemption is enabled, then do NOT limit the set of available
* nodes by their current 'sharable' or 'idle' setting */
if (slurm_get_preempt_mode() != PREEMPT_MODE_OFF)
preempt_flag = true;
if (job_ptr->details->req_node_bitmap) { /* specific nodes required */
/* We have already confirmed that all of these nodes have a
* usable configuration and are in the proper partition.
* Check that these nodes can be used by this job. */
if (min_nodes != 0) {
total_nodes = bit_set_count(
job_ptr->details->req_node_bitmap);
}
if (total_nodes > max_nodes) { /* exceeds node limit */
return ESLURM_REQUESTED_PART_CONFIG_UNAVAILABLE;
}
/* check the availability of these nodes */
/* Should we check memory availability on these nodes? */
if (!bit_super_set(job_ptr->details->req_node_bitmap,
avail_node_bitmap)) {
return ESLURM_NODE_NOT_AVAIL;
}
if (!preempt_flag) {
if (shared) {
if (!bit_super_set(job_ptr->details->
req_node_bitmap,
share_node_bitmap)) {
return ESLURM_NODES_BUSY;
}
} else {
if (!bit_super_set(job_ptr->details->
req_node_bitmap,
idle_node_bitmap)) {
return ESLURM_NODES_BUSY;
}
}
}
/* still must go through select_g_job_test() to
* determine validity of request and/or perform
* set-up before job launch */
total_nodes = 0; /* reinitialize */
}
/* identify the min and max feature values for exclusive OR */
max_feature = -1;
min_feature = MAX_FEATURES;
for (i = 0; i < node_set_size; i++) {
j = bit_ffs(node_set_ptr[i].feature_bits);
if ((j >= 0) && (j < min_feature))
min_feature = j;
j = bit_fls(node_set_ptr[i].feature_bits);
if ((j >= 0) && (j > max_feature))
max_feature = j;
}
debug3("_pick_best_nodes: job %u idle_nodes %u share_nodes %u",
job_ptr->job_id, bit_set_count(idle_node_bitmap),
bit_set_count(share_node_bitmap));
/* Accumulate resources for this job based upon its required
* features (possibly with node counts). */
for (j = min_feature; j <= max_feature; j++) {
for (i = 0; i < node_set_size; i++) {
if (!bit_test(node_set_ptr[i].feature_bits, j))
continue;
if (total_bitmap) {
bit_or(total_bitmap,
node_set_ptr[i].my_bitmap);
} else {
total_bitmap = bit_copy(
node_set_ptr[i].my_bitmap);
if (total_bitmap == NULL)
fatal("bit_copy malloc failure");
}
bit_and(node_set_ptr[i].my_bitmap, avail_node_bitmap);
if (!preempt_flag) {
if (shared) {
bit_and(node_set_ptr[i].my_bitmap,
share_node_bitmap);
} else {
bit_and(node_set_ptr[i].my_bitmap,
idle_node_bitmap);
}
}
if (avail_bitmap) {
bit_or(avail_bitmap,
node_set_ptr[i].my_bitmap);
} else {
avail_bitmap = bit_copy(node_set_ptr[i].
my_bitmap);
if (avail_bitmap == NULL)
fatal("bit_copy malloc failure");
}
avail_nodes = bit_set_count(avail_bitmap);
tried_sched = false; /* need to test these nodes */
if ((shared || preempt_flag) &&
((i+1) < node_set_size) &&
(node_set_ptr[i].weight ==
node_set_ptr[i+1].weight)) {
/* Keep accumulating so we can pick the
* most lightly loaded nodes */
continue;
}
if ((avail_nodes < min_nodes) ||
((avail_nodes >= min_nodes) &&
(avail_nodes < req_nodes) &&
((i+1) < node_set_size)))
continue; /* Keep accumulating nodes */
if ((job_ptr->details->req_node_bitmap) &&
(!bit_super_set(job_ptr->details->req_node_bitmap,
avail_bitmap)))
continue;
/* NOTE: select_g_job_test() is destructive of
* avail_bitmap, so save a backup copy */
backup_bitmap = bit_copy(avail_bitmap);
if (*preemptee_job_list) {
list_destroy(*preemptee_job_list);
*preemptee_job_list = NULL;
}
pick_code = select_g_job_test(job_ptr,
avail_bitmap,
min_nodes,
max_nodes,
req_nodes,
select_mode,
preemptee_candidates,
preemptee_job_list);
#if 0
{
char *tmp_str1 = bitmap2node_name(backup_bitmap);
char *tmp_str2 = bitmap2node_name(avail_bitmap);
info("pick job:%u err:%d nodes:%u:%u:%u mode:%u "
"select %s of %s",
job_ptr->job_id, pick_code,
min_nodes, req_nodes, max_nodes, select_mode,
tmp_str2, tmp_str1);
xfree(tmp_str1);
xfree(tmp_str2);
}
#endif
if (pick_code == SLURM_SUCCESS) {
FREE_NULL_BITMAP(backup_bitmap);
if (bit_set_count(avail_bitmap) > max_nodes) {
/* end of tests for this feature */
avail_nodes = 0;
break;
}
FREE_NULL_BITMAP(total_bitmap);
FREE_NULL_BITMAP(possible_bitmap);
*select_bitmap = avail_bitmap;
return SLURM_SUCCESS;
} else {
tried_sched = true; /* test failed */
FREE_NULL_BITMAP(avail_bitmap);
avail_bitmap = backup_bitmap;
}
} /* for (i = 0; i < node_set_size; i++) */
/* try to get req_nodes now for this feature */
if (avail_bitmap && (!tried_sched) &&
(avail_nodes >= min_nodes) &&
((job_ptr->details->req_node_bitmap == NULL) ||
bit_super_set(job_ptr->details->req_node_bitmap,
avail_bitmap))) {
if (*preemptee_job_list) {
list_destroy(*preemptee_job_list);
*preemptee_job_list = NULL;
}
pick_code = select_g_job_test(job_ptr, avail_bitmap,
min_nodes, max_nodes,
req_nodes,
select_mode,
preemptee_candidates,
preemptee_job_list);
if ((pick_code == SLURM_SUCCESS) &&
(bit_set_count(avail_bitmap) <= max_nodes)) {
FREE_NULL_BITMAP(total_bitmap);
FREE_NULL_BITMAP(possible_bitmap);
*select_bitmap = avail_bitmap;
return SLURM_SUCCESS;
}
}
/* determine if job could possibly run (if all configured
* nodes available) */
if (total_bitmap)
total_nodes = bit_set_count(total_bitmap);
if (total_bitmap &&
(!runable_ever || !runable_avail) &&
(total_nodes >= min_nodes) &&
((job_ptr->details->req_node_bitmap == NULL) ||
(bit_super_set(job_ptr->details->req_node_bitmap,
total_bitmap)))) {
if (!runable_avail) {
FREE_NULL_BITMAP(avail_bitmap);
avail_bitmap = bit_copy(total_bitmap);
if (avail_bitmap == NULL)
fatal("bit_copy malloc failure");
bit_and(avail_bitmap, avail_node_bitmap);
pick_code = select_g_job_test(job_ptr,
avail_bitmap,
min_nodes,
max_nodes,
req_nodes,
SELECT_MODE_TEST_ONLY,
preemptee_candidates, NULL);
if (pick_code == SLURM_SUCCESS) {
runable_ever = true;
if (bit_set_count(avail_bitmap) <=
max_nodes)
runable_avail = true;
FREE_NULL_BITMAP(possible_bitmap);
possible_bitmap = avail_bitmap;
avail_bitmap = NULL;
}
}
if (!runable_ever) {
pick_code = select_g_job_test(job_ptr,
total_bitmap,
min_nodes,
max_nodes,
req_nodes,
SELECT_MODE_TEST_ONLY,
preemptee_candidates, NULL);
if (pick_code == SLURM_SUCCESS) {
FREE_NULL_BITMAP(possible_bitmap);
possible_bitmap = total_bitmap;
total_bitmap = NULL;
runable_ever = true;
}
}
}
FREE_NULL_BITMAP(avail_bitmap);
FREE_NULL_BITMAP(total_bitmap);
if (error_code != SLURM_SUCCESS)
break;
}
/* The job is not able to start right now, return a
* value indicating when the job can start */
if (!runable_avail)
error_code = ESLURM_REQUESTED_PART_CONFIG_UNAVAILABLE;
if (!runable_ever) {
error_code = ESLURM_REQUESTED_NODE_CONFIG_UNAVAILABLE;
info("_pick_best_nodes: job %u never runnable",
job_ptr->job_id);
}
if (error_code == SLURM_SUCCESS) {
error_code = ESLURM_NODES_BUSY;
*select_bitmap = possible_bitmap;
} else {
FREE_NULL_BITMAP(possible_bitmap);
}
return error_code;
}
static void _preempt_jobs(List preemptee_job_list, int *error_code)
{
ListIterator iter;
struct job_record *job_ptr;
uint16_t mode;
int job_cnt = 0, rc = 0;
mode = slurm_get_preempt_mode();
mode &= (~PREEMPT_MODE_GANG);
if (mode == PREEMPT_MODE_SUSPEND)
return; /* just start job and let gang do suspend */
iter = list_iterator_create(preemptee_job_list);
if (!iter)
fatal("list_iterator_create: malloc failure");
while ((job_ptr = (struct job_record *) list_next(iter))) {
job_cnt++;
if (mode == PREEMPT_MODE_CANCEL) {
rc = job_signal(job_ptr->job_id, SIGKILL, 0, 0);
if (rc == SLURM_SUCCESS) {
info("preempted job %u has been killed",
job_ptr->job_id);
}
} else if (mode == PREEMPT_MODE_CHECKPOINT) {
checkpoint_msg_t ckpt_msg;
memset(&ckpt_msg, 0, sizeof(checkpoint_msg_t));
ckpt_msg.op = CHECK_VACATE;
ckpt_msg.job_id = job_ptr->job_id;
rc = job_checkpoint(&ckpt_msg, 0, -1);
if (rc == SLURM_SUCCESS) {
info("preempted job %u has been checkpointed",
job_ptr->job_id);
}
} else if (mode == PREEMPT_MODE_REQUEUE) {
rc = job_requeue(0, job_ptr->job_id, -1);
if (rc == SLURM_SUCCESS) {
info("preempted job %u has been requeued",
job_ptr->job_id);
}
}
if (rc != SLURM_SUCCESS) {
rc = job_signal(job_ptr->job_id, SIGKILL, 0, 0);
if (rc == SLURM_SUCCESS)
info("preempted job %u had to be killed",
job_ptr->job_id);
else {
info("preempted job %u kill failure %s",
job_ptr->job_id, slurm_strerror(rc));
}
}
}
list_iterator_destroy(iter);
if (job_cnt > 0)
*error_code = ESLURM_NODES_BUSY;
}
/*
* select_nodes - select and allocate nodes to a specific job
* IN job_ptr - pointer to the job record
* IN test_only - if set do not allocate nodes, just confirm they
* could be allocated now
* IN select_node_bitmap - bitmap of nodes to be used for the
* job's resource allocation (not returned if NULL), caller
* must free
* RET 0 on success, ESLURM code from slurm_errno.h otherwise
* globals: list_part - global list of partition info
* default_part_loc - pointer to default partition
* config_list - global list of node configuration info
* Notes: The algorithm is
* 1) Build a table (node_set_ptr) of nodes with the requisite
* configuration. Each table entry includes their weight,
* node_list, features, etc.
* 2) Call _pick_best_nodes() to select those nodes best satisfying
* the request, (e.g. best-fit or other criterion)
* 3) Call allocate_nodes() to perform the actual allocation
*/
extern int select_nodes(struct job_record *job_ptr, bool test_only,
bitstr_t **select_node_bitmap)
{
int error_code = SLURM_SUCCESS, i, node_set_size = 0;
bitstr_t *select_bitmap = NULL;
struct node_set *node_set_ptr = NULL;
struct part_record *part_ptr = job_ptr->part_ptr;
uint32_t min_nodes, max_nodes, req_nodes;
enum job_state_reason fail_reason;
time_t now = time(NULL);
bool configuring = false;
List preemptee_job_list = NULL;
xassert(job_ptr);
xassert(job_ptr->magic == JOB_MAGIC);
if (!acct_policy_job_runnable(job_ptr))
return ESLURM_ACCOUNTING_POLICY;
/* identify partition */
if (part_ptr == NULL) {
part_ptr = find_part_record(job_ptr->partition);
xassert(part_ptr);
job_ptr->part_ptr = part_ptr;
error("partition pointer reset for job %u, part %s",
job_ptr->job_id, job_ptr->partition);
}
/* Confirm that partition is up and has compatible nodes limits */
fail_reason = WAIT_NO_REASON;
if (part_ptr->state_up == 0)
fail_reason = WAIT_PART_STATE;
else if (job_ptr->priority == 0) /* user or administrator hold */
fail_reason = WAIT_HELD;
else if ((job_ptr->time_limit != NO_VAL) &&
(job_ptr->time_limit > part_ptr->max_time))
fail_reason = WAIT_PART_TIME_LIMIT;
else if (((job_ptr->details->max_nodes != 0) &&
(job_ptr->details->max_nodes < part_ptr->min_nodes)) ||
(job_ptr->details->min_nodes > part_ptr->max_nodes))
fail_reason = WAIT_PART_NODE_LIMIT;
if (fail_reason != WAIT_NO_REASON) {
job_ptr->state_reason = fail_reason;
xfree(job_ptr->state_desc);
last_job_update = now;
if (job_ptr->priority == 0) /* user/admin hold */
return ESLURM_JOB_HELD;
job_ptr->priority = 1; /* sys hold, move to end of queue */
return ESLURM_REQUESTED_PART_CONFIG_UNAVAILABLE;
}
/* build sets of usable nodes based upon their configuration */
error_code = _build_node_list(job_ptr, &node_set_ptr, &node_set_size);
if (error_code)
return error_code;
/* insure that selected nodes in these node sets */
if (job_ptr->details->req_node_bitmap) {
error_code = _nodes_in_sets(job_ptr->details->req_node_bitmap,
node_set_ptr, node_set_size);
if (error_code) {
info("No nodes satisfy requirements for JobId=%u",
job_ptr->job_id);
goto cleanup;
}
}
/* enforce both user's and partition's node limits */
/* info("req: %u-%u, %u", job_ptr->details->min_nodes,
job_ptr->details->max_nodes, part_ptr->max_nodes); */
min_nodes = MAX(job_ptr->details->min_nodes, part_ptr->min_nodes);
if (job_ptr->details->max_nodes == 0)
max_nodes = part_ptr->max_nodes;
else
max_nodes = MIN(job_ptr->details->max_nodes,
part_ptr->max_nodes);
max_nodes = MIN(max_nodes, 500000); /* prevent overflows */
if (!job_ptr->limit_set_max_nodes &&
job_ptr->details->max_nodes)
req_nodes = max_nodes;
else
req_nodes = min_nodes;
/* info("nodes:%u:%u:%u", min_nodes, req_nodes, max_nodes); */
if (max_nodes < min_nodes) {
error_code = ESLURM_REQUESTED_PART_CONFIG_UNAVAILABLE;
} else {
/* Select resources for the job here */
error_code = _get_req_features(node_set_ptr, node_set_size,
&select_bitmap, job_ptr,
part_ptr, min_nodes, max_nodes,
req_nodes, test_only,
&preemptee_job_list);
}
/* set up the cpu_cnt here so we can decrement it as nodes
free up. total_procs is set within _get_req_features */
job_ptr->cpu_cnt = job_ptr->total_procs;
if (!test_only && preemptee_job_list && (error_code == SLURM_SUCCESS))
_preempt_jobs(preemptee_job_list, &error_code);
if (error_code) {
if (error_code == ESLURM_REQUESTED_PART_CONFIG_UNAVAILABLE) {
/* Too many nodes requested */
debug3("JobId=%u not runnable with present config",
job_ptr->job_id);
job_ptr->state_reason = WAIT_PART_NODE_LIMIT;
xfree(job_ptr->state_desc);
if (job_ptr->priority != 0) /* Move to end of queue */
job_ptr->priority = 1;
last_job_update = now;
} else if (error_code == ESLURM_NODE_NOT_AVAIL) {
/* Required nodes are down or drained */
debug3("JobId=%u required nodes not avail",
job_ptr->job_id);
job_ptr->state_reason = WAIT_NODE_NOT_AVAIL;
xfree(job_ptr->state_desc);
if (job_ptr->priority != 0) /* Move to end of queue */
job_ptr->priority = 1;
last_job_update = now;
} else if (error_code == ESLURM_RESERVATION_NOT_USABLE) {
job_ptr->state_reason = WAIT_RESERVATION;
xfree(job_ptr->state_desc);
} else {
job_ptr->state_reason = WAIT_RESOURCES;
xfree(job_ptr->state_desc);
if (error_code == ESLURM_NODES_BUSY)
slurm_sched_job_is_pending();
}
goto cleanup;
}
if (test_only) { /* set if job not highest priority */
slurm_sched_job_is_pending();
error_code = SLURM_SUCCESS;
goto cleanup;
}
#ifdef HAVE_CRAY_XT
if (basil_reserve(job_ptr) != SLURM_SUCCESS) {
job_ptr->state_reason = WAIT_RESOURCES;
xfree(job_ptr->state_desc);
error_code = ESLURM_NODES_BUSY;
goto cleanup;
}
#endif /* HAVE_CRAY_XT */
/* This job may be getting requeued, clear vestigial
* state information before over-writting and leaking
* memory. */
FREE_NULL_BITMAP(job_ptr->node_bitmap);
xfree(job_ptr->nodes);
job_ptr->node_bitmap = select_bitmap;
/* we need to have these times set to know when the endtime
* is for the job when we place it
*/
job_ptr->start_time = job_ptr->time_last_active = now;
if (job_ptr->time_limit == NO_VAL) {
if (part_ptr->default_time != NO_VAL)
job_ptr->time_limit = part_ptr->default_time;
else
job_ptr->time_limit = part_ptr->max_time;
}
if (job_ptr->time_limit == INFINITE)
job_ptr->end_time = job_ptr->start_time +
(365 * 24 * 60 * 60); /* secs in year */
else
job_ptr->end_time = job_ptr->start_time +
(job_ptr->time_limit * 60); /* secs */
if (select_g_job_begin(job_ptr) != SLURM_SUCCESS) {
/* Leave job queued, something is hosed */
error("select_g_job_begin(%u): %m", job_ptr->job_id);
error_code = ESLURM_NODES_BUSY;
job_ptr->start_time = 0;
job_ptr->time_last_active = 0;
job_ptr->end_time = 0;
job_ptr->node_bitmap = NULL;
goto cleanup;
}
/* assign the nodes and stage_in the job */
job_ptr->state_reason = WAIT_NO_REASON;
xfree(job_ptr->state_desc);
job_ptr->nodes = bitmap2node_name(select_bitmap);
select_bitmap = NULL; /* nothing left to free */
allocate_nodes(job_ptr);
build_node_details(job_ptr);
/* This could be set in the select plugin so we want to keep
the flag. */
configuring = IS_JOB_CONFIGURING(job_ptr);
job_ptr->job_state = JOB_RUNNING;
if (configuring
|| bit_overlap(job_ptr->node_bitmap, power_node_bitmap))
job_ptr->job_state |= JOB_CONFIGURING;
if (select_g_select_nodeinfo_set(job_ptr) != SLURM_SUCCESS) {
error("select_g_update_nodeinfo(%u): %m", job_ptr->job_id);
/* not critical ... by now */
}
if (job_ptr->mail_type & MAIL_JOB_BEGIN)
mail_job_info(job_ptr, MAIL_JOB_BEGIN);
acct_policy_job_begin(job_ptr);
jobacct_storage_g_job_start(acct_db_conn, slurmctld_cluster_name,
job_ptr);
prolog_slurmctld(job_ptr);
slurm_sched_newalloc(job_ptr);
cleanup:
if (preemptee_job_list)
list_destroy(preemptee_job_list);
if (select_node_bitmap)
*select_node_bitmap = select_bitmap;
else
FREE_NULL_BITMAP(select_bitmap);
if (node_set_ptr) {
for (i = 0; i < node_set_size; i++) {
xfree(node_set_ptr[i].features);
FREE_NULL_BITMAP(node_set_ptr[i].my_bitmap);
FREE_NULL_BITMAP(node_set_ptr[i].feature_bits);
}
xfree(node_set_ptr);
}
return error_code;
}
/*
* _list_find_feature - find an entry in the feature list, see list.h for
* documentation
* IN key - is feature name or NULL for all features
* RET 1 if found, 0 otherwise
*/
static int _list_find_feature(void *feature_entry, void *key)
{
struct features_record *feature_ptr;
if (key == NULL)
return 1;
feature_ptr = (struct features_record *) feature_entry;
if (strcmp(feature_ptr->name, (char *) key) == 0)
return 1;
return 0;
}
/*
* _valid_feature_counts - validate a job's features can be satisfied
* by the selected nodes (NOTE: does not process XOR operators)
* IN detail_ptr - job details
* IN/OUT node_bitmap - nodes available for use, clear if unusable
* RET true if valid, false otherwise
*/
static bool _valid_feature_counts(struct job_details *detail_ptr,
bitstr_t *node_bitmap, bool *has_xor)
{
ListIterator job_feat_iter;
struct feature_record *job_feat_ptr;
struct features_record *feat_ptr;
int have_count = false, last_op = FEATURE_OP_AND;
bitstr_t *feature_bitmap, *tmp_bitmap;
bool rc = true;
xassert(detail_ptr);
xassert(node_bitmap);
xassert(has_xor);
*has_xor = false;
if (detail_ptr->feature_list == NULL) /* no constraints */
return rc;
feature_bitmap = bit_copy(node_bitmap);
if (feature_bitmap == NULL)
fatal("bit_copy malloc error");
job_feat_iter = list_iterator_create(detail_ptr->feature_list);
if (job_feat_iter == NULL)
fatal("list_iterator_create malloc error");
while ((job_feat_ptr = (struct feature_record *)
list_next(job_feat_iter))) {
feat_ptr = list_find_first(feature_list, _list_find_feature,
(void *) job_feat_ptr->name);
if (feat_ptr) {
if (last_op == FEATURE_OP_AND)
bit_and(feature_bitmap, feat_ptr->node_bitmap);
else if (last_op == FEATURE_OP_XOR) {
*has_xor = true;
bit_or(feature_bitmap, feat_ptr->node_bitmap);
} else /* FEATURE_OP_OR */
bit_or(feature_bitmap, feat_ptr->node_bitmap);
} else { /* feature not found */
if (last_op == FEATURE_OP_AND) {
bit_nclear(feature_bitmap, 0,
(node_record_count - 1));
}
}
last_op = job_feat_ptr->op_code;
if (job_feat_ptr->count)
have_count = true;
}
list_iterator_destroy(job_feat_iter);
if (have_count) {
job_feat_iter = list_iterator_create(detail_ptr->
feature_list);
if (job_feat_iter == NULL)
fatal("list_iterator_create malloc error");
while ((job_feat_ptr = (struct feature_record *)
list_next(job_feat_iter))) {
if (job_feat_ptr->count == 0)
continue;
feat_ptr = list_find_first(feature_list,
_list_find_feature,
(void *)job_feat_ptr->name);
if (!feat_ptr) {
rc = false;
break;
}
tmp_bitmap = bit_copy(feature_bitmap);
if (tmp_bitmap == NULL)
fatal("bit_copy malloc error");
bit_and(tmp_bitmap, feat_ptr->node_bitmap);
if (bit_set_count(tmp_bitmap) < job_feat_ptr->count)
rc = false;
bit_free(tmp_bitmap);
if (!rc)
break;
}
list_iterator_destroy(job_feat_iter);
bit_free(feature_bitmap);
} else {
bit_and(node_bitmap, feature_bitmap);
bit_free(feature_bitmap);
}
return rc;
}
/*
* job_req_node_filter - job reqeust node filter.
* clear from a bitmap the nodes which can not be used for a job
* test memory size, required features, processor count, etc.
* NOTE: Does not support exclusive OR of features.
* It just matches first element of XOR and ignores count.
* IN job_ptr - pointer to node to be scheduled
* IN/OUT bitmap - set of nodes being considered for use
* RET SLURM_SUCCESS or EINVAL if can't filter (exclusive OR of features)
*/
extern int job_req_node_filter(struct job_record *job_ptr,
bitstr_t *avail_bitmap)
{
int i;
struct job_details *detail_ptr = job_ptr->details;
multi_core_data_t *mc_ptr;
struct node_record *node_ptr;
struct config_record *config_ptr;
bool has_xor = false;
if (detail_ptr == NULL) {
error("job_req_node_filter: job %u has no details",
job_ptr->job_id);
return EINVAL;
}
mc_ptr = detail_ptr->mc_ptr;
for (i=0; i< node_record_count; i++) {
if (!bit_test(avail_bitmap, i))
continue;
node_ptr = node_record_table_ptr + i;
config_ptr = node_ptr->config_ptr;
if (slurmctld_conf.fast_schedule) {
if ((detail_ptr->job_min_cpus > config_ptr->cpus) ||
((detail_ptr->job_min_memory & (~MEM_PER_CPU)) >
config_ptr->real_memory) ||
(detail_ptr->job_min_tmp_disk >
config_ptr->tmp_disk)) {
bit_clear(avail_bitmap, i);
continue;
}
if (mc_ptr &&
(((mc_ptr->min_sockets > config_ptr->sockets) &&
(mc_ptr->min_sockets != (uint16_t) NO_VAL)) ||
((mc_ptr->min_cores > config_ptr->cores) &&
(mc_ptr->min_cores != (uint16_t) NO_VAL)) ||
((mc_ptr->min_threads > config_ptr->threads) &&
(mc_ptr->min_threads != (uint16_t) NO_VAL)))) {
bit_clear(avail_bitmap, i);
continue;
}
} else {
if ((detail_ptr->job_min_cpus > node_ptr->cpus) ||
((detail_ptr->job_min_memory & (~MEM_PER_CPU)) >
node_ptr->real_memory) ||
(detail_ptr->job_min_tmp_disk >
node_ptr->tmp_disk)) {
bit_clear(avail_bitmap, i);
continue;
}
if (mc_ptr &&
(((mc_ptr->min_sockets > node_ptr->sockets) &&
(mc_ptr->min_sockets != (uint16_t) NO_VAL)) ||
((mc_ptr->min_cores > node_ptr->cores) &&
(mc_ptr->min_cores != (uint16_t) NO_VAL)) ||
((mc_ptr->min_threads > node_ptr->threads) &&
(mc_ptr->min_threads != (uint16_t) NO_VAL)))) {
bit_clear(avail_bitmap, i);
continue;
}
}
}
if (!_valid_feature_counts(detail_ptr, avail_bitmap, &has_xor))
return EINVAL;
return SLURM_SUCCESS;
}
/*
* _build_node_list - identify which nodes could be allocated to a job
* based upon node features, memory, processors, etc. Note that a
* bitmap is set to indicate which of the job's features that the
* nodes satisfy.
* IN job_ptr - pointer to node to be scheduled
* OUT node_set_pptr - list of node sets which could be used for the job
* OUT node_set_size - number of node_set entries
* RET error code
*/
static int _build_node_list(struct job_record *job_ptr,
struct node_set **node_set_pptr,
int *node_set_size)
{
int i, node_set_inx, power_cnt, rc;
struct node_set *node_set_ptr;
struct config_record *config_ptr;
struct part_record *part_ptr = job_ptr->part_ptr;
ListIterator config_iterator;
int check_node_config, config_filter = 0;
struct job_details *detail_ptr = job_ptr->details;
bitstr_t *power_up_bitmap = NULL, *usable_node_mask = NULL;
multi_core_data_t *mc_ptr = detail_ptr->mc_ptr;
bitstr_t *tmp_feature;
uint32_t max_weight = 0;
bool has_xor = false;
if (job_ptr->resv_name) {
/* Limit node selection to those in selected reservation */
time_t start_res = time(NULL);
rc = job_test_resv(job_ptr, &start_res, false,
&usable_node_mask);
if (rc != SLURM_SUCCESS) {
job_ptr->state_reason = WAIT_RESERVATION;
xfree(job_ptr->state_desc);
if (rc == ESLURM_INVALID_TIME_VALUE)
return ESLURM_RESERVATION_NOT_USABLE;
/* Defunct reservation or accesss denied */
return ESLURM_REQUESTED_NODE_CONFIG_UNAVAILABLE;
}
if ((detail_ptr->req_node_bitmap) &&
(!bit_super_set(detail_ptr->req_node_bitmap,
usable_node_mask))) {
job_ptr->state_reason = WAIT_RESERVATION;
xfree(job_ptr->state_desc);
FREE_NULL_BITMAP(usable_node_mask);
/* Required nodes outside of the reservation */
return ESLURM_REQUESTED_NODE_CONFIG_UNAVAILABLE;
}
}
node_set_inx = 0;
node_set_ptr = (struct node_set *)
xmalloc(sizeof(struct node_set) * 2);
node_set_ptr[node_set_inx+1].my_bitmap = NULL;
if (detail_ptr->exc_node_bitmap) {
if (usable_node_mask) {
bit_not(detail_ptr->exc_node_bitmap);
bit_and(usable_node_mask, detail_ptr->exc_node_bitmap);
bit_not(detail_ptr->exc_node_bitmap);
} else {
usable_node_mask =
bit_copy(detail_ptr->exc_node_bitmap);
if (usable_node_mask == NULL)
fatal("bit_copy malloc failure");
bit_not(usable_node_mask);
}
} else {
usable_node_mask = bit_alloc(node_record_count);
if (usable_node_mask == NULL)
fatal("bit_alloc malloc failure");
bit_nset(usable_node_mask, 0, (node_record_count - 1));
}
if (!_valid_feature_counts(detail_ptr, usable_node_mask, &has_xor)) {
info("No job %u feature requirements can not be met",
job_ptr->job_id);
bit_free(usable_node_mask);
return ESLURM_REQUESTED_NODE_CONFIG_UNAVAILABLE;
}
config_iterator = list_iterator_create(config_list);
if (config_iterator == NULL)
fatal("list_iterator_create malloc failure");
while ((config_ptr = (struct config_record *)
list_next(config_iterator))) {
config_filter = 0;
if ((detail_ptr->job_min_cpus > config_ptr->cpus ) ||
((detail_ptr->job_min_memory & (~MEM_PER_CPU)) >
config_ptr->real_memory) ||
(detail_ptr->job_min_tmp_disk > config_ptr->tmp_disk))
config_filter = 1;
if (mc_ptr &&
(((mc_ptr->min_sockets > config_ptr->sockets) &&
(mc_ptr->min_sockets != (uint16_t) NO_VAL)) ||
((mc_ptr->min_cores > config_ptr->cores) &&
(mc_ptr->min_cores != (uint16_t) NO_VAL)) ||
((mc_ptr->min_threads > config_ptr->threads) &&
(mc_ptr->min_threads != (uint16_t) NO_VAL))))
config_filter = 1;
/* since nodes can register with more resources than defined */
/* in the configuration, we want to use those higher values */
/* for scheduling, but only as needed (slower) */
if (slurmctld_conf.fast_schedule) {
if (config_filter)
continue;
check_node_config = 0;
} else if (config_filter) {
check_node_config = 1;
} else
check_node_config = 0;
node_set_ptr[node_set_inx].my_bitmap =
bit_copy(config_ptr->node_bitmap);
if (node_set_ptr[node_set_inx].my_bitmap == NULL)
fatal("bit_copy malloc failure");
bit_and(node_set_ptr[node_set_inx].my_bitmap,
part_ptr->node_bitmap);
if (usable_node_mask) {
bit_and(node_set_ptr[node_set_inx].my_bitmap,
usable_node_mask);
}
node_set_ptr[node_set_inx].nodes =
bit_set_count(node_set_ptr[node_set_inx].my_bitmap);
if (check_node_config &&
(node_set_ptr[node_set_inx].nodes != 0)) {
_filter_nodes_in_set(&node_set_ptr[node_set_inx],
detail_ptr);
}
if (node_set_ptr[node_set_inx].nodes == 0) {
FREE_NULL_BITMAP(node_set_ptr[node_set_inx].my_bitmap);
continue;
}
if (has_xor) {
tmp_feature = _valid_features(job_ptr->details,
config_ptr);
if (tmp_feature == NULL) {
FREE_NULL_BITMAP(node_set_ptr[node_set_inx].
my_bitmap);
continue;
}
} else {
/* We've already filtered for AND/OR features */
tmp_feature = bit_alloc(MAX_FEATURES);
bit_set(tmp_feature, 0);
}
/* NOTE: Must bit_free(tmp_feature) to avoid memory leak */
node_set_ptr[node_set_inx].cpus_per_node =
config_ptr->cpus;
node_set_ptr[node_set_inx].real_memory =
config_ptr->real_memory;
node_set_ptr[node_set_inx].weight =
config_ptr->weight;
max_weight = MAX(max_weight, config_ptr->weight);
node_set_ptr[node_set_inx].features =
xstrdup(config_ptr->feature);
node_set_ptr[node_set_inx].feature_bits = tmp_feature;
debug2("found %d usable nodes from config containing %s",
node_set_ptr[node_set_inx].nodes, config_ptr->nodes);
node_set_inx++;
xrealloc(node_set_ptr,
sizeof(struct node_set) * (node_set_inx + 2));
node_set_ptr[node_set_inx + 1].my_bitmap = NULL;
}
list_iterator_destroy(config_iterator);
/* eliminate last (incomplete) node_set record */
xfree(node_set_ptr[node_set_inx].features);
FREE_NULL_BITMAP(node_set_ptr[node_set_inx].my_bitmap);
FREE_NULL_BITMAP(node_set_ptr[node_set_inx].feature_bits);
FREE_NULL_BITMAP(usable_node_mask);
if (node_set_inx == 0) {
info("No nodes satisfy job %u requirements",
job_ptr->job_id);
xfree(node_set_ptr);
return ESLURM_REQUESTED_NODE_CONFIG_UNAVAILABLE;
}
/* If any nodes are powered down, put them into a new node_set
* record with a higher scheduling weight . This means we avoid
* scheduling jobs on powered down nodes where possible. */
for (i = (node_set_inx-1); i >= 0; i--) {
power_cnt = bit_overlap(node_set_ptr[i].my_bitmap,
power_node_bitmap);
if (power_cnt == 0)
continue; /* no nodes powered down */
if (power_cnt == node_set_ptr[i].nodes) {
node_set_ptr[i].weight += max_weight; /* avoid all */
continue; /* all nodes powered down */
}
/* Some nodes powered down, others up, split record */
node_set_ptr[node_set_inx].cpus_per_node =
node_set_ptr[i].cpus_per_node;
node_set_ptr[node_set_inx].real_memory =
node_set_ptr[i].real_memory;
node_set_ptr[node_set_inx].nodes = power_cnt;
node_set_ptr[i].nodes -= power_cnt;
node_set_ptr[node_set_inx].weight =
node_set_ptr[i].weight + max_weight;
node_set_ptr[node_set_inx].features =
xstrdup(node_set_ptr[i].features);
node_set_ptr[node_set_inx].feature_bits =
bit_copy(node_set_ptr[i].feature_bits);
node_set_ptr[node_set_inx].my_bitmap =
bit_copy(node_set_ptr[i].my_bitmap);
bit_and(node_set_ptr[node_set_inx].my_bitmap,
power_node_bitmap);
if (power_up_bitmap == NULL) {
power_up_bitmap = bit_copy(power_node_bitmap);
bit_not(power_up_bitmap);
}
bit_and(node_set_ptr[i].my_bitmap, power_up_bitmap);
node_set_inx++;
xrealloc(node_set_ptr,
sizeof(struct node_set) * (node_set_inx + 2));
node_set_ptr[node_set_inx + 1].my_bitmap = NULL;
}
FREE_NULL_BITMAP(power_up_bitmap);
*node_set_size = node_set_inx;
*node_set_pptr = node_set_ptr;
return SLURM_SUCCESS;
}
/* Remove from the node set any nodes which lack sufficient resources
* to satisfy the job's request */
static void _filter_nodes_in_set(struct node_set *node_set_ptr,
struct job_details *job_con)
{
int i;
multi_core_data_t *mc_ptr = job_con->mc_ptr;
if (slurmctld_conf.fast_schedule) { /* test config records */
struct config_record *node_con = NULL;
for (i = 0; i < node_record_count; i++) {
int job_ok = 0, job_mc_ptr_ok = 0;
if (bit_test(node_set_ptr->my_bitmap, i) == 0)
continue;
node_con = node_record_table_ptr[i].config_ptr;
if ((job_con->job_min_cpus <= node_con->cpus) &&
((job_con->job_min_memory & (~MEM_PER_CPU)) <=
node_con->real_memory) &&
(job_con->job_min_tmp_disk <= node_con->tmp_disk))
job_ok = 1;
if (mc_ptr &&
(((mc_ptr->min_sockets <= node_con->sockets) ||
(mc_ptr->min_sockets == (uint16_t) NO_VAL)) &&
((mc_ptr->min_cores <= node_con->cores) ||
(mc_ptr->min_cores == (uint16_t) NO_VAL)) &&
((mc_ptr->min_threads <= node_con->threads) ||
(mc_ptr->min_threads == (uint16_t) NO_VAL))))
job_mc_ptr_ok = 1;
if (job_ok && (!mc_ptr || job_mc_ptr_ok))
continue;
bit_clear(node_set_ptr->my_bitmap, i);
if ((--(node_set_ptr->nodes)) == 0)
break;
}
} else { /* fast_schedule == 0, test individual node records */
struct node_record *node_ptr = NULL;
for (i = 0; i < node_record_count; i++) {
int job_ok = 0, job_mc_ptr_ok = 0;
if (bit_test(node_set_ptr->my_bitmap, i) == 0)
continue;
node_ptr = &node_record_table_ptr[i];
if ((job_con->job_min_cpus <= node_ptr->cpus) &&
((job_con->job_min_memory & (~MEM_PER_CPU)) <=
node_ptr->real_memory) &&
(job_con->job_min_tmp_disk <= node_ptr->tmp_disk))
job_ok = 1;
if (mc_ptr &&
(((mc_ptr->min_sockets <= node_ptr->sockets) ||
(mc_ptr->min_sockets == (uint16_t) NO_VAL)) &&
((mc_ptr->min_cores <= node_ptr->cores) ||
(mc_ptr->min_cores == (uint16_t) NO_VAL)) &&
((mc_ptr->min_threads <= node_ptr->threads) ||
(mc_ptr->min_threads == (uint16_t) NO_VAL))))
job_mc_ptr_ok = 1;
if (job_ok && (!mc_ptr || job_mc_ptr_ok))
continue;
bit_clear(node_set_ptr->my_bitmap, i);
if ((--(node_set_ptr->nodes)) == 0)
break;
}
}
}
/*
* _nodes_in_sets - Determine if required nodes are included in node_set(s)
* IN req_bitmap - nodes specifically required by the job
* IN node_set_ptr - sets of valid nodes
* IN node_set_size - count of node_set entries
* RET 0 if in set, otherwise an error code
*/
static int _nodes_in_sets(bitstr_t *req_bitmap,
struct node_set * node_set_ptr,
int node_set_size)
{
bitstr_t *scratch_bitmap = NULL;
int error_code = SLURM_SUCCESS, i;
for (i=0; i<node_set_size; i++) {
if (scratch_bitmap)
bit_or(scratch_bitmap,
node_set_ptr[i].my_bitmap);
else {
scratch_bitmap =
bit_copy(node_set_ptr[i].my_bitmap);
if (scratch_bitmap == NULL)
fatal("bit_copy malloc failure");
}
}
if ((scratch_bitmap == NULL)
|| (bit_super_set(req_bitmap, scratch_bitmap) != 1))
error_code = ESLURM_REQUESTED_NODE_CONFIG_UNAVAILABLE;
FREE_NULL_BITMAP(scratch_bitmap);
return error_code;
}
/*
* build_node_details - sets addresses for allocated nodes
* IN job_ptr - pointer to a job record
*/
extern void build_node_details(struct job_record *job_ptr)
{
hostlist_t host_list = NULL;
struct node_record *node_ptr;
char *this_node_name;
int node_inx = 0;
if ((job_ptr->node_bitmap == NULL) || (job_ptr->nodes == NULL)) {
/* No nodes allocated, we're done... */
job_ptr->node_cnt = 0;
job_ptr->node_addr = NULL;
return;
}
/* Use hostlist here to insure ordering of info matches that of srun */
if ((host_list = hostlist_create(job_ptr->nodes)) == NULL)
fatal("hostlist_create error for %s: %m", job_ptr->nodes);
job_ptr->node_cnt = hostlist_count(host_list);
xrealloc(job_ptr->node_addr,
(sizeof(slurm_addr) * job_ptr->node_cnt));
while ((this_node_name = hostlist_shift(host_list))) {
if ((node_ptr = find_node_record(this_node_name))) {
memcpy(&job_ptr->node_addr[node_inx++],
&node_ptr->slurm_addr, sizeof(slurm_addr));
} else {
error("Invalid node %s in JobId=%u",
this_node_name, job_ptr->job_id);
}
free(this_node_name);
}
hostlist_destroy(host_list);
if (job_ptr->node_cnt != node_inx) {
error("Node count mismatch for JobId=%u (%u,%u)",
job_ptr->job_id, job_ptr->node_cnt, node_inx);
}
}
/*
* _valid_features - Determine if the requested features are satisfied by
* the available nodes. This is only used for XOR operators.
* IN details_ptr - job requirement details, includes requested features
* IN config_ptr - node's configuration record
* RET NULL if request is not satisfied, otherwise a bitmap indicating
* which mutually exclusive features are satisfied. For example
* _valid_features("[fs1|fs2|fs3|fs4]", "fs3") returns a bitmap with
* the third bit set. For another example
* _valid_features("[fs1|fs2|fs3|fs4]", "fs1,fs3") returns a bitmap
* with the first and third bits set. The function returns a bitmap
* with the first bit set if requirements are satisfied without a
* mutually exclusive feature list.
*/
static bitstr_t *_valid_features(struct job_details *details_ptr,
struct config_record *config_ptr)
{
bitstr_t *result_bits = (bitstr_t *) NULL;
ListIterator feat_iter;
struct feature_record *job_feat_ptr;
struct features_record *feat_ptr;
int last_op = FEATURE_OP_AND, position = 0;
result_bits = bit_alloc(MAX_FEATURES);
if (result_bits == NULL)
fatal("bit_alloc malloc failure");
if (details_ptr->feature_list == NULL) { /* no constraints */
bit_set(result_bits, 0);
return result_bits;
}
feat_iter = list_iterator_create(details_ptr->feature_list);
if (feat_iter == NULL)
fatal("list_iterator_create malloc failure");
while ((job_feat_ptr = (struct feature_record *)
list_next(feat_iter))) {
if ((job_feat_ptr->op_code == FEATURE_OP_XOR) ||
(last_op == FEATURE_OP_XOR)) {
feat_ptr = list_find_first(feature_list,
_list_find_feature,
(void *)job_feat_ptr->name);
if (feat_ptr &&
bit_super_set(config_ptr->node_bitmap,
feat_ptr->node_bitmap)) {
bit_set(result_bits, position);
}
position++;
}
last_op = job_feat_ptr->op_code;
}
list_iterator_destroy(feat_iter);
return result_bits;
}
/*
* re_kill_job - for a given job, deallocate its nodes for a second time,
* basically a cleanup for failed deallocate() calls
* IN job_ptr - pointer to terminating job (already in some COMPLETING state)
* globals: node_record_count - number of nodes in the system
* node_record_table_ptr - pointer to global node table
*/
extern void re_kill_job(struct job_record *job_ptr)
{
int i;
kill_job_msg_t *kill_job;
agent_arg_t *agent_args;
hostlist_t kill_hostlist = hostlist_create("");
char host_str[64];
static uint32_t last_job_id = 0;
xassert(job_ptr);
xassert(job_ptr->details);
agent_args = xmalloc(sizeof(agent_arg_t));
agent_args->msg_type = REQUEST_TERMINATE_JOB;
agent_args->hostlist = hostlist_create("");
agent_args->retry = 0;
kill_job = xmalloc(sizeof(kill_job_msg_t));
kill_job->job_id = job_ptr->job_id;
kill_job->step_id = NO_VAL;
kill_job->job_uid = job_ptr->user_id;
kill_job->job_state = job_ptr->job_state;
kill_job->time = time(NULL);
kill_job->select_jobinfo = select_g_select_jobinfo_copy(
job_ptr->select_jobinfo);
kill_job->spank_job_env = xduparray(job_ptr->spank_job_env_size,
job_ptr->spank_job_env);
kill_job->spank_job_env_size = job_ptr->spank_job_env_size;
for (i = 0; i < node_record_count; i++) {
struct node_record *node_ptr = &node_record_table_ptr[i];
if ((job_ptr->node_bitmap == NULL) ||
(bit_test(job_ptr->node_bitmap, i) == 0))
continue;
if (IS_NODE_DOWN(node_ptr)) {
/* Consider job already completed */
bit_clear(job_ptr->node_bitmap, i);
job_update_cpu_cnt(job_ptr, i);
if (node_ptr->comp_job_cnt)
(node_ptr->comp_job_cnt)--;
if ((--job_ptr->node_cnt) == 0) {
last_node_update = time(NULL);
job_ptr->job_state &= (~JOB_COMPLETING);
delete_step_records(job_ptr, 0);
slurm_sched_schedule();
}
continue;
}
if (IS_NODE_NO_RESPOND(node_ptr))
continue;
(void) hostlist_push_host(kill_hostlist, node_ptr->name);
#ifdef HAVE_FRONT_END /* Operate only on front-end */
if (agent_args->node_count > 0)
continue;
#endif
hostlist_push(agent_args->hostlist, node_ptr->name);
agent_args->node_count++;
}
if (agent_args->node_count == 0) {
slurm_free_kill_job_msg(kill_job);
if(agent_args->hostlist)
hostlist_destroy(agent_args->hostlist);
xfree(agent_args);
hostlist_destroy(kill_hostlist);
return;
}
hostlist_uniq(kill_hostlist);
hostlist_ranged_string(kill_hostlist,
sizeof(host_str), host_str);
#ifdef HAVE_BG
if (job_ptr->job_id != last_job_id) {
info("Resending TERMINATE_JOB request JobId=%u BPlist=%s",
job_ptr->job_id, host_str);
} else {
debug("Resending TERMINATE_JOB request JobId=%u BPlist=%s",
job_ptr->job_id, host_str);
}
#else
if (job_ptr->job_id != last_job_id) {
info("Resending TERMINATE_JOB request JobId=%u Nodelist=%s",
job_ptr->job_id, host_str);
} else {
debug("Resending TERMINATE_JOB request JobId=%u Nodelist=%s",
job_ptr->job_id, host_str);
}
#endif
last_job_id = job_ptr->job_id;
hostlist_destroy(kill_hostlist);
agent_args->msg_args = kill_job;
agent_queue_request(agent_args);
return;
}