blob: b1a598dd20216575feb602ea8af1a7dd3550cdff [file] [log] [blame]
/*****************************************************************************\
* read_config.c - read the overall slurm configuration file
*****************************************************************************
* Copyright (C) 2002-2007 The Regents of the University of California.
* Copyright (C) 2008-2010 Lawrence Livermore National Security.
* Copyright (C) SchedMD LLC.
* Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
* Written by Morris Jette <jette1@llnl.gov>.
* CODE-OCEC-09-009. All rights reserved.
*
* This file is part of Slurm, a resource management program.
* For details, see <https://slurm.schedmd.com/>.
* Please also read the included file: DISCLAIMER.
*
* Slurm is free software; you can redistribute it and/or modify it under
* the terms of the GNU General Public License as published by the Free
* Software Foundation; either version 2 of the License, or (at your option)
* any later version.
*
* In addition, as a special exception, the copyright holders give permission
* to link the code of portions of this program with the OpenSSL library under
* certain conditions as described in each individual source file, and
* distribute linked combinations including the two. You must obey the GNU
* General Public License in all respects for all of the code used other than
* OpenSSL. If you modify file(s) with this exception, you may extend this
* exception to your version of the file(s), but you are not obligated to do
* so. If you do not wish to do so, delete this exception statement from your
* version. If you delete this exception statement from all source files in
* the program, then also delete it here.
*
* Slurm is distributed in the hope that it will be useful, but WITHOUT ANY
* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
* FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
* details.
*
* You should have received a copy of the GNU General Public License along
* with Slurm; if not, write to the Free Software Foundation, Inc.,
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
\*****************************************************************************/
#include "config.h"
#include <ctype.h>
#include <errno.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <syslog.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <time.h>
#include <unistd.h>
#include "src/common/assoc_mgr.h"
#include "src/common/cpu_frequency.h"
#include "src/common/hostlist.h"
#include "src/common/list.h"
#include "src/common/macros.h"
#include "src/common/node_features.h"
#include "src/common/port_mgr.h"
#include "src/common/read_config.h"
#include "src/common/slurm_rlimits_info.h"
#include "src/common/state_save.h"
#include "src/common/strnatcmp.h"
#include "src/common/xstring.h"
#include "src/interfaces/burst_buffer.h"
#include "src/interfaces/cgroup.h"
#include "src/interfaces/gres.h"
#include "src/interfaces/job_submit.h"
#include "src/interfaces/jobcomp.h"
#include "src/interfaces/mcs.h"
#include "src/interfaces/node_features.h"
#include "src/interfaces/preempt.h"
#include "src/interfaces/prep.h"
#include "src/interfaces/sched_plugin.h"
#include "src/interfaces/select.h"
#include "src/interfaces/switch.h"
#include "src/interfaces/topology.h"
#include "src/slurmctld/acct_policy.h"
#include "src/slurmctld/fed_mgr.h"
#include "src/slurmctld/gang.h"
#include "src/slurmctld/job_scheduler.h"
#include "src/slurmctld/licenses.h"
#include "src/slurmctld/locks.h"
#include "src/slurmctld/node_scheduler.h"
#include "src/slurmctld/power_save.h"
#include "src/slurmctld/proc_req.h"
#include "src/slurmctld/read_config.h"
#include "src/slurmctld/reservation.h"
#include "src/slurmctld/slurmctld.h"
#include "src/slurmctld/trigger_mgr.h"
#include "src/stepmgr/srun_comm.h"
#include "src/stepmgr/stepmgr.h"
/* Global variables */
bool slurmctld_init_db = true;
static void _acct_restore_active_jobs(void);
static void _build_bitmaps(void);
static void _gres_reconfig(void);
static void _init_all_slurm_conf(void);
static int _preserve_select_type_param(slurm_conf_t *ctl_conf_ptr,
uint16_t old_select_type_p);
static int _reset_node_bitmaps(void *x, void *arg);
static void _restore_job_accounting();
static void _set_features(node_record_t **old_node_table_ptr,
int old_node_record_count, int recover);
static void _stat_slurm_dirs(void);
static int _sync_nodes_to_comp_job(void);
static int _sync_nodes_to_jobs(void);
static int _sync_nodes_to_active_job(job_record_t *job_ptr);
static void _sync_nodes_to_suspended_job(job_record_t *job_ptr);
static void _sync_part_prio(void);
/*
* Setup the global response_cluster_rec
*/
static void _set_response_cluster_rec(void)
{
if (response_cluster_rec)
return;
response_cluster_rec = xmalloc(sizeof(slurmdb_cluster_rec_t));
response_cluster_rec->name = xstrdup(slurm_conf.cluster_name);
if (slurm_conf.slurmctld_addr) {
response_cluster_rec->control_host =
xstrdup(slurm_conf.slurmctld_addr);
} else {
response_cluster_rec->control_host =
xstrdup(slurm_conf.control_addr[0]);
}
response_cluster_rec->control_port = slurm_conf.slurmctld_port;
response_cluster_rec->rpc_version = SLURM_PROTOCOL_VERSION;
}
/*
* Free the global response_cluster_rec
*/
extern void cluster_rec_free(void)
{
if (response_cluster_rec) {
xfree(response_cluster_rec->control_host);
xfree(response_cluster_rec->name);
xfree(response_cluster_rec);
}
}
/* Verify that Slurm directories are secure, not world writable */
static void _stat_slurm_dirs(void)
{
struct stat stat_buf;
char *problem_dir = NULL;
/*
* PluginDir may have multiple values, and is checked by
* _is_valid_path() instead
*/
if (slurm_conf.plugstack &&
!stat(slurm_conf.plugstack, &stat_buf) &&
(stat_buf.st_mode & S_IWOTH)) {
problem_dir = "PlugStack";
}
if (!stat(slurm_conf.slurmd_spooldir, &stat_buf) &&
(stat_buf.st_mode & S_IWOTH)) {
problem_dir = "SlurmdSpoolDir";
}
if (!stat(slurm_conf.state_save_location, &stat_buf) &&
(stat_buf.st_mode & S_IWOTH)) {
problem_dir = "StateSaveLocation";
}
if (problem_dir) {
error("################################################");
error("### SEVERE SECURITY VULNERABILITY ###");
error("### %s DIRECTORY IS WORLD WRITABLE ###", problem_dir);
error("### CORRECT FILE PERMISSIONS ###");
error("################################################");
}
}
/*
* _reorder_nodes_by_rank - order node table in ascending order of node_rank
* This depends on the TopologyPlugin, which may generate such a ranking.
*/
static int _sort_nodes_by_rank(const void *a, const void *b)
{
node_record_t *n1 = *(node_record_t **)a;
node_record_t *n2 = *(node_record_t **)b;
if (!n1)
return 1;
if (!n2)
return -1;
return slurm_sort_uint32_list_asc(&n1->node_rank, &n2->node_rank);
}
/*
* _reorder_nodes_by_name - order node table in ascending order of name
*/
static int _sort_nodes_by_name(const void *a, const void *b)
{
node_record_t *n1 = *(node_record_t **)a;
node_record_t *n2 = *(node_record_t **)b;
if (!n1)
return 1;
if (!n2)
return -1;
return strnatcmp(n1->name, n2->name);
}
static void _sort_node_record_table_ptr(void)
{
int (*compare_fn)(const void *, const void *);
if (topology_g_generate_node_ranking())
compare_fn = &_sort_nodes_by_rank;
else
compare_fn = &_sort_nodes_by_name;
qsort(node_record_table_ptr, node_record_count,
sizeof(node_record_t *), compare_fn);
for (int i = 0; i < node_record_count; i++) {
if (node_record_table_ptr[i])
node_record_table_ptr[i]->index = i;
}
#if _DEBUG
/* Log the results */
node_record_t *node_ptr;
for (int i = 0; (node_ptr = next_node(&i)); i++) {
info("node_rank[%d:%d]: %s",
node_ptr->index, node_ptr->node_rank, node_ptr->name);
}
#endif
}
static void _add_nodes_with_feature(hostlist_t *hl, char *feature)
{
node_record_t *node_ptr;
bitstr_t *tmp_bitmap = bit_alloc(node_record_count);
add_nodes_with_feature_to_bitmap(tmp_bitmap, feature);
for (int i = 0; (node_ptr = next_node_bitmap(tmp_bitmap, &i)); i++) {
hostlist_push_host(hl, node_ptr->name);
}
FREE_NULL_BITMAP(tmp_bitmap);
}
static void _add_all_nodes_to_hostlist(hostlist_t *hl)
{
node_record_t *node_ptr;
for (int i = 0; (node_ptr = next_node(&i)); i++)
hostlist_push_host(hl, node_ptr->name);
}
extern hostlist_t *nodespec_to_hostlist(const char *nodes, bool uniq,
char **nodesets)
{
int count;
slurm_conf_nodeset_t *ptr, **ptr_array;
hostlist_t *hl;
if (nodesets)
xfree(*nodesets);
if (!xstrcasecmp(nodes, "ALL")) {
if (!(hl = hostlist_create(NULL))) {
error("%s: hostlist_create() error for %s", __func__, nodes);
return NULL;
}
_add_all_nodes_to_hostlist(hl);
if (nodesets)
*nodesets = xstrdup("ALL");
return hl;
} else if (!(hl = hostlist_create(nodes))) {
error("%s: hostlist_create() error for %s", __func__, nodes);
return NULL;
}
if (!hostlist_count(hl)) {
/* no need to look for nodests */
return hl;
}
count = slurm_conf_nodeset_array(&ptr_array);
for (int i = 0; i < count; i++) {
ptr = ptr_array[i];
/* swap the nodeset entry with the applicable nodes */
if (hostlist_delete_host(hl, ptr->name)) {
if (nodesets)
xstrfmtcat(*nodesets, "%s%s",
*nodesets ? "," : "",
ptr->name);
if (ptr->feature)
_add_nodes_with_feature(hl, ptr->feature);
/* Handle keywords for Nodes= in a NodeSet */
if (!xstrcasecmp(ptr->nodes, "ALL")) {
_add_all_nodes_to_hostlist(hl);
} else if (ptr->nodes) {
hostlist_push(hl, ptr->nodes);
}
}
}
if (xstrchr(nodes, '{'))
parse_hostlist_functions(&hl);
if (uniq)
hostlist_uniq(hl);
return hl;
}
static void _init_bitmaps(void)
{
/* initialize the idle and up bitmaps */
FREE_NULL_BITMAP(asap_node_bitmap);
FREE_NULL_BITMAP(avail_node_bitmap);
FREE_NULL_BITMAP(bf_ignore_node_bitmap);
FREE_NULL_BITMAP(booting_node_bitmap);
FREE_NULL_BITMAP(cg_node_bitmap);
FREE_NULL_BITMAP(cloud_node_bitmap);
FREE_NULL_BITMAP(external_node_bitmap);
FREE_NULL_BITMAP(future_node_bitmap);
FREE_NULL_BITMAP(idle_node_bitmap);
FREE_NULL_BITMAP(power_down_node_bitmap);
FREE_NULL_BITMAP(power_up_node_bitmap);
FREE_NULL_BITMAP(rs_node_bitmap);
FREE_NULL_BITMAP(share_node_bitmap);
FREE_NULL_BITMAP(up_node_bitmap);
asap_node_bitmap = bit_alloc(node_record_count);
avail_node_bitmap = bit_alloc(node_record_count);
bf_ignore_node_bitmap = bit_alloc(node_record_count);
booting_node_bitmap = bit_alloc(node_record_count);
cg_node_bitmap = bit_alloc(node_record_count);
cloud_node_bitmap = bit_alloc(node_record_count);
external_node_bitmap = bit_alloc(node_record_count);
future_node_bitmap = bit_alloc(node_record_count);
idle_node_bitmap = bit_alloc(node_record_count);
power_down_node_bitmap = bit_alloc(node_record_count);
power_up_node_bitmap = bit_alloc(node_record_count);
rs_node_bitmap = bit_alloc(node_record_count);
share_node_bitmap = bit_alloc(node_record_count);
up_node_bitmap = bit_alloc(node_record_count);
}
static void _build_part_bitmaps(void)
{
part_record_t *part_ptr;
list_itr_t *part_iterator;
/* scan partition table and identify nodes in each */
part_iterator = list_iterator_create(part_list);
while ((part_ptr = list_next(part_iterator))) {
if (build_part_bitmap(part_ptr) == ESLURM_INVALID_NODE_NAME)
fatal("Invalid node names in partition %s",
part_ptr->name);
}
list_iterator_destroy(part_iterator);
}
static void _build_node_config_bitmaps(void)
{
node_record_t *node_ptr;
/* initialize the configuration bitmaps */
list_for_each(config_list, _reset_node_bitmaps, NULL);
for (int i = 0; (node_ptr = next_node(&i)); i++) {
if (node_ptr->config_ptr)
bit_set(node_ptr->config_ptr->node_bitmap,
node_ptr->index);
}
}
static int _reset_node_bitmaps(void *x, void *arg)
{
config_record_t *config_ptr = x;
FREE_NULL_BITMAP(config_ptr->node_bitmap);
config_ptr->node_bitmap = bit_alloc(node_record_count);
return 0;
}
static int _set_share_node_bitmap(void *x, void *arg)
{
job_record_t *job_ptr = x;
if (!IS_JOB_RUNNING(job_ptr) ||
(job_ptr->node_bitmap == NULL) ||
(job_ptr->details == NULL) ||
(job_ptr->details->share_res != 0))
return 0;
bit_and_not(share_node_bitmap, job_ptr->node_bitmap);
return 0;
}
static void *_set_node_addrs(void *arg)
{
list_t *nodes = arg;
slurm_addr_t slurm_addr;
node_record_t *node_ptr;
while ((node_ptr = list_pop(nodes))) {
slurm_set_addr(&slurm_addr, node_ptr->port,
node_ptr->comm_name);
if (slurm_get_port(&slurm_addr))
continue;
error("%s: failure on %s", __func__, node_ptr->comm_name);
node_ptr->node_state = NODE_STATE_FUTURE;
node_ptr->port = 0;
xfree(node_ptr->reason);
node_ptr->reason = xstrdup("NO NETWORK ADDRESS FOUND");
node_ptr->reason_time = time(NULL);
node_ptr->reason_uid = slurm_conf.slurm_user_id;
}
return NULL;
}
/*
* Validate that nodes are addressable.
*/
static void _validate_slurmd_addr(void)
{
node_record_t *node_ptr;
DEF_TIMERS;
pthread_t *work_threads;
int threads_num = 1;
char *temp_str;
list_t *nodes = list_create(NULL);
xassert(verify_lock(CONF_LOCK, READ_LOCK));
START_TIMER;
if ((temp_str = xstrcasestr(slurm_conf.slurmctld_params,
"validate_nodeaddr_threads="))) {
int tmp_val = strtol(temp_str + 26, NULL, 10);
if ((tmp_val >= 1) && (tmp_val <= 64))
threads_num = tmp_val;
else
error("SlurmctldParameters option validate_nodeaddr_threads=%d out of range, ignored",
tmp_val);
}
for (int i = 0; (node_ptr = next_node(&i)); i++) {
if ((node_ptr->name == NULL) ||
(node_ptr->name[0] == '\0'))
continue;
if (IS_NODE_FUTURE(node_ptr))
continue;
if (IS_NODE_CLOUD(node_ptr) &&
(IS_NODE_POWERING_DOWN(node_ptr) ||
IS_NODE_POWERED_DOWN(node_ptr) ||
IS_NODE_POWERING_UP(node_ptr)))
continue;
if (node_ptr->port == 0)
node_ptr->port = slurm_conf.slurmd_port;
list_append(nodes, node_ptr);
}
work_threads = xcalloc(threads_num, sizeof(pthread_t));
for (int i = 0; i < threads_num; i++)
slurm_thread_create(&work_threads[i], _set_node_addrs, nodes);
for (int i = 0; i < threads_num; i++)
slurm_thread_join(work_threads[i]);
xfree(work_threads);
xassert(list_is_empty(nodes));
FREE_NULL_LIST(nodes);
END_TIMER2(__func__);
}
/*
* _build_bitmaps - build node bitmaps to define which nodes are in which
* 1) partition 2) configuration record 3) up state 4) idle state
* also sets values of total_nodes and total_cpus for every partition.
* RET 0 if no error, errno otherwise
* Note: Operates on common variables, no arguments
* node_record_count - number of nodes in the system
* node_record_table_ptr - pointer to global node table
* part_list - pointer to global partition list
*/
static void _build_bitmaps(void)
{
node_record_t *node_ptr;
last_node_update = time(NULL);
last_part_update = time(NULL);
/* Set all bits, all nodes initially available for sharing */
bit_set_all(share_node_bitmap);
/* identify all nodes non-sharable due to non-sharing jobs */
list_for_each(job_list, _set_share_node_bitmap, NULL);
/* scan all nodes and identify which are up, idle and
* their configuration, resync DRAINED vs. DRAINING state */
for (int i = 0; (node_ptr = next_node(&i)); i++) {
uint32_t drain_flag, job_cnt;
if (node_ptr->name[0] == '\0')
continue; /* defunct */
drain_flag = IS_NODE_DRAIN(node_ptr) |
IS_NODE_FAIL(node_ptr);
job_cnt = node_ptr->run_job_cnt + node_ptr->comp_job_cnt;
if (!IS_NODE_FUTURE(node_ptr))
bit_set(power_up_node_bitmap, node_ptr->index);
if ((IS_NODE_IDLE(node_ptr) && (job_cnt == 0)) ||
IS_NODE_DOWN(node_ptr))
bit_set(idle_node_bitmap, node_ptr->index);
if (IS_NODE_POWERING_UP(node_ptr))
bit_set(booting_node_bitmap, node_ptr->index);
if (IS_NODE_COMPLETING(node_ptr))
bit_set(cg_node_bitmap, node_ptr->index);
if (IS_NODE_CLOUD(node_ptr))
bit_set(cloud_node_bitmap, node_ptr->index);
if (IS_NODE_EXTERNAL(node_ptr))
bit_set(external_node_bitmap, node_ptr->index);
if (IS_NODE_IDLE(node_ptr) ||
IS_NODE_ALLOCATED(node_ptr) ||
((IS_NODE_REBOOT_REQUESTED(node_ptr) ||
IS_NODE_REBOOT_ISSUED(node_ptr)) &&
((node_ptr->next_state & NODE_STATE_FLAGS) &
NODE_RESUME))) {
if ((drain_flag == 0) &&
(!IS_NODE_NO_RESPOND(node_ptr)))
make_node_avail(node_ptr);
bit_set(up_node_bitmap, node_ptr->index);
}
if (IS_NODE_POWERED_DOWN(node_ptr)) {
bit_set(power_down_node_bitmap, node_ptr->index);
bit_clear(power_up_node_bitmap, node_ptr->index);
}
if (IS_NODE_POWERING_DOWN(node_ptr)) {
bit_set(power_down_node_bitmap, node_ptr->index);
bit_clear(power_up_node_bitmap, node_ptr->index);
bit_clear(avail_node_bitmap, node_ptr->index);
}
if (IS_NODE_FUTURE(node_ptr))
bit_set(future_node_bitmap, node_ptr->index);
if ((IS_NODE_REBOOT_REQUESTED(node_ptr) ||
IS_NODE_REBOOT_ISSUED(node_ptr)) &&
((node_ptr->next_state & NODE_STATE_FLAGS) & NODE_RESUME))
bit_set(rs_node_bitmap, node_ptr->index);
if (IS_NODE_REBOOT_ASAP(node_ptr))
bit_set(asap_node_bitmap, node_ptr->index);
}
}
static int _set_nodes_topo(void)
{
node_record_t *node_ptr;
int rc = SLURM_SUCCESS;
last_node_update = time(NULL);
for (int i = 0; (node_ptr = next_node(&i)); i++) {
if (node_ptr->topology_str &&
(rc = topology_g_add_rm_node(node_ptr))) {
error("Invalid node topology specified %s for %s",
node_ptr->topology_str, node_ptr->name);
break;
}
}
return rc;
}
/*
* _init_all_slurm_conf - initialize or re-initialize the slurm
* configuration values.
* NOTE: We leave the job table intact
* NOTE: Operates on common variables, no arguments
*/
static void _init_all_slurm_conf(void)
{
char *conf_name = xstrdup(slurm_conf.slurm_conf);
slurm_conf_reinit(conf_name);
xfree(conf_name);
init_node_conf();
init_part_conf();
init_job_conf();
}
static int _handle_downnodes_line(slurm_conf_downnodes_t *down)
{
int error_code = 0;
node_record_t *node_rec = NULL;
hostlist_t *alias_list = NULL;
char *alias = NULL;
int state_val = NODE_STATE_DOWN;
if (down->state != NULL) {
state_val = state_str2int(down->state, down->nodenames);
if (state_val == NO_VAL) {
error("Invalid State \"%s\"", down->state);
goto cleanup;
}
}
if ((alias_list = hostlist_create(down->nodenames)) == NULL) {
error("Unable to create NodeName list from %s",
down->nodenames);
error_code = errno;
goto cleanup;
}
while ((alias = hostlist_shift(alias_list))) {
node_rec = find_node_record(alias);
if (node_rec == NULL) {
error("DownNode \"%s\" does not exist!", alias);
free(alias);
continue;
}
if ((state_val != NO_VAL) &&
(state_val != NODE_STATE_UNKNOWN))
node_rec->node_state = state_val;
if (down->reason) {
xfree(node_rec->reason);
node_rec->reason = xstrdup(down->reason);
node_rec->reason_time = time(NULL);
node_rec->reason_uid = slurm_conf.slurm_user_id;
}
free(alias);
}
cleanup:
if (alias_list)
hostlist_destroy(alias_list);
return error_code;
}
static void _handle_all_downnodes(void)
{
slurm_conf_downnodes_t *ptr, **ptr_array;
int count;
int i;
count = slurm_conf_downnodes_array(&ptr_array);
if (count == 0) {
debug("No DownNodes");
return;
}
for (i = 0; i < count; i++) {
ptr = ptr_array[i];
_handle_downnodes_line(ptr);
}
}
/*
* Convert a comma delimited string of account names into a list containing
* pointers to those associations.
*/
extern list_t *accounts_list_build(char *accounts, bool locked)
{
char *tmp_accts, *one_acct_name, *name_ptr = NULL;
list_t *acct_list = NULL;
slurmdb_assoc_rec_t *assoc_ptr = NULL;
assoc_mgr_lock_t locks = { .assoc = READ_LOCK };
if (!accounts)
return acct_list;
if (!locked)
assoc_mgr_lock(&locks);
tmp_accts = xstrdup(accounts);
one_acct_name = strtok_r(tmp_accts, ",", &name_ptr);
while (one_acct_name) {
slurmdb_assoc_rec_t assoc = {
.acct = one_acct_name,
.uid = NO_VAL,
};
if (assoc_mgr_fill_in_assoc(
acct_db_conn, &assoc,
accounting_enforce,
&assoc_ptr, true) != SLURM_SUCCESS) {
if (accounting_enforce & ACCOUNTING_ENFORCE_ASSOCS) {
error("%s: No association for account %s",
__func__, assoc.acct);
} else {
verbose("%s: No association for account %s",
__func__, assoc.acct);
}
}
if (assoc_ptr) {
if (!acct_list)
acct_list = list_create(NULL);
list_append(acct_list, assoc_ptr);
}
one_acct_name = strtok_r(NULL, ",", &name_ptr);
}
xfree(tmp_accts);
if (!locked)
assoc_mgr_unlock(&locks);
return acct_list;
}
/* Convert a comma delimited list of QOS names into a bitmap */
extern void qos_list_build(char *qos, bool locked, bitstr_t **qos_bits)
{
char *tmp_qos, *one_qos_name, *name_ptr = NULL;
slurmdb_qos_rec_t qos_rec, *qos_ptr = NULL;
bitstr_t *tmp_qos_bitstr;
int rc;
assoc_mgr_lock_t locks = { .qos = READ_LOCK };
if (!qos) {
FREE_NULL_BITMAP(*qos_bits);
return;
}
/* Lock here to avoid g_qos_count changing under us */
if (!locked)
assoc_mgr_lock(&locks);
if (!g_qos_count) {
error("We have no QOS on the system Ignoring invalid "
"Allow/DenyQOS value(s) %s",
qos);
if (!locked)
assoc_mgr_unlock(&locks);
FREE_NULL_BITMAP(*qos_bits);
*qos_bits = NULL;
return;
}
tmp_qos_bitstr = bit_alloc(g_qos_count);
tmp_qos = xstrdup(qos);
one_qos_name = strtok_r(tmp_qos, ",", &name_ptr);
while (one_qos_name) {
memset(&qos_rec, 0, sizeof(slurmdb_qos_rec_t));
qos_rec.name = one_qos_name;
rc = assoc_mgr_fill_in_qos(acct_db_conn, &qos_rec,
accounting_enforce,
&qos_ptr, 1);
if ((rc != SLURM_SUCCESS) || (qos_rec.id >= g_qos_count)) {
error("Ignoring invalid Allow/DenyQOS value: %s",
one_qos_name);
} else {
bit_set(tmp_qos_bitstr, qos_rec.id);
}
one_qos_name = strtok_r(NULL, ",", &name_ptr);
}
if (!locked)
assoc_mgr_unlock(&locks);
xfree(tmp_qos);
FREE_NULL_BITMAP(*qos_bits);
*qos_bits = tmp_qos_bitstr;
}
/*
* _build_single_partitionline_info - get a array of slurm_conf_partition_t
* structures from the slurm.conf reader, build table, and set values
* RET 0 if no error, error code otherwise
* Note: Operates on common variables
* global: part_list - global partition list pointer
* default_part - default parameters for a partition
*/
static int _build_single_partitionline_info(slurm_conf_partition_t *part)
{
part_record_t *part_ptr;
if (list_find_first(part_list, &list_find_part, part->name))
fatal("%s: duplicate entry for partition %s",
__func__, part->name);
part_ptr = create_ctld_part_record(part->name);
if (part->default_flag) {
if (default_part_name &&
xstrcmp(default_part_name, part->name)) {
info("_parse_part_spec: changing default partition "
"from %s to %s", default_part_name, part->name);
default_part_loc->flags &= (~PART_FLAG_DEFAULT);
}
xfree(default_part_name);
default_part_name = xstrdup(part->name);
default_part_loc = part_ptr;
part_ptr->flags |= PART_FLAG_DEFAULT;
}
part_ptr->cpu_bind = part->cpu_bind;
if (part->preempt_mode != NO_VAL16)
part_ptr->preempt_mode = part->preempt_mode;
if (part->disable_root_jobs == NO_VAL8) {
if (slurm_conf.conf_flags & CONF_FLAG_DRJ)
part_ptr->flags |= PART_FLAG_NO_ROOT;
} else if (part->disable_root_jobs) {
part_ptr->flags |= PART_FLAG_NO_ROOT;
} else {
part_ptr->flags &= (~PART_FLAG_NO_ROOT);
}
if (part_ptr->flags & PART_FLAG_NO_ROOT)
debug2("partition %s does not allow root jobs", part_ptr->name);
if ((part->default_time != NO_VAL) &&
(part->default_time > part->max_time)) {
info("partition %s DefaultTime exceeds MaxTime (%u > %u)",
part->name, part->default_time, part->max_time);
part->default_time = NO_VAL;
}
if (part->exclusive_user)
part_ptr->flags |= PART_FLAG_EXCLUSIVE_USER;
if (part->exclusive_topo)
part_ptr->flags |= PART_FLAG_EXCLUSIVE_TOPO;
if (part->hidden_flag)
part_ptr->flags |= PART_FLAG_HIDDEN;
if (part->power_down_on_idle)
part_ptr->flags |= PART_FLAG_PDOI;
if (part->root_only_flag)
part_ptr->flags |= PART_FLAG_ROOT_ONLY;
if (part->req_resv_flag)
part_ptr->flags |= PART_FLAG_REQ_RESV;
if (part->lln_flag)
part_ptr->flags |= PART_FLAG_LLN;
part_ptr->max_time = part->max_time;
part_ptr->def_mem_per_cpu = part->def_mem_per_cpu;
part_ptr->default_time = part->default_time;
FREE_NULL_LIST(part_ptr->job_defaults_list);
part_ptr->job_defaults_list =
job_defaults_copy(part->job_defaults_list);
part_ptr->max_cpus_per_node = part->max_cpus_per_node;
part_ptr->max_cpus_per_socket = part->max_cpus_per_socket;
part_ptr->max_share = part->max_share;
part_ptr->max_mem_per_cpu = part->max_mem_per_cpu;
part_ptr->max_nodes = part->max_nodes;
part_ptr->max_nodes_orig = part->max_nodes;
part_ptr->min_nodes = part->min_nodes;
part_ptr->min_nodes_orig = part->min_nodes;
part_ptr->over_time_limit = part->over_time_limit;
part_ptr->preempt_mode = part->preempt_mode;
part_ptr->priority_job_factor = part->priority_job_factor;
part_ptr->priority_tier = part->priority_tier;
part_ptr->resume_timeout = part->resume_timeout;
part_ptr->state_up = part->state_up;
part_ptr->suspend_time = part->suspend_time;
part_ptr->suspend_timeout = part->suspend_timeout;
part_ptr->grace_time = part->grace_time;
part_ptr->cr_type = part->cr_type;
part_ptr->allow_alloc_nodes = xstrdup(part->allow_alloc_nodes);
part_ptr->allow_groups = xstrdup(part->allow_groups);
part_ptr->alternate = xstrdup(part->alternate);
part_ptr->nodes = xstrdup(part->nodes);
part_ptr->orig_nodes = xstrdup(part->nodes);
if (part->billing_weights_str) {
set_partition_billing_weights(part->billing_weights_str,
part_ptr, true);
}
if (part->allow_accounts) {
part_ptr->allow_accounts = xstrdup(part->allow_accounts);
part_ptr->allow_accts_list =
accounts_list_build(part_ptr->allow_accounts, false);
}
if (part->allow_qos) {
part_ptr->allow_qos = xstrdup(part->allow_qos);
qos_list_build(part_ptr->allow_qos, false,
&part_ptr->allow_qos_bitstr);
}
if (part->deny_accounts) {
part_ptr->deny_accounts = xstrdup(part->deny_accounts);
part_ptr->deny_accts_list =
accounts_list_build(part_ptr->deny_accounts, false);
}
if (part->deny_qos) {
part_ptr->deny_qos = xstrdup(part->deny_qos);
qos_list_build(part_ptr->deny_qos, false,
&part_ptr->deny_qos_bitstr);
}
if (part->qos_char) {
slurmdb_qos_rec_t qos_rec;
part_ptr->qos_char = xstrdup(part->qos_char);
memset(&qos_rec, 0, sizeof(slurmdb_qos_rec_t));
qos_rec.name = part_ptr->qos_char;
if (assoc_mgr_fill_in_qos(
acct_db_conn, &qos_rec, accounting_enforce,
(slurmdb_qos_rec_t **)&part_ptr->qos_ptr, 0)
!= SLURM_SUCCESS) {
fatal("Partition %s has an invalid qos (%s), "
"please check your configuration",
part_ptr->name, qos_rec.name);
}
if (part_ptr->qos_ptr) {
if ((part_ptr->qos_ptr->flags & QOS_FLAG_PART_QOS) &&
(part_ptr->qos_ptr->flags & QOS_FLAG_RELATIVE))
fatal("QOS %s is a relative QOS. A relative QOS must be unique per partition. Please check your configuration and adjust accordingly",
part_ptr->qos_ptr->name);
part_ptr->qos_ptr->flags |= QOS_FLAG_PART_QOS;
}
}
if (part->topology_name) {
part_ptr->topology_name = xstrdup(part->topology_name);
}
return 0;
}
/*
* _build_all_partitionline_info - get a array of slurm_conf_partition_t
* structures from the slurm.conf reader, build table, and set values
* Note: Operates on common variables
* global: part_list - global partition list pointer
* default_part - default parameters for a partition
*/
static void _build_all_partitionline_info(void)
{
slurm_conf_partition_t **ptr_array;
int count;
int i;
count = slurm_conf_partition_array(&ptr_array);
for (i = 0; i < count; i++)
_build_single_partitionline_info(ptr_array[i]);
}
static int _set_max_part_prio(void *x, void *arg)
{
part_record_t *part_ptr = x;
if (part_ptr->priority_job_factor > part_max_priority)
part_max_priority = part_ptr->priority_job_factor;
return 0;
}
static int _reset_part_prio(void *x, void *arg)
{
part_record_t *part_ptr = x;
/* protect against div0 if all partition priorities are zero */
if (part_max_priority == 0) {
part_ptr->norm_priority = 0;
return 0;
}
part_ptr->norm_priority = (double)part_ptr->priority_job_factor /
(double)part_max_priority;
return 0;
}
/* _sync_part_prio - Set normalized partition priorities */
static void _sync_part_prio(void)
{
/* reset global value from part list */
part_max_priority = DEF_PART_MAX_PRIORITY;
list_for_each(part_list, _set_max_part_prio, NULL);
/* renormalize values after finding new max */
list_for_each(part_list, _reset_part_prio, NULL);
}
static int _foreach_requeue_job_node_failed(void *x, void *arg)
{
job_record_t *job_ptr = x;
job_record_t *het_job_leader;
int rc = SLURM_SUCCESS;
xassert(job_ptr->magic == JOB_MAGIC);
if (!IS_JOB_NODE_FAILED(job_ptr) && !IS_JOB_REQUEUED(job_ptr))
return SLURM_SUCCESS;
het_job_leader = find_job_record(job_ptr->het_job_id);
if (het_job_leader && het_job_leader->batch_flag &&
het_job_leader->details &&
het_job_leader->details->requeue &&
het_job_leader->part_ptr) {
info("Requeue het job leader %pJ due to node failure on %pJ",
het_job_leader, job_ptr);
if ((rc = job_requeue(0, het_job_leader->job_id, NULL, false,
0)))
error("Unable to requeue %pJ: %s",
het_job_leader, slurm_strerror(rc));
} else if (job_ptr->batch_flag && job_ptr->details &&
job_ptr->details->requeue && job_ptr->part_ptr) {
info("Requeue job %pJ due to node failure",
job_ptr);
if ((rc = job_requeue(0, job_ptr->job_id, NULL, false, 0)))
error("Unable to requeue %pJ: %s",
job_ptr, slurm_strerror(rc));
}
job_state_unset_flag(job_ptr, JOB_REQUEUE);
return rc;
}
static void _requeue_job_node_failed(void)
{
xassert(job_list);
(void) list_for_each_nobreak(job_list,
_foreach_requeue_job_node_failed, NULL);
}
static void _abort_job(job_record_t *job_ptr, uint32_t job_state,
uint16_t state_reason, char *reason_string)
{
time_t now = time(NULL);
job_state_set(job_ptr, (job_state | JOB_COMPLETING));
build_cg_bitmap(job_ptr);
job_ptr->end_time = MIN(job_ptr->end_time, now);
job_ptr->state_reason = state_reason;
xfree(job_ptr->state_desc);
job_ptr->state_desc = xstrdup(reason_string);
job_completion_logger(job_ptr, false);
if (job_ptr->job_state == JOB_NODE_FAIL) {
/* build_cg_bitmap() may clear JOB_COMPLETING */
epilog_slurmctld(job_ptr);
}
}
static int _mark_het_job_unused(void *x, void *arg)
{
job_record_t *job_ptr = x;
job_ptr->bit_flags &= (~HET_JOB_FLAG);
return 0;
}
static int _mark_het_job_used(void *x, void *arg)
{
job_record_t *job_ptr = x;
job_ptr->bit_flags |= HET_JOB_FLAG;
return 0;
}
static int _test_het_job_used(void *x, void *arg)
{
job_record_t *job_ptr = x;
if ((job_ptr->het_job_id == 0) || IS_JOB_FINISHED(job_ptr))
return 0;
if (job_ptr->bit_flags & HET_JOB_FLAG)
return 0;
error("Incomplete hetjob being aborted %pJ", job_ptr);
_abort_job(job_ptr, JOB_FAILED, FAIL_SYSTEM, "incomplete hetjob");
return 0;
}
/*
* Validate heterogeneous jobs
*
* Make sure that every active (not yet complete) job has all of its components
* and they are all in the same state. Also rebuild het_job_list.
* If hetjob is corrupted, aborts and removes it from job_list.
*/
static void _validate_het_jobs(void)
{
list_itr_t *job_iterator;
job_record_t *job_ptr, *het_job_ptr;
hostset_t *hs;
char *job_id_str;
uint32_t job_id;
bool het_job_valid;
list_for_each(job_list, _mark_het_job_unused, NULL);
job_iterator = list_iterator_create(job_list);
while ((job_ptr = list_next(job_iterator))) {
/* Checking for corrupted hetjob components */
if (job_ptr->het_job_offset != 0) {
het_job_ptr = find_job_record(job_ptr->het_job_id);
if (!het_job_ptr) {
error("Could not find hetjob leader (JobId=%u) of %pJ. Aborting and removing job as it is corrupted.",
job_ptr->het_job_id, job_ptr);
_abort_job(job_ptr, JOB_FAILED, FAIL_SYSTEM,
"invalid het_job_id_set");
if (list_delete_item(job_iterator) != 1)
error("Not able to remove the job.");
continue;
}
if (job_ptr->het_job_id &&
(job_ptr->job_id == job_ptr->het_job_id)) {
error("Invalid HetJob component %pJ HetJobIdSet=%s. Aborting and removing job.",
job_ptr,
job_ptr->het_job_id_set);
_abort_job(job_ptr, JOB_FAILED, FAIL_SYSTEM,
"Invalid HetJob component");
if (list_delete_item(job_iterator) != 1)
error("Not able to remove the job.");
continue;
}
}
if ((job_ptr->het_job_id == 0) ||
(job_ptr->het_job_offset != 0))
continue;
/* active het job leader found */
FREE_NULL_LIST(job_ptr->het_job_list);
job_id_str = NULL;
/* Need to wrap numbers with brackets for hostset functions */
xstrfmtcat(job_id_str, "[%s]", job_ptr->het_job_id_set);
hs = hostset_create(job_id_str);
xfree(job_id_str);
if (!hs) {
error("%pJ has invalid het_job_id_set(%s). Aborting and removing job as it is corrupted.",
job_ptr, job_ptr->het_job_id_set);
_abort_job(job_ptr, JOB_FAILED, FAIL_SYSTEM,
"invalid het_job_id_set");
if (list_delete_item(job_iterator) != 1)
error("Not able to remove the job.");
continue;
}
job_ptr->het_job_list = list_create(NULL);
het_job_valid = true; /* assume valid for now */
while (het_job_valid && (job_id_str = hostset_shift(hs))) {
job_id = (uint32_t) strtoll(job_id_str, NULL, 10);
het_job_ptr = find_job_record(job_id);
if (!het_job_ptr) {
error("Could not find JobId=%u, part of hetjob JobId=%u",
job_id, job_ptr->job_id);
het_job_valid = false;
} else if (het_job_ptr->het_job_id !=
job_ptr->job_id) {
error("Invalid state of JobId=%u, part of hetjob JobId=%u",
job_id, job_ptr->job_id);
het_job_valid = false;
} else {
list_append(job_ptr->het_job_list,
het_job_ptr);
}
free(job_id_str);
}
hostset_destroy(hs);
if (het_job_valid) {
list_for_each(job_ptr->het_job_list, _mark_het_job_used,
NULL);
}
}
list_iterator_destroy(job_iterator);
list_for_each(job_list, _test_het_job_used, NULL);
}
/* Log an error if SlurmdUser is not root and any cgroup plugin is used */
static void _test_cgroup_plugin_use(void)
{
if (xstrstr(slurm_conf.task_plugin, "cgroup"))
error("task/cgroup plugin will not work unless SlurmdUser is root");
if (xstrstr(slurm_conf.proctrack_type, "cgroup"))
error("proctrack/cgroup plugin will not work unless SlurmdUser is root");
}
static void _sync_steps_to_conf(job_record_t *job_ptr)
{
list_itr_t *step_iterator;
step_record_t *step_ptr;
step_iterator = list_iterator_create (job_ptr->step_list);
while ((step_ptr = list_next(step_iterator))) {
if (step_ptr->state < JOB_RUNNING)
continue;
FREE_NULL_BITMAP(step_ptr->step_node_bitmap);
if (step_ptr->step_layout &&
step_ptr->step_layout->node_list &&
(node_name2bitmap(step_ptr->step_layout->node_list, false,
&step_ptr->step_node_bitmap, NULL))) {
error("Invalid step_node_list (%s) for %pS",
step_ptr->step_layout->node_list, step_ptr);
delete_step_record(job_ptr, step_ptr);
} else if (step_ptr->step_node_bitmap == NULL) {
error("Missing node_list for %pS", step_ptr);
delete_step_record(job_ptr, step_ptr);
}
}
list_iterator_destroy (step_iterator);
}
static int _sync_detail_bitmaps(job_record_t *job_ptr)
{
if (job_ptr->details == NULL)
return SLURM_SUCCESS;
FREE_NULL_BITMAP(job_ptr->details->req_node_bitmap);
if ((job_ptr->details->req_nodes) &&
(node_name2bitmap(job_ptr->details->req_nodes, false,
&job_ptr->details->req_node_bitmap, NULL))) {
error("Invalid req_nodes (%s) for %pJ",
job_ptr->details->req_nodes, job_ptr);
return SLURM_ERROR;
}
/*
* Ignore any errors if the exc_nodes list contains invalid entries.
* We can the pretty sure we won't schedule onto nodes that don't exist.
*/
FREE_NULL_BITMAP(job_ptr->details->exc_node_bitmap);
if (job_ptr->details->exc_nodes)
node_name2bitmap(job_ptr->details->exc_nodes, false,
&job_ptr->details->exc_node_bitmap, NULL);
/*
* If a nodelist has been provided with more nodes than are required
* for the job, translate this into an exclusion of all nodes except
* those requested.
*/
if (job_ptr->details->req_node_bitmap &&
(bit_set_count(job_ptr->details->req_node_bitmap) >
job_ptr->details->min_nodes)) {
if (!job_ptr->details->exc_node_bitmap)
job_ptr->details->exc_node_bitmap =
bit_alloc(node_record_count);
bit_or_not(job_ptr->details->exc_node_bitmap,
job_ptr->details->req_node_bitmap);
FREE_NULL_BITMAP(job_ptr->details->req_node_bitmap);
}
return SLURM_SUCCESS;
}
/*
* _sync_jobs_to_conf - Sync current slurm.conf configuration for existing jobs.
* This should be called after rebuilding node, part, and gres information,
* but before using any job entries.
* global: last_job_update - time of last job table update
* job_list - pointer to global job list
*/
void _sync_jobs_to_conf(void)
{
list_itr_t *job_iterator;
job_record_t *job_ptr;
part_record_t *part_ptr;
list_t *part_ptr_list = NULL;
bool job_fail = false;
time_t now = time(NULL);
bool gang_flag = false;
xassert(job_list);
if (slurm_conf.preempt_mode & PREEMPT_MODE_GANG)
gang_flag = true;
job_iterator = list_iterator_create(job_list);
while ((job_ptr = list_next(job_iterator))) {
xassert (job_ptr->magic == JOB_MAGIC);
job_fail = false;
/*
* This resets the req/exc node bitmaps, so even if the job is
* finished it still needs to happen just in case the job is
* requeued.
*/
if (_sync_detail_bitmaps(job_ptr)) {
job_fail = true;
if (job_ptr->details) {
/*
* job can't be requeued because either
* req_nodes or exc_nodes can't be satisfied.
*/
job_ptr->details->requeue = false;
}
}
/*
* While the job is completed at this point there is code in
* _job_requeue_op() that requires the part_ptr to be set in
* order to requeue a job. We also need to set it to NULL if
* the partition was removed or we will be pointing at bad
* data. This is the safest/easiest place to do it.
*/
if (job_ptr->partition == NULL) {
error("No partition for %pJ", job_ptr);
part_ptr = NULL;
job_fail = true;
} else {
char *err_part = NULL;
get_part_list(job_ptr->partition, &part_ptr_list,
&part_ptr, &err_part);
if (part_ptr == NULL) {
error("Invalid partition (%s) for %pJ",
err_part, job_ptr);
xfree(err_part);
job_fail = true;
}
}
job_ptr->part_ptr = part_ptr;
FREE_NULL_LIST(job_ptr->part_ptr_list);
if (part_ptr_list) {
job_ptr->part_ptr_list = part_ptr_list;
part_ptr_list = NULL; /* clear for next job */
}
/*
* If the job is finished there is no reason to do anything
* below this.
*/
if (IS_JOB_COMPLETED(job_ptr))
continue;
FREE_NULL_BITMAP(job_ptr->node_bitmap_cg);
if (job_ptr->nodes_completing &&
node_name2bitmap(job_ptr->nodes_completing,
false, &job_ptr->node_bitmap_cg, NULL)) {
error("Invalid nodes_completing (%s) for %pJ",
job_ptr->nodes_completing, job_ptr);
job_fail = true;
}
FREE_NULL_BITMAP(job_ptr->node_bitmap);
if (job_ptr->nodes &&
node_name2bitmap(job_ptr->nodes, false,
&job_ptr->node_bitmap, NULL)) {
error("Invalid nodes (%s) for %pJ",
job_ptr->nodes, job_ptr);
job_fail = true;
}
FREE_NULL_BITMAP(job_ptr->node_bitmap_pr);
if (job_ptr->nodes_pr &&
node_name2bitmap(job_ptr->nodes_pr, false,
&job_ptr->node_bitmap_pr, NULL)) {
error("Invalid nodes_pr (%s) for %pJ",
job_ptr->nodes_pr, job_ptr);
job_fail = true;
}
if (reset_node_bitmap(job_ptr))
job_fail = true;
if (!job_fail &&
job_ptr->job_resrcs &&
(running_cons_tres() || gang_flag) &&
valid_job_resources(job_ptr->job_resrcs)) {
error("Aborting %pJ due to change in socket/core configuration of allocated nodes",
job_ptr);
job_fail = true;
}
if (!job_fail &&
gres_job_revalidate(job_ptr->gres_list_req)) {
error("Aborting %pJ due to use of unsupported GRES options",
job_ptr);
job_fail = true;
if (job_ptr->details) {
/* don't attempt to requeue job */
job_ptr->details->requeue = false;
}
}
if (!job_fail && job_ptr->job_resrcs &&
(IS_JOB_RUNNING(job_ptr) || IS_JOB_SUSPENDED(job_ptr)) &&
gres_job_revalidate2(job_ptr->job_id,
job_ptr->gres_list_alloc,
job_ptr->job_resrcs->node_bitmap)) {
/*
* This can be due to the job being allocated GRES
* which no longer exist (i.e. the GRES count on some
* allocated node changed since when the job started).
*/
error("Aborting %pJ due to use of invalid GRES configuration",
job_ptr);
job_fail = true;
}
_sync_steps_to_conf(job_ptr);
build_node_details(job_ptr, false); /* set node_addr */
if (job_fail) {
bool was_running = false;
if (IS_JOB_PENDING(job_ptr)) {
job_ptr->start_time =
job_ptr->end_time = time(NULL);
job_state_set(job_ptr, JOB_NODE_FAIL);
} else if (IS_JOB_RUNNING(job_ptr)) {
job_ptr->end_time = time(NULL);
job_state_set(job_ptr, (JOB_NODE_FAIL |
JOB_COMPLETING));
build_cg_bitmap(job_ptr);
was_running = true;
} else if (IS_JOB_SUSPENDED(job_ptr)) {
job_ptr->end_time = job_ptr->suspend_time;
job_state_set(job_ptr, (JOB_NODE_FAIL |
JOB_COMPLETING));
build_cg_bitmap(job_ptr);
job_ptr->tot_sus_time +=
difftime(now, job_ptr->suspend_time);
jobacct_storage_g_job_suspend(acct_db_conn,
job_ptr);
was_running = true;
}
job_ptr->state_reason = FAIL_DOWN_NODE;
xfree(job_ptr->state_desc);
job_ptr->exit_code = 1;
job_completion_logger(job_ptr, false);
if (job_ptr->job_state == JOB_NODE_FAIL) {
/* build_cg_bitmap() may clear JOB_COMPLETING */
epilog_slurmctld(job_ptr);
}
if (was_running && job_ptr->batch_flag &&
job_ptr->details && job_ptr->details->requeue &&
job_ptr->part_ptr) {
/*
* Mark for requeue
* see _requeue_job_node_failed()
*/
info("Attempting to requeue failed job %pJ",
job_ptr);
job_state_set_flag(job_ptr, JOB_REQUEUE);
/* Reset node_cnt to exclude vanished nodes */
job_ptr->node_cnt = bit_set_count(
job_ptr->node_bitmap_cg);
/* Reset exit code from last run */
job_ptr->exit_code = 0;
}
}
}
list_iterator_reset(job_iterator);
/* This will reinitialize the select plugin database, which
* we can only do after ALL job's states and bitmaps are set
* (i.e. it needs to be in this second loop) */
while ((job_ptr = list_next(job_iterator))) {
if (select_g_select_nodeinfo_set(job_ptr) != SLURM_SUCCESS) {
error("select_g_select_nodeinfo_set(%pJ): %m",
job_ptr);
}
}
list_iterator_destroy(job_iterator);
last_job_update = now;
}
/*
* read_slurm_conf - load the slurm configuration from the configured file.
* read_slurm_conf can be called more than once if so desired.
* IN recover - replace job, node and/or partition data with latest
* available information depending upon value
* 0 = use no saved state information, rebuild everything from
* slurm.conf contents
* 1 = recover saved job and trigger state,
* node DOWN/DRAIN/FAIL state and reason information
* 2 = recover all saved state
* RET SLURM_SUCCESS if no error, otherwise an error code
* Note: Operates on common variables only
*/
extern int read_slurm_conf(int recover)
{
DEF_TIMERS;
int error_code = SLURM_SUCCESS;
int rc = 0, load_job_ret = SLURM_SUCCESS;
char *old_auth_type = xstrdup(slurm_conf.authtype);
char *old_bb_type = xstrdup(slurm_conf.bb_type);
char *old_cred_type = xstrdup(slurm_conf.cred_type);
char *old_job_container_type = xstrdup(slurm_conf.job_container_plugin);
char *old_preempt_type = xstrdup(slurm_conf.preempt_type);
char *old_sched_type = xstrdup(slurm_conf.schedtype);
char *old_select_type = xstrdup(slurm_conf.select_type);
char *old_switch_type = xstrdup(slurm_conf.switch_type);
char *state_save_dir = xstrdup(slurm_conf.state_save_location);
char *tmp_ptr = NULL;
uint16_t old_select_type_p = slurm_conf.select_type_param;
bool cgroup_mem_confinement = false;
uint16_t reconfig_flags = slurm_conf.reconfig_flags;
/* initialization */
START_TIMER;
_init_all_slurm_conf();
cgroup_conf_init();
cgroup_mem_confinement = cgroup_memcg_job_confinement();
if (slurm_conf.job_acct_oom_kill && cgroup_mem_confinement)
fatal("Jobs memory is being constrained by both TaskPlugin cgroup and JobAcctGather plugin. This enables two incompatible memory enforcement mechanisms, one of them must be disabled.");
else if (slurm_conf.job_acct_oom_kill)
info("Memory enforcing by using JobAcctGather's mechanism is discouraged, task/cgroup is recommended where available.");
else if (!cgroup_mem_confinement)
info("No memory enforcing mechanism configured.");
if (slurm_conf.slurmd_user_id != 0)
_test_cgroup_plugin_use();
if (topology_g_init() != SLURM_SUCCESS)
fatal("Failed to initialize topology plugin");
if (xstrcasestr(slurm_conf.slurmctld_params, "enable_stepmgr") &&
!(slurm_conf.prolog_flags & PROLOG_FLAG_CONTAIN))
fatal("STEP_MGR not supported without PrologFlags=contain");
/* Build node and partition information based upon slurm.conf file */
if ((error_code = build_all_nodeline_info(false, slurmctld_tres_cnt)))
goto end_it;
/* Increase node table to handle dynamic nodes. */
if ((slurm_conf.max_node_cnt != NO_VAL) &&
node_record_count < slurm_conf.max_node_cnt) {
node_record_count = slurm_conf.max_node_cnt;
grow_node_record_table_ptr();
} else {
/* Lock node_record_table_ptr from growing */
slurm_conf.max_node_cnt = node_record_count;
}
if (slurm_conf.max_node_cnt == 0) {
/*
* Set to 1 so bitmaps will be created but don't allow any nodes
* to be created.
*/
node_record_count = 1;
grow_node_record_table_ptr();
}
bit_cache_init(node_record_count);
(void)acct_storage_g_reconfig(acct_db_conn, 0);
_handle_all_downnodes();
_build_all_partitionline_info();
/*
* Currently load/dump_state_lite has to run before load_all_job_state.
* FIXME: this stores a single string, this should probably move into
* the job state file as it's only pertinent to job accounting.
*/
load_config_state_lite();
dump_config_state_lite();
update_logging();
if (jobcomp_g_init() != SLURM_SUCCESS)
fatal("Failed to initialize jobcomp plugin");
if (controller_init_scheduling(
(slurm_conf.preempt_mode & PREEMPT_MODE_GANG)) != SLURM_SUCCESS) {
fatal("Failed to initialize the various schedulers");
}
if (default_part_loc == NULL)
error("%s: default partition not set.", __func__);
if (node_record_count < 1) {
error("%s: no nodes configured.", __func__);
error_code = EINVAL;
goto end_it;
}
/*
* Node reordering may be done by the topology plugin.
* Reordering the table must be done before hashing the
* nodes, and before any position-relative bitmaps are created.
*
* Sort the nodes read in from the slurm.conf first before restoring
* the dynamic nodes from the state file to prevent dynamic nodes from
* being sorted -- which can cause problems with heterogeneous jobs and
* the order of the sockets changing on startup.
*/
_sort_node_record_table_ptr();
/*
* Load node state which includes dynamic nodes so that dynamic nodes
* can be included in topology.
*/
if (recover == 0) { /* Build everything from slurm.conf */
_set_features(node_record_table_ptr, node_record_count,
recover);
} else if (recover == 1) { /* Load job & node state files */
(void) load_all_node_state(true);
_set_features(node_record_table_ptr, node_record_count,
recover);
} else if (recover > 1) { /* Load node, part & job state files */
(void) load_all_node_state(false);
_set_features(NULL, 0, recover);
}
rehash_node();
topology_g_build_config();
rehash_jobs();
_validate_slurmd_addr();
_stat_slurm_dirs();
_init_bitmaps();
/*
* Set standard features and preserve the plugin controlled ones.
*/
if (recover == 0) { /* Build everything from slurm.conf */
load_last_job_id();
reset_first_job_id();
controller_reconfig_scheduling();
} else if (recover == 1) { /* Load job & node state files */
load_job_ret = load_all_job_state();
} else if (recover > 1) { /* Load node, part & job state files */
reconfig_flags |= RECONFIG_KEEP_PART_INFO;
load_job_ret = load_all_job_state();
}
(void) load_all_part_state(reconfig_flags);
/*
* _build_node_config_bitmaps() must be called before
* build_features_list_*() and before restore_node_features()
*/
_build_node_config_bitmaps();
/* _gres_reconfig needs to happen before restore_node_features */
_gres_reconfig();
/* NOTE: Run restore_node_features before _restore_job_accounting */
restore_node_features(recover);
if ((node_features_g_count() > 0) &&
(node_features_g_get_node(NULL) != SLURM_SUCCESS))
error("failed to initialize node features");
/*
* _build_bitmaps() must follow node_features_g_get_node() and
* precede build_features_list_*()
*/
_build_bitmaps();
if (_set_nodes_topo()) {
error("Invalid node topology");
error_code = ESLURM_REQUESTED_TOPO_CONFIG_UNAVAILABLE;
goto end_it;
}
/* Active and available features can be different on -R */
if ((node_features_g_count() == 0) && (recover != 2))
node_features_build_list_eq();
else
node_features_build_list_ne();
_sync_part_prio();
_build_part_bitmaps(); /* Must be called after build_feature_list_*() */
if (list_for_each(part_list, set_part_topology_idx, NULL) < 0) {
error("Invalid partition topology");
error_code = ESLURM_REQUESTED_TOPO_CONFIG_UNAVAILABLE;
goto end_it;
}
if (select_g_node_init() != SLURM_SUCCESS)
fatal("Failed to initialize node selection plugin state, Clean start required.");
/*
* config_power_mgr() Must be after node and partitions have been loaded
* and before any calls to power_save_test().
*/
config_power_mgr();
_sync_jobs_to_conf(); /* must follow select_g_job_init() */
/*
* The burst buffer plugin must be initialized and state loaded before
* _sync_nodes_to_jobs(), which calls bb_g_job_init().
*/
rc = bb_g_load_state(true);
error_code = MAX(error_code, rc); /* not fatal */
(void) _sync_nodes_to_jobs();
(void) sync_job_files();
reserve_port_config(slurm_conf.mpi_params, job_list);
if (license_update(slurm_conf.licenses) != SLURM_SUCCESS)
fatal("Invalid Licenses value: %s", slurm_conf.licenses);
init_requeue_policy();
init_depend_policy();
/*
* Must be at after nodes and partitions (e.g.
* _build_part_bitmaps()) have been created and before
* _sync_nodes_to_comp_job().
*/
set_cluster_tres(false);
_validate_het_jobs();
(void) _sync_nodes_to_comp_job();/* must follow select_g_node_init() */
_requeue_job_node_failed();
load_part_uid_allow_list(true);
/* NOTE: Run load_all_resv_state() before _restore_job_accounting */
load_all_resv_state(recover);
if (recover >= 1) {
trigger_state_restore();
controller_reconfig_scheduling();
}
hres_init();
_restore_job_accounting();
/* sort config_list by weight for scheduling */
list_sort(config_list, &list_compare_config);
/* Update plugins as possible */
if (xstrcmp(old_auth_type, slurm_conf.authtype)) {
xfree(slurm_conf.authtype);
slurm_conf.authtype = old_auth_type;
old_auth_type = NULL;
rc = ESLURM_INVALID_AUTHTYPE_CHANGE;
}
if (xstrcmp(old_bb_type, slurm_conf.bb_type)) {
xfree(slurm_conf.bb_type);
slurm_conf.bb_type = old_bb_type;
old_bb_type = NULL;
rc = ESLURM_INVALID_BURST_BUFFER_CHANGE;
}
if (xstrcmp(old_cred_type, slurm_conf.cred_type)) {
xfree(slurm_conf.cred_type);
slurm_conf.cred_type = old_cred_type;
old_cred_type = NULL;
rc = ESLURM_INVALID_CRED_TYPE_CHANGE;
}
if (xstrcmp(old_job_container_type, slurm_conf.job_container_plugin)) {
xfree(slurm_conf.job_container_plugin);
slurm_conf.job_container_plugin = old_job_container_type;
old_job_container_type = NULL;
rc = ESLURM_INVALID_JOB_CONTAINER_CHANGE;
}
if (xstrcmp(old_sched_type, slurm_conf.schedtype)) {
xfree(slurm_conf.schedtype);
slurm_conf.schedtype = old_sched_type;
old_sched_type = NULL;
rc = ESLURM_INVALID_SCHEDTYPE_CHANGE;
}
if (xstrcmp(old_select_type, slurm_conf.select_type)) {
xfree(slurm_conf.select_type);
slurm_conf.select_type = old_select_type;
old_select_type = NULL;
rc = ESLURM_INVALID_SELECTTYPE_CHANGE;
}
if (xstrcmp(old_switch_type, slurm_conf.switch_type)) {
xfree(slurm_conf.switch_type);
slurm_conf.switch_type = old_switch_type;
old_switch_type = NULL;
rc = ESLURM_INVALID_SWITCHTYPE_CHANGE;
}
if ((slurm_conf.control_cnt < 2) ||
(slurm_conf.control_machine[1] == NULL))
info("%s: backup_controller not specified", __func__);
error_code = MAX(error_code, rc); /* not fatal */
if (xstrcmp(old_preempt_type, slurm_conf.preempt_type)) {
info("Changing PreemptType from %s to %s",
old_preempt_type, slurm_conf.preempt_type);
(void) preempt_g_fini();
if (preempt_g_init() != SLURM_SUCCESS)
fatal("failed to initialize preempt plugin");
}
/* Update plugin parameters as possible */
rc = _preserve_select_type_param(&slurm_conf, old_select_type_p);
error_code = MAX(error_code, rc); /* not fatal */
/*
* Restore job accounting info if file missing or corrupted,
* an extremely rare situation
*/
if (load_job_ret)
_acct_restore_active_jobs();
/* Sync select plugin with synchronized job/node/part data */
gres_reconfig(); /* Clear gres/mps counters */
select_g_reconfigure();
_set_response_cluster_rec();
consolidate_config_list(true, true);
cloud_dns = xstrcasestr(slurm_conf.slurmctld_params, "cloud_dns");
if ((tmp_ptr = xstrcasestr(slurm_conf.slurmctld_params,
"max_powered_nodes="))) {
max_powered_nodes =
strtol(tmp_ptr + strlen("max_powered_nodes="),
NULL, 10);
}
slurm_conf.last_update = time(NULL);
end_it:
xfree(old_auth_type);
xfree(old_bb_type);
xfree(old_cred_type);
xfree(old_job_container_type);
xfree(old_preempt_type);
xfree(old_sched_type);
xfree(old_select_type);
xfree(old_switch_type);
xfree(state_save_dir);
END_TIMER2(__func__);
return error_code;
}
static void _gres_reconfig(void)
{
node_record_t *node_ptr;
char *gres_name;
int i;
for (i = 0; (node_ptr = next_node(&i)); i++) {
/* node_ptr->gres is set when recover == 2 */
if (node_ptr->gres)
gres_name = node_ptr->gres;
else
gres_name = node_ptr->config_ptr->gres;
gres_init_node_config(gres_name, &node_ptr->gres_list);
if (!IS_NODE_CLOUD(node_ptr))
continue;
/*
* Load in GRES for node now. By default Slurm gets this
* information when the node registers for the first
* time, which can take a while for a node in the cloud
* to boot.
*/
if (gres_g_node_config_load(node_ptr->config_ptr->cpus,
node_ptr->name, node_ptr->gres_list,
NULL, NULL) != SLURM_SUCCESS)
continue; /* No need to validate if load failed */
gres_node_config_validate(node_ptr,
node_ptr->config_ptr->threads,
node_ptr->config_ptr->cores,
node_ptr->config_ptr->tot_sockets,
(slurm_conf.conf_flags &
CONF_FLAG_OR),
NULL);
}
}
/*
* Append changeable features in old_features and not in features to features.
*/
static void _merge_changeable_features(char *old_features, char **features)
{
char *save_ptr_old = NULL;
char *tok_old, *tmp_old, *tok_new;
char *sep;
if (*features)
sep = ",";
else
sep = "";
/* Merge features strings, skipping duplicates */
tmp_old = xstrdup(old_features);
for (tok_old = strtok_r(tmp_old, ",", &save_ptr_old);
tok_old;
tok_old = strtok_r(NULL, ",", &save_ptr_old)) {
bool match = false;
if (!node_features_g_changeable_feature(tok_old))
continue;
if (*features) {
char *tmp_new, *save_ptr_new = NULL;
/* Check if old feature already exists in features string */
tmp_new = xstrdup(*features);
for (tok_new = strtok_r(tmp_new, ",", &save_ptr_new);
tok_new;
tok_new = strtok_r(NULL, ",", &save_ptr_new)) {
if (!xstrcmp(tok_old, tok_new)) {
match = true;
break;
}
}
xfree(tmp_new);
}
if (match)
continue;
xstrfmtcat(*features, "%s%s", sep, tok_old);
sep = ",";
}
xfree(tmp_old);
}
static void _preserve_active_features(const char *available,
const char *old_active,
char **active)
{
char *old_feature, *saveptr_old;
char *tmp_old_active;
if (!available || !old_active)
return;
tmp_old_active = xstrdup(old_active);
for (old_feature = strtok_r(tmp_old_active, ",", &saveptr_old);
old_feature;
old_feature = strtok_r(NULL, ",", &saveptr_old)) {
char *new_feature, *saveptr_avail;
char *tmp_avail;
if (!node_features_g_changeable_feature(old_feature))
continue;
tmp_avail = xstrdup(available);
for (new_feature = strtok_r(tmp_avail, ",", &saveptr_avail);
new_feature;
new_feature = strtok_r(NULL, ",", &saveptr_avail)) {
if (!xstrcmp(old_feature, new_feature)) {
xstrfmtcat(*active, "%s%s",
*active ? "," : "", old_feature);
break;
}
}
xfree(tmp_avail);
}
xfree(tmp_old_active);
}
/*
* Configure node features.
* IN old_node_table_ptr IN - Previous nodes information
* IN old_node_record_count IN - Count of previous nodes information
* IN recover - replace node features data depending upon value.
* 0, 1 - use data from config record, built using slurm.conf
* 2 = use data from node record, built from saved state
*/
static void _set_features(node_record_t **old_node_table_ptr,
int old_node_record_count, int recover)
{
node_record_t *node_ptr, *old_node_ptr;
int i, node_features_cnt = node_features_g_count();
for (i = 0; i < old_node_record_count; i++) {
char *old_features_act;
if (!(old_node_ptr = old_node_table_ptr[i]))
continue;
node_ptr = find_node_record(old_node_ptr->name);
if (node_ptr == NULL)
continue;
/*
* Load all from state, ignore what has been read from
* slurm.conf. Features in node record just a placeholder
* for restore_node_features() to set up new config records.
*/
if (recover == 2) {
xfree(node_ptr->features);
xfree(node_ptr->features_act);
node_ptr->features = old_node_ptr->features;
node_ptr->features_act = old_node_ptr->features_act;
old_node_ptr->features = NULL;
old_node_ptr->features_act = NULL;
continue;
}
/* No changeable features so active == available */
if (node_features_cnt == 0) {
xfree(node_ptr->features_act);
node_ptr->features_act = xstrdup(node_ptr->features);
continue;
}
/* If we are here, there's a node_features plugin active */
/*
* Changeable features may be listed in the slurm.conf along
* with the non-changeable features (e.g. cloud nodes). So
* filter out the changeable features and leave only the
* non-changeable features. non-changeable features are active
* by default.
*/
old_features_act = node_ptr->features_act;
node_ptr->features_act =
filter_out_changeable_features(node_ptr->features);
/*
* Preserve active features on startup but make sure they are a
* subset of available features -- in case available features
* were changed.
*
* features_act has all non-changeable features now. We need to
* add back previous active features that are in available
* features.
*
* For cloud nodes, changeable features are added in slurm.conf.
* This will preserve the cloud active features on startup. When
* changeable features aren't defined in slurm.conf then
* features_act will be reset to all non-changeable features
* read in from slurm.conf and will expect to get the available
* and active features from the slurmd.
*/
_preserve_active_features(node_ptr->features, old_features_act,
&node_ptr->features_act);
xfree(old_features_act);
/*
* On startup, node_record_table_ptr is passed as
* old_node_table_ptr so no need to merge features.
*/
if (node_ptr == old_node_ptr)
continue;
/*
* The subset of plugin-controlled features_available
* and features_active found in the old node_ptr for this node
* are copied into new node respective fields.
* This will make that KNL modes are preserved while doing a
* reconfigure. Otherwise, we should wait until node is
* registered to get KNL available and active features.
*/
if (old_node_ptr->features != NULL) {
_merge_changeable_features(old_node_ptr->features,
&node_ptr->features);
}
if (old_node_ptr->features_act != NULL) {
_merge_changeable_features(old_node_ptr->features_act,
&node_ptr->features_act);
}
}
}
/*
* _preserve_select_type_param - preserve original plugin parameters.
* Daemons and/or commands must be restarted for some
* select plugin value changes to take effect.
* RET zero or error code
*/
static int _preserve_select_type_param(slurm_conf_t *ctl_conf_ptr,
uint16_t old_select_type_p)
{
int rc = SLURM_SUCCESS;
/* SelectTypeParameters cannot change */
if (old_select_type_p) {
if (old_select_type_p != ctl_conf_ptr->select_type_param) {
ctl_conf_ptr->select_type_param = old_select_type_p;
rc = ESLURM_INVALID_SELECTTYPE_CHANGE;
}
}
return rc;
}
/*
* _sync_nodes_to_jobs - sync node state to job states on slurmctld restart.
* This routine marks nodes allocated to a job as busy no matter what
* the node's last saved state
* RET count of nodes having state changed
* Note: Operates on common variables, no arguments
*/
static int _sync_nodes_to_jobs(void)
{
job_record_t *job_ptr;
list_itr_t *job_iterator;
int update_cnt = 0;
job_iterator = list_iterator_create(job_list);
while ((job_ptr = list_next(job_iterator))) {
if (job_ptr->details && job_ptr->details->prolog_running) {
job_ptr->details->prolog_running = 0;
if (IS_JOB_CONFIGURING(job_ptr)) {
prolog_slurmctld(job_ptr);
(void) bb_g_job_begin(job_ptr);
}
}
if (job_ptr->node_bitmap == NULL)
;
else if (IS_JOB_RUNNING(job_ptr) || IS_JOB_COMPLETING(job_ptr))
update_cnt += _sync_nodes_to_active_job(job_ptr);
else if (IS_JOB_SUSPENDED(job_ptr))
_sync_nodes_to_suspended_job(job_ptr);
}
list_iterator_destroy(job_iterator);
if (update_cnt) {
info("_sync_nodes_to_jobs updated state of %d nodes",
update_cnt);
}
return update_cnt;
}
/* For jobs which are in state COMPLETING, deallocate the nodes and
* issue the RPC to kill the job */
static int _sync_nodes_to_comp_job(void)
{
job_record_t *job_ptr;
list_itr_t *job_iterator;
int update_cnt = 0;
job_iterator = list_iterator_create(job_list);
while ((job_ptr = list_next(job_iterator))) {
if ((job_ptr->node_bitmap) && IS_JOB_COMPLETING(job_ptr)) {
/* If the controller is reconfiguring
* and the job is in completing state
* and the slurmctld epilog is already
* running which means deallocate_nodes()
* was already called, do invoke it again
* and don't start another epilog.
*/
if (job_ptr->epilog_running == true)
continue;
update_cnt++;
info("%s: %pJ in completing state", __func__, job_ptr);
if (!job_ptr->node_bitmap_cg)
build_cg_bitmap(job_ptr);
/* deallocate_nodes will remove this job from
* the system before it was added, so add it
* now
*/
if (accounting_enforce & ACCOUNTING_ENFORCE_LIMITS)
acct_policy_job_begin(job_ptr, false);
deallocate_nodes(job_ptr, false, false, false);
/* The job in completing state at slurmctld restart or
* reconfiguration, do not log completion again.
* job_completion_logger(job_ptr, false); */
}
}
list_iterator_destroy(job_iterator);
if (update_cnt)
info("%s: completing %d jobs", __func__, update_cnt);
return update_cnt;
}
/* Synchronize states of nodes and active jobs (RUNNING or COMPLETING state)
* RET count of jobs with state changes */
static int _sync_nodes_to_active_job(job_record_t *job_ptr)
{
int cnt = 0;
uint32_t node_flags;
node_record_t *node_ptr;
bitstr_t *node_bitmap, *orig_job_node_bitmap = NULL;
if (job_ptr->node_bitmap_cg) /* job completing */
node_bitmap = job_ptr->node_bitmap_cg;
else
node_bitmap = job_ptr->node_bitmap;
job_ptr->node_cnt = bit_set_count(node_bitmap);
for (int i = 0; (node_ptr = next_node_bitmap(node_bitmap, &i)); i++) {
if ((job_ptr->details &&
(job_ptr->details->whole_node & WHOLE_NODE_USER)) ||
(job_ptr->part_ptr &&
(job_ptr->part_ptr->flags & PART_FLAG_EXCLUSIVE_USER))) {
node_ptr->owner_job_cnt++;
node_ptr->owner = job_ptr->user_id;
}
if (slurm_mcs_get_select(job_ptr) == 1) {
xfree(node_ptr->mcs_label);
node_ptr->mcs_label = xstrdup(job_ptr->mcs_label);
}
node_flags = node_ptr->node_state & NODE_STATE_FLAGS;
if (IS_JOB_COMPLETING(job_ptr) && job_ptr->epilog_running) {
/*
* _sync_nodes_to_comp_job() won't call
* deallocate_nodes()/make_node_comp() if the
* EpilogSlurmctld is still running to decrement
* run_job_cnt and increment comp_job_cnt, so just
* increment comp_job_cnt now.
*/
node_ptr->comp_job_cnt++;
} else {
/*
* run_job_cnt will be decremented by
* deallocate_nodes()/make_node_comp() in
* _sync_nodes_to_comp_job().
*/
node_ptr->run_job_cnt++;
}
if ((job_ptr->details) && (job_ptr->details->share_res == 0))
node_ptr->no_share_job_cnt++;
if (IS_NODE_DOWN(node_ptr) &&
IS_JOB_RUNNING(job_ptr) &&
(job_ptr->kill_on_node_fail == 0) &&
(job_ptr->node_cnt > 1)) {
/* This should only happen if a job was running
* on a node that was newly configured DOWN */
int save_accounting_enforce;
info("Removing failed node %s from %pJ",
node_ptr->name, job_ptr);
/*
* Disable accounting here. Accounting reset for all
* jobs in _restore_job_accounting()
*/
save_accounting_enforce = accounting_enforce;
accounting_enforce &= (~ACCOUNTING_ENFORCE_LIMITS);
if (job_ptr->job_resrcs &&
job_ptr->job_resrcs->node_bitmap) {
/*
* node_bitmap is eventually changed within
* extract_job_resources_node() so we need to
* copy it before that.
*/
if (!orig_job_node_bitmap)
orig_job_node_bitmap = bit_copy(
job_ptr->job_resrcs->
node_bitmap);
} else {
error("We resized job %pJ, but the original node bitmap is unavailable. Unable to resize step node bitmaps for job's steps, this should never happen",
job_ptr);
}
job_pre_resize_acctg(job_ptr);
srun_node_fail(job_ptr, node_ptr->name);
kill_step_on_node(job_ptr, node_ptr, true);
excise_node_from_job(job_ptr, node_ptr);
job_post_resize_acctg(job_ptr);
accounting_enforce = save_accounting_enforce;
} else if (IS_NODE_DOWN(node_ptr) && IS_JOB_RUNNING(job_ptr)) {
info("Killing %pJ on DOWN node %s",
job_ptr, node_ptr->name);
job_ptr->exit_code = 1;
_abort_job(job_ptr, JOB_NODE_FAIL, FAIL_DOWN_NODE,
NULL);
cnt++;
} else if (IS_NODE_IDLE(node_ptr)) {
cnt++;
node_ptr->node_state = NODE_STATE_ALLOCATED |
node_flags;
}
}
/* If the job was resized then resize the bitmaps of the job's steps */
if (orig_job_node_bitmap)
rebuild_step_bitmaps(job_ptr, orig_job_node_bitmap);
FREE_NULL_BITMAP(orig_job_node_bitmap);
set_initial_job_alias_list(job_ptr);
return cnt;
}
/* Synchronize states of nodes and suspended jobs */
static void _sync_nodes_to_suspended_job(job_record_t *job_ptr)
{
node_record_t *node_ptr;
for (int i = 0; (node_ptr = next_node_bitmap(job_ptr->node_bitmap, &i));
i++) {
node_ptr->sus_job_cnt++;
}
set_initial_job_alias_list(job_ptr);
}
static void _restore_job_licenses(job_record_t *job_ptr)
{
list_t *license_list = NULL, *license_list_alloc = NULL;
bool valid = true, alloc_valid = true;
license_list = license_validate(job_ptr->licenses, false, false, false,
job_ptr->tres_req_cnt, &valid);
license_list_alloc =
license_validate(job_ptr->licenses_allocated, false, false,
true, NULL, &alloc_valid);
FREE_NULL_LIST(job_ptr->license_list);
if (valid) {
job_ptr->license_list = license_list;
xfree(job_ptr->licenses);
job_ptr->licenses = license_list_to_string(license_list);
}
/*
* If there are allocated licenses, then set job_ptr->license_list to
* that so we get the correct licenses from the cluster.
*/
if (license_list_alloc && alloc_valid) {
FREE_NULL_LIST(job_ptr->license_list);
job_ptr->license_list = license_list_alloc;
xfree(job_ptr->licenses_allocated);
job_ptr->licenses_allocated =
license_list_to_string(job_ptr->license_list);
}
if (IS_JOB_RUNNING(job_ptr) || IS_JOB_SUSPENDED(job_ptr) ||
IS_JOB_COMPLETING(job_ptr))
license_job_get(job_ptr, true);
}
/*
* Build license_list for every job.
* Reset accounting for every job.
* Reset the running job count for scheduling policy.
* This must be called after load_all_resv_state() and restore_node_features().
*/
static void _restore_job_accounting(void)
{
job_record_t *job_ptr;
list_itr_t *job_iterator;
assoc_mgr_clear_used_info();
job_iterator = list_iterator_create(job_list);
while ((job_ptr = list_next(job_iterator))) {
if (job_ptr->array_recs)
job_ptr->array_recs->tot_run_tasks = 0;
}
list_iterator_reset(job_iterator);
while ((job_ptr = list_next(job_iterator))) {
(void) build_feature_list(job_ptr, false, false);
(void) build_feature_list(job_ptr, true, false);
if (job_ptr->details->features_use ==
job_ptr->details->features)
job_ptr->details->feature_list_use =
job_ptr->details->feature_list;
else if (job_ptr->details->features_use ==
job_ptr->details->prefer)
job_ptr->details->feature_list_use =
job_ptr->details->prefer_list;
(void) extra_constraints_parse(job_ptr->extra,
&job_ptr->extra_constraints);
if (IS_JOB_RUNNING(job_ptr) || IS_JOB_SUSPENDED(job_ptr))
job_array_start(job_ptr);
if (accounting_enforce & ACCOUNTING_ENFORCE_LIMITS) {
if (!IS_JOB_FINISHED(job_ptr))
acct_policy_add_job_submit(job_ptr, false);
if (IS_JOB_RUNNING(job_ptr) ||
IS_JOB_SUSPENDED(job_ptr)) {
acct_policy_job_begin(job_ptr, false);
resv_replace_update(job_ptr);
} else if (IS_JOB_PENDING(job_ptr) &&
job_ptr->details &&
job_ptr->details->accrue_time) {
/*
* accrue usage was cleared above with
* assoc_mgr_clear_used_info(). Clear accrue
* time so that _handle_add_accrue() will add
* the usage back.
*/
time_t save_accrue_time =
job_ptr->details->accrue_time;
job_ptr->details->accrue_time = 0;
acct_policy_add_accrue_time(job_ptr, false);
if (job_ptr->details->accrue_time)
job_ptr->details->accrue_time =
save_accrue_time;
}
}
_restore_job_licenses(job_ptr);
}
list_iterator_destroy(job_iterator);
}
/* Flush accounting information on this cluster, then for each running or
* suspended job, restore its state in the accounting system */
static void _acct_restore_active_jobs(void)
{
job_record_t *job_ptr;
list_itr_t *job_iterator;
step_record_t *step_ptr;
list_itr_t *step_iterator;
info("Reinitializing job accounting state");
acct_storage_g_flush_jobs_on_cluster(acct_db_conn,
time(NULL));
job_iterator = list_iterator_create(job_list);
while ((job_ptr = list_next(job_iterator))) {
if (IS_JOB_SUSPENDED(job_ptr))
jobacct_storage_g_job_suspend(acct_db_conn, job_ptr);
if (IS_JOB_SUSPENDED(job_ptr) || IS_JOB_RUNNING(job_ptr)) {
if (job_ptr->db_index != NO_VAL64)
job_ptr->db_index = 0;
step_iterator = list_iterator_create(
job_ptr->step_list);
while ((step_ptr = list_next(step_iterator))) {
jobacct_storage_g_step_start(acct_db_conn,
step_ptr);
}
list_iterator_destroy (step_iterator);
}
}
list_iterator_destroy(job_iterator);
}
extern int dump_config_state_lite(void)
{
static uint32_t high_buffer_size = (1024 * 1024);
int error_code = 0;
buf_t *buffer = init_buf(high_buffer_size);
DEF_TIMERS;
START_TIMER;
/* write header: version, time */
pack16(SLURM_PROTOCOL_VERSION, buffer);
pack_time(time(NULL), buffer);
packstr(slurm_conf.accounting_storage_type, buffer);
/* write the buffer to file */
error_code = save_buf_to_state("last_config_lite", buffer,
&high_buffer_size);
FREE_NULL_BUFFER(buffer);
END_TIMER2(__func__);
return error_code;
}
extern int load_config_state_lite(void)
{
uint16_t ver = 0;
char *state_file;
buf_t *buffer;
time_t buf_time;
char *last_accounting_storage_type = NULL;
if (!(buffer = state_save_open("last_config_lite", &state_file))) {
debug2("No last_config_lite file (%s) to recover", state_file);
xfree(state_file);
return ENOENT;
}
xfree(state_file);
safe_unpack16(&ver, buffer);
debug3("Version in last_conf_lite header is %u", ver);
if (ver > SLURM_PROTOCOL_VERSION || ver < SLURM_MIN_PROTOCOL_VERSION) {
if (!ignore_state_errors)
fatal("Can not recover last_conf_lite, incompatible version, (%u not between %d and %d), start with '-i' to ignore this. Warning: using -i will lose the data that can't be recovered.",
ver, SLURM_MIN_PROTOCOL_VERSION,
SLURM_PROTOCOL_VERSION);
error("***********************************************");
error("Can not recover last_conf_lite, incompatible version, "
"(%u not between %d and %d)",
ver, SLURM_MIN_PROTOCOL_VERSION, SLURM_PROTOCOL_VERSION);
error("***********************************************");
FREE_NULL_BUFFER(buffer);
return EFAULT;
} else {
safe_unpack_time(&buf_time, buffer);
safe_unpackstr(&last_accounting_storage_type, buffer);
}
if (last_accounting_storage_type
&& !xstrcmp(last_accounting_storage_type,
slurm_conf.accounting_storage_type))
slurmctld_init_db = 0;
xfree(last_accounting_storage_type);
FREE_NULL_BUFFER(buffer);
return SLURM_SUCCESS;
unpack_error:
if (!ignore_state_errors)
fatal("Incomplete last_config_lite checkpoint file, start with '-i' to ignore this. Warning: using -i will lose the data that can't be recovered.");
error("Incomplete last_config_lite checkpoint file");
FREE_NULL_BUFFER(buffer);
return SLURM_ERROR;
}