| /*****************************************************************************\ |
| * read_config.c - read the overall slurm configuration file |
| ***************************************************************************** |
| * Copyright (C) 2002-2007 The Regents of the University of California. |
| * Copyright (C) 2008-2010 Lawrence Livermore National Security. |
| * Copyright (C) SchedMD LLC. |
| * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). |
| * Written by Morris Jette <jette1@llnl.gov>. |
| * CODE-OCEC-09-009. All rights reserved. |
| * |
| * This file is part of Slurm, a resource management program. |
| * For details, see <https://slurm.schedmd.com/>. |
| * Please also read the included file: DISCLAIMER. |
| * |
| * Slurm is free software; you can redistribute it and/or modify it under |
| * the terms of the GNU General Public License as published by the Free |
| * Software Foundation; either version 2 of the License, or (at your option) |
| * any later version. |
| * |
| * In addition, as a special exception, the copyright holders give permission |
| * to link the code of portions of this program with the OpenSSL library under |
| * certain conditions as described in each individual source file, and |
| * distribute linked combinations including the two. You must obey the GNU |
| * General Public License in all respects for all of the code used other than |
| * OpenSSL. If you modify file(s) with this exception, you may extend this |
| * exception to your version of the file(s), but you are not obligated to do |
| * so. If you do not wish to do so, delete this exception statement from your |
| * version. If you delete this exception statement from all source files in |
| * the program, then also delete it here. |
| * |
| * Slurm is distributed in the hope that it will be useful, but WITHOUT ANY |
| * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS |
| * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more |
| * details. |
| * |
| * You should have received a copy of the GNU General Public License along |
| * with Slurm; if not, write to the Free Software Foundation, Inc., |
| * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. |
| \*****************************************************************************/ |
| |
| #include "config.h" |
| |
| #include <ctype.h> |
| #include <errno.h> |
| #include <stdio.h> |
| #include <stdlib.h> |
| #include <string.h> |
| #include <syslog.h> |
| #include <sys/stat.h> |
| #include <sys/types.h> |
| #include <time.h> |
| #include <unistd.h> |
| |
| #include "src/common/assoc_mgr.h" |
| #include "src/common/cpu_frequency.h" |
| #include "src/common/hostlist.h" |
| #include "src/common/list.h" |
| #include "src/common/macros.h" |
| #include "src/common/node_features.h" |
| #include "src/common/port_mgr.h" |
| #include "src/common/read_config.h" |
| #include "src/common/slurm_rlimits_info.h" |
| #include "src/common/state_save.h" |
| #include "src/common/strnatcmp.h" |
| #include "src/common/xstring.h" |
| |
| #include "src/interfaces/burst_buffer.h" |
| #include "src/interfaces/cgroup.h" |
| #include "src/interfaces/gres.h" |
| #include "src/interfaces/job_submit.h" |
| #include "src/interfaces/jobcomp.h" |
| #include "src/interfaces/mcs.h" |
| #include "src/interfaces/node_features.h" |
| #include "src/interfaces/preempt.h" |
| #include "src/interfaces/prep.h" |
| #include "src/interfaces/sched_plugin.h" |
| #include "src/interfaces/select.h" |
| #include "src/interfaces/switch.h" |
| #include "src/interfaces/topology.h" |
| |
| #include "src/slurmctld/acct_policy.h" |
| #include "src/slurmctld/fed_mgr.h" |
| #include "src/slurmctld/gang.h" |
| #include "src/slurmctld/job_scheduler.h" |
| #include "src/slurmctld/licenses.h" |
| #include "src/slurmctld/locks.h" |
| #include "src/slurmctld/node_scheduler.h" |
| #include "src/slurmctld/power_save.h" |
| #include "src/slurmctld/proc_req.h" |
| #include "src/slurmctld/read_config.h" |
| #include "src/slurmctld/reservation.h" |
| #include "src/slurmctld/slurmctld.h" |
| #include "src/slurmctld/trigger_mgr.h" |
| |
| #include "src/stepmgr/srun_comm.h" |
| #include "src/stepmgr/stepmgr.h" |
| |
| /* Global variables */ |
| bool slurmctld_init_db = true; |
| |
| static void _acct_restore_active_jobs(void); |
| static void _build_bitmaps(void); |
| static void _gres_reconfig(void); |
| static void _init_all_slurm_conf(void); |
| static int _preserve_select_type_param(slurm_conf_t *ctl_conf_ptr, |
| uint16_t old_select_type_p); |
| static int _reset_node_bitmaps(void *x, void *arg); |
| static void _restore_job_accounting(); |
| |
| static void _set_features(node_record_t **old_node_table_ptr, |
| int old_node_record_count, int recover); |
| static void _stat_slurm_dirs(void); |
| static int _sync_nodes_to_comp_job(void); |
| static int _sync_nodes_to_jobs(void); |
| static int _sync_nodes_to_active_job(job_record_t *job_ptr); |
| static void _sync_nodes_to_suspended_job(job_record_t *job_ptr); |
| static void _sync_part_prio(void); |
| |
| /* |
| * Setup the global response_cluster_rec |
| */ |
| static void _set_response_cluster_rec(void) |
| { |
| if (response_cluster_rec) |
| return; |
| |
| response_cluster_rec = xmalloc(sizeof(slurmdb_cluster_rec_t)); |
| response_cluster_rec->name = xstrdup(slurm_conf.cluster_name); |
| if (slurm_conf.slurmctld_addr) { |
| response_cluster_rec->control_host = |
| xstrdup(slurm_conf.slurmctld_addr); |
| } else { |
| response_cluster_rec->control_host = |
| xstrdup(slurm_conf.control_addr[0]); |
| } |
| response_cluster_rec->control_port = slurm_conf.slurmctld_port; |
| response_cluster_rec->rpc_version = SLURM_PROTOCOL_VERSION; |
| } |
| |
| /* |
| * Free the global response_cluster_rec |
| */ |
| extern void cluster_rec_free(void) |
| { |
| if (response_cluster_rec) { |
| xfree(response_cluster_rec->control_host); |
| xfree(response_cluster_rec->name); |
| xfree(response_cluster_rec); |
| } |
| } |
| |
| /* Verify that Slurm directories are secure, not world writable */ |
| static void _stat_slurm_dirs(void) |
| { |
| struct stat stat_buf; |
| char *problem_dir = NULL; |
| |
| /* |
| * PluginDir may have multiple values, and is checked by |
| * _is_valid_path() instead |
| */ |
| |
| if (slurm_conf.plugstack && |
| !stat(slurm_conf.plugstack, &stat_buf) && |
| (stat_buf.st_mode & S_IWOTH)) { |
| problem_dir = "PlugStack"; |
| } |
| if (!stat(slurm_conf.slurmd_spooldir, &stat_buf) && |
| (stat_buf.st_mode & S_IWOTH)) { |
| problem_dir = "SlurmdSpoolDir"; |
| } |
| if (!stat(slurm_conf.state_save_location, &stat_buf) && |
| (stat_buf.st_mode & S_IWOTH)) { |
| problem_dir = "StateSaveLocation"; |
| } |
| |
| if (problem_dir) { |
| error("################################################"); |
| error("### SEVERE SECURITY VULNERABILITY ###"); |
| error("### %s DIRECTORY IS WORLD WRITABLE ###", problem_dir); |
| error("### CORRECT FILE PERMISSIONS ###"); |
| error("################################################"); |
| } |
| } |
| |
| /* |
| * _reorder_nodes_by_rank - order node table in ascending order of node_rank |
| * This depends on the TopologyPlugin, which may generate such a ranking. |
| */ |
| static int _sort_nodes_by_rank(const void *a, const void *b) |
| { |
| node_record_t *n1 = *(node_record_t **)a; |
| node_record_t *n2 = *(node_record_t **)b; |
| |
| if (!n1) |
| return 1; |
| if (!n2) |
| return -1; |
| |
| return slurm_sort_uint32_list_asc(&n1->node_rank, &n2->node_rank); |
| } |
| |
| /* |
| * _reorder_nodes_by_name - order node table in ascending order of name |
| */ |
| static int _sort_nodes_by_name(const void *a, const void *b) |
| { |
| node_record_t *n1 = *(node_record_t **)a; |
| node_record_t *n2 = *(node_record_t **)b; |
| |
| if (!n1) |
| return 1; |
| if (!n2) |
| return -1; |
| |
| return strnatcmp(n1->name, n2->name); |
| } |
| |
| static void _sort_node_record_table_ptr(void) |
| { |
| int (*compare_fn)(const void *, const void *); |
| |
| if (topology_g_generate_node_ranking()) |
| compare_fn = &_sort_nodes_by_rank; |
| else |
| compare_fn = &_sort_nodes_by_name; |
| |
| qsort(node_record_table_ptr, node_record_count, |
| sizeof(node_record_t *), compare_fn); |
| |
| for (int i = 0; i < node_record_count; i++) { |
| if (node_record_table_ptr[i]) |
| node_record_table_ptr[i]->index = i; |
| } |
| |
| #if _DEBUG |
| /* Log the results */ |
| node_record_t *node_ptr; |
| for (int i = 0; (node_ptr = next_node(&i)); i++) { |
| info("node_rank[%d:%d]: %s", |
| node_ptr->index, node_ptr->node_rank, node_ptr->name); |
| } |
| #endif |
| } |
| |
| static void _add_nodes_with_feature(hostlist_t *hl, char *feature) |
| { |
| node_record_t *node_ptr; |
| bitstr_t *tmp_bitmap = bit_alloc(node_record_count); |
| |
| add_nodes_with_feature_to_bitmap(tmp_bitmap, feature); |
| for (int i = 0; (node_ptr = next_node_bitmap(tmp_bitmap, &i)); i++) { |
| hostlist_push_host(hl, node_ptr->name); |
| } |
| |
| FREE_NULL_BITMAP(tmp_bitmap); |
| } |
| |
| static void _add_all_nodes_to_hostlist(hostlist_t *hl) |
| { |
| node_record_t *node_ptr; |
| |
| for (int i = 0; (node_ptr = next_node(&i)); i++) |
| hostlist_push_host(hl, node_ptr->name); |
| } |
| |
| extern hostlist_t *nodespec_to_hostlist(const char *nodes, bool uniq, |
| char **nodesets) |
| { |
| int count; |
| slurm_conf_nodeset_t *ptr, **ptr_array; |
| hostlist_t *hl; |
| |
| if (nodesets) |
| xfree(*nodesets); |
| |
| if (!xstrcasecmp(nodes, "ALL")) { |
| if (!(hl = hostlist_create(NULL))) { |
| error("%s: hostlist_create() error for %s", __func__, nodes); |
| return NULL; |
| } |
| _add_all_nodes_to_hostlist(hl); |
| if (nodesets) |
| *nodesets = xstrdup("ALL"); |
| return hl; |
| } else if (!(hl = hostlist_create(nodes))) { |
| error("%s: hostlist_create() error for %s", __func__, nodes); |
| return NULL; |
| } |
| |
| if (!hostlist_count(hl)) { |
| /* no need to look for nodests */ |
| return hl; |
| } |
| |
| count = slurm_conf_nodeset_array(&ptr_array); |
| for (int i = 0; i < count; i++) { |
| ptr = ptr_array[i]; |
| |
| /* swap the nodeset entry with the applicable nodes */ |
| if (hostlist_delete_host(hl, ptr->name)) { |
| if (nodesets) |
| xstrfmtcat(*nodesets, "%s%s", |
| *nodesets ? "," : "", |
| ptr->name); |
| |
| if (ptr->feature) |
| _add_nodes_with_feature(hl, ptr->feature); |
| |
| /* Handle keywords for Nodes= in a NodeSet */ |
| if (!xstrcasecmp(ptr->nodes, "ALL")) { |
| _add_all_nodes_to_hostlist(hl); |
| } else if (ptr->nodes) { |
| hostlist_push(hl, ptr->nodes); |
| } |
| } |
| } |
| |
| if (xstrchr(nodes, '{')) |
| parse_hostlist_functions(&hl); |
| |
| if (uniq) |
| hostlist_uniq(hl); |
| return hl; |
| } |
| |
| static void _init_bitmaps(void) |
| { |
| /* initialize the idle and up bitmaps */ |
| FREE_NULL_BITMAP(asap_node_bitmap); |
| FREE_NULL_BITMAP(avail_node_bitmap); |
| FREE_NULL_BITMAP(bf_ignore_node_bitmap); |
| FREE_NULL_BITMAP(booting_node_bitmap); |
| FREE_NULL_BITMAP(cg_node_bitmap); |
| FREE_NULL_BITMAP(cloud_node_bitmap); |
| FREE_NULL_BITMAP(external_node_bitmap); |
| FREE_NULL_BITMAP(future_node_bitmap); |
| FREE_NULL_BITMAP(idle_node_bitmap); |
| FREE_NULL_BITMAP(power_down_node_bitmap); |
| FREE_NULL_BITMAP(power_up_node_bitmap); |
| FREE_NULL_BITMAP(rs_node_bitmap); |
| FREE_NULL_BITMAP(share_node_bitmap); |
| FREE_NULL_BITMAP(up_node_bitmap); |
| asap_node_bitmap = bit_alloc(node_record_count); |
| avail_node_bitmap = bit_alloc(node_record_count); |
| bf_ignore_node_bitmap = bit_alloc(node_record_count); |
| booting_node_bitmap = bit_alloc(node_record_count); |
| cg_node_bitmap = bit_alloc(node_record_count); |
| cloud_node_bitmap = bit_alloc(node_record_count); |
| external_node_bitmap = bit_alloc(node_record_count); |
| future_node_bitmap = bit_alloc(node_record_count); |
| idle_node_bitmap = bit_alloc(node_record_count); |
| power_down_node_bitmap = bit_alloc(node_record_count); |
| power_up_node_bitmap = bit_alloc(node_record_count); |
| rs_node_bitmap = bit_alloc(node_record_count); |
| share_node_bitmap = bit_alloc(node_record_count); |
| up_node_bitmap = bit_alloc(node_record_count); |
| } |
| |
| static void _build_part_bitmaps(void) |
| { |
| part_record_t *part_ptr; |
| list_itr_t *part_iterator; |
| |
| /* scan partition table and identify nodes in each */ |
| part_iterator = list_iterator_create(part_list); |
| while ((part_ptr = list_next(part_iterator))) { |
| if (build_part_bitmap(part_ptr) == ESLURM_INVALID_NODE_NAME) |
| fatal("Invalid node names in partition %s", |
| part_ptr->name); |
| } |
| list_iterator_destroy(part_iterator); |
| } |
| |
| static void _build_node_config_bitmaps(void) |
| { |
| node_record_t *node_ptr; |
| |
| /* initialize the configuration bitmaps */ |
| list_for_each(config_list, _reset_node_bitmaps, NULL); |
| |
| for (int i = 0; (node_ptr = next_node(&i)); i++) { |
| if (node_ptr->config_ptr) |
| bit_set(node_ptr->config_ptr->node_bitmap, |
| node_ptr->index); |
| } |
| } |
| |
| static int _reset_node_bitmaps(void *x, void *arg) |
| { |
| config_record_t *config_ptr = x; |
| |
| FREE_NULL_BITMAP(config_ptr->node_bitmap); |
| config_ptr->node_bitmap = bit_alloc(node_record_count); |
| |
| return 0; |
| } |
| |
| static int _set_share_node_bitmap(void *x, void *arg) |
| { |
| job_record_t *job_ptr = x; |
| |
| if (!IS_JOB_RUNNING(job_ptr) || |
| (job_ptr->node_bitmap == NULL) || |
| (job_ptr->details == NULL) || |
| (job_ptr->details->share_res != 0)) |
| return 0; |
| |
| bit_and_not(share_node_bitmap, job_ptr->node_bitmap); |
| |
| return 0; |
| } |
| |
| static void *_set_node_addrs(void *arg) |
| { |
| list_t *nodes = arg; |
| slurm_addr_t slurm_addr; |
| node_record_t *node_ptr; |
| |
| while ((node_ptr = list_pop(nodes))) { |
| slurm_set_addr(&slurm_addr, node_ptr->port, |
| node_ptr->comm_name); |
| if (slurm_get_port(&slurm_addr)) |
| continue; |
| error("%s: failure on %s", __func__, node_ptr->comm_name); |
| node_ptr->node_state = NODE_STATE_FUTURE; |
| node_ptr->port = 0; |
| xfree(node_ptr->reason); |
| node_ptr->reason = xstrdup("NO NETWORK ADDRESS FOUND"); |
| node_ptr->reason_time = time(NULL); |
| node_ptr->reason_uid = slurm_conf.slurm_user_id; |
| } |
| |
| return NULL; |
| } |
| |
| /* |
| * Validate that nodes are addressable. |
| */ |
| static void _validate_slurmd_addr(void) |
| { |
| node_record_t *node_ptr; |
| DEF_TIMERS; |
| pthread_t *work_threads; |
| int threads_num = 1; |
| char *temp_str; |
| list_t *nodes = list_create(NULL); |
| xassert(verify_lock(CONF_LOCK, READ_LOCK)); |
| |
| START_TIMER; |
| |
| if ((temp_str = xstrcasestr(slurm_conf.slurmctld_params, |
| "validate_nodeaddr_threads="))) { |
| int tmp_val = strtol(temp_str + 26, NULL, 10); |
| if ((tmp_val >= 1) && (tmp_val <= 64)) |
| threads_num = tmp_val; |
| else |
| error("SlurmctldParameters option validate_nodeaddr_threads=%d out of range, ignored", |
| tmp_val); |
| } |
| |
| |
| for (int i = 0; (node_ptr = next_node(&i)); i++) { |
| if ((node_ptr->name == NULL) || |
| (node_ptr->name[0] == '\0')) |
| continue; |
| if (IS_NODE_FUTURE(node_ptr)) |
| continue; |
| if (IS_NODE_CLOUD(node_ptr) && |
| (IS_NODE_POWERING_DOWN(node_ptr) || |
| IS_NODE_POWERED_DOWN(node_ptr) || |
| IS_NODE_POWERING_UP(node_ptr))) |
| continue; |
| if (node_ptr->port == 0) |
| node_ptr->port = slurm_conf.slurmd_port; |
| list_append(nodes, node_ptr); |
| } |
| |
| work_threads = xcalloc(threads_num, sizeof(pthread_t)); |
| for (int i = 0; i < threads_num; i++) |
| slurm_thread_create(&work_threads[i], _set_node_addrs, nodes); |
| for (int i = 0; i < threads_num; i++) |
| slurm_thread_join(work_threads[i]); |
| xfree(work_threads); |
| xassert(list_is_empty(nodes)); |
| FREE_NULL_LIST(nodes); |
| |
| END_TIMER2(__func__); |
| } |
| |
| /* |
| * _build_bitmaps - build node bitmaps to define which nodes are in which |
| * 1) partition 2) configuration record 3) up state 4) idle state |
| * also sets values of total_nodes and total_cpus for every partition. |
| * RET 0 if no error, errno otherwise |
| * Note: Operates on common variables, no arguments |
| * node_record_count - number of nodes in the system |
| * node_record_table_ptr - pointer to global node table |
| * part_list - pointer to global partition list |
| */ |
| static void _build_bitmaps(void) |
| { |
| node_record_t *node_ptr; |
| |
| last_node_update = time(NULL); |
| last_part_update = time(NULL); |
| |
| /* Set all bits, all nodes initially available for sharing */ |
| bit_set_all(share_node_bitmap); |
| |
| /* identify all nodes non-sharable due to non-sharing jobs */ |
| list_for_each(job_list, _set_share_node_bitmap, NULL); |
| |
| /* scan all nodes and identify which are up, idle and |
| * their configuration, resync DRAINED vs. DRAINING state */ |
| for (int i = 0; (node_ptr = next_node(&i)); i++) { |
| uint32_t drain_flag, job_cnt; |
| |
| if (node_ptr->name[0] == '\0') |
| continue; /* defunct */ |
| drain_flag = IS_NODE_DRAIN(node_ptr) | |
| IS_NODE_FAIL(node_ptr); |
| job_cnt = node_ptr->run_job_cnt + node_ptr->comp_job_cnt; |
| if (!IS_NODE_FUTURE(node_ptr)) |
| bit_set(power_up_node_bitmap, node_ptr->index); |
| |
| if ((IS_NODE_IDLE(node_ptr) && (job_cnt == 0)) || |
| IS_NODE_DOWN(node_ptr)) |
| bit_set(idle_node_bitmap, node_ptr->index); |
| if (IS_NODE_POWERING_UP(node_ptr)) |
| bit_set(booting_node_bitmap, node_ptr->index); |
| if (IS_NODE_COMPLETING(node_ptr)) |
| bit_set(cg_node_bitmap, node_ptr->index); |
| if (IS_NODE_CLOUD(node_ptr)) |
| bit_set(cloud_node_bitmap, node_ptr->index); |
| if (IS_NODE_EXTERNAL(node_ptr)) |
| bit_set(external_node_bitmap, node_ptr->index); |
| if (IS_NODE_IDLE(node_ptr) || |
| IS_NODE_ALLOCATED(node_ptr) || |
| ((IS_NODE_REBOOT_REQUESTED(node_ptr) || |
| IS_NODE_REBOOT_ISSUED(node_ptr)) && |
| ((node_ptr->next_state & NODE_STATE_FLAGS) & |
| NODE_RESUME))) { |
| if ((drain_flag == 0) && |
| (!IS_NODE_NO_RESPOND(node_ptr))) |
| make_node_avail(node_ptr); |
| bit_set(up_node_bitmap, node_ptr->index); |
| } |
| if (IS_NODE_POWERED_DOWN(node_ptr)) { |
| bit_set(power_down_node_bitmap, node_ptr->index); |
| bit_clear(power_up_node_bitmap, node_ptr->index); |
| } |
| if (IS_NODE_POWERING_DOWN(node_ptr)) { |
| bit_set(power_down_node_bitmap, node_ptr->index); |
| bit_clear(power_up_node_bitmap, node_ptr->index); |
| bit_clear(avail_node_bitmap, node_ptr->index); |
| } |
| if (IS_NODE_FUTURE(node_ptr)) |
| bit_set(future_node_bitmap, node_ptr->index); |
| |
| if ((IS_NODE_REBOOT_REQUESTED(node_ptr) || |
| IS_NODE_REBOOT_ISSUED(node_ptr)) && |
| ((node_ptr->next_state & NODE_STATE_FLAGS) & NODE_RESUME)) |
| bit_set(rs_node_bitmap, node_ptr->index); |
| |
| if (IS_NODE_REBOOT_ASAP(node_ptr)) |
| bit_set(asap_node_bitmap, node_ptr->index); |
| } |
| } |
| |
| static int _set_nodes_topo(void) |
| { |
| node_record_t *node_ptr; |
| int rc = SLURM_SUCCESS; |
| |
| last_node_update = time(NULL); |
| |
| for (int i = 0; (node_ptr = next_node(&i)); i++) { |
| if (node_ptr->topology_str && |
| (rc = topology_g_add_rm_node(node_ptr))) { |
| error("Invalid node topology specified %s for %s", |
| node_ptr->topology_str, node_ptr->name); |
| break; |
| } |
| } |
| |
| return rc; |
| } |
| |
| /* |
| * _init_all_slurm_conf - initialize or re-initialize the slurm |
| * configuration values. |
| * NOTE: We leave the job table intact |
| * NOTE: Operates on common variables, no arguments |
| */ |
| static void _init_all_slurm_conf(void) |
| { |
| char *conf_name = xstrdup(slurm_conf.slurm_conf); |
| |
| slurm_conf_reinit(conf_name); |
| xfree(conf_name); |
| |
| init_node_conf(); |
| init_part_conf(); |
| init_job_conf(); |
| } |
| |
| static int _handle_downnodes_line(slurm_conf_downnodes_t *down) |
| { |
| int error_code = 0; |
| node_record_t *node_rec = NULL; |
| hostlist_t *alias_list = NULL; |
| char *alias = NULL; |
| int state_val = NODE_STATE_DOWN; |
| |
| if (down->state != NULL) { |
| state_val = state_str2int(down->state, down->nodenames); |
| if (state_val == NO_VAL) { |
| error("Invalid State \"%s\"", down->state); |
| goto cleanup; |
| } |
| } |
| |
| if ((alias_list = hostlist_create(down->nodenames)) == NULL) { |
| error("Unable to create NodeName list from %s", |
| down->nodenames); |
| error_code = errno; |
| goto cleanup; |
| } |
| |
| while ((alias = hostlist_shift(alias_list))) { |
| node_rec = find_node_record(alias); |
| if (node_rec == NULL) { |
| error("DownNode \"%s\" does not exist!", alias); |
| free(alias); |
| continue; |
| } |
| |
| if ((state_val != NO_VAL) && |
| (state_val != NODE_STATE_UNKNOWN)) |
| node_rec->node_state = state_val; |
| if (down->reason) { |
| xfree(node_rec->reason); |
| node_rec->reason = xstrdup(down->reason); |
| node_rec->reason_time = time(NULL); |
| node_rec->reason_uid = slurm_conf.slurm_user_id; |
| } |
| free(alias); |
| } |
| |
| cleanup: |
| if (alias_list) |
| hostlist_destroy(alias_list); |
| return error_code; |
| } |
| |
| static void _handle_all_downnodes(void) |
| { |
| slurm_conf_downnodes_t *ptr, **ptr_array; |
| int count; |
| int i; |
| |
| count = slurm_conf_downnodes_array(&ptr_array); |
| if (count == 0) { |
| debug("No DownNodes"); |
| return; |
| } |
| |
| for (i = 0; i < count; i++) { |
| ptr = ptr_array[i]; |
| |
| _handle_downnodes_line(ptr); |
| } |
| } |
| |
| /* |
| * Convert a comma delimited string of account names into a list containing |
| * pointers to those associations. |
| */ |
| extern list_t *accounts_list_build(char *accounts, bool locked) |
| { |
| char *tmp_accts, *one_acct_name, *name_ptr = NULL; |
| list_t *acct_list = NULL; |
| slurmdb_assoc_rec_t *assoc_ptr = NULL; |
| assoc_mgr_lock_t locks = { .assoc = READ_LOCK }; |
| |
| if (!accounts) |
| return acct_list; |
| |
| if (!locked) |
| assoc_mgr_lock(&locks); |
| tmp_accts = xstrdup(accounts); |
| one_acct_name = strtok_r(tmp_accts, ",", &name_ptr); |
| while (one_acct_name) { |
| slurmdb_assoc_rec_t assoc = { |
| .acct = one_acct_name, |
| .uid = NO_VAL, |
| }; |
| |
| if (assoc_mgr_fill_in_assoc( |
| acct_db_conn, &assoc, |
| accounting_enforce, |
| &assoc_ptr, true) != SLURM_SUCCESS) { |
| if (accounting_enforce & ACCOUNTING_ENFORCE_ASSOCS) { |
| error("%s: No association for account %s", |
| __func__, assoc.acct); |
| } else { |
| verbose("%s: No association for account %s", |
| __func__, assoc.acct); |
| } |
| |
| } |
| if (assoc_ptr) { |
| if (!acct_list) |
| acct_list = list_create(NULL); |
| list_append(acct_list, assoc_ptr); |
| } |
| |
| one_acct_name = strtok_r(NULL, ",", &name_ptr); |
| } |
| xfree(tmp_accts); |
| if (!locked) |
| assoc_mgr_unlock(&locks); |
| return acct_list; |
| } |
| |
| /* Convert a comma delimited list of QOS names into a bitmap */ |
| extern void qos_list_build(char *qos, bool locked, bitstr_t **qos_bits) |
| { |
| char *tmp_qos, *one_qos_name, *name_ptr = NULL; |
| slurmdb_qos_rec_t qos_rec, *qos_ptr = NULL; |
| bitstr_t *tmp_qos_bitstr; |
| int rc; |
| assoc_mgr_lock_t locks = { .qos = READ_LOCK }; |
| |
| if (!qos) { |
| FREE_NULL_BITMAP(*qos_bits); |
| return; |
| } |
| |
| /* Lock here to avoid g_qos_count changing under us */ |
| if (!locked) |
| assoc_mgr_lock(&locks); |
| if (!g_qos_count) { |
| error("We have no QOS on the system Ignoring invalid " |
| "Allow/DenyQOS value(s) %s", |
| qos); |
| if (!locked) |
| assoc_mgr_unlock(&locks); |
| FREE_NULL_BITMAP(*qos_bits); |
| *qos_bits = NULL; |
| return; |
| } |
| |
| tmp_qos_bitstr = bit_alloc(g_qos_count); |
| tmp_qos = xstrdup(qos); |
| one_qos_name = strtok_r(tmp_qos, ",", &name_ptr); |
| while (one_qos_name) { |
| memset(&qos_rec, 0, sizeof(slurmdb_qos_rec_t)); |
| qos_rec.name = one_qos_name; |
| rc = assoc_mgr_fill_in_qos(acct_db_conn, &qos_rec, |
| accounting_enforce, |
| &qos_ptr, 1); |
| if ((rc != SLURM_SUCCESS) || (qos_rec.id >= g_qos_count)) { |
| error("Ignoring invalid Allow/DenyQOS value: %s", |
| one_qos_name); |
| } else { |
| bit_set(tmp_qos_bitstr, qos_rec.id); |
| } |
| one_qos_name = strtok_r(NULL, ",", &name_ptr); |
| } |
| if (!locked) |
| assoc_mgr_unlock(&locks); |
| xfree(tmp_qos); |
| FREE_NULL_BITMAP(*qos_bits); |
| *qos_bits = tmp_qos_bitstr; |
| } |
| |
| /* |
| * _build_single_partitionline_info - get a array of slurm_conf_partition_t |
| * structures from the slurm.conf reader, build table, and set values |
| * RET 0 if no error, error code otherwise |
| * Note: Operates on common variables |
| * global: part_list - global partition list pointer |
| * default_part - default parameters for a partition |
| */ |
| static int _build_single_partitionline_info(slurm_conf_partition_t *part) |
| { |
| part_record_t *part_ptr; |
| |
| if (list_find_first(part_list, &list_find_part, part->name)) |
| fatal("%s: duplicate entry for partition %s", |
| __func__, part->name); |
| |
| part_ptr = create_ctld_part_record(part->name); |
| |
| if (part->default_flag) { |
| if (default_part_name && |
| xstrcmp(default_part_name, part->name)) { |
| info("_parse_part_spec: changing default partition " |
| "from %s to %s", default_part_name, part->name); |
| default_part_loc->flags &= (~PART_FLAG_DEFAULT); |
| } |
| xfree(default_part_name); |
| default_part_name = xstrdup(part->name); |
| default_part_loc = part_ptr; |
| part_ptr->flags |= PART_FLAG_DEFAULT; |
| } |
| |
| part_ptr->cpu_bind = part->cpu_bind; |
| |
| if (part->preempt_mode != NO_VAL16) |
| part_ptr->preempt_mode = part->preempt_mode; |
| |
| if (part->disable_root_jobs == NO_VAL8) { |
| if (slurm_conf.conf_flags & CONF_FLAG_DRJ) |
| part_ptr->flags |= PART_FLAG_NO_ROOT; |
| } else if (part->disable_root_jobs) { |
| part_ptr->flags |= PART_FLAG_NO_ROOT; |
| } else { |
| part_ptr->flags &= (~PART_FLAG_NO_ROOT); |
| } |
| if (part_ptr->flags & PART_FLAG_NO_ROOT) |
| debug2("partition %s does not allow root jobs", part_ptr->name); |
| |
| if ((part->default_time != NO_VAL) && |
| (part->default_time > part->max_time)) { |
| info("partition %s DefaultTime exceeds MaxTime (%u > %u)", |
| part->name, part->default_time, part->max_time); |
| part->default_time = NO_VAL; |
| } |
| |
| if (part->exclusive_user) |
| part_ptr->flags |= PART_FLAG_EXCLUSIVE_USER; |
| if (part->exclusive_topo) |
| part_ptr->flags |= PART_FLAG_EXCLUSIVE_TOPO; |
| if (part->hidden_flag) |
| part_ptr->flags |= PART_FLAG_HIDDEN; |
| if (part->power_down_on_idle) |
| part_ptr->flags |= PART_FLAG_PDOI; |
| if (part->root_only_flag) |
| part_ptr->flags |= PART_FLAG_ROOT_ONLY; |
| if (part->req_resv_flag) |
| part_ptr->flags |= PART_FLAG_REQ_RESV; |
| if (part->lln_flag) |
| part_ptr->flags |= PART_FLAG_LLN; |
| part_ptr->max_time = part->max_time; |
| part_ptr->def_mem_per_cpu = part->def_mem_per_cpu; |
| part_ptr->default_time = part->default_time; |
| FREE_NULL_LIST(part_ptr->job_defaults_list); |
| part_ptr->job_defaults_list = |
| job_defaults_copy(part->job_defaults_list); |
| part_ptr->max_cpus_per_node = part->max_cpus_per_node; |
| part_ptr->max_cpus_per_socket = part->max_cpus_per_socket; |
| part_ptr->max_share = part->max_share; |
| part_ptr->max_mem_per_cpu = part->max_mem_per_cpu; |
| part_ptr->max_nodes = part->max_nodes; |
| part_ptr->max_nodes_orig = part->max_nodes; |
| part_ptr->min_nodes = part->min_nodes; |
| part_ptr->min_nodes_orig = part->min_nodes; |
| part_ptr->over_time_limit = part->over_time_limit; |
| part_ptr->preempt_mode = part->preempt_mode; |
| part_ptr->priority_job_factor = part->priority_job_factor; |
| part_ptr->priority_tier = part->priority_tier; |
| part_ptr->resume_timeout = part->resume_timeout; |
| part_ptr->state_up = part->state_up; |
| part_ptr->suspend_time = part->suspend_time; |
| part_ptr->suspend_timeout = part->suspend_timeout; |
| part_ptr->grace_time = part->grace_time; |
| part_ptr->cr_type = part->cr_type; |
| |
| part_ptr->allow_alloc_nodes = xstrdup(part->allow_alloc_nodes); |
| part_ptr->allow_groups = xstrdup(part->allow_groups); |
| part_ptr->alternate = xstrdup(part->alternate); |
| part_ptr->nodes = xstrdup(part->nodes); |
| part_ptr->orig_nodes = xstrdup(part->nodes); |
| |
| if (part->billing_weights_str) { |
| set_partition_billing_weights(part->billing_weights_str, |
| part_ptr, true); |
| } |
| |
| if (part->allow_accounts) { |
| part_ptr->allow_accounts = xstrdup(part->allow_accounts); |
| part_ptr->allow_accts_list = |
| accounts_list_build(part_ptr->allow_accounts, false); |
| } |
| |
| if (part->allow_qos) { |
| part_ptr->allow_qos = xstrdup(part->allow_qos); |
| qos_list_build(part_ptr->allow_qos, false, |
| &part_ptr->allow_qos_bitstr); |
| } |
| |
| if (part->deny_accounts) { |
| part_ptr->deny_accounts = xstrdup(part->deny_accounts); |
| part_ptr->deny_accts_list = |
| accounts_list_build(part_ptr->deny_accounts, false); |
| } |
| |
| if (part->deny_qos) { |
| part_ptr->deny_qos = xstrdup(part->deny_qos); |
| qos_list_build(part_ptr->deny_qos, false, |
| &part_ptr->deny_qos_bitstr); |
| } |
| |
| if (part->qos_char) { |
| slurmdb_qos_rec_t qos_rec; |
| part_ptr->qos_char = xstrdup(part->qos_char); |
| |
| memset(&qos_rec, 0, sizeof(slurmdb_qos_rec_t)); |
| qos_rec.name = part_ptr->qos_char; |
| if (assoc_mgr_fill_in_qos( |
| acct_db_conn, &qos_rec, accounting_enforce, |
| (slurmdb_qos_rec_t **)&part_ptr->qos_ptr, 0) |
| != SLURM_SUCCESS) { |
| fatal("Partition %s has an invalid qos (%s), " |
| "please check your configuration", |
| part_ptr->name, qos_rec.name); |
| } |
| if (part_ptr->qos_ptr) { |
| if ((part_ptr->qos_ptr->flags & QOS_FLAG_PART_QOS) && |
| (part_ptr->qos_ptr->flags & QOS_FLAG_RELATIVE)) |
| fatal("QOS %s is a relative QOS. A relative QOS must be unique per partition. Please check your configuration and adjust accordingly", |
| part_ptr->qos_ptr->name); |
| part_ptr->qos_ptr->flags |= QOS_FLAG_PART_QOS; |
| } |
| } |
| |
| if (part->topology_name) { |
| part_ptr->topology_name = xstrdup(part->topology_name); |
| } |
| |
| return 0; |
| } |
| |
| /* |
| * _build_all_partitionline_info - get a array of slurm_conf_partition_t |
| * structures from the slurm.conf reader, build table, and set values |
| * Note: Operates on common variables |
| * global: part_list - global partition list pointer |
| * default_part - default parameters for a partition |
| */ |
| static void _build_all_partitionline_info(void) |
| { |
| slurm_conf_partition_t **ptr_array; |
| int count; |
| int i; |
| |
| count = slurm_conf_partition_array(&ptr_array); |
| |
| for (i = 0; i < count; i++) |
| _build_single_partitionline_info(ptr_array[i]); |
| } |
| |
| static int _set_max_part_prio(void *x, void *arg) |
| { |
| part_record_t *part_ptr = x; |
| |
| if (part_ptr->priority_job_factor > part_max_priority) |
| part_max_priority = part_ptr->priority_job_factor; |
| |
| return 0; |
| } |
| |
| static int _reset_part_prio(void *x, void *arg) |
| { |
| part_record_t *part_ptr = x; |
| |
| /* protect against div0 if all partition priorities are zero */ |
| if (part_max_priority == 0) { |
| part_ptr->norm_priority = 0; |
| return 0; |
| } |
| |
| part_ptr->norm_priority = (double)part_ptr->priority_job_factor / |
| (double)part_max_priority; |
| |
| return 0; |
| } |
| |
| /* _sync_part_prio - Set normalized partition priorities */ |
| static void _sync_part_prio(void) |
| { |
| /* reset global value from part list */ |
| part_max_priority = DEF_PART_MAX_PRIORITY; |
| list_for_each(part_list, _set_max_part_prio, NULL); |
| /* renormalize values after finding new max */ |
| list_for_each(part_list, _reset_part_prio, NULL); |
| } |
| |
| static int _foreach_requeue_job_node_failed(void *x, void *arg) |
| { |
| job_record_t *job_ptr = x; |
| job_record_t *het_job_leader; |
| int rc = SLURM_SUCCESS; |
| |
| xassert(job_ptr->magic == JOB_MAGIC); |
| |
| if (!IS_JOB_NODE_FAILED(job_ptr) && !IS_JOB_REQUEUED(job_ptr)) |
| return SLURM_SUCCESS; |
| |
| het_job_leader = find_job_record(job_ptr->het_job_id); |
| if (het_job_leader && het_job_leader->batch_flag && |
| het_job_leader->details && |
| het_job_leader->details->requeue && |
| het_job_leader->part_ptr) { |
| info("Requeue het job leader %pJ due to node failure on %pJ", |
| het_job_leader, job_ptr); |
| if ((rc = job_requeue(0, het_job_leader->job_id, NULL, false, |
| 0))) |
| error("Unable to requeue %pJ: %s", |
| het_job_leader, slurm_strerror(rc)); |
| } else if (job_ptr->batch_flag && job_ptr->details && |
| job_ptr->details->requeue && job_ptr->part_ptr) { |
| info("Requeue job %pJ due to node failure", |
| job_ptr); |
| if ((rc = job_requeue(0, job_ptr->job_id, NULL, false, 0))) |
| error("Unable to requeue %pJ: %s", |
| job_ptr, slurm_strerror(rc)); |
| } |
| job_state_unset_flag(job_ptr, JOB_REQUEUE); |
| |
| return rc; |
| } |
| |
| static void _requeue_job_node_failed(void) |
| { |
| xassert(job_list); |
| |
| (void) list_for_each_nobreak(job_list, |
| _foreach_requeue_job_node_failed, NULL); |
| } |
| |
| static void _abort_job(job_record_t *job_ptr, uint32_t job_state, |
| uint16_t state_reason, char *reason_string) |
| { |
| time_t now = time(NULL); |
| |
| job_state_set(job_ptr, (job_state | JOB_COMPLETING)); |
| build_cg_bitmap(job_ptr); |
| job_ptr->end_time = MIN(job_ptr->end_time, now); |
| job_ptr->state_reason = state_reason; |
| xfree(job_ptr->state_desc); |
| job_ptr->state_desc = xstrdup(reason_string); |
| job_completion_logger(job_ptr, false); |
| if (job_ptr->job_state == JOB_NODE_FAIL) { |
| /* build_cg_bitmap() may clear JOB_COMPLETING */ |
| epilog_slurmctld(job_ptr); |
| } |
| } |
| |
| static int _mark_het_job_unused(void *x, void *arg) |
| { |
| job_record_t *job_ptr = x; |
| job_ptr->bit_flags &= (~HET_JOB_FLAG); |
| return 0; |
| } |
| |
| static int _mark_het_job_used(void *x, void *arg) |
| { |
| job_record_t *job_ptr = x; |
| job_ptr->bit_flags |= HET_JOB_FLAG; |
| return 0; |
| } |
| |
| static int _test_het_job_used(void *x, void *arg) |
| { |
| job_record_t *job_ptr = x; |
| |
| if ((job_ptr->het_job_id == 0) || IS_JOB_FINISHED(job_ptr)) |
| return 0; |
| if (job_ptr->bit_flags & HET_JOB_FLAG) |
| return 0; |
| |
| error("Incomplete hetjob being aborted %pJ", job_ptr); |
| _abort_job(job_ptr, JOB_FAILED, FAIL_SYSTEM, "incomplete hetjob"); |
| |
| return 0; |
| } |
| |
| /* |
| * Validate heterogeneous jobs |
| * |
| * Make sure that every active (not yet complete) job has all of its components |
| * and they are all in the same state. Also rebuild het_job_list. |
| * If hetjob is corrupted, aborts and removes it from job_list. |
| */ |
| static void _validate_het_jobs(void) |
| { |
| list_itr_t *job_iterator; |
| job_record_t *job_ptr, *het_job_ptr; |
| hostset_t *hs; |
| char *job_id_str; |
| uint32_t job_id; |
| bool het_job_valid; |
| |
| list_for_each(job_list, _mark_het_job_unused, NULL); |
| |
| job_iterator = list_iterator_create(job_list); |
| while ((job_ptr = list_next(job_iterator))) { |
| /* Checking for corrupted hetjob components */ |
| if (job_ptr->het_job_offset != 0) { |
| het_job_ptr = find_job_record(job_ptr->het_job_id); |
| if (!het_job_ptr) { |
| error("Could not find hetjob leader (JobId=%u) of %pJ. Aborting and removing job as it is corrupted.", |
| job_ptr->het_job_id, job_ptr); |
| _abort_job(job_ptr, JOB_FAILED, FAIL_SYSTEM, |
| "invalid het_job_id_set"); |
| if (list_delete_item(job_iterator) != 1) |
| error("Not able to remove the job."); |
| continue; |
| } |
| if (job_ptr->het_job_id && |
| (job_ptr->job_id == job_ptr->het_job_id)) { |
| error("Invalid HetJob component %pJ HetJobIdSet=%s. Aborting and removing job.", |
| job_ptr, |
| job_ptr->het_job_id_set); |
| _abort_job(job_ptr, JOB_FAILED, FAIL_SYSTEM, |
| "Invalid HetJob component"); |
| if (list_delete_item(job_iterator) != 1) |
| error("Not able to remove the job."); |
| continue; |
| } |
| } |
| |
| if ((job_ptr->het_job_id == 0) || |
| (job_ptr->het_job_offset != 0)) |
| continue; |
| /* active het job leader found */ |
| FREE_NULL_LIST(job_ptr->het_job_list); |
| job_id_str = NULL; |
| /* Need to wrap numbers with brackets for hostset functions */ |
| xstrfmtcat(job_id_str, "[%s]", job_ptr->het_job_id_set); |
| hs = hostset_create(job_id_str); |
| xfree(job_id_str); |
| if (!hs) { |
| error("%pJ has invalid het_job_id_set(%s). Aborting and removing job as it is corrupted.", |
| job_ptr, job_ptr->het_job_id_set); |
| _abort_job(job_ptr, JOB_FAILED, FAIL_SYSTEM, |
| "invalid het_job_id_set"); |
| if (list_delete_item(job_iterator) != 1) |
| error("Not able to remove the job."); |
| continue; |
| } |
| job_ptr->het_job_list = list_create(NULL); |
| het_job_valid = true; /* assume valid for now */ |
| while (het_job_valid && (job_id_str = hostset_shift(hs))) { |
| job_id = (uint32_t) strtoll(job_id_str, NULL, 10); |
| het_job_ptr = find_job_record(job_id); |
| if (!het_job_ptr) { |
| error("Could not find JobId=%u, part of hetjob JobId=%u", |
| job_id, job_ptr->job_id); |
| het_job_valid = false; |
| } else if (het_job_ptr->het_job_id != |
| job_ptr->job_id) { |
| error("Invalid state of JobId=%u, part of hetjob JobId=%u", |
| job_id, job_ptr->job_id); |
| het_job_valid = false; |
| } else { |
| list_append(job_ptr->het_job_list, |
| het_job_ptr); |
| } |
| free(job_id_str); |
| } |
| hostset_destroy(hs); |
| if (het_job_valid) { |
| list_for_each(job_ptr->het_job_list, _mark_het_job_used, |
| NULL); |
| } |
| } |
| list_iterator_destroy(job_iterator); |
| |
| list_for_each(job_list, _test_het_job_used, NULL); |
| } |
| |
| /* Log an error if SlurmdUser is not root and any cgroup plugin is used */ |
| static void _test_cgroup_plugin_use(void) |
| { |
| if (xstrstr(slurm_conf.task_plugin, "cgroup")) |
| error("task/cgroup plugin will not work unless SlurmdUser is root"); |
| |
| if (xstrstr(slurm_conf.proctrack_type, "cgroup")) |
| error("proctrack/cgroup plugin will not work unless SlurmdUser is root"); |
| } |
| |
| |
| static void _sync_steps_to_conf(job_record_t *job_ptr) |
| { |
| list_itr_t *step_iterator; |
| step_record_t *step_ptr; |
| |
| step_iterator = list_iterator_create (job_ptr->step_list); |
| while ((step_ptr = list_next(step_iterator))) { |
| if (step_ptr->state < JOB_RUNNING) |
| continue; |
| FREE_NULL_BITMAP(step_ptr->step_node_bitmap); |
| if (step_ptr->step_layout && |
| step_ptr->step_layout->node_list && |
| (node_name2bitmap(step_ptr->step_layout->node_list, false, |
| &step_ptr->step_node_bitmap, NULL))) { |
| error("Invalid step_node_list (%s) for %pS", |
| step_ptr->step_layout->node_list, step_ptr); |
| delete_step_record(job_ptr, step_ptr); |
| } else if (step_ptr->step_node_bitmap == NULL) { |
| error("Missing node_list for %pS", step_ptr); |
| delete_step_record(job_ptr, step_ptr); |
| } |
| } |
| |
| list_iterator_destroy (step_iterator); |
| } |
| |
| static int _sync_detail_bitmaps(job_record_t *job_ptr) |
| { |
| if (job_ptr->details == NULL) |
| return SLURM_SUCCESS; |
| |
| FREE_NULL_BITMAP(job_ptr->details->req_node_bitmap); |
| |
| if ((job_ptr->details->req_nodes) && |
| (node_name2bitmap(job_ptr->details->req_nodes, false, |
| &job_ptr->details->req_node_bitmap, NULL))) { |
| error("Invalid req_nodes (%s) for %pJ", |
| job_ptr->details->req_nodes, job_ptr); |
| return SLURM_ERROR; |
| } |
| |
| /* |
| * Ignore any errors if the exc_nodes list contains invalid entries. |
| * We can the pretty sure we won't schedule onto nodes that don't exist. |
| */ |
| FREE_NULL_BITMAP(job_ptr->details->exc_node_bitmap); |
| if (job_ptr->details->exc_nodes) |
| node_name2bitmap(job_ptr->details->exc_nodes, false, |
| &job_ptr->details->exc_node_bitmap, NULL); |
| |
| /* |
| * If a nodelist has been provided with more nodes than are required |
| * for the job, translate this into an exclusion of all nodes except |
| * those requested. |
| */ |
| if (job_ptr->details->req_node_bitmap && |
| (bit_set_count(job_ptr->details->req_node_bitmap) > |
| job_ptr->details->min_nodes)) { |
| if (!job_ptr->details->exc_node_bitmap) |
| job_ptr->details->exc_node_bitmap = |
| bit_alloc(node_record_count); |
| bit_or_not(job_ptr->details->exc_node_bitmap, |
| job_ptr->details->req_node_bitmap); |
| FREE_NULL_BITMAP(job_ptr->details->req_node_bitmap); |
| } |
| |
| return SLURM_SUCCESS; |
| } |
| |
| /* |
| * _sync_jobs_to_conf - Sync current slurm.conf configuration for existing jobs. |
| * This should be called after rebuilding node, part, and gres information, |
| * but before using any job entries. |
| * global: last_job_update - time of last job table update |
| * job_list - pointer to global job list |
| */ |
| void _sync_jobs_to_conf(void) |
| { |
| list_itr_t *job_iterator; |
| job_record_t *job_ptr; |
| part_record_t *part_ptr; |
| list_t *part_ptr_list = NULL; |
| bool job_fail = false; |
| time_t now = time(NULL); |
| bool gang_flag = false; |
| |
| xassert(job_list); |
| |
| if (slurm_conf.preempt_mode & PREEMPT_MODE_GANG) |
| gang_flag = true; |
| |
| job_iterator = list_iterator_create(job_list); |
| while ((job_ptr = list_next(job_iterator))) { |
| xassert (job_ptr->magic == JOB_MAGIC); |
| job_fail = false; |
| |
| /* |
| * This resets the req/exc node bitmaps, so even if the job is |
| * finished it still needs to happen just in case the job is |
| * requeued. |
| */ |
| if (_sync_detail_bitmaps(job_ptr)) { |
| job_fail = true; |
| if (job_ptr->details) { |
| /* |
| * job can't be requeued because either |
| * req_nodes or exc_nodes can't be satisfied. |
| */ |
| job_ptr->details->requeue = false; |
| } |
| } |
| |
| /* |
| * While the job is completed at this point there is code in |
| * _job_requeue_op() that requires the part_ptr to be set in |
| * order to requeue a job. We also need to set it to NULL if |
| * the partition was removed or we will be pointing at bad |
| * data. This is the safest/easiest place to do it. |
| */ |
| |
| if (job_ptr->partition == NULL) { |
| error("No partition for %pJ", job_ptr); |
| part_ptr = NULL; |
| job_fail = true; |
| } else { |
| char *err_part = NULL; |
| get_part_list(job_ptr->partition, &part_ptr_list, |
| &part_ptr, &err_part); |
| if (part_ptr == NULL) { |
| error("Invalid partition (%s) for %pJ", |
| err_part, job_ptr); |
| xfree(err_part); |
| job_fail = true; |
| } |
| } |
| job_ptr->part_ptr = part_ptr; |
| FREE_NULL_LIST(job_ptr->part_ptr_list); |
| if (part_ptr_list) { |
| job_ptr->part_ptr_list = part_ptr_list; |
| part_ptr_list = NULL; /* clear for next job */ |
| } |
| |
| /* |
| * If the job is finished there is no reason to do anything |
| * below this. |
| */ |
| if (IS_JOB_COMPLETED(job_ptr)) |
| continue; |
| |
| FREE_NULL_BITMAP(job_ptr->node_bitmap_cg); |
| if (job_ptr->nodes_completing && |
| node_name2bitmap(job_ptr->nodes_completing, |
| false, &job_ptr->node_bitmap_cg, NULL)) { |
| error("Invalid nodes_completing (%s) for %pJ", |
| job_ptr->nodes_completing, job_ptr); |
| job_fail = true; |
| } |
| FREE_NULL_BITMAP(job_ptr->node_bitmap); |
| if (job_ptr->nodes && |
| node_name2bitmap(job_ptr->nodes, false, |
| &job_ptr->node_bitmap, NULL)) { |
| error("Invalid nodes (%s) for %pJ", |
| job_ptr->nodes, job_ptr); |
| job_fail = true; |
| } |
| FREE_NULL_BITMAP(job_ptr->node_bitmap_pr); |
| if (job_ptr->nodes_pr && |
| node_name2bitmap(job_ptr->nodes_pr, false, |
| &job_ptr->node_bitmap_pr, NULL)) { |
| error("Invalid nodes_pr (%s) for %pJ", |
| job_ptr->nodes_pr, job_ptr); |
| job_fail = true; |
| } |
| if (reset_node_bitmap(job_ptr)) |
| job_fail = true; |
| if (!job_fail && |
| job_ptr->job_resrcs && |
| (running_cons_tres() || gang_flag) && |
| valid_job_resources(job_ptr->job_resrcs)) { |
| error("Aborting %pJ due to change in socket/core configuration of allocated nodes", |
| job_ptr); |
| job_fail = true; |
| } |
| if (!job_fail && |
| gres_job_revalidate(job_ptr->gres_list_req)) { |
| error("Aborting %pJ due to use of unsupported GRES options", |
| job_ptr); |
| job_fail = true; |
| if (job_ptr->details) { |
| /* don't attempt to requeue job */ |
| job_ptr->details->requeue = false; |
| } |
| } |
| |
| if (!job_fail && job_ptr->job_resrcs && |
| (IS_JOB_RUNNING(job_ptr) || IS_JOB_SUSPENDED(job_ptr)) && |
| gres_job_revalidate2(job_ptr->job_id, |
| job_ptr->gres_list_alloc, |
| job_ptr->job_resrcs->node_bitmap)) { |
| /* |
| * This can be due to the job being allocated GRES |
| * which no longer exist (i.e. the GRES count on some |
| * allocated node changed since when the job started). |
| */ |
| error("Aborting %pJ due to use of invalid GRES configuration", |
| job_ptr); |
| job_fail = true; |
| } |
| |
| _sync_steps_to_conf(job_ptr); |
| |
| build_node_details(job_ptr, false); /* set node_addr */ |
| |
| if (job_fail) { |
| bool was_running = false; |
| if (IS_JOB_PENDING(job_ptr)) { |
| job_ptr->start_time = |
| job_ptr->end_time = time(NULL); |
| job_state_set(job_ptr, JOB_NODE_FAIL); |
| } else if (IS_JOB_RUNNING(job_ptr)) { |
| job_ptr->end_time = time(NULL); |
| job_state_set(job_ptr, (JOB_NODE_FAIL | |
| JOB_COMPLETING)); |
| build_cg_bitmap(job_ptr); |
| was_running = true; |
| } else if (IS_JOB_SUSPENDED(job_ptr)) { |
| job_ptr->end_time = job_ptr->suspend_time; |
| job_state_set(job_ptr, (JOB_NODE_FAIL | |
| JOB_COMPLETING)); |
| build_cg_bitmap(job_ptr); |
| job_ptr->tot_sus_time += |
| difftime(now, job_ptr->suspend_time); |
| jobacct_storage_g_job_suspend(acct_db_conn, |
| job_ptr); |
| was_running = true; |
| } |
| job_ptr->state_reason = FAIL_DOWN_NODE; |
| xfree(job_ptr->state_desc); |
| job_ptr->exit_code = 1; |
| job_completion_logger(job_ptr, false); |
| if (job_ptr->job_state == JOB_NODE_FAIL) { |
| /* build_cg_bitmap() may clear JOB_COMPLETING */ |
| epilog_slurmctld(job_ptr); |
| } |
| if (was_running && job_ptr->batch_flag && |
| job_ptr->details && job_ptr->details->requeue && |
| job_ptr->part_ptr) { |
| /* |
| * Mark for requeue |
| * see _requeue_job_node_failed() |
| */ |
| info("Attempting to requeue failed job %pJ", |
| job_ptr); |
| job_state_set_flag(job_ptr, JOB_REQUEUE); |
| |
| /* Reset node_cnt to exclude vanished nodes */ |
| job_ptr->node_cnt = bit_set_count( |
| job_ptr->node_bitmap_cg); |
| /* Reset exit code from last run */ |
| job_ptr->exit_code = 0; |
| } |
| } |
| } |
| |
| list_iterator_reset(job_iterator); |
| /* This will reinitialize the select plugin database, which |
| * we can only do after ALL job's states and bitmaps are set |
| * (i.e. it needs to be in this second loop) */ |
| while ((job_ptr = list_next(job_iterator))) { |
| if (select_g_select_nodeinfo_set(job_ptr) != SLURM_SUCCESS) { |
| error("select_g_select_nodeinfo_set(%pJ): %m", |
| job_ptr); |
| } |
| } |
| list_iterator_destroy(job_iterator); |
| |
| last_job_update = now; |
| } |
| |
| /* |
| * read_slurm_conf - load the slurm configuration from the configured file. |
| * read_slurm_conf can be called more than once if so desired. |
| * IN recover - replace job, node and/or partition data with latest |
| * available information depending upon value |
| * 0 = use no saved state information, rebuild everything from |
| * slurm.conf contents |
| * 1 = recover saved job and trigger state, |
| * node DOWN/DRAIN/FAIL state and reason information |
| * 2 = recover all saved state |
| * RET SLURM_SUCCESS if no error, otherwise an error code |
| * Note: Operates on common variables only |
| */ |
| extern int read_slurm_conf(int recover) |
| { |
| DEF_TIMERS; |
| int error_code = SLURM_SUCCESS; |
| int rc = 0, load_job_ret = SLURM_SUCCESS; |
| char *old_auth_type = xstrdup(slurm_conf.authtype); |
| char *old_bb_type = xstrdup(slurm_conf.bb_type); |
| char *old_cred_type = xstrdup(slurm_conf.cred_type); |
| char *old_job_container_type = xstrdup(slurm_conf.job_container_plugin); |
| char *old_preempt_type = xstrdup(slurm_conf.preempt_type); |
| char *old_sched_type = xstrdup(slurm_conf.schedtype); |
| char *old_select_type = xstrdup(slurm_conf.select_type); |
| char *old_switch_type = xstrdup(slurm_conf.switch_type); |
| char *state_save_dir = xstrdup(slurm_conf.state_save_location); |
| char *tmp_ptr = NULL; |
| uint16_t old_select_type_p = slurm_conf.select_type_param; |
| bool cgroup_mem_confinement = false; |
| uint16_t reconfig_flags = slurm_conf.reconfig_flags; |
| |
| /* initialization */ |
| START_TIMER; |
| |
| _init_all_slurm_conf(); |
| |
| cgroup_conf_init(); |
| |
| cgroup_mem_confinement = cgroup_memcg_job_confinement(); |
| |
| if (slurm_conf.job_acct_oom_kill && cgroup_mem_confinement) |
| fatal("Jobs memory is being constrained by both TaskPlugin cgroup and JobAcctGather plugin. This enables two incompatible memory enforcement mechanisms, one of them must be disabled."); |
| else if (slurm_conf.job_acct_oom_kill) |
| info("Memory enforcing by using JobAcctGather's mechanism is discouraged, task/cgroup is recommended where available."); |
| else if (!cgroup_mem_confinement) |
| info("No memory enforcing mechanism configured."); |
| |
| if (slurm_conf.slurmd_user_id != 0) |
| _test_cgroup_plugin_use(); |
| |
| if (topology_g_init() != SLURM_SUCCESS) |
| fatal("Failed to initialize topology plugin"); |
| |
| if (xstrcasestr(slurm_conf.slurmctld_params, "enable_stepmgr") && |
| !(slurm_conf.prolog_flags & PROLOG_FLAG_CONTAIN)) |
| fatal("STEP_MGR not supported without PrologFlags=contain"); |
| |
| /* Build node and partition information based upon slurm.conf file */ |
| if ((error_code = build_all_nodeline_info(false, slurmctld_tres_cnt))) |
| goto end_it; |
| /* Increase node table to handle dynamic nodes. */ |
| if ((slurm_conf.max_node_cnt != NO_VAL) && |
| node_record_count < slurm_conf.max_node_cnt) { |
| node_record_count = slurm_conf.max_node_cnt; |
| grow_node_record_table_ptr(); |
| } else { |
| /* Lock node_record_table_ptr from growing */ |
| slurm_conf.max_node_cnt = node_record_count; |
| } |
| if (slurm_conf.max_node_cnt == 0) { |
| /* |
| * Set to 1 so bitmaps will be created but don't allow any nodes |
| * to be created. |
| */ |
| node_record_count = 1; |
| grow_node_record_table_ptr(); |
| } |
| |
| bit_cache_init(node_record_count); |
| |
| (void)acct_storage_g_reconfig(acct_db_conn, 0); |
| _handle_all_downnodes(); |
| _build_all_partitionline_info(); |
| |
| /* |
| * Currently load/dump_state_lite has to run before load_all_job_state. |
| * FIXME: this stores a single string, this should probably move into |
| * the job state file as it's only pertinent to job accounting. |
| */ |
| load_config_state_lite(); |
| dump_config_state_lite(); |
| |
| update_logging(); |
| if (jobcomp_g_init() != SLURM_SUCCESS) |
| fatal("Failed to initialize jobcomp plugin"); |
| if (controller_init_scheduling( |
| (slurm_conf.preempt_mode & PREEMPT_MODE_GANG)) != SLURM_SUCCESS) { |
| fatal("Failed to initialize the various schedulers"); |
| } |
| |
| if (default_part_loc == NULL) |
| error("%s: default partition not set.", __func__); |
| |
| if (node_record_count < 1) { |
| error("%s: no nodes configured.", __func__); |
| error_code = EINVAL; |
| goto end_it; |
| } |
| |
| /* |
| * Node reordering may be done by the topology plugin. |
| * Reordering the table must be done before hashing the |
| * nodes, and before any position-relative bitmaps are created. |
| * |
| * Sort the nodes read in from the slurm.conf first before restoring |
| * the dynamic nodes from the state file to prevent dynamic nodes from |
| * being sorted -- which can cause problems with heterogeneous jobs and |
| * the order of the sockets changing on startup. |
| */ |
| _sort_node_record_table_ptr(); |
| |
| /* |
| * Load node state which includes dynamic nodes so that dynamic nodes |
| * can be included in topology. |
| */ |
| if (recover == 0) { /* Build everything from slurm.conf */ |
| _set_features(node_record_table_ptr, node_record_count, |
| recover); |
| } else if (recover == 1) { /* Load job & node state files */ |
| (void) load_all_node_state(true); |
| _set_features(node_record_table_ptr, node_record_count, |
| recover); |
| } else if (recover > 1) { /* Load node, part & job state files */ |
| (void) load_all_node_state(false); |
| _set_features(NULL, 0, recover); |
| } |
| |
| rehash_node(); |
| topology_g_build_config(); |
| |
| rehash_jobs(); |
| _validate_slurmd_addr(); |
| |
| _stat_slurm_dirs(); |
| |
| _init_bitmaps(); |
| |
| /* |
| * Set standard features and preserve the plugin controlled ones. |
| */ |
| if (recover == 0) { /* Build everything from slurm.conf */ |
| load_last_job_id(); |
| reset_first_job_id(); |
| controller_reconfig_scheduling(); |
| } else if (recover == 1) { /* Load job & node state files */ |
| load_job_ret = load_all_job_state(); |
| } else if (recover > 1) { /* Load node, part & job state files */ |
| reconfig_flags |= RECONFIG_KEEP_PART_INFO; |
| load_job_ret = load_all_job_state(); |
| } |
| (void) load_all_part_state(reconfig_flags); |
| |
| /* |
| * _build_node_config_bitmaps() must be called before |
| * build_features_list_*() and before restore_node_features() |
| */ |
| _build_node_config_bitmaps(); |
| /* _gres_reconfig needs to happen before restore_node_features */ |
| _gres_reconfig(); |
| /* NOTE: Run restore_node_features before _restore_job_accounting */ |
| restore_node_features(recover); |
| |
| if ((node_features_g_count() > 0) && |
| (node_features_g_get_node(NULL) != SLURM_SUCCESS)) |
| error("failed to initialize node features"); |
| |
| /* |
| * _build_bitmaps() must follow node_features_g_get_node() and |
| * precede build_features_list_*() |
| */ |
| _build_bitmaps(); |
| |
| if (_set_nodes_topo()) { |
| error("Invalid node topology"); |
| error_code = ESLURM_REQUESTED_TOPO_CONFIG_UNAVAILABLE; |
| goto end_it; |
| } |
| |
| /* Active and available features can be different on -R */ |
| if ((node_features_g_count() == 0) && (recover != 2)) |
| node_features_build_list_eq(); |
| else |
| node_features_build_list_ne(); |
| |
| _sync_part_prio(); |
| _build_part_bitmaps(); /* Must be called after build_feature_list_*() */ |
| if (list_for_each(part_list, set_part_topology_idx, NULL) < 0) { |
| error("Invalid partition topology"); |
| error_code = ESLURM_REQUESTED_TOPO_CONFIG_UNAVAILABLE; |
| goto end_it; |
| } |
| |
| if (select_g_node_init() != SLURM_SUCCESS) |
| fatal("Failed to initialize node selection plugin state, Clean start required."); |
| |
| /* |
| * config_power_mgr() Must be after node and partitions have been loaded |
| * and before any calls to power_save_test(). |
| */ |
| config_power_mgr(); |
| |
| _sync_jobs_to_conf(); /* must follow select_g_job_init() */ |
| |
| /* |
| * The burst buffer plugin must be initialized and state loaded before |
| * _sync_nodes_to_jobs(), which calls bb_g_job_init(). |
| */ |
| rc = bb_g_load_state(true); |
| error_code = MAX(error_code, rc); /* not fatal */ |
| |
| (void) _sync_nodes_to_jobs(); |
| (void) sync_job_files(); |
| |
| reserve_port_config(slurm_conf.mpi_params, job_list); |
| |
| if (license_update(slurm_conf.licenses) != SLURM_SUCCESS) |
| fatal("Invalid Licenses value: %s", slurm_conf.licenses); |
| |
| init_requeue_policy(); |
| init_depend_policy(); |
| |
| /* |
| * Must be at after nodes and partitions (e.g. |
| * _build_part_bitmaps()) have been created and before |
| * _sync_nodes_to_comp_job(). |
| */ |
| set_cluster_tres(false); |
| |
| _validate_het_jobs(); |
| (void) _sync_nodes_to_comp_job();/* must follow select_g_node_init() */ |
| _requeue_job_node_failed(); |
| load_part_uid_allow_list(true); |
| |
| /* NOTE: Run load_all_resv_state() before _restore_job_accounting */ |
| load_all_resv_state(recover); |
| if (recover >= 1) { |
| trigger_state_restore(); |
| controller_reconfig_scheduling(); |
| } |
| hres_init(); |
| _restore_job_accounting(); |
| |
| /* sort config_list by weight for scheduling */ |
| list_sort(config_list, &list_compare_config); |
| |
| /* Update plugins as possible */ |
| if (xstrcmp(old_auth_type, slurm_conf.authtype)) { |
| xfree(slurm_conf.authtype); |
| slurm_conf.authtype = old_auth_type; |
| old_auth_type = NULL; |
| rc = ESLURM_INVALID_AUTHTYPE_CHANGE; |
| } |
| |
| if (xstrcmp(old_bb_type, slurm_conf.bb_type)) { |
| xfree(slurm_conf.bb_type); |
| slurm_conf.bb_type = old_bb_type; |
| old_bb_type = NULL; |
| rc = ESLURM_INVALID_BURST_BUFFER_CHANGE; |
| } |
| |
| if (xstrcmp(old_cred_type, slurm_conf.cred_type)) { |
| xfree(slurm_conf.cred_type); |
| slurm_conf.cred_type = old_cred_type; |
| old_cred_type = NULL; |
| rc = ESLURM_INVALID_CRED_TYPE_CHANGE; |
| } |
| |
| if (xstrcmp(old_job_container_type, slurm_conf.job_container_plugin)) { |
| xfree(slurm_conf.job_container_plugin); |
| slurm_conf.job_container_plugin = old_job_container_type; |
| old_job_container_type = NULL; |
| rc = ESLURM_INVALID_JOB_CONTAINER_CHANGE; |
| } |
| |
| if (xstrcmp(old_sched_type, slurm_conf.schedtype)) { |
| xfree(slurm_conf.schedtype); |
| slurm_conf.schedtype = old_sched_type; |
| old_sched_type = NULL; |
| rc = ESLURM_INVALID_SCHEDTYPE_CHANGE; |
| } |
| |
| if (xstrcmp(old_select_type, slurm_conf.select_type)) { |
| xfree(slurm_conf.select_type); |
| slurm_conf.select_type = old_select_type; |
| old_select_type = NULL; |
| rc = ESLURM_INVALID_SELECTTYPE_CHANGE; |
| } |
| |
| if (xstrcmp(old_switch_type, slurm_conf.switch_type)) { |
| xfree(slurm_conf.switch_type); |
| slurm_conf.switch_type = old_switch_type; |
| old_switch_type = NULL; |
| rc = ESLURM_INVALID_SWITCHTYPE_CHANGE; |
| } |
| |
| if ((slurm_conf.control_cnt < 2) || |
| (slurm_conf.control_machine[1] == NULL)) |
| info("%s: backup_controller not specified", __func__); |
| |
| error_code = MAX(error_code, rc); /* not fatal */ |
| |
| if (xstrcmp(old_preempt_type, slurm_conf.preempt_type)) { |
| info("Changing PreemptType from %s to %s", |
| old_preempt_type, slurm_conf.preempt_type); |
| (void) preempt_g_fini(); |
| if (preempt_g_init() != SLURM_SUCCESS) |
| fatal("failed to initialize preempt plugin"); |
| } |
| |
| /* Update plugin parameters as possible */ |
| rc = _preserve_select_type_param(&slurm_conf, old_select_type_p); |
| error_code = MAX(error_code, rc); /* not fatal */ |
| |
| /* |
| * Restore job accounting info if file missing or corrupted, |
| * an extremely rare situation |
| */ |
| if (load_job_ret) |
| _acct_restore_active_jobs(); |
| |
| /* Sync select plugin with synchronized job/node/part data */ |
| gres_reconfig(); /* Clear gres/mps counters */ |
| select_g_reconfigure(); |
| |
| _set_response_cluster_rec(); |
| |
| consolidate_config_list(true, true); |
| cloud_dns = xstrcasestr(slurm_conf.slurmctld_params, "cloud_dns"); |
| if ((tmp_ptr = xstrcasestr(slurm_conf.slurmctld_params, |
| "max_powered_nodes="))) { |
| max_powered_nodes = |
| strtol(tmp_ptr + strlen("max_powered_nodes="), |
| NULL, 10); |
| } |
| |
| slurm_conf.last_update = time(NULL); |
| end_it: |
| xfree(old_auth_type); |
| xfree(old_bb_type); |
| xfree(old_cred_type); |
| xfree(old_job_container_type); |
| xfree(old_preempt_type); |
| xfree(old_sched_type); |
| xfree(old_select_type); |
| xfree(old_switch_type); |
| xfree(state_save_dir); |
| |
| END_TIMER2(__func__); |
| return error_code; |
| |
| } |
| |
| static void _gres_reconfig(void) |
| { |
| node_record_t *node_ptr; |
| char *gres_name; |
| int i; |
| |
| for (i = 0; (node_ptr = next_node(&i)); i++) { |
| /* node_ptr->gres is set when recover == 2 */ |
| if (node_ptr->gres) |
| gres_name = node_ptr->gres; |
| else |
| gres_name = node_ptr->config_ptr->gres; |
| gres_init_node_config(gres_name, &node_ptr->gres_list); |
| if (!IS_NODE_CLOUD(node_ptr)) |
| continue; |
| |
| /* |
| * Load in GRES for node now. By default Slurm gets this |
| * information when the node registers for the first |
| * time, which can take a while for a node in the cloud |
| * to boot. |
| */ |
| if (gres_g_node_config_load(node_ptr->config_ptr->cpus, |
| node_ptr->name, node_ptr->gres_list, |
| NULL, NULL) != SLURM_SUCCESS) |
| continue; /* No need to validate if load failed */ |
| |
| gres_node_config_validate(node_ptr, |
| node_ptr->config_ptr->threads, |
| node_ptr->config_ptr->cores, |
| node_ptr->config_ptr->tot_sockets, |
| (slurm_conf.conf_flags & |
| CONF_FLAG_OR), |
| NULL); |
| } |
| } |
| |
| /* |
| * Append changeable features in old_features and not in features to features. |
| */ |
| static void _merge_changeable_features(char *old_features, char **features) |
| { |
| char *save_ptr_old = NULL; |
| char *tok_old, *tmp_old, *tok_new; |
| char *sep; |
| |
| if (*features) |
| sep = ","; |
| else |
| sep = ""; |
| |
| /* Merge features strings, skipping duplicates */ |
| tmp_old = xstrdup(old_features); |
| for (tok_old = strtok_r(tmp_old, ",", &save_ptr_old); |
| tok_old; |
| tok_old = strtok_r(NULL, ",", &save_ptr_old)) { |
| bool match = false; |
| |
| if (!node_features_g_changeable_feature(tok_old)) |
| continue; |
| |
| if (*features) { |
| char *tmp_new, *save_ptr_new = NULL; |
| |
| /* Check if old feature already exists in features string */ |
| tmp_new = xstrdup(*features); |
| for (tok_new = strtok_r(tmp_new, ",", &save_ptr_new); |
| tok_new; |
| tok_new = strtok_r(NULL, ",", &save_ptr_new)) { |
| if (!xstrcmp(tok_old, tok_new)) { |
| match = true; |
| break; |
| } |
| } |
| xfree(tmp_new); |
| } |
| |
| if (match) |
| continue; |
| |
| xstrfmtcat(*features, "%s%s", sep, tok_old); |
| sep = ","; |
| } |
| xfree(tmp_old); |
| } |
| |
| static void _preserve_active_features(const char *available, |
| const char *old_active, |
| char **active) |
| { |
| char *old_feature, *saveptr_old; |
| char *tmp_old_active; |
| |
| if (!available || !old_active) |
| return; |
| |
| tmp_old_active = xstrdup(old_active); |
| for (old_feature = strtok_r(tmp_old_active, ",", &saveptr_old); |
| old_feature; |
| old_feature = strtok_r(NULL, ",", &saveptr_old)) { |
| char *new_feature, *saveptr_avail; |
| char *tmp_avail; |
| |
| if (!node_features_g_changeable_feature(old_feature)) |
| continue; |
| |
| tmp_avail = xstrdup(available); |
| for (new_feature = strtok_r(tmp_avail, ",", &saveptr_avail); |
| new_feature; |
| new_feature = strtok_r(NULL, ",", &saveptr_avail)) { |
| if (!xstrcmp(old_feature, new_feature)) { |
| xstrfmtcat(*active, "%s%s", |
| *active ? "," : "", old_feature); |
| break; |
| } |
| } |
| xfree(tmp_avail); |
| } |
| xfree(tmp_old_active); |
| } |
| |
| /* |
| * Configure node features. |
| * IN old_node_table_ptr IN - Previous nodes information |
| * IN old_node_record_count IN - Count of previous nodes information |
| * IN recover - replace node features data depending upon value. |
| * 0, 1 - use data from config record, built using slurm.conf |
| * 2 = use data from node record, built from saved state |
| */ |
| static void _set_features(node_record_t **old_node_table_ptr, |
| int old_node_record_count, int recover) |
| { |
| node_record_t *node_ptr, *old_node_ptr; |
| int i, node_features_cnt = node_features_g_count(); |
| |
| for (i = 0; i < old_node_record_count; i++) { |
| char *old_features_act; |
| |
| if (!(old_node_ptr = old_node_table_ptr[i])) |
| continue; |
| |
| node_ptr = find_node_record(old_node_ptr->name); |
| |
| if (node_ptr == NULL) |
| continue; |
| |
| /* |
| * Load all from state, ignore what has been read from |
| * slurm.conf. Features in node record just a placeholder |
| * for restore_node_features() to set up new config records. |
| */ |
| if (recover == 2) { |
| xfree(node_ptr->features); |
| xfree(node_ptr->features_act); |
| node_ptr->features = old_node_ptr->features; |
| node_ptr->features_act = old_node_ptr->features_act; |
| old_node_ptr->features = NULL; |
| old_node_ptr->features_act = NULL; |
| continue; |
| } |
| |
| /* No changeable features so active == available */ |
| if (node_features_cnt == 0) { |
| xfree(node_ptr->features_act); |
| node_ptr->features_act = xstrdup(node_ptr->features); |
| continue; |
| } |
| |
| /* If we are here, there's a node_features plugin active */ |
| |
| /* |
| * Changeable features may be listed in the slurm.conf along |
| * with the non-changeable features (e.g. cloud nodes). So |
| * filter out the changeable features and leave only the |
| * non-changeable features. non-changeable features are active |
| * by default. |
| */ |
| old_features_act = node_ptr->features_act; |
| node_ptr->features_act = |
| filter_out_changeable_features(node_ptr->features); |
| |
| /* |
| * Preserve active features on startup but make sure they are a |
| * subset of available features -- in case available features |
| * were changed. |
| * |
| * features_act has all non-changeable features now. We need to |
| * add back previous active features that are in available |
| * features. |
| * |
| * For cloud nodes, changeable features are added in slurm.conf. |
| * This will preserve the cloud active features on startup. When |
| * changeable features aren't defined in slurm.conf then |
| * features_act will be reset to all non-changeable features |
| * read in from slurm.conf and will expect to get the available |
| * and active features from the slurmd. |
| */ |
| _preserve_active_features(node_ptr->features, old_features_act, |
| &node_ptr->features_act); |
| xfree(old_features_act); |
| |
| /* |
| * On startup, node_record_table_ptr is passed as |
| * old_node_table_ptr so no need to merge features. |
| */ |
| if (node_ptr == old_node_ptr) |
| continue; |
| |
| /* |
| * The subset of plugin-controlled features_available |
| * and features_active found in the old node_ptr for this node |
| * are copied into new node respective fields. |
| * This will make that KNL modes are preserved while doing a |
| * reconfigure. Otherwise, we should wait until node is |
| * registered to get KNL available and active features. |
| */ |
| if (old_node_ptr->features != NULL) { |
| _merge_changeable_features(old_node_ptr->features, |
| &node_ptr->features); |
| } |
| |
| if (old_node_ptr->features_act != NULL) { |
| _merge_changeable_features(old_node_ptr->features_act, |
| &node_ptr->features_act); |
| } |
| } |
| } |
| |
| /* |
| * _preserve_select_type_param - preserve original plugin parameters. |
| * Daemons and/or commands must be restarted for some |
| * select plugin value changes to take effect. |
| * RET zero or error code |
| */ |
| static int _preserve_select_type_param(slurm_conf_t *ctl_conf_ptr, |
| uint16_t old_select_type_p) |
| { |
| int rc = SLURM_SUCCESS; |
| |
| /* SelectTypeParameters cannot change */ |
| if (old_select_type_p) { |
| if (old_select_type_p != ctl_conf_ptr->select_type_param) { |
| ctl_conf_ptr->select_type_param = old_select_type_p; |
| rc = ESLURM_INVALID_SELECTTYPE_CHANGE; |
| } |
| } |
| return rc; |
| } |
| |
| /* |
| * _sync_nodes_to_jobs - sync node state to job states on slurmctld restart. |
| * This routine marks nodes allocated to a job as busy no matter what |
| * the node's last saved state |
| * RET count of nodes having state changed |
| * Note: Operates on common variables, no arguments |
| */ |
| static int _sync_nodes_to_jobs(void) |
| { |
| job_record_t *job_ptr; |
| list_itr_t *job_iterator; |
| int update_cnt = 0; |
| |
| job_iterator = list_iterator_create(job_list); |
| while ((job_ptr = list_next(job_iterator))) { |
| if (job_ptr->details && job_ptr->details->prolog_running) { |
| job_ptr->details->prolog_running = 0; |
| if (IS_JOB_CONFIGURING(job_ptr)) { |
| prolog_slurmctld(job_ptr); |
| (void) bb_g_job_begin(job_ptr); |
| } |
| } |
| |
| if (job_ptr->node_bitmap == NULL) |
| ; |
| else if (IS_JOB_RUNNING(job_ptr) || IS_JOB_COMPLETING(job_ptr)) |
| update_cnt += _sync_nodes_to_active_job(job_ptr); |
| else if (IS_JOB_SUSPENDED(job_ptr)) |
| _sync_nodes_to_suspended_job(job_ptr); |
| |
| } |
| list_iterator_destroy(job_iterator); |
| |
| if (update_cnt) { |
| info("_sync_nodes_to_jobs updated state of %d nodes", |
| update_cnt); |
| } |
| return update_cnt; |
| } |
| |
| /* For jobs which are in state COMPLETING, deallocate the nodes and |
| * issue the RPC to kill the job */ |
| static int _sync_nodes_to_comp_job(void) |
| { |
| job_record_t *job_ptr; |
| list_itr_t *job_iterator; |
| int update_cnt = 0; |
| |
| job_iterator = list_iterator_create(job_list); |
| while ((job_ptr = list_next(job_iterator))) { |
| if ((job_ptr->node_bitmap) && IS_JOB_COMPLETING(job_ptr)) { |
| |
| /* If the controller is reconfiguring |
| * and the job is in completing state |
| * and the slurmctld epilog is already |
| * running which means deallocate_nodes() |
| * was already called, do invoke it again |
| * and don't start another epilog. |
| */ |
| if (job_ptr->epilog_running == true) |
| continue; |
| |
| update_cnt++; |
| info("%s: %pJ in completing state", __func__, job_ptr); |
| if (!job_ptr->node_bitmap_cg) |
| build_cg_bitmap(job_ptr); |
| |
| /* deallocate_nodes will remove this job from |
| * the system before it was added, so add it |
| * now |
| */ |
| if (accounting_enforce & ACCOUNTING_ENFORCE_LIMITS) |
| acct_policy_job_begin(job_ptr, false); |
| |
| deallocate_nodes(job_ptr, false, false, false); |
| /* The job in completing state at slurmctld restart or |
| * reconfiguration, do not log completion again. |
| * job_completion_logger(job_ptr, false); */ |
| } |
| } |
| list_iterator_destroy(job_iterator); |
| if (update_cnt) |
| info("%s: completing %d jobs", __func__, update_cnt); |
| return update_cnt; |
| } |
| |
| /* Synchronize states of nodes and active jobs (RUNNING or COMPLETING state) |
| * RET count of jobs with state changes */ |
| static int _sync_nodes_to_active_job(job_record_t *job_ptr) |
| { |
| int cnt = 0; |
| uint32_t node_flags; |
| node_record_t *node_ptr; |
| bitstr_t *node_bitmap, *orig_job_node_bitmap = NULL; |
| |
| if (job_ptr->node_bitmap_cg) /* job completing */ |
| node_bitmap = job_ptr->node_bitmap_cg; |
| else |
| node_bitmap = job_ptr->node_bitmap; |
| |
| job_ptr->node_cnt = bit_set_count(node_bitmap); |
| for (int i = 0; (node_ptr = next_node_bitmap(node_bitmap, &i)); i++) { |
| if ((job_ptr->details && |
| (job_ptr->details->whole_node & WHOLE_NODE_USER)) || |
| (job_ptr->part_ptr && |
| (job_ptr->part_ptr->flags & PART_FLAG_EXCLUSIVE_USER))) { |
| node_ptr->owner_job_cnt++; |
| node_ptr->owner = job_ptr->user_id; |
| } |
| |
| if (slurm_mcs_get_select(job_ptr) == 1) { |
| xfree(node_ptr->mcs_label); |
| node_ptr->mcs_label = xstrdup(job_ptr->mcs_label); |
| } |
| |
| node_flags = node_ptr->node_state & NODE_STATE_FLAGS; |
| |
| if (IS_JOB_COMPLETING(job_ptr) && job_ptr->epilog_running) { |
| /* |
| * _sync_nodes_to_comp_job() won't call |
| * deallocate_nodes()/make_node_comp() if the |
| * EpilogSlurmctld is still running to decrement |
| * run_job_cnt and increment comp_job_cnt, so just |
| * increment comp_job_cnt now. |
| */ |
| node_ptr->comp_job_cnt++; |
| } else { |
| /* |
| * run_job_cnt will be decremented by |
| * deallocate_nodes()/make_node_comp() in |
| * _sync_nodes_to_comp_job(). |
| */ |
| node_ptr->run_job_cnt++; |
| } |
| |
| if ((job_ptr->details) && (job_ptr->details->share_res == 0)) |
| node_ptr->no_share_job_cnt++; |
| |
| if (IS_NODE_DOWN(node_ptr) && |
| IS_JOB_RUNNING(job_ptr) && |
| (job_ptr->kill_on_node_fail == 0) && |
| (job_ptr->node_cnt > 1)) { |
| /* This should only happen if a job was running |
| * on a node that was newly configured DOWN */ |
| int save_accounting_enforce; |
| info("Removing failed node %s from %pJ", |
| node_ptr->name, job_ptr); |
| |
| /* |
| * Disable accounting here. Accounting reset for all |
| * jobs in _restore_job_accounting() |
| */ |
| save_accounting_enforce = accounting_enforce; |
| accounting_enforce &= (~ACCOUNTING_ENFORCE_LIMITS); |
| if (job_ptr->job_resrcs && |
| job_ptr->job_resrcs->node_bitmap) { |
| /* |
| * node_bitmap is eventually changed within |
| * extract_job_resources_node() so we need to |
| * copy it before that. |
| */ |
| if (!orig_job_node_bitmap) |
| orig_job_node_bitmap = bit_copy( |
| job_ptr->job_resrcs-> |
| node_bitmap); |
| } else { |
| error("We resized job %pJ, but the original node bitmap is unavailable. Unable to resize step node bitmaps for job's steps, this should never happen", |
| job_ptr); |
| } |
| job_pre_resize_acctg(job_ptr); |
| srun_node_fail(job_ptr, node_ptr->name); |
| kill_step_on_node(job_ptr, node_ptr, true); |
| excise_node_from_job(job_ptr, node_ptr); |
| job_post_resize_acctg(job_ptr); |
| accounting_enforce = save_accounting_enforce; |
| } else if (IS_NODE_DOWN(node_ptr) && IS_JOB_RUNNING(job_ptr)) { |
| info("Killing %pJ on DOWN node %s", |
| job_ptr, node_ptr->name); |
| job_ptr->exit_code = 1; |
| _abort_job(job_ptr, JOB_NODE_FAIL, FAIL_DOWN_NODE, |
| NULL); |
| cnt++; |
| } else if (IS_NODE_IDLE(node_ptr)) { |
| cnt++; |
| node_ptr->node_state = NODE_STATE_ALLOCATED | |
| node_flags; |
| } |
| } |
| |
| /* If the job was resized then resize the bitmaps of the job's steps */ |
| if (orig_job_node_bitmap) |
| rebuild_step_bitmaps(job_ptr, orig_job_node_bitmap); |
| FREE_NULL_BITMAP(orig_job_node_bitmap); |
| |
| set_initial_job_alias_list(job_ptr); |
| |
| return cnt; |
| } |
| |
| /* Synchronize states of nodes and suspended jobs */ |
| static void _sync_nodes_to_suspended_job(job_record_t *job_ptr) |
| { |
| node_record_t *node_ptr; |
| |
| for (int i = 0; (node_ptr = next_node_bitmap(job_ptr->node_bitmap, &i)); |
| i++) { |
| node_ptr->sus_job_cnt++; |
| } |
| |
| set_initial_job_alias_list(job_ptr); |
| } |
| |
| static void _restore_job_licenses(job_record_t *job_ptr) |
| { |
| list_t *license_list = NULL, *license_list_alloc = NULL; |
| bool valid = true, alloc_valid = true; |
| |
| license_list = license_validate(job_ptr->licenses, false, false, false, |
| job_ptr->tres_req_cnt, &valid); |
| license_list_alloc = |
| license_validate(job_ptr->licenses_allocated, false, false, |
| true, NULL, &alloc_valid); |
| FREE_NULL_LIST(job_ptr->license_list); |
| |
| if (valid) { |
| job_ptr->license_list = license_list; |
| xfree(job_ptr->licenses); |
| job_ptr->licenses = license_list_to_string(license_list); |
| } |
| |
| /* |
| * If there are allocated licenses, then set job_ptr->license_list to |
| * that so we get the correct licenses from the cluster. |
| */ |
| if (license_list_alloc && alloc_valid) { |
| FREE_NULL_LIST(job_ptr->license_list); |
| job_ptr->license_list = license_list_alloc; |
| xfree(job_ptr->licenses_allocated); |
| job_ptr->licenses_allocated = |
| license_list_to_string(job_ptr->license_list); |
| } |
| |
| if (IS_JOB_RUNNING(job_ptr) || IS_JOB_SUSPENDED(job_ptr) || |
| IS_JOB_COMPLETING(job_ptr)) |
| license_job_get(job_ptr, true); |
| } |
| |
| /* |
| * Build license_list for every job. |
| * Reset accounting for every job. |
| * Reset the running job count for scheduling policy. |
| * This must be called after load_all_resv_state() and restore_node_features(). |
| */ |
| static void _restore_job_accounting(void) |
| { |
| job_record_t *job_ptr; |
| list_itr_t *job_iterator; |
| |
| assoc_mgr_clear_used_info(); |
| |
| job_iterator = list_iterator_create(job_list); |
| while ((job_ptr = list_next(job_iterator))) { |
| if (job_ptr->array_recs) |
| job_ptr->array_recs->tot_run_tasks = 0; |
| } |
| |
| list_iterator_reset(job_iterator); |
| while ((job_ptr = list_next(job_iterator))) { |
| (void) build_feature_list(job_ptr, false, false); |
| (void) build_feature_list(job_ptr, true, false); |
| |
| if (job_ptr->details->features_use == |
| job_ptr->details->features) |
| job_ptr->details->feature_list_use = |
| job_ptr->details->feature_list; |
| else if (job_ptr->details->features_use == |
| job_ptr->details->prefer) |
| job_ptr->details->feature_list_use = |
| job_ptr->details->prefer_list; |
| (void) extra_constraints_parse(job_ptr->extra, |
| &job_ptr->extra_constraints); |
| |
| if (IS_JOB_RUNNING(job_ptr) || IS_JOB_SUSPENDED(job_ptr)) |
| job_array_start(job_ptr); |
| |
| if (accounting_enforce & ACCOUNTING_ENFORCE_LIMITS) { |
| if (!IS_JOB_FINISHED(job_ptr)) |
| acct_policy_add_job_submit(job_ptr, false); |
| if (IS_JOB_RUNNING(job_ptr) || |
| IS_JOB_SUSPENDED(job_ptr)) { |
| acct_policy_job_begin(job_ptr, false); |
| resv_replace_update(job_ptr); |
| } else if (IS_JOB_PENDING(job_ptr) && |
| job_ptr->details && |
| job_ptr->details->accrue_time) { |
| /* |
| * accrue usage was cleared above with |
| * assoc_mgr_clear_used_info(). Clear accrue |
| * time so that _handle_add_accrue() will add |
| * the usage back. |
| */ |
| time_t save_accrue_time = |
| job_ptr->details->accrue_time; |
| job_ptr->details->accrue_time = 0; |
| acct_policy_add_accrue_time(job_ptr, false); |
| if (job_ptr->details->accrue_time) |
| job_ptr->details->accrue_time = |
| save_accrue_time; |
| } |
| } |
| _restore_job_licenses(job_ptr); |
| } |
| list_iterator_destroy(job_iterator); |
| } |
| |
| /* Flush accounting information on this cluster, then for each running or |
| * suspended job, restore its state in the accounting system */ |
| static void _acct_restore_active_jobs(void) |
| { |
| job_record_t *job_ptr; |
| list_itr_t *job_iterator; |
| step_record_t *step_ptr; |
| list_itr_t *step_iterator; |
| |
| info("Reinitializing job accounting state"); |
| acct_storage_g_flush_jobs_on_cluster(acct_db_conn, |
| time(NULL)); |
| job_iterator = list_iterator_create(job_list); |
| while ((job_ptr = list_next(job_iterator))) { |
| if (IS_JOB_SUSPENDED(job_ptr)) |
| jobacct_storage_g_job_suspend(acct_db_conn, job_ptr); |
| if (IS_JOB_SUSPENDED(job_ptr) || IS_JOB_RUNNING(job_ptr)) { |
| if (job_ptr->db_index != NO_VAL64) |
| job_ptr->db_index = 0; |
| step_iterator = list_iterator_create( |
| job_ptr->step_list); |
| while ((step_ptr = list_next(step_iterator))) { |
| jobacct_storage_g_step_start(acct_db_conn, |
| step_ptr); |
| } |
| list_iterator_destroy (step_iterator); |
| } |
| } |
| list_iterator_destroy(job_iterator); |
| } |
| |
| extern int dump_config_state_lite(void) |
| { |
| static uint32_t high_buffer_size = (1024 * 1024); |
| int error_code = 0; |
| buf_t *buffer = init_buf(high_buffer_size); |
| |
| DEF_TIMERS; |
| |
| START_TIMER; |
| /* write header: version, time */ |
| pack16(SLURM_PROTOCOL_VERSION, buffer); |
| pack_time(time(NULL), buffer); |
| packstr(slurm_conf.accounting_storage_type, buffer); |
| |
| /* write the buffer to file */ |
| error_code = save_buf_to_state("last_config_lite", buffer, |
| &high_buffer_size); |
| |
| FREE_NULL_BUFFER(buffer); |
| |
| END_TIMER2(__func__); |
| return error_code; |
| } |
| |
| extern int load_config_state_lite(void) |
| { |
| uint16_t ver = 0; |
| char *state_file; |
| buf_t *buffer; |
| time_t buf_time; |
| char *last_accounting_storage_type = NULL; |
| |
| if (!(buffer = state_save_open("last_config_lite", &state_file))) { |
| debug2("No last_config_lite file (%s) to recover", state_file); |
| xfree(state_file); |
| return ENOENT; |
| } |
| xfree(state_file); |
| |
| safe_unpack16(&ver, buffer); |
| debug3("Version in last_conf_lite header is %u", ver); |
| if (ver > SLURM_PROTOCOL_VERSION || ver < SLURM_MIN_PROTOCOL_VERSION) { |
| if (!ignore_state_errors) |
| fatal("Can not recover last_conf_lite, incompatible version, (%u not between %d and %d), start with '-i' to ignore this. Warning: using -i will lose the data that can't be recovered.", |
| ver, SLURM_MIN_PROTOCOL_VERSION, |
| SLURM_PROTOCOL_VERSION); |
| error("***********************************************"); |
| error("Can not recover last_conf_lite, incompatible version, " |
| "(%u not between %d and %d)", |
| ver, SLURM_MIN_PROTOCOL_VERSION, SLURM_PROTOCOL_VERSION); |
| error("***********************************************"); |
| FREE_NULL_BUFFER(buffer); |
| return EFAULT; |
| } else { |
| safe_unpack_time(&buf_time, buffer); |
| safe_unpackstr(&last_accounting_storage_type, buffer); |
| } |
| |
| if (last_accounting_storage_type |
| && !xstrcmp(last_accounting_storage_type, |
| slurm_conf.accounting_storage_type)) |
| slurmctld_init_db = 0; |
| xfree(last_accounting_storage_type); |
| |
| FREE_NULL_BUFFER(buffer); |
| return SLURM_SUCCESS; |
| |
| unpack_error: |
| if (!ignore_state_errors) |
| fatal("Incomplete last_config_lite checkpoint file, start with '-i' to ignore this. Warning: using -i will lose the data that can't be recovered."); |
| error("Incomplete last_config_lite checkpoint file"); |
| FREE_NULL_BUFFER(buffer); |
| |
| return SLURM_ERROR; |
| } |