| /*****************************************************************************\ |
| * read_config.c - read the overall slurm configuration file |
| ***************************************************************************** |
| * Copyright (C) 2002-2007 The Regents of the University of California. |
| * Copyright (C) 2008-2010 Lawrence Livermore National Security. |
| * Portions Copyright (C) 2010 SchedMD <http://www.schedmd.com>. |
| * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). |
| * Written by Morris Jette <jette1@llnl.gov>. |
| * CODE-OCEC-09-009. All rights reserved. |
| * |
| * This file is part of SLURM, a resource management program. |
| * For details, see <http://www.schedmd.com/slurmdocs/>. |
| * Please also read the included file: DISCLAIMER. |
| * |
| * SLURM is free software; you can redistribute it and/or modify it under |
| * the terms of the GNU General Public License as published by the Free |
| * Software Foundation; either version 2 of the License, or (at your option) |
| * any later version. |
| * |
| * In addition, as a special exception, the copyright holders give permission |
| * to link the code of portions of this program with the OpenSSL library under |
| * certain conditions as described in each individual source file, and |
| * distribute linked combinations including the two. You must obey the GNU |
| * General Public License in all respects for all of the code used other than |
| * OpenSSL. If you modify file(s) with this exception, you may extend this |
| * exception to your version of the file(s), but you are not obligated to do |
| * so. If you do not wish to do so, delete this exception statement from your |
| * version. If you delete this exception statement from all source files in |
| * the program, then also delete it here. |
| * |
| * SLURM is distributed in the hope that it will be useful, but WITHOUT ANY |
| * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS |
| * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more |
| * details. |
| * |
| * You should have received a copy of the GNU General Public License along |
| * with SLURM; if not, write to the Free Software Foundation, Inc., |
| * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. |
| \*****************************************************************************/ |
| |
| #ifdef HAVE_CONFIG_H |
| # include "config.h" |
| #endif |
| |
| #ifdef HAVE_SYS_SYSLOG_H |
| # include <sys/syslog.h> |
| #endif |
| |
| #include <ctype.h> |
| #include <errno.h> |
| #include <fcntl.h> |
| #include <stdio.h> |
| #include <stdlib.h> |
| #include <string.h> |
| #include <syslog.h> |
| #include <sys/types.h> |
| #include <sys/stat.h> |
| #include <time.h> |
| #include <unistd.h> |
| |
| #include "src/common/assoc_mgr.h" |
| #include "src/common/gres.h" |
| #include "src/common/hostlist.h" |
| #include "src/common/list.h" |
| #include "src/common/macros.h" |
| #include "src/common/node_select.h" |
| #include "src/common/parse_spec.h" |
| #include "src/common/read_config.h" |
| #include "src/common/slurm_jobcomp.h" |
| #include "src/common/slurm_topology.h" |
| #include "src/common/slurm_rlimits_info.h" |
| #include "src/common/switch.h" |
| #include "src/common/xstring.h" |
| #include "src/common/strnatcmp.h" |
| |
| #include "src/slurmctld/acct_policy.h" |
| #include "src/slurmctld/front_end.h" |
| #include "src/slurmctld/gang.h" |
| #include "src/slurmctld/job_scheduler.h" |
| #include "src/slurmctld/job_submit.h" |
| #include "src/slurmctld/licenses.h" |
| #include "src/slurmctld/locks.h" |
| #include "src/slurmctld/node_scheduler.h" |
| #include "src/slurmctld/port_mgr.h" |
| #include "src/slurmctld/preempt.h" |
| #include "src/slurmctld/proc_req.h" |
| #include "src/slurmctld/read_config.h" |
| #include "src/slurmctld/reservation.h" |
| #include "src/slurmctld/sched_plugin.h" |
| #include "src/slurmctld/slurmctld.h" |
| #include "src/slurmctld/srun_comm.h" |
| #include "src/slurmctld/trigger_mgr.h" |
| |
| static void _acct_restore_active_jobs(void); |
| static int _build_bitmaps(void); |
| static void _build_bitmaps_pre_select(void); |
| static void _gres_reconfig(bool reconfig); |
| static int _init_all_slurm_conf(void); |
| static int _preserve_select_type_param(slurm_ctl_conf_t * ctl_conf_ptr, |
| uint16_t old_select_type_p); |
| static int _preserve_plugins(slurm_ctl_conf_t * ctl_conf_ptr, |
| char *old_auth_type, char *old_checkpoint_type, |
| char *old_crypto_type, char *old_sched_type, |
| char *old_select_type, char *old_switch_type); |
| static void _purge_old_node_state(struct node_record *old_node_table_ptr, |
| int old_node_record_count); |
| static void _purge_old_part_state(List old_part_list, char *old_def_part_name); |
| static int _restore_job_dependencies(void); |
| static int _restore_node_state(int recover, |
| struct node_record *old_node_table_ptr, |
| int old_node_record_count); |
| static int _restore_part_state(List old_part_list, char *old_def_part_name, |
| uint16_t flags); |
| static int _strcmp(const char *s1, const char *s2); |
| static int _sync_nodes_to_comp_job(void); |
| static int _sync_nodes_to_jobs(void); |
| static int _sync_nodes_to_active_job(struct job_record *job_ptr); |
| static void _sync_nodes_to_suspended_job(struct job_record *job_ptr); |
| static void _sync_part_prio(void); |
| static int _update_preempt(uint16_t old_enable_preempt); |
| #ifdef HAVE_ELAN |
| static void _validate_node_proc_count(void); |
| #endif |
| |
| /* |
| * _reorder_nodes_by_name - order node table in ascending order of name |
| */ |
| static void _reorder_nodes_by_name(void) |
| { |
| struct node_record *node_ptr, *node_ptr2; |
| int i, j, min_inx; |
| |
| /* Now we need to sort the node records */ |
| for (i = 0; i < node_record_count; i++) { |
| min_inx = i; |
| for (j = i + 1; j < node_record_count; j++) { |
| if (strnatcmp(node_record_table_ptr[j].name, |
| node_record_table_ptr[min_inx].name) < 0) |
| min_inx = j; |
| } |
| |
| if (min_inx != i) { /* swap records */ |
| struct node_record node_record_tmp; |
| |
| j = sizeof(struct node_record); |
| node_ptr = node_record_table_ptr + i; |
| node_ptr2 = node_record_table_ptr + min_inx; |
| |
| memcpy(&node_record_tmp, node_ptr, j); |
| memcpy(node_ptr, node_ptr2, j); |
| memcpy(node_ptr2, &node_record_tmp, j); |
| } |
| } |
| |
| #if _DEBUG |
| /* Log the results */ |
| for (i=0, node_ptr = node_record_table_ptr; i < node_record_count; |
| i++, node_ptr++) { |
| info("node_rank[%d]: %s", i, node_ptr->name); |
| } |
| #endif |
| } |
| |
| /* |
| * _reorder_nodes_by_rank - order node table in ascending order of node_rank |
| * This depends on the TopologyPlugin and/or SelectPlugin, which may generate |
| * such a ranking. |
| */ |
| static void _reorder_nodes_by_rank(void) |
| { |
| struct node_record *node_ptr, *node_ptr2; |
| int i, j, min_inx; |
| uint32_t min_val; |
| |
| /* Now we need to sort the node records */ |
| for (i = 0; i < node_record_count; i++) { |
| min_val = node_record_table_ptr[i].node_rank; |
| min_inx = i; |
| for (j = i + 1; j < node_record_count; j++) { |
| if (node_record_table_ptr[j].node_rank < min_val) { |
| min_val = node_record_table_ptr[j].node_rank; |
| min_inx = j; |
| } |
| } |
| |
| if (min_inx != i) { /* swap records */ |
| struct node_record node_record_tmp; |
| |
| j = sizeof(struct node_record); |
| node_ptr = node_record_table_ptr + i; |
| node_ptr2 = node_record_table_ptr + min_inx; |
| |
| memcpy(&node_record_tmp, node_ptr, j); |
| memcpy(node_ptr, node_ptr2, j); |
| memcpy(node_ptr2, &node_record_tmp, j); |
| } |
| } |
| |
| #if _DEBUG |
| /* Log the results */ |
| for (i=0, node_ptr = node_record_table_ptr; i < node_record_count; |
| i++, node_ptr++) { |
| info("node_rank[%u]: %s", node_ptr->node_rank, node_ptr->name); |
| } |
| #endif |
| } |
| |
| |
| /* |
| * _build_bitmaps_pre_select - recover some state for jobs and nodes prior to |
| * calling the select_* functions |
| */ |
| static void _build_bitmaps_pre_select(void) |
| { |
| struct part_record *part_ptr; |
| struct node_record *node_ptr; |
| ListIterator part_iterator; |
| int i; |
| |
| /* scan partition table and identify nodes in each */ |
| part_iterator = list_iterator_create(part_list); |
| if (part_iterator == NULL) |
| fatal ("memory allocation failure"); |
| |
| while ((part_ptr = (struct part_record *) list_next(part_iterator))) { |
| FREE_NULL_BITMAP(part_ptr->node_bitmap); |
| |
| if ((part_ptr->nodes == NULL) || (part_ptr->nodes[0] == '\0')) { |
| /* Partitions need a bitmap, even if empty */ |
| part_ptr->node_bitmap = bit_alloc(node_record_count); |
| continue; |
| } |
| |
| if (node_name2bitmap(part_ptr->nodes, false, |
| &part_ptr->node_bitmap)) { |
| fatal("Invalid node names in partition %s", |
| part_ptr->name); |
| } |
| |
| for (i=0; i<node_record_count; i++) { |
| if (bit_test(part_ptr->node_bitmap, i) == 0) |
| continue; |
| node_ptr = &node_record_table_ptr[i]; |
| part_ptr->total_nodes++; |
| if (slurmctld_conf.fast_schedule) |
| part_ptr->total_cpus += |
| node_ptr->config_ptr->cpus; |
| else |
| part_ptr->total_cpus += node_ptr->cpus; |
| node_ptr->part_cnt++; |
| xrealloc(node_ptr->part_pptr, (node_ptr->part_cnt * |
| sizeof(struct part_record *))); |
| node_ptr->part_pptr[node_ptr->part_cnt-1] = part_ptr; |
| } |
| } |
| list_iterator_destroy(part_iterator); |
| return; |
| } |
| |
| /* |
| * _build_bitmaps - build node bitmaps to define which nodes are in which |
| * 1) partition 2) configuration record 3) up state 4) idle state |
| * also sets values of total_nodes and total_cpus for every partition. |
| * RET 0 if no error, errno otherwise |
| * Note: Operates on common variables, no arguments |
| * node_record_count - number of nodes in the system |
| * node_record_table_ptr - pointer to global node table |
| * part_list - pointer to global partition list |
| */ |
| static int _build_bitmaps(void) |
| { |
| int i, error_code = SLURM_SUCCESS; |
| ListIterator config_iterator; |
| struct config_record *config_ptr; |
| struct job_record *job_ptr; |
| struct node_record *node_ptr; |
| ListIterator job_iterator; |
| |
| last_node_update = time(NULL); |
| last_part_update = time(NULL); |
| |
| /* initialize the idle and up bitmaps */ |
| FREE_NULL_BITMAP(avail_node_bitmap); |
| FREE_NULL_BITMAP(cg_node_bitmap); |
| FREE_NULL_BITMAP(idle_node_bitmap); |
| FREE_NULL_BITMAP(power_node_bitmap); |
| FREE_NULL_BITMAP(share_node_bitmap); |
| FREE_NULL_BITMAP(up_node_bitmap); |
| avail_node_bitmap = (bitstr_t *) bit_alloc(node_record_count); |
| cg_node_bitmap = (bitstr_t *) bit_alloc(node_record_count); |
| idle_node_bitmap = (bitstr_t *) bit_alloc(node_record_count); |
| power_node_bitmap = (bitstr_t *) bit_alloc(node_record_count); |
| share_node_bitmap = (bitstr_t *) bit_alloc(node_record_count); |
| up_node_bitmap = (bitstr_t *) bit_alloc(node_record_count); |
| if ((avail_node_bitmap == NULL) || |
| (cg_node_bitmap == NULL) || |
| (idle_node_bitmap == NULL) || |
| (power_node_bitmap == NULL) || |
| (share_node_bitmap == NULL) || |
| (up_node_bitmap == NULL)) |
| fatal ("bit_alloc malloc failure"); |
| |
| /* initialize the configuration bitmaps */ |
| config_iterator = list_iterator_create(config_list); |
| if (config_iterator == NULL) |
| fatal ("memory allocation failure"); |
| while ((config_ptr = (struct config_record *) |
| list_next(config_iterator))) { |
| FREE_NULL_BITMAP(config_ptr->node_bitmap); |
| config_ptr->node_bitmap = |
| (bitstr_t *) bit_alloc(node_record_count); |
| if (config_ptr->node_bitmap == NULL) |
| fatal ("bit_alloc malloc failure"); |
| } |
| list_iterator_destroy(config_iterator); |
| |
| /* Set all bits, all nodes initially available for sharing */ |
| bit_nset(share_node_bitmap, 0, (node_record_count-1)); |
| |
| /* identify all nodes non-sharable due to non-sharing jobs */ |
| job_iterator = list_iterator_create(job_list); |
| while ((job_ptr = (struct job_record *) list_next(job_iterator))) { |
| bitstr_t *tmp_bits; |
| if (!IS_JOB_RUNNING(job_ptr) || |
| (job_ptr->node_bitmap == NULL) || |
| (job_ptr->details == NULL) || |
| (job_ptr->details->shared != 0)) |
| continue; |
| tmp_bits = bit_copy(job_ptr->node_bitmap); |
| if (tmp_bits == NULL) |
| fatal ("bit_copy malloc failure"); |
| bit_not(tmp_bits); |
| bit_and(share_node_bitmap, tmp_bits); |
| FREE_NULL_BITMAP(tmp_bits); |
| } |
| list_iterator_destroy(job_iterator); |
| |
| /* scan all nodes and identify which are up, idle and |
| * their configuration, resync DRAINED vs. DRAINING state */ |
| for (i=0, node_ptr=node_record_table_ptr; |
| i<node_record_count; i++, node_ptr++) { |
| uint16_t drain_flag, job_cnt; |
| |
| if (node_ptr->name[0] == '\0') |
| continue; /* defunct */ |
| drain_flag = IS_NODE_DRAIN(node_ptr) | |
| IS_NODE_FAIL(node_ptr); |
| job_cnt = node_ptr->run_job_cnt + node_ptr->comp_job_cnt; |
| |
| if ((IS_NODE_IDLE(node_ptr) && (job_cnt == 0)) || |
| IS_NODE_DOWN(node_ptr)) |
| bit_set(idle_node_bitmap, i); |
| if (IS_NODE_COMPLETING(node_ptr)) |
| bit_set(cg_node_bitmap, i); |
| if (IS_NODE_IDLE(node_ptr) || IS_NODE_ALLOCATED(node_ptr)) { |
| if ((drain_flag == 0) && |
| (!IS_NODE_NO_RESPOND(node_ptr))) |
| bit_set(avail_node_bitmap, i); |
| bit_set(up_node_bitmap, i); |
| } |
| if (IS_NODE_POWER_SAVE(node_ptr)) |
| bit_set(power_node_bitmap, i); |
| if (node_ptr->config_ptr) |
| bit_set(node_ptr->config_ptr->node_bitmap, i); |
| } |
| |
| config_iterator = list_iterator_create(config_list); |
| if (config_iterator == NULL) |
| fatal ("memory allocation failure"); |
| while ((config_ptr = (struct config_record *) |
| list_next(config_iterator))) { |
| build_config_feature_list(config_ptr); |
| } |
| list_iterator_destroy(config_iterator); |
| |
| return error_code; |
| } |
| |
| |
| /* |
| * _init_all_slurm_conf - initialize or re-initialize the slurm |
| * configuration values. |
| * RET 0 if no error, otherwise an error code. |
| * NOTE: We leave the job table intact |
| * NOTE: Operates on common variables, no arguments |
| */ |
| static int _init_all_slurm_conf(void) |
| { |
| int error_code; |
| char *conf_name = xstrdup(slurmctld_conf.slurm_conf); |
| |
| slurm_conf_reinit(conf_name); |
| xfree(conf_name); |
| |
| if ((error_code = init_node_conf())) |
| return error_code; |
| |
| if ((error_code = init_part_conf())) |
| return error_code; |
| |
| if ((error_code = init_job_conf())) |
| return error_code; |
| |
| return 0; |
| } |
| |
| static int _handle_downnodes_line(slurm_conf_downnodes_t *down) |
| { |
| int error_code = 0; |
| struct node_record *node_rec = NULL; |
| hostlist_t alias_list = NULL; |
| char *alias = NULL; |
| int state_val = NODE_STATE_DOWN; |
| |
| if (down->state != NULL) { |
| state_val = state_str2int(down->state, down->nodenames); |
| if (state_val == NO_VAL) { |
| error("Invalid State \"%s\"", down->state); |
| goto cleanup; |
| } |
| } |
| |
| if ((alias_list = hostlist_create(down->nodenames)) == NULL) { |
| error("Unable to create NodeName list from %s", |
| down->nodenames); |
| error_code = errno; |
| goto cleanup; |
| } |
| |
| while ((alias = hostlist_shift(alias_list))) { |
| node_rec = find_node_record(alias); |
| if (node_rec == NULL) { |
| error("DownNode \"%s\" does not exist!", alias); |
| free(alias); |
| continue; |
| } |
| |
| if ((state_val != NO_VAL) && |
| (state_val != NODE_STATE_UNKNOWN)) |
| node_rec->node_state = state_val; |
| if (down->reason) { |
| xfree(node_rec->reason); |
| node_rec->reason = xstrdup(down->reason); |
| node_rec->reason_time = time(NULL); |
| node_rec->reason_uid = getuid(); |
| } |
| free(alias); |
| } |
| |
| cleanup: |
| if (alias_list) |
| hostlist_destroy(alias_list); |
| return error_code; |
| } |
| |
| static void _handle_all_downnodes(void) |
| { |
| slurm_conf_downnodes_t *ptr, **ptr_array; |
| int count; |
| int i; |
| |
| count = slurm_conf_downnodes_array(&ptr_array); |
| if (count == 0) { |
| debug("No DownNodes"); |
| return; |
| } |
| |
| for (i = 0; i < count; i++) { |
| ptr = ptr_array[i]; |
| |
| _handle_downnodes_line(ptr); |
| } |
| } |
| |
| /* |
| * _build_all_nodeline_info - get a array of slurm_conf_node_t structures |
| * from the slurm.conf reader, build table, and set values |
| * RET 0 if no error, error code otherwise |
| * Note: Operates on common variables |
| * default_node_record - default node configuration values |
| */ |
| static int _build_all_nodeline_info(void) |
| { |
| int rc; |
| |
| /* Load the node table here */ |
| rc = build_all_nodeline_info(false); |
| rc = MAX(build_all_frontend_info(false), rc); |
| |
| /* Now perform operations on the node table as needed by slurmctld */ |
| #ifdef HAVE_BG |
| { |
| slurm_ctl_conf_t *conf = slurm_conf_lock(); |
| char *node_000 = NULL; |
| struct node_record *node_rec = NULL; |
| if (conf->node_prefix) |
| node_000 = xstrdup(conf->node_prefix); |
| slurm_conf_unlock(); |
| #if (SYSTEM_DIMENSIONS == 3) |
| xstrcat(node_000, "000"); |
| #endif |
| #if (SYSTEM_DIMENSIONS == 4) |
| xstrcat(node_000, "0000"); |
| #endif |
| #if (SYSTEM_DIMENSIONS == 5) |
| xstrcat(node_000, "00000"); |
| #endif |
| node_rec = find_node_record(node_000); |
| if (node_rec == NULL) |
| info("WARNING: No node %s configured", node_000); |
| xfree(node_000); |
| } |
| #endif /* HAVE_BG */ |
| |
| return rc; |
| } |
| |
| /* |
| * _build_single_partitionline_info - get a array of slurm_conf_partition_t |
| * structures from the slurm.conf reader, build table, and set values |
| * RET 0 if no error, error code otherwise |
| * Note: Operates on common variables |
| * global: part_list - global partition list pointer |
| * default_part - default parameters for a partition |
| */ |
| static int _build_single_partitionline_info(slurm_conf_partition_t *part) |
| { |
| struct part_record *part_ptr; |
| |
| part_ptr = list_find_first(part_list, &list_find_part, part->name); |
| if (part_ptr == NULL) { |
| part_ptr = create_part_record(); |
| xfree(part_ptr->name); |
| part_ptr->name = xstrdup(part->name); |
| } else { |
| /* FIXME - maybe should be fatal? */ |
| error("_parse_part_spec: duplicate entry for partition %s, " |
| "ignoring", part->name); |
| return EEXIST; |
| } |
| |
| if (part->default_flag) { |
| if (default_part_name && |
| strcmp(default_part_name, part->name)) { |
| info("_parse_part_spec: changing default partition " |
| "from %s to %s", default_part_name, part->name); |
| default_part_loc->flags &= (~PART_FLAG_DEFAULT); |
| } |
| xfree(default_part_name); |
| default_part_name = xstrdup(part->name); |
| default_part_loc = part_ptr; |
| part_ptr->flags |= PART_FLAG_DEFAULT; |
| } |
| |
| if (part->preempt_mode != (uint16_t) NO_VAL) |
| part_ptr->preempt_mode = part->preempt_mode; |
| |
| if (part->disable_root_jobs == (uint16_t)NO_VAL) { |
| if (slurmctld_conf.disable_root_jobs) |
| part_ptr->flags |= PART_FLAG_NO_ROOT; |
| } else if (part->disable_root_jobs) { |
| part_ptr->flags |= PART_FLAG_NO_ROOT; |
| } else { |
| part_ptr->flags &= (~PART_FLAG_NO_ROOT); |
| } |
| |
| if (part_ptr->flags & PART_FLAG_NO_ROOT) |
| debug2("partition %s does not allow root jobs", part_ptr->name); |
| |
| if ((part->default_time != NO_VAL) && |
| (part->default_time > part->max_time)) { |
| info("partition %s DefaultTime exceeds MaxTime (%u > %u)", |
| part->name, part->default_time, part->max_time); |
| part->default_time = NO_VAL; |
| } |
| |
| if (part->hidden_flag) |
| part_ptr->flags |= PART_FLAG_HIDDEN; |
| if (part->root_only_flag) |
| part_ptr->flags |= PART_FLAG_ROOT_ONLY; |
| if (part->req_resv_flag) |
| part_ptr->flags |= PART_FLAG_REQ_RESV; |
| part_ptr->max_time = part->max_time; |
| part_ptr->def_mem_per_cpu = part->def_mem_per_cpu; |
| part_ptr->default_time = part->default_time; |
| part_ptr->max_share = part->max_share; |
| part_ptr->max_mem_per_cpu = part->max_mem_per_cpu; |
| part_ptr->max_nodes = part->max_nodes; |
| part_ptr->max_nodes_orig = part->max_nodes; |
| part_ptr->min_nodes = part->min_nodes; |
| part_ptr->min_nodes_orig = part->min_nodes; |
| part_ptr->preempt_mode = part->preempt_mode; |
| part_ptr->priority = part->priority; |
| part_ptr->state_up = part->state_up; |
| part_ptr->grace_time = part->grace_time; |
| |
| if (part->allow_groups) { |
| xfree(part_ptr->allow_groups); |
| part_ptr->allow_groups = xstrdup(part->allow_groups); |
| } |
| if (part->allow_alloc_nodes) { |
| if (part_ptr->allow_alloc_nodes) { |
| int cnt_tot, cnt_uniq; |
| hostlist_t hl = hostlist_create(part_ptr-> |
| allow_alloc_nodes); |
| |
| hostlist_push(hl, part->allow_alloc_nodes); |
| cnt_tot = hostlist_count(hl); |
| hostlist_uniq(hl); |
| cnt_uniq = hostlist_count(hl); |
| if (cnt_tot != cnt_uniq) { |
| fatal("Duplicate Allowed Allocating Nodes for " |
| "Partition %s", part->name); |
| } |
| xfree(part_ptr->allow_alloc_nodes); |
| part_ptr->allow_alloc_nodes = |
| hostlist_ranged_string_xmalloc(hl); |
| hostlist_destroy(hl); |
| } else { |
| part_ptr->allow_alloc_nodes = |
| xstrdup(part->allow_alloc_nodes); |
| } |
| } |
| if (part->alternate) { |
| xfree(part_ptr->alternate); |
| part_ptr->alternate = xstrdup(part->alternate); |
| } |
| if (part->nodes) { |
| if (part_ptr->nodes) { |
| int cnt_tot, cnt_uniq; |
| hostlist_t hl = hostlist_create(part_ptr->nodes); |
| |
| hostlist_push(hl, part->nodes); |
| cnt_tot = hostlist_count(hl); |
| hostlist_uniq(hl); |
| cnt_uniq = hostlist_count(hl); |
| if (cnt_tot != cnt_uniq) { |
| fatal("Duplicate Nodes for Partition %s", |
| part->name); |
| } |
| xfree(part_ptr->nodes); |
| part_ptr->nodes = hostlist_ranged_string_xmalloc(hl); |
| hostlist_destroy(hl); |
| } else { |
| part_ptr->nodes = xstrdup(part->nodes); |
| } |
| } |
| |
| return 0; |
| } |
| |
| /* |
| * _build_all_partitionline_info - get a array of slurm_conf_partition_t |
| * structures from the slurm.conf reader, build table, and set values |
| * RET 0 if no error, error code otherwise |
| * Note: Operates on common variables |
| * global: part_list - global partition list pointer |
| * default_part - default parameters for a partition |
| */ |
| static int _build_all_partitionline_info(void) |
| { |
| slurm_conf_partition_t **ptr_array; |
| int count; |
| int i; |
| |
| count = slurm_conf_partition_array(&ptr_array); |
| if (count == 0) |
| fatal("No PartitionName information available!"); |
| |
| for (i = 0; i < count; i++) |
| _build_single_partitionline_info(ptr_array[i]); |
| |
| return SLURM_SUCCESS; |
| } |
| |
| /* _sync_part_prio - Set normalized partition priorities */ |
| static void _sync_part_prio(void) |
| { |
| ListIterator itr = NULL; |
| struct part_record *part_ptr = NULL; |
| |
| part_max_priority = 0; |
| itr = list_iterator_create(part_list); |
| if (itr == NULL) |
| fatal("list_iterator_create malloc failure"); |
| while ((part_ptr = list_next(itr))) { |
| if (part_ptr->priority > part_max_priority) |
| part_max_priority = part_ptr->priority; |
| } |
| list_iterator_destroy(itr); |
| |
| if (part_max_priority) { |
| itr = list_iterator_create(part_list); |
| if (itr == NULL) |
| fatal("list_iterator_create malloc failure"); |
| while ((part_ptr = list_next(itr))) { |
| part_ptr->norm_priority = (double)part_ptr->priority / |
| (double)part_max_priority; |
| } |
| list_iterator_destroy(itr); |
| } |
| } |
| |
| /* |
| * read_slurm_conf - load the slurm configuration from the configured file. |
| * read_slurm_conf can be called more than once if so desired. |
| * IN recover - replace job, node and/or partition data with latest |
| * available information depending upon value |
| * 0 = use no saved state information, rebuild everything from |
| * slurm.conf contents |
| * 1 = recover saved job and trigger state, |
| * node DOWN/DRAIN/FAIL state and reason information |
| * 2 = recover all saved state |
| * IN reconfig - true if SIGHUP or "scontrol reconfig" and there is state in |
| * memory to preserve, otherwise recover state from disk |
| * RET SLURM_SUCCESS if no error, otherwise an error code |
| * Note: Operates on common variables only |
| */ |
| int read_slurm_conf(int recover, bool reconfig) |
| { |
| DEF_TIMERS; |
| int error_code, i, rc, load_job_ret = SLURM_SUCCESS; |
| int old_node_record_count = 0; |
| struct node_record *old_node_table_ptr = NULL, *node_ptr; |
| bool do_reorder_nodes = false; |
| List old_part_list = NULL; |
| char *old_def_part_name = NULL; |
| char *old_auth_type = xstrdup(slurmctld_conf.authtype); |
| uint16_t old_preempt_mode = slurmctld_conf.preempt_mode; |
| char *old_checkpoint_type = xstrdup(slurmctld_conf.checkpoint_type); |
| char *old_crypto_type = xstrdup(slurmctld_conf.crypto_type); |
| char *old_preempt_type = xstrdup(slurmctld_conf.preempt_type); |
| char *old_sched_type = xstrdup(slurmctld_conf.schedtype); |
| char *old_select_type = xstrdup(slurmctld_conf.select_type); |
| char *old_switch_type = xstrdup(slurmctld_conf.switch_type); |
| char *state_save_dir = xstrdup(slurmctld_conf.state_save_location); |
| char *mpi_params; |
| uint16_t old_select_type_p = slurmctld_conf.select_type_param; |
| |
| /* initialization */ |
| START_TIMER; |
| |
| if (reconfig) { |
| /* in order to re-use job state information, |
| * update nodes_completing string (based on node_bitmap) */ |
| update_job_nodes_completing(); |
| |
| /* save node and partition states for reconfig RPC */ |
| old_node_record_count = node_record_count; |
| old_node_table_ptr = node_record_table_ptr; |
| for (i=0, node_ptr=old_node_table_ptr; i<node_record_count; |
| i++, node_ptr++) { |
| xfree(node_ptr->features); |
| node_ptr->features = xstrdup( |
| node_ptr->config_ptr->feature); |
| /* Store the original configured CPU count somewhere |
| * (port is reused here for that purpose) so we can |
| * report changes in its configuration. */ |
| node_ptr->port = node_ptr->config_ptr->cpus; |
| node_ptr->weight = node_ptr->config_ptr->weight; |
| } |
| node_record_table_ptr = NULL; |
| node_record_count = 0; |
| old_part_list = part_list; |
| part_list = NULL; |
| old_def_part_name = default_part_name; |
| default_part_name = NULL; |
| } |
| |
| if ((error_code = _init_all_slurm_conf())) { |
| node_record_table_ptr = old_node_table_ptr; |
| part_list = old_part_list; |
| return error_code; |
| } |
| |
| if (slurm_topo_init() != SLURM_SUCCESS) |
| fatal("Failed to initialize topology plugin"); |
| |
| /* Build node and partition information based upon slurm.conf file */ |
| _build_all_nodeline_info(); |
| _handle_all_downnodes(); |
| _build_all_partitionline_info(); |
| if (!reconfig) |
| restore_front_end_state(recover); |
| |
| update_logging(); |
| g_slurm_jobcomp_init(slurmctld_conf.job_comp_loc); |
| if (slurm_sched_init() != SLURM_SUCCESS) |
| fatal("Failed to initialize sched plugin"); |
| if (!reconfig && (old_preempt_mode & PREEMPT_MODE_GANG) && |
| (gs_init() != SLURM_SUCCESS)) { |
| /* gs_init() must immediately follow slurm_sched_init() */ |
| fatal("Failed to initialize gang scheduler"); |
| } |
| if (switch_init() != SLURM_SUCCESS) |
| fatal("Failed to initialize switch plugin"); |
| |
| if (default_part_loc == NULL) |
| error("read_slurm_conf: default partition not set."); |
| |
| if (node_record_count < 1) { |
| error("read_slurm_conf: no nodes configured."); |
| _purge_old_node_state(old_node_table_ptr, |
| old_node_record_count); |
| _purge_old_part_state(old_part_list, old_def_part_name); |
| return EINVAL; |
| } |
| |
| /* |
| * Node reordering needs to be done by the topology and/or select |
| * plugin. Reordering the table must be done before hashing the |
| * nodes, and before any position-relative bitmaps are created. |
| */ |
| do_reorder_nodes |= slurm_topo_generate_node_ranking(); |
| do_reorder_nodes |= select_g_node_ranking(node_record_table_ptr, |
| node_record_count); |
| if (do_reorder_nodes) |
| _reorder_nodes_by_rank(); |
| else |
| _reorder_nodes_by_name(); |
| slurm_topo_build_config(); |
| |
| rehash_node(); |
| rehash_jobs(); |
| set_slurmd_addr(); |
| |
| if (reconfig) { /* Preserve state from memory */ |
| if (old_node_table_ptr) { |
| info("restoring original state of nodes"); |
| rc = _restore_node_state(recover, old_node_table_ptr, |
| old_node_record_count); |
| error_code = MAX(error_code, rc); /* not fatal */ |
| } |
| if (old_part_list && ((recover > 1) || |
| (slurmctld_conf.reconfig_flags & RECONFIG_KEEP_PART_INFO))) { |
| info("restoring original partition state"); |
| rc = _restore_part_state(old_part_list, |
| old_def_part_name, |
| slurmctld_conf.reconfig_flags); |
| error_code = MAX(error_code, rc); /* not fatal */ |
| } else if (old_part_list && (slurmctld_conf.reconfig_flags & |
| RECONFIG_KEEP_PART_STAT)) { |
| info("restoring original partition state only (up/down)"); |
| rc = _restore_part_state(old_part_list, |
| old_def_part_name, |
| slurmctld_conf.reconfig_flags); |
| error_code = MAX(error_code, rc); /* not fatal */ |
| } |
| load_last_job_id(); |
| reset_first_job_id(); |
| (void) slurm_sched_reconfig(); |
| } else if (recover == 0) { /* Build everything from slurm.conf */ |
| load_last_job_id(); |
| reset_first_job_id(); |
| (void) slurm_sched_reconfig(); |
| } else if (recover == 1) { /* Load job & node state files */ |
| (void) load_all_node_state(true); |
| (void) load_all_front_end_state(true); |
| load_job_ret = load_all_job_state(); |
| sync_job_priorities(); |
| } else if (recover > 1) { /* Load node, part & job state files */ |
| (void) load_all_node_state(false); |
| (void) load_all_front_end_state(false); |
| (void) load_all_part_state(); |
| load_job_ret = load_all_job_state(); |
| sync_job_priorities(); |
| } |
| |
| _sync_part_prio(); |
| _build_bitmaps_pre_select(); |
| if ((select_g_node_init(node_record_table_ptr, node_record_count) |
| != SLURM_SUCCESS) || |
| (select_g_block_init(part_list) != SLURM_SUCCESS) || |
| (select_g_state_restore(state_save_dir) != SLURM_SUCCESS) || |
| (select_g_job_init(job_list) != SLURM_SUCCESS)) { |
| fatal("failed to initialize node selection plugin state, " |
| "Clean start required."); |
| } |
| xfree(state_save_dir); |
| _gres_reconfig(reconfig); |
| reset_job_bitmaps(); /* must follow select_g_job_init() */ |
| |
| (void) _sync_nodes_to_jobs(); |
| (void) sync_job_files(); |
| _purge_old_node_state(old_node_table_ptr, old_node_record_count); |
| _purge_old_part_state(old_part_list, old_def_part_name); |
| |
| if ((rc = _build_bitmaps())) |
| fatal("_build_bitmaps failure"); |
| mpi_params = slurm_get_mpi_params(); |
| reserve_port_config(mpi_params); |
| xfree(mpi_params); |
| |
| license_free(); |
| if (license_init(slurmctld_conf.licenses) != SLURM_SUCCESS) |
| fatal("Invalid Licenses value: %s", slurmctld_conf.licenses); |
| |
| /* NOTE: Run restore_node_features before _restore_job_dependencies */ |
| restore_node_features(recover); |
| _restore_job_dependencies(); |
| #ifdef HAVE_ELAN |
| _validate_node_proc_count(); |
| #endif |
| (void) _sync_nodes_to_comp_job();/* must follow select_g_node_init() */ |
| load_part_uid_allow_list(1); |
| |
| if (reconfig) { |
| load_all_resv_state(0); |
| } else { |
| load_all_resv_state(recover); |
| if (recover >= 1) { |
| (void) trigger_state_restore(); |
| (void) slurm_sched_reconfig(); |
| } |
| } |
| |
| /* sort config_list by weight for scheduling */ |
| list_sort(config_list, &list_compare_config); |
| |
| /* Update plugins as possible */ |
| rc = _preserve_plugins(&slurmctld_conf, |
| old_auth_type, old_checkpoint_type, |
| old_crypto_type, old_sched_type, |
| old_select_type, old_switch_type); |
| error_code = MAX(error_code, rc); /* not fatal */ |
| |
| if (strcmp(old_preempt_type, slurmctld_conf.preempt_type)) { |
| info("Changing PreemptType from %s to %s", |
| old_preempt_type, slurmctld_conf.preempt_type); |
| (void) slurm_preempt_fini(); |
| if (slurm_preempt_init() != SLURM_SUCCESS) |
| fatal( "failed to initialize preempt plugin" ); |
| } |
| xfree(old_preempt_type); |
| rc = _update_preempt(old_preempt_mode); |
| error_code = MAX(error_code, rc); /* not fatal */ |
| |
| /* Update plugin parameters as possible */ |
| rc = job_submit_plugin_reconfig(); |
| error_code = MAX(error_code, rc); /* not fatal */ |
| rc = switch_g_reconfig(); |
| error_code = MAX(error_code, rc); /* not fatal */ |
| rc = _preserve_select_type_param(&slurmctld_conf, old_select_type_p); |
| error_code = MAX(error_code, rc); /* not fatal */ |
| |
| /* Restore job accounting info if file missing or corrupted, |
| * an extremely rare situation */ |
| if (load_job_ret) |
| _acct_restore_active_jobs(); |
| |
| /* Sync select plugin with synchronized job/node/part data */ |
| select_g_reconfigure(); |
| |
| slurmctld_conf.last_update = time(NULL); |
| END_TIMER2("read_slurm_conf"); |
| return error_code; |
| } |
| |
| static void _gres_reconfig(bool reconfig) |
| { |
| struct node_record *node_ptr; |
| char *gres_name; |
| bool gres_changed; |
| int i; |
| |
| if (reconfig) { |
| gres_plugin_reconfig(&gres_changed); |
| } else { |
| for (i = 0, node_ptr = node_record_table_ptr; |
| i < node_record_count; i++, node_ptr++) { |
| if (node_ptr->gres) |
| gres_name = node_ptr->gres; |
| else |
| gres_name = node_ptr->config_ptr->gres; |
| gres_plugin_init_node_config(node_ptr->name, gres_name, |
| &node_ptr->gres_list); |
| } |
| } |
| } |
| |
| /* Restore node state and size information from saved records which match |
| * the node registration message. If a node was re-configured to be down or |
| * drained, we set those states. We only recover a node's Features if |
| * recover==2. */ |
| static int _restore_node_state(int recover, |
| struct node_record *old_node_table_ptr, |
| int old_node_record_count) |
| { |
| struct node_record *node_ptr, *old_node_ptr; |
| int i, rc = SLURM_SUCCESS; |
| hostset_t hs = NULL; |
| slurm_ctl_conf_t *conf = slurm_conf_lock(); |
| bool power_save_mode = false; |
| |
| if (conf->suspend_program && conf->resume_program) |
| power_save_mode = true; |
| slurm_conf_unlock(); |
| |
| |
| for (i=0, node_ptr=node_record_table_ptr; i<node_record_count; |
| i++, node_ptr++) { |
| node_ptr->not_responding = true; |
| } |
| |
| for (i=0, old_node_ptr=old_node_table_ptr; i<old_node_record_count; |
| i++, old_node_ptr++) { |
| uint16_t drain_flag = false, down_flag = false; |
| node_ptr = find_node_record(old_node_ptr->name); |
| if (node_ptr == NULL) |
| continue; |
| |
| node_ptr->not_responding = false; |
| if (IS_NODE_DOWN(node_ptr)) |
| down_flag = true; |
| if (IS_NODE_DRAIN(node_ptr)) |
| drain_flag = true; |
| node_ptr->node_state = old_node_ptr->node_state; |
| if (down_flag) { |
| node_ptr->node_state &= NODE_STATE_FLAGS; |
| node_ptr->node_state |= NODE_STATE_DOWN; |
| } |
| if (drain_flag) |
| node_ptr->node_state |= NODE_STATE_DRAIN; |
| if ((!power_save_mode) && |
| (IS_NODE_POWER_SAVE(node_ptr) || |
| IS_NODE_POWER_UP(node_ptr))) { |
| node_ptr->node_state &= (~NODE_STATE_POWER_SAVE); |
| node_ptr->node_state &= (~NODE_STATE_POWER_UP); |
| if (hs) |
| hostset_insert(hs, node_ptr->name); |
| else |
| hs = hostset_create(node_ptr->name); |
| } |
| |
| if (IS_NODE_CLOUD(node_ptr) && !IS_NODE_POWER_SAVE(node_ptr)) { |
| /* Preserve NodeHostname + NodeAddr set by scontrol */ |
| xfree(node_ptr->comm_name); |
| node_ptr->comm_name = old_node_ptr->comm_name; |
| old_node_ptr->comm_name = NULL; |
| xfree(node_ptr->node_hostname); |
| node_ptr->node_hostname = old_node_ptr->node_hostname; |
| old_node_ptr->node_hostname = NULL; |
| slurm_reset_alias(node_ptr->name, node_ptr->comm_name, |
| node_ptr->node_hostname); |
| } |
| |
| node_ptr->last_response = old_node_ptr->last_response; |
| |
| #ifndef HAVE_BG |
| /* If running on a BlueGene system the cpus never |
| change so just skip this. |
| */ |
| if (old_node_ptr->port != node_ptr->config_ptr->cpus) { |
| rc = ESLURM_NEED_RESTART; |
| error("Configured cpu count change on %s (%u to %u)", |
| node_ptr->name, old_node_ptr->port, |
| node_ptr->config_ptr->cpus); |
| } |
| #endif |
| node_ptr->boot_time = old_node_ptr->boot_time; |
| node_ptr->cpus = old_node_ptr->cpus; |
| node_ptr->cores = old_node_ptr->cores; |
| node_ptr->last_idle = old_node_ptr->last_idle; |
| node_ptr->boards = old_node_ptr->boards; |
| node_ptr->sockets = old_node_ptr->sockets; |
| node_ptr->threads = old_node_ptr->threads; |
| node_ptr->real_memory = old_node_ptr->real_memory; |
| node_ptr->slurmd_start_time = old_node_ptr->slurmd_start_time; |
| node_ptr->tmp_disk = old_node_ptr->tmp_disk; |
| node_ptr->weight = old_node_ptr->weight; |
| |
| node_ptr->sus_job_cnt = old_node_ptr->sus_job_cnt; |
| |
| if (node_ptr->gres_list) |
| list_destroy(node_ptr->gres_list); |
| node_ptr->gres_list = old_node_ptr->gres_list; |
| old_node_ptr->gres_list = NULL; |
| |
| if (node_ptr->reason == NULL) { |
| /* Recover only if not explicitly set in slurm.conf */ |
| node_ptr->reason = old_node_ptr->reason; |
| node_ptr->reason_time = old_node_ptr->reason_time; |
| old_node_ptr->reason = NULL; |
| } |
| if (recover == 2) { |
| /* NOTE: features in node record just a placeholder |
| * for restore_node_features() to set up new config |
| * records. */ |
| xfree(node_ptr->features); |
| node_ptr->features = old_node_ptr->features; |
| old_node_ptr->features = NULL; |
| xfree(node_ptr->gres); |
| node_ptr->gres = old_node_ptr->gres; |
| old_node_ptr->gres = NULL; |
| } |
| if (old_node_ptr->arch) { |
| xfree(node_ptr->arch); |
| node_ptr->arch = old_node_ptr->arch; |
| old_node_ptr->arch = NULL; |
| } |
| if (old_node_ptr->os) { |
| xfree(node_ptr->os); |
| node_ptr->os = old_node_ptr->os; |
| old_node_ptr->os = NULL; |
| } |
| } |
| |
| if (hs) { |
| char node_names[128]; |
| hostset_ranged_string(hs, sizeof(node_names), node_names); |
| info("Cleared POWER_SAVE flag from nodes %s", node_names); |
| hostset_destroy(hs); |
| hs = NULL; |
| } |
| |
| for (i=0, node_ptr=node_record_table_ptr; i<node_record_count; |
| i++, node_ptr++) { |
| if (!node_ptr->not_responding) |
| continue; |
| node_ptr->not_responding = false; |
| if (hs) |
| hostset_insert(hs, node_ptr->name); |
| else |
| hs = hostset_create(node_ptr->name); |
| } |
| if (hs) { |
| char node_names[128]; |
| hostset_ranged_string(hs, sizeof(node_names), node_names); |
| error("Nodes added to configuration (%s)", node_names); |
| error("Reboot of all slurm daemons is recommended"); |
| hostset_destroy(hs); |
| } |
| |
| return rc; |
| } |
| |
| /* Purge old node state information */ |
| static void _purge_old_node_state(struct node_record *old_node_table_ptr, |
| int old_node_record_count) |
| { |
| int i; |
| struct node_record *node_ptr; |
| |
| node_ptr = old_node_table_ptr; |
| for (i=0; i< old_node_record_count; i++, node_ptr++) |
| purge_node_rec(node_ptr); |
| |
| xfree(old_node_table_ptr); |
| } |
| |
| /* Variant of strcmp that will accept NULL string pointers */ |
| static int _strcmp(const char *s1, const char *s2) |
| { |
| if ((s1 != NULL) && (s2 == NULL)) |
| return 1; |
| if ((s1 == NULL) && (s2 == NULL)) |
| return 0; |
| if ((s1 == NULL) && (s2 != NULL)) |
| return -1; |
| return strcmp(s1, s2); |
| } |
| |
| /* Restore partition information from saved records */ |
| static int _restore_part_state(List old_part_list, char *old_def_part_name, |
| uint16_t flags) |
| { |
| int rc = SLURM_SUCCESS; |
| ListIterator part_iterator; |
| struct part_record *old_part_ptr, *part_ptr; |
| |
| if (!old_part_list) |
| return rc; |
| |
| /* For each part in list, find and update recs */ |
| part_iterator = list_iterator_create(old_part_list); |
| if (!part_iterator) |
| fatal("list_iterator_create malloc"); |
| while ((old_part_ptr = (struct part_record *) |
| list_next(part_iterator))) { |
| xassert(old_part_ptr->magic == PART_MAGIC); |
| part_ptr = find_part_record(old_part_ptr->name); |
| if (part_ptr) { |
| if ( !(flags & RECONFIG_KEEP_PART_INFO) && |
| (flags & RECONFIG_KEEP_PART_STAT) ) { |
| if (part_ptr->state_up != old_part_ptr->state_up) { |
| info("Partition %s State differs from " |
| "slurm.conf", part_ptr->name); |
| part_ptr->state_up = old_part_ptr->state_up; |
| } |
| continue; |
| } |
| /* Current partition found in slurm.conf, |
| * report differences from slurm.conf configuration */ |
| if (_strcmp(part_ptr->allow_groups, |
| old_part_ptr->allow_groups)) { |
| error("Partition %s AllowGroups differs from " |
| "slurm.conf", part_ptr->name); |
| xfree(part_ptr->allow_groups); |
| part_ptr->allow_groups = xstrdup(old_part_ptr-> |
| allow_groups); |
| } |
| if (_strcmp(part_ptr->allow_alloc_nodes, |
| old_part_ptr->allow_alloc_nodes)) { |
| error("Partition %s AllowNodes differs from " |
| "slurm.conf", part_ptr->name); |
| xfree(part_ptr->allow_alloc_nodes); |
| part_ptr->allow_alloc_nodes = |
| xstrdup(old_part_ptr-> |
| allow_alloc_nodes); |
| } |
| if (part_ptr->default_time != |
| old_part_ptr->default_time) { |
| error("Partition %s DefaultTime differs from " |
| "slurm.conf", part_ptr->name); |
| part_ptr->default_time = old_part_ptr-> |
| default_time; |
| } |
| if ((part_ptr->flags & PART_FLAG_HIDDEN) != |
| (old_part_ptr->flags & PART_FLAG_HIDDEN)) { |
| error("Partition %s Hidden differs from " |
| "slurm.conf", part_ptr->name); |
| if (old_part_ptr->flags & PART_FLAG_HIDDEN) |
| part_ptr->flags |= PART_FLAG_HIDDEN; |
| else |
| part_ptr->flags &= (~PART_FLAG_HIDDEN); |
| } |
| if ((part_ptr->flags & PART_FLAG_NO_ROOT) != |
| (old_part_ptr->flags & PART_FLAG_NO_ROOT)) { |
| error("Partition %s DisableRootJobs differs " |
| "from slurm.conf", part_ptr->name); |
| if (old_part_ptr->flags & PART_FLAG_NO_ROOT) |
| part_ptr->flags |= PART_FLAG_NO_ROOT; |
| else |
| part_ptr->flags &= (~PART_FLAG_NO_ROOT); |
| } |
| if ((part_ptr->flags & PART_FLAG_ROOT_ONLY) != |
| (old_part_ptr->flags & PART_FLAG_ROOT_ONLY)) { |
| error("Partition %s RootOnly differs from " |
| "slurm.conf", part_ptr->name); |
| if (old_part_ptr->flags & PART_FLAG_ROOT_ONLY) |
| part_ptr->flags |= PART_FLAG_ROOT_ONLY; |
| else |
| part_ptr->flags &= (~PART_FLAG_ROOT_ONLY); |
| } |
| if ((part_ptr->flags & PART_FLAG_REQ_RESV) != |
| (old_part_ptr->flags & PART_FLAG_REQ_RESV)) { |
| error("Partition %s ReqResv differs from " |
| "slurm.conf", part_ptr->name); |
| if (old_part_ptr->flags & PART_FLAG_REQ_RESV) |
| part_ptr->flags |= PART_FLAG_REQ_RESV; |
| else |
| part_ptr->flags &= (~PART_FLAG_REQ_RESV); |
| } |
| if (part_ptr->max_nodes_orig != |
| old_part_ptr->max_nodes_orig) { |
| error("Partition %s MaxNodes differs from " |
| "slurm.conf (%u != %u)", part_ptr->name, |
| part_ptr->max_nodes_orig, |
| old_part_ptr->max_nodes_orig); |
| part_ptr->max_nodes = old_part_ptr-> |
| max_nodes_orig; |
| part_ptr->max_nodes_orig = old_part_ptr-> |
| max_nodes_orig; |
| } |
| if (part_ptr->max_share != old_part_ptr->max_share) { |
| error("Partition %s Shared differs from " |
| "slurm.conf", part_ptr->name); |
| part_ptr->max_share = old_part_ptr->max_share; |
| } |
| if (part_ptr->max_time != old_part_ptr->max_time) { |
| error("Partition %s MaxTime differs from " |
| "slurm.conf", part_ptr->name); |
| part_ptr->max_time = old_part_ptr->max_time; |
| } |
| if (part_ptr->grace_time != old_part_ptr->grace_time) { |
| error("Partition %s GraceTime differs from " |
| "slurm.conf", part_ptr->name); |
| part_ptr->grace_time = old_part_ptr->grace_time; |
| } |
| if (part_ptr->min_nodes_orig != |
| old_part_ptr->min_nodes_orig) { |
| error("Partition %s MinNodes differs from " |
| "slurm.conf (%u != %u)", part_ptr->name, |
| part_ptr->min_nodes_orig, |
| old_part_ptr->min_nodes_orig); |
| part_ptr->min_nodes = old_part_ptr-> |
| min_nodes_orig; |
| part_ptr->min_nodes_orig = old_part_ptr-> |
| min_nodes_orig; |
| } |
| if (_strcmp(part_ptr->nodes, old_part_ptr->nodes)) { |
| error("Partition %s Nodes differs from " |
| "slurm.conf", part_ptr->name); |
| xfree(part_ptr->nodes); |
| part_ptr->nodes = xstrdup(old_part_ptr->nodes); |
| } |
| if (part_ptr->preempt_mode != |
| old_part_ptr->preempt_mode) { |
| error("Partition %s PreemptMode differs from " |
| "slurm.conf", part_ptr->name); |
| part_ptr->preempt_mode = old_part_ptr-> |
| preempt_mode; |
| } |
| if (part_ptr->priority != old_part_ptr->priority) { |
| error("Partition %s Priority differs from " |
| "slurm.conf", part_ptr->name); |
| part_ptr->priority = old_part_ptr->priority; |
| } |
| if (part_ptr->state_up != old_part_ptr->state_up) { |
| error("Partition %s State differs from " |
| "slurm.conf", part_ptr->name); |
| part_ptr->state_up = old_part_ptr->state_up; |
| } |
| } else { |
| if ( !(flags & RECONFIG_KEEP_PART_INFO) && |
| (flags & RECONFIG_KEEP_PART_STAT) ) { |
| info("Partition %s missing from slurm.conf, " |
| "not restoring it", old_part_ptr->name); |
| continue; |
| } |
| error("Partition %s missing from slurm.conf, " |
| "restoring it", old_part_ptr->name); |
| part_ptr = create_part_record(); |
| part_ptr->name = xstrdup(old_part_ptr->name); |
| part_ptr->allow_alloc_nodes = xstrdup(old_part_ptr-> |
| allow_alloc_nodes); |
| part_ptr->allow_groups = xstrdup(old_part_ptr-> |
| allow_groups); |
| part_ptr->default_time = old_part_ptr->default_time; |
| part_ptr->flags = old_part_ptr->flags; |
| part_ptr->max_nodes = old_part_ptr->max_nodes; |
| part_ptr->max_nodes_orig = old_part_ptr-> |
| max_nodes_orig; |
| part_ptr->max_share = old_part_ptr->max_share; |
| part_ptr->max_time = old_part_ptr->max_time; |
| part_ptr->grace_time = old_part_ptr->grace_time; |
| part_ptr->min_nodes = old_part_ptr->min_nodes; |
| part_ptr->min_nodes_orig = old_part_ptr-> |
| min_nodes_orig; |
| part_ptr->nodes = xstrdup(old_part_ptr->nodes); |
| part_ptr->priority = old_part_ptr->priority; |
| part_ptr->state_up = old_part_ptr->state_up; |
| } |
| } |
| list_iterator_destroy(part_iterator); |
| |
| if (old_def_part_name && |
| ((default_part_name == NULL) || |
| strcmp(old_def_part_name, default_part_name))) { |
| part_ptr = find_part_record(old_def_part_name); |
| if (part_ptr) { |
| error("Default partition reset to %s", |
| old_def_part_name); |
| default_part_loc = part_ptr; |
| xfree(default_part_name); |
| default_part_name = xstrdup(old_def_part_name); |
| } |
| } |
| |
| return rc; |
| } |
| |
| /* Purge old partition state information */ |
| static void _purge_old_part_state(List old_part_list, char *old_def_part_name) |
| { |
| xfree(old_def_part_name); |
| |
| if (!old_part_list) |
| return; |
| list_destroy(old_part_list); |
| } |
| |
| /* |
| * _preserve_select_type_param - preserve original plugin parameters. |
| * Daemons and/or commands must be restarted for some |
| * select plugin value changes to take effect. |
| * RET zero or error code |
| */ |
| static int _preserve_select_type_param(slurm_ctl_conf_t *ctl_conf_ptr, |
| uint16_t old_select_type_p) |
| { |
| int rc = SLURM_SUCCESS; |
| |
| /* SelectTypeParameters cannot change */ |
| if (old_select_type_p) { |
| if (old_select_type_p != ctl_conf_ptr->select_type_param) { |
| ctl_conf_ptr->select_type_param = old_select_type_p; |
| rc = ESLURM_INVALID_SELECTTYPE_CHANGE; |
| } |
| } |
| return rc; |
| } |
| |
| /* Start or stop the gang scheduler module as needed based upon changes in |
| * configuration */ |
| static int _update_preempt(uint16_t old_preempt_mode) |
| { |
| uint16_t new_preempt_mode = slurm_get_preempt_mode(); |
| |
| if ((old_preempt_mode & PREEMPT_MODE_GANG) == |
| (new_preempt_mode & PREEMPT_MODE_GANG)) |
| return SLURM_SUCCESS; |
| |
| if (new_preempt_mode & PREEMPT_MODE_GANG) { |
| info("Enabling gang scheduling"); |
| return gs_init(); |
| } |
| |
| if (old_preempt_mode == PREEMPT_MODE_GANG) { |
| info("Disabling gang scheduling"); |
| gs_wake_jobs(); |
| return gs_fini(); |
| } |
| |
| error("Invalid gang scheduling mode change"); |
| return EINVAL; |
| } |
| |
| /* |
| * _preserve_plugins - preserve original plugin values over reconfiguration |
| * as required. daemons and/or commands must be restarted for some |
| * plugin value changes to take effect. |
| * RET zero or error code |
| */ |
| static int _preserve_plugins(slurm_ctl_conf_t * ctl_conf_ptr, |
| char *old_auth_type, char *old_checkpoint_type, |
| char *old_crypto_type, char *old_sched_type, |
| char *old_select_type, char *old_switch_type) |
| { |
| int rc = SLURM_SUCCESS; |
| |
| if (old_auth_type) { |
| if (strcmp(old_auth_type, ctl_conf_ptr->authtype)) { |
| xfree(ctl_conf_ptr->authtype); |
| ctl_conf_ptr->authtype = old_auth_type; |
| rc = ESLURM_INVALID_AUTHTYPE_CHANGE; |
| } else /* free duplicate value */ |
| xfree(old_auth_type); |
| } |
| |
| if (old_checkpoint_type) { |
| if (strcmp(old_checkpoint_type, |
| ctl_conf_ptr->checkpoint_type)) { |
| xfree(ctl_conf_ptr->checkpoint_type); |
| ctl_conf_ptr->checkpoint_type = old_checkpoint_type; |
| rc = ESLURM_INVALID_CHECKPOINT_TYPE_CHANGE; |
| } else /* free duplicate value */ |
| xfree(old_checkpoint_type); |
| } |
| |
| if (old_crypto_type) { |
| if (strcmp(old_crypto_type, |
| ctl_conf_ptr->crypto_type)) { |
| xfree(ctl_conf_ptr->crypto_type); |
| ctl_conf_ptr->crypto_type = old_crypto_type; |
| rc = ESLURM_INVALID_CRYPTO_TYPE_CHANGE; |
| } else |
| xfree(old_crypto_type); |
| } |
| |
| if (old_sched_type) { |
| if (strcmp(old_sched_type, ctl_conf_ptr->schedtype)) { |
| xfree(ctl_conf_ptr->schedtype); |
| ctl_conf_ptr->schedtype = old_sched_type; |
| rc = ESLURM_INVALID_SCHEDTYPE_CHANGE; |
| } else /* free duplicate value */ |
| xfree(old_sched_type); |
| } |
| |
| |
| if (old_select_type) { |
| if (strcmp(old_select_type, ctl_conf_ptr->select_type)) { |
| xfree(ctl_conf_ptr->select_type); |
| ctl_conf_ptr->select_type = old_select_type; |
| rc = ESLURM_INVALID_SELECTTYPE_CHANGE; |
| } else /* free duplicate value */ |
| xfree(old_select_type); |
| } |
| |
| if (old_switch_type) { |
| if (strcmp(old_switch_type, ctl_conf_ptr->switch_type)) { |
| xfree(ctl_conf_ptr->switch_type); |
| ctl_conf_ptr->switch_type = old_switch_type; |
| rc = ESLURM_INVALID_SWITCHTYPE_CHANGE; |
| } else /* free duplicate value */ |
| xfree(old_switch_type); |
| } |
| |
| if (ctl_conf_ptr->backup_controller == NULL) |
| info("read_slurm_conf: backup_controller not specified."); |
| |
| return rc; |
| } |
| |
| |
| /* |
| * _sync_nodes_to_jobs - sync node state to job states on slurmctld restart. |
| * This routine marks nodes allocated to a job as busy no matter what |
| * the node's last saved state |
| * RET count of nodes having state changed |
| * Note: Operates on common variables, no arguments |
| */ |
| static int _sync_nodes_to_jobs(void) |
| { |
| struct job_record *job_ptr; |
| ListIterator job_iterator; |
| int update_cnt = 0; |
| |
| job_iterator = list_iterator_create(job_list); |
| while ((job_ptr = (struct job_record *) list_next(job_iterator))) { |
| if (job_ptr->node_bitmap == NULL) |
| continue; |
| |
| if (IS_JOB_RUNNING(job_ptr) || IS_JOB_COMPLETING(job_ptr)) |
| update_cnt += _sync_nodes_to_active_job(job_ptr); |
| else if (IS_JOB_SUSPENDED(job_ptr)) |
| _sync_nodes_to_suspended_job(job_ptr); |
| } |
| list_iterator_destroy(job_iterator); |
| |
| if (update_cnt) { |
| info("_sync_nodes_to_jobs updated state of %d nodes", |
| update_cnt); |
| } |
| sync_front_end_state(); |
| return update_cnt; |
| } |
| |
| /* For jobs which are in state COMPLETING, deallocate the nodes and |
| * issue the RPC to kill the job */ |
| static int _sync_nodes_to_comp_job(void) |
| { |
| struct job_record *job_ptr; |
| ListIterator job_iterator; |
| int update_cnt = 0; |
| |
| job_iterator = list_iterator_create(job_list); |
| while ((job_ptr = (struct job_record *) list_next(job_iterator))) { |
| if ((job_ptr->node_bitmap) && IS_JOB_COMPLETING(job_ptr)) { |
| update_cnt++; |
| info("Job %u in completing state", job_ptr->job_id); |
| if (!job_ptr->node_bitmap_cg) |
| build_cg_bitmap(job_ptr); |
| deallocate_nodes(job_ptr, false, false, false); |
| /* The job in completing state at slurmctld restart or |
| * reconfiguration, do not log completion again. |
| * job_completion_logger(job_ptr, false); */ |
| } |
| } |
| list_iterator_destroy(job_iterator); |
| if (update_cnt) |
| info("_sync_nodes_to_comp_job completing %d jobs", update_cnt); |
| return update_cnt; |
| } |
| |
| /* Synchronize states of nodes and active jobs (RUNNING or COMPLETING state) |
| * RET count of jobs with state changes */ |
| static int _sync_nodes_to_active_job(struct job_record *job_ptr) |
| { |
| int i, cnt = 0; |
| uint16_t node_flags; |
| struct node_record *node_ptr = node_record_table_ptr; |
| |
| job_ptr->node_cnt = bit_set_count(job_ptr->node_bitmap); |
| for (i = 0; i < node_record_count; i++, node_ptr++) { |
| if (bit_test(job_ptr->node_bitmap, i) == 0) |
| continue; |
| |
| node_flags = node_ptr->node_state & NODE_STATE_FLAGS; |
| |
| node_ptr->run_job_cnt++; /* NOTE: |
| * This counter moved to comp_job_cnt |
| * by _sync_nodes_to_comp_job() */ |
| if ((job_ptr->details) && (job_ptr->details->shared == 0)) |
| node_ptr->no_share_job_cnt++; |
| |
| if (IS_NODE_DOWN(node_ptr) && |
| IS_JOB_RUNNING(job_ptr) && |
| (job_ptr->kill_on_node_fail == 0) && |
| (job_ptr->node_cnt > 1)) { |
| /* This should only happen if a job was running |
| * on a node that was newly configured DOWN */ |
| int save_accounting_enforce; |
| info("Removing failed node %s from job_id %u", |
| node_ptr->name, job_ptr->job_id); |
| /* Disable accounting here. Accounting reset for all |
| * jobs in _restore_job_dependencies() */ |
| save_accounting_enforce = accounting_enforce; |
| accounting_enforce &= (~ACCOUNTING_ENFORCE_LIMITS); |
| job_pre_resize_acctg(job_ptr); |
| srun_node_fail(job_ptr->job_id, node_ptr->name); |
| kill_step_on_node(job_ptr, node_ptr, true); |
| excise_node_from_job(job_ptr, node_ptr); |
| job_post_resize_acctg(job_ptr); |
| accounting_enforce = save_accounting_enforce; |
| } else if (IS_NODE_DOWN(node_ptr) && IS_JOB_RUNNING(job_ptr)) { |
| time_t now = time(NULL); |
| info("Killing job %u on DOWN node %s", |
| job_ptr->job_id, node_ptr->name); |
| job_ptr->job_state = JOB_NODE_FAIL | JOB_COMPLETING; |
| build_cg_bitmap(job_ptr); |
| job_ptr->end_time = MIN(job_ptr->end_time, now); |
| job_ptr->exit_code = MAX(job_ptr->exit_code, 1); |
| job_ptr->state_reason = FAIL_DOWN_NODE; |
| xfree(job_ptr->state_desc); |
| job_completion_logger(job_ptr, false); |
| cnt++; |
| } else if (IS_NODE_IDLE(node_ptr)) { |
| cnt++; |
| node_ptr->node_state = NODE_STATE_ALLOCATED | |
| node_flags; |
| } |
| } |
| |
| if (IS_JOB_RUNNING(job_ptr) && job_ptr->front_end_ptr) |
| job_ptr->front_end_ptr->job_cnt_run++; |
| |
| return cnt; |
| } |
| |
| /* Synchronize states of nodes and suspended jobs */ |
| static void _sync_nodes_to_suspended_job(struct job_record *job_ptr) |
| { |
| int i; |
| struct node_record *node_ptr = node_record_table_ptr; |
| |
| for (i = 0; i < node_record_count; i++, node_ptr++) { |
| if (bit_test(job_ptr->node_bitmap, i) == 0) |
| continue; |
| |
| node_ptr->sus_job_cnt++; |
| } |
| return; |
| } |
| |
| #ifdef HAVE_ELAN |
| /* Every node in a given partition must have the same processor count |
| * at present, ensured by this function. */ |
| static void _validate_node_proc_count(void) |
| { |
| ListIterator part_iterator; |
| struct part_record *part_ptr; |
| struct node_record *node_ptr; |
| int first_bit, last_bit, i, node_size, part_size; |
| |
| part_iterator = list_iterator_create(part_list); |
| while ((part_ptr = (struct part_record *) list_next(part_iterator))) { |
| first_bit = bit_ffs(part_ptr->node_bitmap); |
| last_bit = bit_fls(part_ptr->node_bitmap); |
| part_size = -1; |
| for (i = first_bit; i <= last_bit; i++) { |
| if (bit_test(part_ptr->node_bitmap, i) == 0) |
| continue; |
| node_ptr = node_record_table_ptr + i; |
| |
| if (slurmctld_conf.fast_schedule) |
| node_size = node_ptr->config_ptr->cpus; |
| else if (node_ptr->cpus < node_ptr->config_ptr->cpus) |
| continue; /* node too small, will be DOWN */ |
| else if (IS_NODE_DOWN(node_ptr)) |
| continue; |
| else |
| node_size = node_ptr->cpus; |
| |
| if (part_size == -1) |
| part_size = node_size; |
| else if (part_size != node_size) |
| fatal("Partition %s has inconsistent " |
| "processor count", part_ptr->name); |
| } |
| } |
| list_iterator_destroy(part_iterator); |
| } |
| #endif |
| |
| /* |
| * _restore_job_dependencies - Build depend_list and license_list for every job |
| * also reset the runing job count for scheduling policy |
| */ |
| static int _restore_job_dependencies(void) |
| { |
| int error_code = SLURM_SUCCESS, rc; |
| struct job_record *job_ptr; |
| ListIterator job_iterator; |
| char *new_depend; |
| bool valid; |
| List license_list; |
| |
| assoc_mgr_clear_used_info(); |
| job_iterator = list_iterator_create(job_list); |
| while ((job_ptr = (struct job_record *) list_next(job_iterator))) { |
| (void) build_feature_list(job_ptr); |
| |
| if (accounting_enforce & ACCOUNTING_ENFORCE_LIMITS) { |
| if (!IS_JOB_FINISHED(job_ptr)) |
| acct_policy_add_job_submit(job_ptr); |
| if (IS_JOB_RUNNING(job_ptr) || |
| IS_JOB_SUSPENDED(job_ptr)) |
| acct_policy_job_begin(job_ptr); |
| } |
| |
| license_list = license_validate(job_ptr->licenses, &valid); |
| if (job_ptr->license_list) |
| list_destroy(job_ptr->license_list); |
| if (valid) |
| job_ptr->license_list = license_list; |
| if (IS_JOB_RUNNING(job_ptr)) |
| license_job_get(job_ptr); |
| |
| if ((job_ptr->details == NULL) || |
| (job_ptr->details->dependency == NULL)) |
| continue; |
| new_depend = job_ptr->details->dependency; |
| job_ptr->details->dependency = NULL; |
| rc = update_job_dependency(job_ptr, new_depend); |
| if (rc != SLURM_SUCCESS) { |
| error("Invalid dependencies discarded for job %u: %s", |
| job_ptr->job_id, new_depend); |
| error_code = rc; |
| } |
| xfree(new_depend); |
| } |
| list_iterator_destroy(job_iterator); |
| return error_code; |
| } |
| |
| /* Flush accounting information on this cluster, then for each running or |
| * suspended job, restore its state in the accounting system */ |
| static void _acct_restore_active_jobs(void) |
| { |
| struct job_record *job_ptr; |
| ListIterator job_iterator; |
| struct step_record *step_ptr; |
| ListIterator step_iterator; |
| |
| info("Reinitializing job accounting state"); |
| acct_storage_g_flush_jobs_on_cluster(acct_db_conn, |
| time(NULL)); |
| job_iterator = list_iterator_create(job_list); |
| while ((job_ptr = (struct job_record *) list_next(job_iterator))) { |
| if (IS_JOB_SUSPENDED(job_ptr)) |
| jobacct_storage_g_job_suspend(acct_db_conn, job_ptr); |
| if (IS_JOB_SUSPENDED(job_ptr) || IS_JOB_RUNNING(job_ptr)) { |
| jobacct_storage_g_job_start(acct_db_conn, job_ptr); |
| step_iterator = list_iterator_create( |
| job_ptr->step_list); |
| while ((step_ptr = (struct step_record *) |
| list_next(step_iterator))) { |
| jobacct_storage_g_step_start(acct_db_conn, |
| step_ptr); |
| } |
| list_iterator_destroy (step_iterator); |
| } |
| } |
| list_iterator_destroy(job_iterator); |
| } |