| /*****************************************************************************\ |
| * backfill.c - simple backfill scheduler plugin. |
| * |
| * If a partition does not have root only access and nodes are not shared |
| * then raise the priority of pending jobs if doing so does not adversely |
| * effect the expected initiation of any higher priority job. We do not alter |
| * a job's required or excluded node list, so this is a conservative |
| * algorithm. |
| * |
| * For example, consider a cluster "lx[01-08]" with one job executing on |
| * nodes "lx[01-04]". The highest priority pending job requires five nodes |
| * including "lx05". The next highest priority pending job requires any |
| * three nodes. Without explicitly forcing the second job to use nodes |
| * "lx[06-08]", we can't start it without possibly delaying the higher |
| * priority job. |
| ***************************************************************************** |
| * Copyright (C) 2003-2007 The Regents of the University of California. |
| * Copyright (C) 2008-2010 Lawrence Livermore National Security. |
| * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). |
| * Written by Morris Jette <jette1@llnl.gov> |
| * CODE-OCEC-09-009. All rights reserved. |
| * |
| * This file is part of Slurm, a resource management program. |
| * For details, see <https://slurm.schedmd.com/>. |
| * Please also read the included file: DISCLAIMER. |
| * |
| * Slurm is free software; you can redistribute it and/or modify it under |
| * the terms of the GNU General Public License as published by the Free |
| * Software Foundation; either version 2 of the License, or (at your option) |
| * any later version. |
| * |
| * In addition, as a special exception, the copyright holders give permission |
| * to link the code of portions of this program with the OpenSSL library under |
| * certain conditions as described in each individual source file, and |
| * distribute linked combinations including the two. You must obey the GNU |
| * General Public License in all respects for all of the code used other than |
| * OpenSSL. If you modify file(s) with this exception, you may extend this |
| * exception to your version of the file(s), but you are not obligated to do |
| * so. If you do not wish to do so, delete this exception statement from your |
| * version. If you delete this exception statement from all source files in |
| * the program, then also delete it here. |
| * |
| * Slurm is distributed in the hope that it will be useful, but WITHOUT ANY |
| * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS |
| * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more |
| * details. |
| * |
| * You should have received a copy of the GNU General Public License along |
| * with Slurm; if not, write to the Free Software Foundation, Inc., |
| * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. |
| \*****************************************************************************/ |
| |
| #include "config.h" |
| |
| #if HAVE_SYS_PRCTL_H |
| # include <sys/prctl.h> |
| #endif |
| |
| #include <pthread.h> |
| #include <stdio.h> |
| #include <stdlib.h> |
| #include <string.h> |
| #include <time.h> |
| #include <unistd.h> |
| |
| #include "slurm/slurm.h" |
| #include "slurm/slurmdb.h" |
| #include "slurm/slurm_errno.h" |
| |
| #include "src/common/assoc_mgr.h" |
| #include "src/common/job_features.h" |
| #include "src/common/list.h" |
| #include "src/common/macros.h" |
| #include "src/common/parse_time.h" |
| #include "src/common/read_config.h" |
| #include "src/common/slurm_protocol_api.h" |
| #include "src/common/xmalloc.h" |
| #include "src/common/xstring.h" |
| |
| #include "src/interfaces/accounting_storage.h" |
| #include "src/interfaces/burst_buffer.h" |
| #include "src/interfaces/gres.h" |
| #include "src/interfaces/node_features.h" |
| #include "src/interfaces/mcs.h" |
| #include "src/interfaces/preempt.h" |
| #include "src/interfaces/select.h" |
| #include "src/interfaces/topology.h" |
| |
| |
| #include "src/slurmctld/acct_policy.h" |
| #include "src/slurmctld/fed_mgr.h" |
| #include "src/slurmctld/job_scheduler.h" |
| #include "src/slurmctld/licenses.h" |
| #include "src/slurmctld/locks.h" |
| #include "src/slurmctld/node_scheduler.h" |
| #include "src/slurmctld/proc_req.h" |
| #include "src/slurmctld/reservation.h" |
| #include "src/slurmctld/slurmctld.h" |
| |
| #include "src/stepmgr/gres_stepmgr.h" |
| #include "src/stepmgr/srun_comm.h" |
| |
| #include "backfill.h" |
| #include "oracle.h" |
| |
| #define BACKFILL_INTERVAL 30 |
| #define BACKFILL_RESOLUTION 60 |
| #define BACKFILL_WINDOW (24 * 60 * 60) |
| #define BF_MAX_JOB_ARRAY_RESV 20 |
| |
| #define YIELD_INTERVAL 2000000 /* time in micro-seconds */ |
| #define YIELD_SLEEP 500000; /* time in micro-seconds */ |
| |
| #define MAX_BACKFILL_INTERVAL 10800 /* 3 hours */ |
| #define MAX_BACKFILL_RESOLUTION 3600 /* 1 hour */ |
| #define MAX_BACKFILL_WINDOW (30 * 24 * 60 * 60) /* 30 days */ |
| #define MAX_BF_JOB_PART_COUNT_RESERVE 100000 |
| #define MAX_BF_MAX_JOB_ARRAY_RESV 1000 |
| #define MAX_BF_MAX_JOB_START 10000 |
| #define DEF_BF_MAX_JOB_TEST 500 |
| #define MAX_BF_MAX_JOB_TEST 1000000 |
| #define MAX_BF_MAX_TIME 3600 |
| #define MAX_BF_MIN_AGE_RESERVE (30 * 24 * 60 * 60) /* 30 days */ |
| #define MAX_BF_MIN_PRIO_RESERVE INFINITE |
| #define MAX_BF_YIELD_INTERVAL 10000000 /* 10 seconds in usec */ |
| #define MAX_MAX_RPC_CNT 1000 |
| #define MAX_YIELD_RPC_CNT 200 |
| #define MAX_YIELD_SLEEP 10000000 /* 10 seconds in usec */ |
| |
| #define MAX_BF_MAX_JOB_ASSOC MAX_BF_MAX_JOB_TEST |
| #define MAX_BF_MAX_JOB_USER MAX_BF_MAX_JOB_TEST |
| #define MAX_BF_MAX_JOB_USER_PART MAX_BF_MAX_JOB_TEST |
| #define MAX_BF_MAX_JOB_PART MAX_BF_MAX_JOB_TEST |
| |
| typedef struct { |
| node_space_map_t *node_space; |
| int *node_space_recs; |
| } node_space_handler_t; |
| |
| /* |
| * HetJob scheduling structures |
| * NOTE: An individual hetjob component can be submitted to multiple |
| * partitions and have different start times in each |
| */ |
| typedef struct { |
| uint32_t job_id; |
| job_record_t *job_ptr; |
| time_t latest_start; /* Time when expected to start */ |
| part_record_t *part_ptr; |
| slurmctld_resv_t *resv_ptr; |
| } het_job_rec_t; |
| |
| typedef struct { |
| uint32_t comp_time_limit; /* Time limit for hetjob */ |
| uint32_t het_job_id; |
| list_t *het_job_rec_list; /* list of het_job_rec_t */ |
| time_t prev_start; /* Expected start time from last test */ |
| } het_job_map_t; |
| |
| typedef struct { |
| uint32_t het_job_id; |
| time_t start_time; |
| } deadlock_job_struct_t; |
| |
| typedef struct { |
| list_t *deadlock_job_list; |
| part_record_t *part_ptr; |
| } deadlock_part_struct_t; |
| |
| /* Diagnostic statistics */ |
| extern diag_stats_t slurmctld_diag_stats; |
| uint32_t bf_sleep_usec = 0; |
| |
| typedef struct { |
| slurmdb_bf_usage_t bf_usage; |
| uid_t uid; |
| } bf_user_usage_t; |
| |
| typedef struct { |
| bool allocated; /* A job is running on this node */ |
| time_t last_job_end; /* Last end time of running job on node*/ |
| char *mcs_label; |
| bool mixed_user; /* multiple users running on node */ |
| bool needs_sorting; /* After adding to the mix sort related |
| * nodes_used_list */ |
| uint32_t node_index; |
| bool owned; /* Node has exclusive=user job */ |
| uint32_t uid; /* user id of a job running on the node */ |
| } node_used_t; |
| |
| typedef struct { |
| bool delay_start; |
| bool is_exclusive_user; |
| uint32_t job_user; |
| time_t *later_start; |
| char *mcs_label; |
| uint32_t min_nodes; |
| bitstr_t *node_bitmap; |
| int node_cnt; |
| time_t prev_time; |
| bitstr_t *req_nodes; |
| bool set_later_start; |
| time_t start_time; |
| } filter_exclusive_args_t; |
| |
| /*********************** local variables *********************/ |
| static bool stop_backfill = false; |
| static pthread_mutex_t term_lock = PTHREAD_MUTEX_INITIALIZER; |
| static pthread_cond_t term_cond = PTHREAD_COND_INITIALIZER; |
| static pthread_mutex_t config_lock = PTHREAD_MUTEX_INITIALIZER; |
| static bool config_flag = false; |
| static int backfill_interval = BACKFILL_INTERVAL; |
| static int bf_max_time = BACKFILL_INTERVAL; |
| static int backfill_resolution = BACKFILL_RESOLUTION; |
| static int backfill_window = BACKFILL_WINDOW; |
| static int bf_job_part_count_reserve = 0; |
| static int bf_max_job_array_resv = BF_MAX_JOB_ARRAY_RESV; |
| static int bf_min_age_reserve = 0; |
| static int bf_node_space_size = 0; |
| static bool bf_running_job_reserve = false; |
| static bool bf_licenses = false; |
| static uint32_t bf_min_prio_reserve = 0; |
| static list_t *deadlock_global_list = NULL; |
| static bool bf_hetjob_immediate = false; |
| static uint16_t bf_hetjob_prio = 0; |
| static bool bf_one_resv_per_job = false; |
| static bool bf_allow_magnetic_slot = false; |
| static bool bf_topopt_enable = false; |
| static uint32_t job_start_cnt = 0; |
| static uint32_t job_test_cnt = 0; |
| static int max_backfill_job_cnt = DEF_BF_MAX_JOB_TEST; |
| static int max_backfill_job_per_assoc = 0; |
| static int max_backfill_job_per_part = 0; |
| static int max_backfill_job_per_user = 0; |
| static int max_backfill_job_per_user_part = 0; |
| static int max_backfill_jobs_start = 0; |
| static bool backfill_continue = false; |
| static bool assoc_limit_stop = false; |
| static int max_rpc_cnt = 0; |
| static int yield_rpc_cnt = 0; |
| static int yield_interval = YIELD_INTERVAL; |
| static int yield_sleep = YIELD_SLEEP; |
| static list_t *het_job_list = NULL; |
| static xhash_t *user_usage_map = NULL; /* look up user usage when no assoc */ |
| static bitstr_t *planned_bitmap = NULL; |
| static bool soft_time_limit = false; |
| |
| /*********************** local functions *********************/ |
| static void _add_reservation(time_t start_time, time_t end_reserve, |
| bitstr_t *res_bitmap, job_record_t *job_ptr, |
| node_space_map_t *node_space, int *node_space_recs, |
| time_t orig_start_time); |
| static void _adjust_hetjob_prio(uint32_t *prio, uint32_t val); |
| static void _attempt_backfill(void); |
| static int _clear_job_estimates(void *x, void *arg); |
| static int _clear_qos_blocked_times(void *x, void *arg); |
| static void _do_diag_stats(struct timeval *tv1, struct timeval *tv2, |
| int node_space_recs); |
| static uint32_t _get_job_max_tl(job_record_t *job_ptr, time_t now, |
| node_space_map_t *node_space); |
| static bool _hetjob_any_resv(job_record_t *het_leader); |
| static uint32_t _hetjob_calc_prio(job_record_t *het_leader); |
| static uint32_t _hetjob_calc_prio_tier(job_record_t *het_leader); |
| static void _het_job_deadlock_fini(void); |
| static bool _het_job_deadlock_test(job_record_t *job_ptr); |
| static bool _job_part_valid(job_record_t *job_ptr, part_record_t *part_ptr); |
| static void _load_config(void); |
| static bool _many_pending_rpcs(void); |
| static bool _more_work(time_t last_backfill_time); |
| static uint32_t _my_sleep(int64_t usec); |
| static int _num_feature_count(job_record_t *job_ptr, bool *has_xand, |
| bool *has_mor); |
| static int _het_job_find_map(void *x, void *key); |
| static void _het_job_map_del(void *x); |
| static void _het_job_start_clear(void); |
| static time_t _het_job_start_find(job_record_t *job_ptr); |
| static void _het_job_start_set(job_record_t *job_ptr, time_t latest_start, |
| uint32_t comp_time_limit); |
| static bool _het_job_start_test_single(node_space_map_t *node_space, |
| het_job_map_t *map, bool single); |
| static int _het_job_start_test_list(void *map, void *node_space); |
| static void _het_job_start_test(node_space_map_t *node_space, |
| uint32_t het_job_id, node_used_t *nodes_used, |
| list_t *nodes_used_list); |
| static void _reset_job_time_limit(job_record_t *job_ptr, time_t now, |
| node_space_map_t *node_space); |
| static void _set_bf_exit(bf_exit_t code); |
| static int _set_hetjob_details(void *x, void *arg); |
| static int _start_job(job_record_t *job_ptr, bitstr_t *avail_bitmap); |
| static bool _test_resv_overlap(node_space_map_t *node_space, |
| bitstr_t *use_bitmap, job_record_t *job_ptr, |
| uint32_t start_time, uint32_t end_reserve); |
| static int _try_sched(job_record_t *job_ptr, bitstr_t **avail_bitmap, |
| uint32_t min_nodes, uint32_t max_nodes, |
| uint32_t req_nodes, resv_exc_t *resv_exc_ptr, |
| will_run_data_t *will_run); |
| static int _yield_locks(int64_t usec); |
| static void _bf_map_key_id(void *item, const char **key, uint32_t *key_len); |
| static void _bf_map_free(void *item); |
| |
| /* Log resources to be allocated to a pending job */ |
| static void _dump_job_sched(job_record_t *job_ptr, time_t end_time, |
| bitstr_t *avail_bitmap) |
| { |
| char begin_buf[256], end_buf[256], *node_list; |
| |
| slurm_make_time_str(&job_ptr->start_time, begin_buf, sizeof(begin_buf)); |
| slurm_make_time_str(&end_time, end_buf, sizeof(end_buf)); |
| node_list = bitmap2node_name(avail_bitmap); |
| log_flag(BACKFILL, "%pJ to start at %s, end at %s on nodes %s in partition %s", |
| job_ptr, begin_buf, end_buf, node_list, |
| job_ptr->part_ptr->name); |
| xfree(node_list); |
| } |
| |
| static void _dump_job_test(job_record_t *job_ptr, bitstr_t *avail_bitmap, |
| time_t start_time, time_t later_start) |
| { |
| char begin_buf[256], *node_list; |
| char end_buf[256]; |
| char later_buf[256]; |
| |
| if (start_time == 0) |
| strcpy(begin_buf, "NOW"); |
| else |
| slurm_make_time_str(&start_time, begin_buf, sizeof(begin_buf)); |
| if (later_start == 0) |
| strcpy(later_buf, "NO"); |
| else |
| slurm_make_time_str(&later_start, later_buf, sizeof(later_buf)); |
| if (later_start) |
| later_start += job_ptr->time_limit * 60; |
| slurm_make_time_str(&later_start, end_buf, sizeof(end_buf)); |
| |
| node_list = bitmap2node_name(avail_bitmap); |
| log_flag(BACKFILL, "Test %pJ at %s to %s (later_start: %s) on %s", |
| job_ptr, begin_buf, end_buf, later_buf, node_list); |
| xfree(node_list); |
| } |
| |
| /* Log resource allocate table */ |
| static void _dump_node_space_table(node_space_map_t *node_space_ptr) |
| { |
| int i = 0; |
| char begin_buf[256], end_buf[256], *node_list, *licenses; |
| |
| log_flag(BACKFILL, "========================================="); |
| while (1) { |
| slurm_make_time_str(&node_space_ptr[i].begin_time, |
| begin_buf, sizeof(begin_buf)); |
| slurm_make_time_str(&node_space_ptr[i].end_time, |
| end_buf, sizeof(end_buf)); |
| node_list = bitmap2node_name(node_space_ptr[i].avail_bitmap); |
| licenses = bf_licenses_to_string(node_space_ptr[i].licenses); |
| log_flag(BACKFILL, "Begin:%s End:%s Nodes:%s Licenses:%s Fragmentation:%u", |
| begin_buf, end_buf, node_list, licenses, |
| node_space_ptr[i].fragmentation); |
| xfree(node_list); |
| xfree(licenses); |
| if ((i = node_space_ptr[i].next) == 0) |
| break; |
| } |
| log_flag(BACKFILL, "========================================="); |
| } |
| |
| static void _set_job_time_limit(job_record_t *job_ptr, uint32_t new_limit) |
| { |
| job_ptr->time_limit = new_limit; |
| /* reset flag if we have a NO_VAL time_limit */ |
| if (job_ptr->time_limit == NO_VAL) |
| job_ptr->limit_set.time = 0; |
| |
| } |
| |
| /* |
| * _many_pending_rpcs - Determine if slurmctld is busy with many active RPCs |
| * RET - True if slurmctld currently has more than max_rpc_cnt active RPCs |
| */ |
| static bool _many_pending_rpcs(void) |
| { |
| bool many_pending_rpcs = false; |
| |
| slurm_mutex_lock(&slurmctld_config.thread_count_lock); |
| //info("thread_count = %u", slurmctld_config.server_thread_count); |
| if ((max_rpc_cnt > 0) && |
| (slurmctld_config.server_thread_count >= max_rpc_cnt)) |
| many_pending_rpcs = true; |
| slurm_mutex_unlock(&slurmctld_config.thread_count_lock); |
| |
| return many_pending_rpcs; |
| |
| } |
| |
| /* |
| * Report summary of job's feature specification |
| * IN job_ptr - job to schedule |
| * OUT has_xand - true if features are XANDed together |
| * OUT has_mor - true if features are MORed together |
| * RET Total count for ALL job features, even counts with XAND separator |
| */ |
| static int _num_feature_count(job_record_t *job_ptr, bool *has_xand, |
| bool *has_mor) |
| { |
| job_details_t *detail_ptr = job_ptr->details; |
| int rc = 0; |
| list_itr_t *feat_iter; |
| job_feature_t *feat_ptr; |
| |
| *has_xand = false; |
| *has_mor = false; |
| if (detail_ptr->feature_list_use == NULL) /* no constraints */ |
| return rc; |
| |
| feat_iter = list_iterator_create(detail_ptr->feature_list_use); |
| while ((feat_ptr = list_next(feat_iter))) { |
| if (feat_ptr->count) |
| rc++; |
| if (feat_ptr->op_code == FEATURE_OP_XAND) |
| *has_xand = true; |
| if (feat_ptr->op_code == FEATURE_OP_MOR) |
| *has_mor = true; |
| } |
| list_iterator_destroy(feat_iter); |
| |
| return rc; |
| } |
| |
| static int _clear_qos_blocked_times(void *x, void *arg) |
| { |
| slurmdb_qos_rec_t *qos_ptr = (slurmdb_qos_rec_t *) x; |
| qos_ptr->blocked_until = 0; |
| |
| return 0; |
| } |
| |
| /* |
| * Attempt to schedule a specific job on specific available nodes |
| * IN job_ptr - job to schedule |
| * IN/OUT avail_bitmap - nodes available/selected to use |
| * IN resv_exc_ptr - Various TRES which can not be used |
| * RET SLURM_SUCCESS on success, otherwise an error code |
| */ |
| static int _try_sched(job_record_t *job_ptr, bitstr_t **avail_bitmap, |
| uint32_t min_nodes, uint32_t max_nodes, |
| uint32_t req_nodes, resv_exc_t *resv_exc_ptr, |
| will_run_data_t *will_run) |
| { |
| bitstr_t *low_bitmap = NULL, *tmp_bitmap = NULL; |
| int rc = SLURM_SUCCESS; |
| bool has_xand = false, has_mor = false; |
| int feat_cnt = _num_feature_count(job_ptr, &has_xand, &has_mor); |
| job_details_t *detail_ptr = job_ptr->details; |
| list_t *feature_cache = detail_ptr->feature_list_use; |
| list_t *preemptee_candidates = NULL; |
| list_itr_t *feat_iter; |
| job_feature_t *feat_ptr; |
| job_feature_t *feature_base; |
| |
| if (has_xand || feat_cnt) { |
| /* |
| * Cache the feature information and test the individual |
| * features (or sets of features in parenthesis), one at a time |
| */ |
| time_t high_start = 0; |
| uint32_t feat_min_node; |
| uint32_t feat_node_cnt; |
| |
| tmp_bitmap = bit_copy(*avail_bitmap); |
| preemptee_candidates = slurm_find_preemptable_jobs(job_ptr); |
| feat_iter = list_iterator_create(feature_cache); |
| while ((feat_ptr = list_next(feat_iter)) && |
| (rc == SLURM_SUCCESS)) { |
| detail_ptr->feature_list_use = |
| list_create(feature_list_delete); |
| feature_base = xmalloc(sizeof(job_feature_t)); |
| feature_base->name = xstrdup(feat_ptr->name); |
| feature_base->op_code = feat_ptr->op_code; |
| list_append(detail_ptr->feature_list_use, feature_base); |
| feat_min_node = feat_ptr->count; |
| while ((feat_ptr->paren > 0) && |
| ((feat_ptr = list_next(feat_iter)))) { |
| feature_base = xmalloc(sizeof(job_feature_t)); |
| feature_base->name = xstrdup(feat_ptr->name); |
| feature_base->op_code = feat_ptr->op_code; |
| feat_min_node = feat_ptr->count; |
| list_append(detail_ptr->feature_list_use, |
| feature_base); |
| } |
| feature_base->op_code = FEATURE_OP_END; |
| feat_min_node = MAX(1, feat_min_node); |
| |
| if ((job_req_node_filter(job_ptr, *avail_bitmap, true) |
| == SLURM_SUCCESS) && |
| (bit_set_count(*avail_bitmap) >= feat_min_node)) { |
| rc = select_g_job_test(job_ptr, *avail_bitmap, |
| feat_min_node, max_nodes, |
| feat_min_node, |
| SELECT_MODE_WILL_RUN, |
| preemptee_candidates, |
| NULL, |
| resv_exc_ptr, |
| will_run); |
| if (rc == SLURM_SUCCESS) { |
| if ((high_start == 0) || |
| (high_start < job_ptr->start_time)) |
| high_start = |
| job_ptr->start_time; |
| |
| if (low_bitmap) { |
| bit_or(low_bitmap, |
| *avail_bitmap); |
| } else { |
| low_bitmap = *avail_bitmap; |
| *avail_bitmap = NULL; |
| } |
| } |
| } else { |
| rc = ESLURM_NODES_BUSY; |
| } |
| FREE_NULL_BITMAP(*avail_bitmap); |
| *avail_bitmap = bit_copy(tmp_bitmap); |
| if (low_bitmap) |
| bit_and_not(*avail_bitmap, low_bitmap); |
| FREE_NULL_LIST(detail_ptr->feature_list_use); |
| } |
| list_iterator_destroy(feat_iter); |
| |
| if (low_bitmap) |
| feat_node_cnt = bit_set_count(low_bitmap); |
| else |
| feat_node_cnt = 0; |
| if (feat_node_cnt < req_nodes) { |
| detail_ptr->feature_list_use = NULL; |
| rc = select_g_job_test(job_ptr, *avail_bitmap, |
| min_nodes - feat_node_cnt, |
| max_nodes - feat_node_cnt, |
| req_nodes - feat_node_cnt, |
| SELECT_MODE_WILL_RUN, |
| preemptee_candidates, |
| NULL, |
| resv_exc_ptr, |
| will_run); |
| |
| if (low_bitmap) { |
| bit_or(low_bitmap, *avail_bitmap); |
| } else { |
| low_bitmap = *avail_bitmap; |
| *avail_bitmap = NULL; |
| } |
| } |
| FREE_NULL_LIST(preemptee_candidates); |
| FREE_NULL_BITMAP(tmp_bitmap); |
| if (high_start && rc == SLURM_SUCCESS) { |
| job_ptr->start_time = high_start; |
| FREE_NULL_BITMAP(*avail_bitmap); |
| *avail_bitmap = low_bitmap; |
| } else { |
| rc = ESLURM_NODES_BUSY; |
| job_ptr->start_time = 0; |
| FREE_NULL_BITMAP(*avail_bitmap); |
| FREE_NULL_BITMAP(low_bitmap); |
| } |
| |
| /* Restore the original feature information */ |
| detail_ptr->feature_list_use = feature_cache; |
| } else if (has_mor) { |
| /* |
| * Cache the feature information and test the individual |
| * features (or sets of features in parenthesis), one at a time |
| */ |
| time_t low_start = 0; |
| |
| tmp_bitmap = bit_copy(*avail_bitmap); |
| preemptee_candidates = slurm_find_preemptable_jobs(job_ptr); |
| feat_iter = list_iterator_create(feature_cache); |
| while ((feat_ptr = list_next(feat_iter))) { |
| detail_ptr->feature_list_use = |
| list_create(feature_list_delete); |
| feature_base = xmalloc(sizeof(job_feature_t)); |
| feature_base->name = xstrdup(feat_ptr->name); |
| feature_base->op_code = feat_ptr->op_code; |
| list_append(detail_ptr->feature_list_use, feature_base); |
| while ((feat_ptr->paren > 0) && |
| ((feat_ptr = list_next(feat_iter)))) { |
| feature_base = xmalloc(sizeof(job_feature_t)); |
| feature_base->name = xstrdup(feat_ptr->name); |
| feature_base->op_code = feat_ptr->op_code; |
| list_append(detail_ptr->feature_list_use, |
| feature_base); |
| } |
| feature_base->op_code = FEATURE_OP_END; |
| |
| if ((job_req_node_filter(job_ptr, *avail_bitmap, true) |
| == SLURM_SUCCESS) && |
| (bit_set_count(*avail_bitmap) >= min_nodes)) { |
| rc = select_g_job_test(job_ptr, *avail_bitmap, |
| min_nodes, max_nodes, |
| req_nodes, |
| SELECT_MODE_WILL_RUN, |
| preemptee_candidates, |
| NULL, |
| resv_exc_ptr, |
| will_run); |
| if ((rc == SLURM_SUCCESS) && |
| ((low_start == 0) || |
| (low_start > job_ptr->start_time))) { |
| low_start = job_ptr->start_time; |
| low_bitmap = *avail_bitmap; |
| *avail_bitmap = NULL; |
| } |
| } |
| FREE_NULL_BITMAP(*avail_bitmap); |
| *avail_bitmap = bit_copy(tmp_bitmap); |
| FREE_NULL_LIST(detail_ptr->feature_list_use); |
| } |
| list_iterator_destroy(feat_iter); |
| FREE_NULL_LIST(preemptee_candidates); |
| FREE_NULL_BITMAP(tmp_bitmap); |
| if (low_start) { |
| job_ptr->start_time = low_start; |
| rc = SLURM_SUCCESS; |
| FREE_NULL_BITMAP(*avail_bitmap); |
| *avail_bitmap = low_bitmap; |
| } else { |
| rc = ESLURM_NODES_BUSY; |
| FREE_NULL_BITMAP(low_bitmap); |
| } |
| |
| /* Restore the original feature information */ |
| detail_ptr->feature_list_use = feature_cache; |
| } else if (detail_ptr->feature_list_use) { |
| if ((job_req_node_filter(job_ptr, *avail_bitmap, true) != |
| SLURM_SUCCESS) || |
| (bit_set_count(*avail_bitmap) < min_nodes)) { |
| rc = ESLURM_NODES_BUSY; |
| } else { |
| preemptee_candidates = |
| slurm_find_preemptable_jobs(job_ptr); |
| rc = select_g_job_test(job_ptr, *avail_bitmap, |
| min_nodes, max_nodes, req_nodes, |
| SELECT_MODE_WILL_RUN, |
| preemptee_candidates, |
| NULL, |
| resv_exc_ptr, |
| will_run); |
| } |
| } else { |
| /* Try to schedule the job. First on dedicated nodes |
| * then on shared nodes (if so configured). */ |
| uint16_t orig_shared; |
| time_t now = time(NULL); |
| char str[100]; |
| |
| preemptee_candidates = slurm_find_preemptable_jobs(job_ptr); |
| orig_shared = job_ptr->details->share_res; |
| job_ptr->details->share_res = 0; |
| tmp_bitmap = bit_copy(*avail_bitmap); |
| |
| if (resv_exc_ptr && resv_exc_ptr->core_bitmap) { |
| bit_fmt(str, (sizeof(str) - 1), |
| resv_exc_ptr->core_bitmap); |
| debug2("exclude core bitmap: %s", str); |
| } |
| |
| rc = select_g_job_test(job_ptr, *avail_bitmap, min_nodes, |
| max_nodes, req_nodes, |
| SELECT_MODE_WILL_RUN, |
| preemptee_candidates, |
| NULL, |
| resv_exc_ptr, |
| will_run); |
| |
| job_ptr->details->share_res = orig_shared; |
| |
| if (((rc != SLURM_SUCCESS) || (job_ptr->start_time > now)) && |
| (orig_shared != 0)) { |
| FREE_NULL_BITMAP(*avail_bitmap); |
| *avail_bitmap = tmp_bitmap; |
| rc = select_g_job_test(job_ptr, *avail_bitmap, |
| min_nodes, max_nodes, req_nodes, |
| SELECT_MODE_WILL_RUN, |
| preemptee_candidates, |
| NULL, |
| resv_exc_ptr, |
| will_run); |
| } else |
| FREE_NULL_BITMAP(tmp_bitmap); |
| } |
| |
| FREE_NULL_LIST(preemptee_candidates); |
| return rc; |
| } |
| |
| /* Terminate backfill_agent */ |
| extern void stop_backfill_agent(void) |
| { |
| slurm_mutex_lock(&term_lock); |
| stop_backfill = true; |
| slurm_cond_signal(&term_cond); |
| slurm_mutex_unlock(&term_lock); |
| } |
| |
| /* Sleep for at least specified time, returns actual sleep time in usec */ |
| static uint32_t _my_sleep(int64_t usec) |
| { |
| int64_t nsec; |
| uint32_t sleep_time = 0; |
| struct timespec ts = {0, 0}; |
| struct timeval tv1 = {0, 0}, tv2 = {0, 0}; |
| |
| if (gettimeofday(&tv1, NULL)) { /* Some error */ |
| sleep(1); |
| return 1000000; |
| } |
| |
| nsec = tv1.tv_usec + usec; |
| nsec *= 1000; |
| ts.tv_sec = tv1.tv_sec + (nsec / 1000000000); |
| ts.tv_nsec = nsec % 1000000000; |
| slurm_mutex_lock(&term_lock); |
| if (!stop_backfill) |
| slurm_cond_timedwait(&term_cond, &term_lock, &ts); |
| slurm_mutex_unlock(&term_lock); |
| if (gettimeofday(&tv2, NULL)) |
| return usec; |
| sleep_time = (tv2.tv_sec - tv1.tv_sec) * 1000000; |
| sleep_time += tv2.tv_usec; |
| sleep_time -= tv1.tv_usec; |
| return sleep_time; |
| } |
| |
| static void _load_config(void) |
| { |
| char *sched_params = slurm_conf.sched_params, *tmp_ptr; |
| long tmp_val = 0; |
| |
| if ((tmp_ptr = xstrcasestr(sched_params, "bf_interval="))) { |
| backfill_interval = atoi(tmp_ptr + 12); |
| if (((backfill_interval != -1) && (backfill_interval < 1)) || |
| backfill_interval > MAX_BACKFILL_INTERVAL) { |
| error("Invalid SchedulerParameters bf_interval: %d", |
| backfill_interval); |
| backfill_interval = BACKFILL_INTERVAL; |
| } |
| } else { |
| backfill_interval = BACKFILL_INTERVAL; |
| } |
| |
| if ((tmp_ptr = xstrcasestr(sched_params, "bf_max_time="))) { |
| bf_max_time = atoi(tmp_ptr + 12); |
| if (bf_max_time < 1 || bf_max_time > MAX_BF_MAX_TIME) { |
| error("Invalid SchedulerParameters bf_max_time:" |
| " %d", bf_max_time); |
| bf_max_time = backfill_interval; |
| } |
| } else { |
| bf_max_time = backfill_interval; |
| } |
| |
| if ((tmp_ptr = xstrcasestr(sched_params, "bf_window="))) { |
| backfill_window = atoi(tmp_ptr + 10) * 60; /* mins to secs */ |
| if (backfill_window < 1 || |
| backfill_window > MAX_BACKFILL_WINDOW) { |
| error("Invalid SchedulerParameters bf_window: %d", |
| backfill_window); |
| backfill_window = BACKFILL_WINDOW; |
| } |
| } else { |
| backfill_window = BACKFILL_WINDOW; |
| } |
| |
| if ((tmp_ptr = xstrcasestr(sched_params, "bf_max_job_test="))) |
| max_backfill_job_cnt = atoi(tmp_ptr + 16); |
| else if ((tmp_ptr = xstrcasestr(sched_params, "max_job_bf="))) { |
| fatal("Invalid parameter max_job_bf. The option is no longer supported, please use bf_max_job_test instead."); |
| } |
| else |
| max_backfill_job_cnt = DEF_BF_MAX_JOB_TEST; |
| |
| if (max_backfill_job_cnt < 1 || |
| max_backfill_job_cnt > MAX_BF_MAX_JOB_TEST) { |
| error("Invalid SchedulerParameters bf_max_job_test: %d", |
| max_backfill_job_cnt); |
| max_backfill_job_cnt = DEF_BF_MAX_JOB_TEST; |
| } |
| |
| if ((tmp_ptr = xstrcasestr(sched_params, "bf_node_space_size="))) |
| bf_node_space_size = atoi(tmp_ptr + 19); |
| else |
| bf_node_space_size = max_backfill_job_cnt; |
| |
| if (bf_node_space_size < 2 || |
| bf_node_space_size > 2 * MAX_BF_MAX_JOB_TEST) { |
| error("Invalid SchedulerParameters bf_node_space_size: %d", |
| bf_node_space_size); |
| bf_node_space_size = max_backfill_job_cnt; |
| } |
| |
| if ((tmp_ptr = xstrcasestr(sched_params, "bf_resolution="))) { |
| backfill_resolution = atoi(tmp_ptr + 14); |
| if (backfill_resolution < 1 || |
| backfill_resolution > MAX_BACKFILL_RESOLUTION) { |
| error("Invalid SchedulerParameters bf_resolution: %d", |
| backfill_resolution); |
| backfill_resolution = BACKFILL_RESOLUTION; |
| } |
| } else { |
| backfill_resolution = BACKFILL_RESOLUTION; |
| } |
| |
| if ((tmp_ptr = xstrcasestr(sched_params, "bf_max_job_array_resv="))) { |
| bf_max_job_array_resv = atoi(tmp_ptr + 22); |
| if (bf_max_job_array_resv < 0 || |
| bf_max_job_array_resv > MAX_BF_MAX_JOB_ARRAY_RESV) { |
| error("Invalid SchedulerParameters bf_max_job_array_resv: %d", |
| bf_max_job_array_resv); |
| bf_max_job_array_resv = BF_MAX_JOB_ARRAY_RESV; |
| } |
| } else { |
| bf_max_job_array_resv = BF_MAX_JOB_ARRAY_RESV; |
| } |
| |
| if ((tmp_ptr = xstrcasestr(sched_params, "bf_max_job_part="))) { |
| max_backfill_job_per_part = atoi(tmp_ptr + 16); |
| if (max_backfill_job_per_part < 0) { |
| error("Invalid SchedulerParameters bf_max_job_part: %d", |
| max_backfill_job_per_part); |
| max_backfill_job_per_part = 0; |
| } |
| } else { |
| max_backfill_job_per_part = 0; |
| } |
| if ((max_backfill_job_per_part != 0) && |
| (max_backfill_job_per_part >= max_backfill_job_cnt)) { |
| error("bf_max_job_part >= bf_max_job_test (%u >= %u)", |
| max_backfill_job_per_part, max_backfill_job_cnt); |
| } |
| |
| if ((tmp_ptr = xstrcasestr(sched_params, "bf_max_job_start="))) { |
| max_backfill_jobs_start = atoi(tmp_ptr + 17); |
| if (max_backfill_jobs_start < 0 || |
| max_backfill_jobs_start > MAX_BF_MAX_JOB_START) { |
| error("Invalid SchedulerParameters bf_max_job_start: %d", |
| max_backfill_jobs_start); |
| max_backfill_jobs_start = 0; |
| } |
| } else { |
| max_backfill_jobs_start = 0; |
| } |
| |
| if ((tmp_ptr = xstrcasestr(sched_params, "bf_max_job_user="))) { |
| max_backfill_job_per_user = atoi(tmp_ptr + 16); |
| if (max_backfill_job_per_user < 0) { |
| error("Invalid SchedulerParameters bf_max_job_user: %d", |
| max_backfill_job_per_user); |
| max_backfill_job_per_user = 0; |
| } |
| } else { |
| max_backfill_job_per_user = 0; |
| } |
| if ((max_backfill_job_per_user != 0) && |
| (max_backfill_job_per_user > max_backfill_job_cnt)) { |
| warning("bf_max_job_user > bf_max_job_test (%u > %u)", |
| max_backfill_job_per_user, max_backfill_job_cnt); |
| } |
| |
| bf_job_part_count_reserve = 0; |
| if ((tmp_ptr = xstrcasestr(sched_params, "bf_job_part_count_reserve="))) { |
| int job_cnt = atoi(tmp_ptr + 26); |
| if (job_cnt < 0 || job_cnt > MAX_BF_JOB_PART_COUNT_RESERVE) { |
| error("Invalid SchedulerParameters bf_job_part_count_reserve: %d", |
| job_cnt); |
| } else { |
| bf_job_part_count_reserve = job_cnt; |
| } |
| } |
| |
| if ((tmp_ptr = xstrcasestr(sched_params, "bf_max_job_user_part="))) { |
| max_backfill_job_per_user_part = atoi(tmp_ptr + 21); |
| if (max_backfill_job_per_user_part < 0) { |
| error("Invalid SchedulerParameters bf_max_job_user_part: %d", |
| max_backfill_job_per_user_part); |
| max_backfill_job_per_user_part = 0; |
| } |
| } else { |
| max_backfill_job_per_user_part = 0; |
| } |
| if ((max_backfill_job_per_user_part != 0) && |
| (max_backfill_job_per_user_part > max_backfill_job_cnt)) { |
| warning("bf_max_job_user_part > bf_max_job_test (%u > %u)", |
| max_backfill_job_per_user_part, max_backfill_job_cnt); |
| } |
| |
| |
| if ((tmp_ptr = xstrcasestr(sched_params, "bf_max_job_assoc="))) { |
| max_backfill_job_per_assoc = atoi(tmp_ptr + 17); |
| if (max_backfill_job_per_assoc < 0) { |
| error("Invalid SchedulerParameters bf_max_job_assoc: %d", |
| max_backfill_job_per_assoc); |
| max_backfill_job_per_assoc = 0; |
| } |
| } else { |
| max_backfill_job_per_assoc = 0; |
| } |
| if ((max_backfill_job_per_assoc != 0) && |
| (max_backfill_job_per_assoc > max_backfill_job_cnt)) { |
| warning("bf_max_job_assoc > bf_max_job_test (%u > %u)", |
| max_backfill_job_per_assoc, max_backfill_job_cnt); |
| } |
| if ((max_backfill_job_per_assoc != 0) && |
| (max_backfill_job_per_user != 0)) { |
| error("Both bf_max_job_user and bf_max_job_assoc are set: " |
| "bf_max_job_assoc taking precedence."); |
| max_backfill_job_per_user = 0; |
| } |
| |
| bf_min_age_reserve = 0; |
| if ((tmp_ptr = xstrcasestr(sched_params, "bf_min_age_reserve="))) { |
| int min_age = atoi(tmp_ptr + 19); |
| if (min_age < 0 || min_age > MAX_BF_MIN_AGE_RESERVE) { |
| error("Invalid SchedulerParameters bf_min_age_reserve: %d", |
| min_age); |
| } else { |
| bf_min_age_reserve = min_age; |
| } |
| } |
| |
| bf_min_prio_reserve = 0; |
| if ((tmp_ptr = xstrcasestr(sched_params, "bf_min_prio_reserve="))) { |
| unsigned long long int min_prio; |
| tmp_ptr += 20; |
| min_prio = strtoull(tmp_ptr, NULL, 10); |
| if (!min_prio || min_prio > MAX_BF_MIN_PRIO_RESERVE) { |
| error("Invalid SchedulerParameters bf_min_prio_reserve: %llu", |
| min_prio); |
| } else { |
| bf_min_prio_reserve = (uint32_t) min_prio; |
| } |
| } |
| |
| /* bf_continue makes backfill continue where it was if interrupted */ |
| if (xstrcasestr(sched_params, "bf_continue")) { |
| backfill_continue = true; |
| } else { |
| backfill_continue = false; |
| } |
| |
| if (xstrcasestr(sched_params, "assoc_limit_stop")) { |
| assoc_limit_stop = true; |
| } else { |
| assoc_limit_stop = false; |
| } |
| |
| |
| if ((tmp_ptr = xstrcasestr(sched_params, "bf_yield_interval="))) { |
| yield_interval = atoi(tmp_ptr + 18); |
| if ((yield_interval <= 0) || |
| (yield_interval > MAX_BF_YIELD_INTERVAL)) { |
| error("Invalid backfill scheduler bf_yield_interval: %d", |
| yield_interval); |
| yield_interval = YIELD_INTERVAL; |
| } |
| } else { |
| yield_interval = YIELD_INTERVAL; |
| } |
| |
| if ((tmp_ptr = xstrcasestr(sched_params, "bf_yield_sleep="))) { |
| yield_sleep = (int64_t) atoll(tmp_ptr + 15); |
| if (yield_sleep <= 0 || yield_sleep > MAX_YIELD_SLEEP) { |
| error("Invalid backfill scheduler bf_yield_sleep: %d", |
| yield_sleep); |
| yield_sleep = YIELD_SLEEP; |
| } |
| } else { |
| yield_sleep = YIELD_SLEEP; |
| } |
| |
| bf_hetjob_prio = 0; |
| if ((tmp_ptr = xstrcasestr(sched_params, "bf_hetjob_prio="))) { |
| tmp_ptr += 15; |
| if (!xstrncasecmp(tmp_ptr, "min", 3)) |
| bf_hetjob_prio |= HETJOB_PRIO_MIN; |
| else if (!xstrncasecmp(tmp_ptr, "max", 3)) |
| bf_hetjob_prio |= HETJOB_PRIO_MAX; |
| else if (!xstrncasecmp(tmp_ptr, "avg", 3)) |
| bf_hetjob_prio |= HETJOB_PRIO_AVG; |
| else |
| error("Invalid SchedulerParameters bf_hetjob_prio: %s", |
| tmp_ptr); |
| } |
| |
| bf_hetjob_immediate = false; |
| if (xstrcasestr(sched_params, "bf_hetjob_immediate")) |
| bf_hetjob_immediate = true; |
| |
| if (bf_hetjob_immediate && !bf_hetjob_prio) { |
| bf_hetjob_prio |= HETJOB_PRIO_MIN; |
| info("bf_hetjob_immediate automatically sets bf_hetjob_prio=min"); |
| } |
| |
| if (xstrcasestr(sched_params, "bf_one_resv_per_job")) |
| bf_one_resv_per_job = true; |
| else |
| bf_one_resv_per_job = false; |
| |
| if (xstrcasestr(sched_params, "bf_allow_magnetic_slot")) |
| bf_allow_magnetic_slot = true; |
| else |
| bf_allow_magnetic_slot = false; |
| |
| if (xstrcasestr(sched_params, "bf_running_job_reserve")) |
| bf_running_job_reserve = true; |
| else |
| bf_running_job_reserve = false; |
| |
| if (xstrcasestr(sched_params, "bf_licenses")) { |
| bf_licenses = true; |
| bf_running_job_reserve = true; |
| } else { |
| bf_licenses = false; |
| } |
| |
| if (xstrcasestr(sched_params, "bf_topopt_enable")) { |
| bf_topopt_enable = true; |
| } else { |
| bf_topopt_enable = false; |
| } |
| |
| if ((tmp_ptr = xstrcasestr(sched_params, "bf_topopt_iterations="))) { |
| bf_topopt_iterations = atoi(tmp_ptr + 21); |
| if ((bf_topopt_iterations <= 1) || |
| (bf_topopt_iterations > MAX_ORACLE_DEPTH)) { |
| error("Invalid backfill scheduler bf_topopt_iterations: %d", |
| bf_topopt_iterations); |
| bf_topopt_iterations = ORACLE_DEPTH; |
| } |
| } else { |
| bf_topopt_iterations = ORACLE_DEPTH; |
| } |
| if ((tmp_ptr = xstrcasestr(sched_params, "max_rpc_cnt="))) |
| max_rpc_cnt = atoi(tmp_ptr + 12); |
| else if ((tmp_ptr = xstrcasestr(sched_params, "max_rpc_count="))) |
| max_rpc_cnt = atoi(tmp_ptr + 14); |
| else |
| max_rpc_cnt = 0; |
| if ((max_rpc_cnt < 0) || (max_rpc_cnt > MAX_MAX_RPC_CNT)) { |
| error("Invalid SchedulerParameters max_rpc_cnt: %d", |
| max_rpc_cnt); |
| max_rpc_cnt = 0; |
| } |
| |
| if ((tmp_ptr = xstrcasestr(sched_params, "bf_yield_rpc_cnt="))) |
| tmp_val = strtol(tmp_ptr + 17, NULL, 10); |
| else if ((tmp_ptr = xstrcasestr(sched_params, "bf_yield_rpc_count="))) |
| tmp_val = strtol(tmp_ptr + 19, NULL, 10); |
| else |
| tmp_val = MAX((max_rpc_cnt / 10), 20); |
| if ((tmp_val < 0) || (tmp_val > MAX_YIELD_RPC_CNT)) { |
| error("Invalid SchedulerParameters bf_yield_rpc_cnt: %ld", |
| tmp_val); |
| yield_rpc_cnt = MAX((max_rpc_cnt / 10), 20); |
| } else { |
| yield_rpc_cnt = tmp_val; |
| } |
| |
| if (xstrcasestr(sched_params, "time_min_as_soft_limit")) |
| soft_time_limit = true; |
| } |
| |
| /* Note that slurm.conf has changed */ |
| extern void backfill_reconfig(void) |
| { |
| slurm_mutex_lock(&config_lock); |
| config_flag = true; |
| slurm_mutex_unlock(&config_lock); |
| } |
| |
| /* Update backfill scheduling statistics |
| * IN tv1 - start time |
| * IN tv2 - end (current) time |
| * IN node_space_recs - count of records in resources/time table being tested |
| */ |
| static void _do_diag_stats(struct timeval *tv1, struct timeval *tv2, |
| int node_space_recs) |
| { |
| uint32_t delta_t, real_time; |
| |
| delta_t = (tv2->tv_sec - tv1->tv_sec) * 1000000; |
| delta_t += tv2->tv_usec; |
| delta_t -= tv1->tv_usec; |
| real_time = delta_t - bf_sleep_usec; |
| |
| slurmctld_diag_stats.bf_cycle_counter++; |
| slurmctld_diag_stats.bf_cycle_sum += real_time; |
| slurmctld_diag_stats.bf_cycle_last = real_time; |
| |
| slurmctld_diag_stats.bf_depth_sum += slurmctld_diag_stats.bf_last_depth; |
| slurmctld_diag_stats.bf_depth_try_sum += |
| slurmctld_diag_stats.bf_last_depth_try; |
| if (slurmctld_diag_stats.bf_cycle_last > |
| slurmctld_diag_stats.bf_cycle_max) { |
| slurmctld_diag_stats.bf_cycle_max = slurmctld_diag_stats. |
| bf_cycle_last; |
| } |
| slurmctld_diag_stats.bf_table_size = node_space_recs; |
| slurmctld_diag_stats.bf_table_size_sum += node_space_recs; |
| } |
| |
| static void _init_planned_bitmap(void) |
| { |
| slurmctld_lock_t read_node_lock = { .node = READ_LOCK }; |
| node_record_t *node_ptr = NULL; |
| |
| xassert(!planned_bitmap); |
| planned_bitmap = bit_alloc(node_record_count); |
| |
| /* Sync planned_bitmap with NODE_STATE_PLANNED nodes from state save */ |
| lock_slurmctld(read_node_lock); |
| for (int i = 0; (node_ptr = next_node(&i)); i++) |
| if (IS_NODE_PLANNED(node_ptr)) |
| bit_set(planned_bitmap, i); |
| unlock_slurmctld(read_node_lock); |
| } |
| |
| extern void __attempt_backfill(void) |
| { |
| _load_config(); |
| het_job_list = list_create(_het_job_map_del); |
| _init_planned_bitmap(); |
| _attempt_backfill(); |
| FREE_NULL_LIST(het_job_list); |
| FREE_NULL_BITMAP(planned_bitmap); |
| } |
| |
| /* backfill_agent - detached thread periodically attempts to backfill jobs */ |
| extern void *backfill_agent(void *args) |
| { |
| time_t now; |
| double wait_time; |
| static time_t last_backfill_time = 0; |
| /* Read config and partitions; Write jobs and nodes */ |
| slurmctld_lock_t all_locks = { |
| READ_LOCK, WRITE_LOCK, WRITE_LOCK, READ_LOCK, READ_LOCK }; |
| bool load_config; |
| bool short_sleep = false; |
| int backfill_cnt = 0; |
| |
| #if HAVE_SYS_PRCTL_H |
| if (prctl(PR_SET_NAME, "bckfl", NULL, NULL, NULL) < 0) { |
| error("cannot set my name to %s %m", "backfill"); |
| } |
| #endif |
| _load_config(); |
| last_backfill_time = time(NULL); |
| _init_planned_bitmap(); |
| het_job_list = list_create(_het_job_map_del); |
| while (!stop_backfill) { |
| if (short_sleep) |
| _my_sleep(USEC_IN_SEC); |
| else if (backfill_interval == -1) |
| _my_sleep(BACKFILL_INTERVAL * USEC_IN_SEC); |
| else |
| _my_sleep((int64_t) backfill_interval * USEC_IN_SEC); |
| if (stop_backfill) |
| break; |
| |
| if (slurmctld_config.scheduling_disabled) |
| continue; |
| |
| list_flush(het_job_list); |
| slurm_mutex_lock(&config_lock); |
| if (config_flag) { |
| config_flag = false; |
| load_config = true; |
| } else { |
| load_config = false; |
| } |
| slurm_mutex_unlock(&config_lock); |
| if (load_config) |
| _load_config(); |
| if (backfill_interval == -1) { |
| log_flag(BACKFILL, "skipping backfill cycle for %ds", |
| BACKFILL_INTERVAL); |
| continue; |
| } |
| now = time(NULL); |
| wait_time = difftime(now, last_backfill_time); |
| if ((wait_time < backfill_interval) || |
| job_is_completing(NULL) || _many_pending_rpcs() || |
| !_more_work(last_backfill_time)) { |
| short_sleep = true; |
| continue; |
| } |
| |
| slurm_mutex_lock(&check_bf_running_lock); |
| slurmctld_diag_stats.bf_active = 1; |
| slurm_mutex_unlock(&check_bf_running_lock); |
| |
| lock_slurmctld(all_locks); |
| validate_all_reservations(true, false); |
| if ((backfill_cnt++ % 2) == 0) |
| _het_job_start_clear(); |
| _attempt_backfill(); |
| last_backfill_time = time(NULL); |
| (void) bb_g_job_try_stage_in(); |
| unlock_slurmctld(all_locks); |
| |
| slurm_mutex_lock(&check_bf_running_lock); |
| slurmctld_diag_stats.bf_active = 0; |
| slurm_mutex_unlock(&check_bf_running_lock); |
| |
| short_sleep = false; |
| } |
| FREE_NULL_LIST(het_job_list); |
| xhash_free(user_usage_map); /* May have been init'ed if used */ |
| FREE_NULL_BITMAP(planned_bitmap); |
| |
| return NULL; |
| } |
| |
| /* |
| * Clear the start_time and sched_nodes for all pending jobs. This is used to |
| * ensure that a job which can run in multiple partitions has its start_time and |
| * sched_nodes set to the partition offering the earliest start_time. |
| */ |
| static int _clear_job_estimates(void *x, void *arg) |
| { |
| job_record_t *job_ptr = (job_record_t *) x; |
| if (IS_JOB_PENDING(job_ptr)) { |
| job_ptr->start_time = 0; |
| xfree(job_ptr->sched_nodes); |
| } |
| return SLURM_SUCCESS; |
| } |
| |
| /* |
| * Return non-zero to break the backfill loop if change in job, node, |
| * reservation or partition state or the backfill scheduler needs to be stopped. |
| */ |
| static int _yield_locks(int64_t usec) |
| { |
| slurmctld_lock_t all_locks = { |
| READ_LOCK, WRITE_LOCK, WRITE_LOCK, READ_LOCK, READ_LOCK }; |
| time_t job_update, node_update, part_update, config_update, resv_update; |
| bool load_config = false; |
| |
| job_update = last_job_update; |
| node_update = last_node_update; |
| part_update = last_part_update; |
| config_update = slurm_conf.last_update; |
| resv_update = last_resv_update; |
| |
| unlock_slurmctld(all_locks); |
| while (!stop_backfill) { |
| bf_sleep_usec += _my_sleep(usec); |
| slurm_mutex_lock(&slurmctld_config.thread_count_lock); |
| if ((max_rpc_cnt == 0) || |
| (slurmctld_config.server_thread_count <= yield_rpc_cnt)) { |
| slurm_mutex_unlock(&slurmctld_config.thread_count_lock); |
| break; |
| } |
| verbose("continuing to yield locks, %d RPCs pending", |
| slurmctld_config.server_thread_count); |
| slurm_mutex_unlock(&slurmctld_config.thread_count_lock); |
| } |
| lock_slurmctld(all_locks); |
| slurm_mutex_lock(&config_lock); |
| if (config_flag) |
| load_config = true; |
| slurm_mutex_unlock(&config_lock); |
| |
| if (((!backfill_continue) && ((last_job_update != job_update) || |
| (last_node_update != node_update))) || |
| (last_part_update != part_update) || |
| (slurm_conf.last_update != config_update) || |
| (validate_resv_cnt != 0) || |
| (last_resv_update != resv_update) || |
| stop_backfill || load_config) |
| return 1; |
| else |
| return 0; |
| } |
| |
| /* Test if this job still has access to the specified partition. The job's |
| * available partitions may have changed when locks were released */ |
| static bool _job_part_valid(job_record_t *job_ptr, part_record_t *part_ptr) |
| { |
| part_record_t *avail_part_ptr; |
| list_itr_t *part_iterator; |
| bool rc = false; |
| |
| if (job_ptr->part_ptr_list) { |
| part_iterator = list_iterator_create(job_ptr->part_ptr_list); |
| while ((avail_part_ptr = list_next(part_iterator))) { |
| if (avail_part_ptr == part_ptr) { |
| rc = true; |
| break; |
| } |
| } |
| list_iterator_destroy(part_iterator); |
| } else if (job_ptr->part_ptr == part_ptr) { |
| rc = true; |
| } |
| |
| return rc; |
| } |
| |
| /* Determine if job in the backfill queue is still runnable. |
| * Job state could change when lock are periodically released */ |
| static bool _job_runnable_now(job_record_t *job_ptr) |
| { |
| if (IS_JOB_REVOKED(job_ptr)) { |
| log_flag(BACKFILL, "%pJ revoked during bf yield", job_ptr); |
| return false; |
| } |
| if (!IS_JOB_PENDING(job_ptr)) { /* Started in other partition */ |
| log_flag(BACKFILL, "%pJ started in other partition during bf yield", |
| job_ptr); |
| return false; |
| } |
| if (job_ptr->priority == 0) { /* Job has been held */ |
| log_flag(BACKFILL, "%pJ job held during bf yield", job_ptr); |
| return false; |
| } |
| if (IS_JOB_COMPLETING(job_ptr)) { /* Started, requeue and completing */ |
| log_flag(BACKFILL, "%pJ job started during bf yield", job_ptr); |
| return false; |
| } |
| /* |
| * Already reserved resources for either bf_max_job_array_resv or |
| * max_run_tasks number of jobs in the array. If max_run_tasks is 0, it |
| * wasn't set, so ignore it. |
| */ |
| if (job_ptr->array_recs && |
| ((job_ptr->array_recs->pend_run_tasks >= bf_max_job_array_resv) || |
| (job_ptr->array_recs->max_run_tasks && |
| ((job_ptr->array_recs->pend_run_tasks + |
| job_ptr->array_recs->tot_run_tasks) >= |
| job_ptr->array_recs->max_run_tasks)))) |
| return false; |
| |
| return true; |
| } |
| |
| static void _restore_preempt_state(job_record_t *job_ptr, |
| time_t *tmp_preempt_start_time, |
| bool *tmp_preempt_in_progress) |
| { |
| if ((*tmp_preempt_start_time != 0) |
| && (job_ptr->details->preempt_start_time == 0)) { |
| job_ptr->details->preempt_start_time = |
| *tmp_preempt_start_time; |
| job_ptr->preempt_in_progress = *tmp_preempt_in_progress; |
| } |
| |
| *tmp_preempt_start_time = 0; |
| *tmp_preempt_in_progress = false; |
| } |
| |
| /* |
| * IN/OUT: prio to be adjusted |
| * IN: value from current component partition |
| */ |
| static void _adjust_hetjob_prio(uint32_t *prio, uint32_t val) |
| { |
| if (!*prio) |
| *prio = val; |
| else if (bf_hetjob_prio & HETJOB_PRIO_MIN) |
| *prio = MIN(*prio, val); |
| else if (bf_hetjob_prio & HETJOB_PRIO_MAX) |
| *prio = MAX(*prio, val); |
| else if (bf_hetjob_prio & HETJOB_PRIO_AVG) |
| *prio += val; |
| } |
| |
| /* |
| * IN: job_record pointer of a hetjob leader (caller responsible) |
| * RET: [min|max|avg] Priority of all components from same hetjob |
| */ |
| static uint32_t _hetjob_calc_prio(job_record_t *het_leader) |
| { |
| job_record_t *het_comp = NULL; |
| uint32_t prio = 0, tmp = 0, cnt = 0, i = 0, nparts = 0; |
| list_itr_t *iter = NULL; |
| |
| if (bf_hetjob_prio & HETJOB_PRIO_MIN) |
| prio = INFINITE; |
| |
| iter = list_iterator_create(het_leader->het_job_list); |
| while ((het_comp = list_next(iter))) { |
| if (het_comp->part_ptr_list && |
| het_comp->prio_mult && |
| het_comp->prio_mult->priority_array && |
| (nparts = list_count(het_comp->part_ptr_list))) { |
| for (i = 0; i < nparts; i++) { |
| tmp = het_comp->prio_mult->priority_array[i]; |
| if (tmp == 0) { /* job held */ |
| prio = 0; |
| break; |
| } |
| _adjust_hetjob_prio(&prio, tmp); |
| cnt++; |
| } |
| if (prio == 0) /* job held */ |
| break; |
| } else { |
| tmp = het_comp->priority; |
| if (tmp == 0) { /* job held */ |
| prio = 0; |
| break; |
| } |
| _adjust_hetjob_prio(&prio, tmp); |
| cnt++; |
| } |
| if ((bf_hetjob_prio & HETJOB_PRIO_MIN) && (prio == 1)) |
| break; /* Can not get lower */ |
| } |
| list_iterator_destroy(iter); |
| if (prio && cnt && (bf_hetjob_prio & HETJOB_PRIO_AVG)) |
| prio /= cnt; |
| |
| return prio; |
| } |
| |
| /* |
| * IN: job_record pointer of a hetjob leader (caller responsible) |
| * RET: [min|max|avg] PriorityTier of all components from same hetjob |
| */ |
| static uint32_t _hetjob_calc_prio_tier(job_record_t *het_leader) |
| { |
| job_record_t *het_comp = NULL; |
| part_record_t *part_ptr = NULL; |
| uint32_t prio_tier = 0, tmp = 0, cnt = 0; |
| list_itr_t *iter = NULL, *iter2 = NULL; |
| |
| if (bf_hetjob_prio & HETJOB_PRIO_MIN) |
| prio_tier = NO_VAL16 - 1; |
| |
| iter = list_iterator_create(het_leader->het_job_list); |
| while ((het_comp = list_next(iter))) { |
| if (het_comp->part_ptr_list && |
| list_count(het_comp->part_ptr_list)) { |
| iter2 = list_iterator_create(het_comp->part_ptr_list); |
| while ((part_ptr = list_next(iter2))) { |
| tmp = part_ptr->priority_tier; |
| _adjust_hetjob_prio(&prio_tier, tmp); |
| cnt++; |
| } |
| list_iterator_destroy(iter2); |
| } else { |
| tmp = het_comp->part_ptr->priority_tier; |
| _adjust_hetjob_prio(&prio_tier, tmp); |
| cnt++; |
| } |
| if ((bf_hetjob_prio & HETJOB_PRIO_MIN) && (prio_tier == 0)) |
| break; /* Minimum found. */ |
| if ((bf_hetjob_prio & HETJOB_PRIO_MAX) && |
| (prio_tier == (NO_VAL16 - 1))) |
| break; /* Maximum found. */ |
| } |
| list_iterator_destroy(iter); |
| if (prio_tier && cnt && (bf_hetjob_prio & HETJOB_PRIO_AVG)) |
| prio_tier /= cnt; |
| |
| return prio_tier; |
| } |
| |
| /* |
| * IN: job_record pointer of a hetjob leader (caller responsible) |
| * RET: true if any component from same hetjob has a reservation |
| */ |
| static bool _hetjob_any_resv(job_record_t *het_leader) |
| { |
| job_record_t *het_comp = NULL; |
| list_itr_t *iter = NULL; |
| bool any_resv = false; |
| |
| iter = list_iterator_create(het_leader->het_job_list); |
| while (!any_resv && (het_comp = list_next(iter))) { |
| if (het_comp->resv_id != 0) |
| any_resv = true; |
| } |
| list_iterator_destroy(iter); |
| |
| return any_resv; |
| } |
| |
| static int _foreach_het_job_details(void *x, void *arg) |
| { |
| job_record_t *job_ptr = (job_record_t *) x; |
| job_ptr->het_details = (het_job_details_t *)arg; |
| |
| return SLURM_SUCCESS; |
| } |
| |
| static int _bf_reserve_resv_licenses(void *x, void *arg) |
| { |
| slurmctld_resv_t *resv_ptr = x; |
| node_space_handler_t *ns_h = arg; |
| node_space_map_t *node_space = ns_h->node_space; |
| int *ns_recs_ptr = ns_h->node_space_recs; |
| time_t start_time, end_time; |
| job_record_t fake_job = { |
| .license_list = resv_ptr->license_list, |
| .resv_ptr = resv_ptr, |
| }; |
| |
| if (!resv_ptr->license_list) |
| return 0; |
| |
| if (resv_ptr->end_time < node_space[0].begin_time) |
| return 0; |
| |
| /* treat flex reservations as always active */ |
| if (resv_ptr->flags & RESERVE_FLAG_FLEX) { |
| start_time = 0; |
| end_time = INFINITE; |
| } else { |
| /* align to resolution */ |
| |
| start_time = resv_ptr->start_time / backfill_resolution; |
| start_time *= backfill_resolution; |
| end_time = ROUNDUP(resv_ptr->end_time, backfill_resolution); |
| end_time *= backfill_resolution; |
| } |
| |
| _add_reservation(start_time, end_time, NULL, &fake_job, node_space, |
| ns_recs_ptr, 0); |
| |
| return 0; |
| } |
| |
| static int _bf_reserve_running(void *x, void *arg) |
| { |
| job_record_t *job_ptr = (job_record_t *) x; |
| node_space_handler_t *ns_h = (node_space_handler_t *) arg; |
| node_space_map_t *node_space = ns_h->node_space; |
| int *ns_recs_ptr = ns_h->node_space_recs; |
| time_t end_time = job_ptr->end_time; |
| bool licenses, whole, preemptable; |
| bitstr_t *tmp_bitmap; |
| |
| if (!job_ptr || !IS_JOB_RUNNING(job_ptr) || !job_ptr->job_resrcs) |
| return SLURM_SUCCESS; |
| |
| whole = (job_ptr->job_resrcs->whole_node & WHOLE_NODE_REQUIRED) || |
| (IS_JOB_WHOLE_TOPO(job_ptr)); |
| |
| licenses = (job_ptr->license_list); |
| |
| if (!whole && !licenses) |
| return SLURM_SUCCESS; |
| |
| preemptable = (slurm_job_preempt_mode(job_ptr) != PREEMPT_MODE_OFF); |
| |
| if (preemptable && !licenses) |
| return SLURM_SUCCESS; |
| |
| if (*ns_recs_ptr >= bf_node_space_size) |
| return SLURM_ERROR; |
| |
| if (soft_time_limit && job_ptr->time_min) { |
| time_t now = time(NULL); |
| time_t soft_end = job_ptr->start_time + job_ptr->time_min * 60; |
| /* |
| * If over the soft limit, assume the job will use half of the |
| * remaining time until the hard limit. |
| */ |
| if (soft_end < now) |
| soft_end = now + (end_time - now) / 2; |
| end_time = soft_end; |
| } |
| |
| end_time = ROUNDUP(end_time, backfill_resolution) * backfill_resolution; |
| |
| if (preemptable || !whole) { |
| /* Reservation only needed for licenses. */ |
| tmp_bitmap = bit_alloc(node_record_count); |
| } else { |
| tmp_bitmap = bit_copy(job_ptr->node_bitmap); |
| } |
| |
| /* |
| * Ensure reservation start time is aligned to the start of the |
| * backfill map by sending 0 in instead of the actual start time. |
| * A long-running backfill cycle could lead to a skew of a few |
| * seconds - or significantly longer with bf_continue set - which |
| * would fragment the start of the backfill map. |
| */ |
| _add_reservation(0, end_time, tmp_bitmap, job_ptr, node_space, |
| ns_recs_ptr, 0); |
| |
| FREE_NULL_BITMAP(tmp_bitmap); |
| |
| return SLURM_SUCCESS; |
| } |
| |
| static int _set_hetjob_details(void *x, void *arg) |
| { |
| job_record_t *job_ptr = (job_record_t *) x; |
| het_job_details_t *details = NULL; |
| |
| if (IS_JOB_PENDING(job_ptr) && job_ptr->het_job_id && |
| !job_ptr->het_job_offset && job_ptr->het_job_list) { |
| /* |
| * Pending hetjob leader component. Do calculations only once |
| * for whole hetjob. xmalloc memory for 1 het_details struct, |
| * but make the pointer accessible in all hetjob components. |
| */ |
| if (!job_ptr->het_details) |
| job_ptr->het_details = |
| xmalloc(sizeof(het_job_details_t)); |
| |
| details = job_ptr->het_details; |
| details->any_resv = _hetjob_any_resv(job_ptr); |
| details->priority_tier = _hetjob_calc_prio_tier(job_ptr); |
| details->priority = _hetjob_calc_prio(job_ptr); |
| |
| list_for_each(job_ptr->het_job_list, |
| _foreach_het_job_details, details); |
| } |
| |
| return SLURM_SUCCESS; |
| } |
| |
| /* Fetch key from xhash_t item. Called from function ptr */ |
| static void _bf_map_key_id(void *item, const char **key, uint32_t *key_len) |
| { |
| bf_user_usage_t *user = (bf_user_usage_t *)item; |
| |
| xassert(user); |
| |
| *key = (char *)&user->uid; |
| *key_len = sizeof(uid_t); |
| } |
| |
| /* Free item from xhash_t. Called from function ptr */ |
| static void _bf_map_free(void *item) |
| { |
| bf_user_usage_t *user = (bf_user_usage_t *)item; |
| |
| if (!user) |
| return; |
| |
| slurmdb_destroy_bf_usage_members(&user->bf_usage); |
| xfree(user); |
| } |
| |
| /* Allocate new user and add to xhash_t map */ |
| static bf_user_usage_t *_bf_map_add_user(xhash_t *map, uid_t uid) |
| { |
| bf_user_usage_t *user = xmalloc(sizeof(bf_user_usage_t)); |
| user->uid = uid; |
| xhash_add(map, user); |
| return user; |
| } |
| |
| /* Find user usage from uid. Add new empty entry to map if not found */ |
| static slurmdb_bf_usage_t *_bf_map_find_add(xhash_t* map, uid_t uid) |
| { |
| bf_user_usage_t *user; |
| xassert(map != NULL); |
| |
| if (!(user = xhash_get(map, (char *)&uid, sizeof(uid_t)))) |
| user = _bf_map_add_user(map, uid); |
| return &user->bf_usage; |
| } |
| |
| /* |
| * Check if limit exceeded. Reset usage if usage time is before current |
| * scheduling iteration time |
| */ |
| static bool _check_bf_usage( |
| slurmdb_bf_usage_t *usage, int limit, time_t sched_time) |
| { |
| if (usage->last_sched < sched_time) { |
| usage->last_sched = sched_time; |
| usage->count = 0; |
| return false; |
| } |
| return usage->count >= limit; |
| } |
| |
| /* |
| * Check if job exceeds configured count limits |
| * returns true if count exceeded |
| */ |
| static bool _job_exceeds_max_bf_param(job_record_t *job_ptr, |
| time_t sched_start) |
| { |
| slurmdb_bf_usage_t *part_usage = NULL, *user_usage = NULL, |
| *assoc_usage = NULL, *user_part_usage = NULL; |
| |
| slurmdb_assoc_rec_t *assoc_ptr = job_ptr->assoc_ptr; |
| part_record_t *part_ptr = job_ptr->part_ptr; |
| |
| if (max_backfill_job_per_user_part) { |
| xassert(part_ptr->bf_data); |
| user_part_usage = _bf_map_find_add( |
| part_ptr->bf_data->user_usage, |
| job_ptr->user_id); |
| if (_check_bf_usage(user_part_usage, |
| max_backfill_job_per_user_part, |
| sched_start)) { |
| log_flag(BACKFILL, "have already checked %u jobs for user %u on partition %s; skipping job %u, %pJ", |
| max_backfill_job_per_user_part, |
| job_ptr->user_id, job_ptr->part_ptr->name, |
| job_ptr->job_id, job_ptr); |
| return true; |
| } |
| } |
| |
| if (max_backfill_job_per_part) { |
| xassert(part_ptr->bf_data); |
| part_usage = part_ptr->bf_data->job_usage; |
| if (_check_bf_usage(part_usage, max_backfill_job_per_part, |
| sched_start)) { |
| log_flag(BACKFILL, "have already checked %u jobs for partition %s; skipping %pJ", |
| max_backfill_job_per_part, |
| job_ptr->part_ptr->name, job_ptr); |
| return true; |
| } |
| } |
| |
| if (max_backfill_job_per_assoc) { |
| if (assoc_ptr) { |
| if (!assoc_ptr->bf_usage) |
| assoc_ptr->bf_usage = |
| xmalloc(sizeof(slurmdb_bf_usage_t)); |
| assoc_usage = assoc_ptr->bf_usage; |
| |
| if (_check_bf_usage(assoc_usage, |
| max_backfill_job_per_assoc, |
| sched_start)) { |
| log_flag(BACKFILL, "have already checked %u jobs for user %u, assoc %u; skipping %pJ", |
| max_backfill_job_per_assoc, |
| job_ptr->user_id, job_ptr->assoc_id, |
| job_ptr); |
| return true; |
| } |
| } else { |
| /* Null assoc_ptr indicates no database */ |
| log_flag(BACKFILL, "no assoc for job %u, required for parameter bf_max_job_per_assoc", |
| job_ptr->job_id); |
| assoc_usage = NULL; |
| } |
| } |
| |
| if (max_backfill_job_per_user) { |
| if (assoc_ptr && assoc_ptr->user_rec) { |
| if (!assoc_ptr->user_rec->bf_usage) |
| assoc_ptr->user_rec->bf_usage = |
| xmalloc(sizeof(slurmdb_bf_usage_t)); |
| user_usage = assoc_ptr->user_rec->bf_usage; |
| } else { |
| /* No database, or user rec missing from assoc */ |
| if (!user_usage_map) |
| user_usage_map = xhash_init(_bf_map_key_id, |
| _bf_map_free); |
| user_usage = _bf_map_find_add(user_usage_map, |
| job_ptr->user_id); |
| } |
| |
| if (_check_bf_usage(user_usage, max_backfill_job_per_user, |
| sched_start)) { |
| log_flag(BACKFILL, "have already checked %u jobs for user %u; skipping %pJ", |
| max_backfill_job_per_user, job_ptr->user_id, |
| job_ptr); |
| return true; |
| } |
| } |
| |
| /* |
| * Don't count queue records for magnetic reservation against |
| * backfill limits. |
| */ |
| if ((job_ptr->bit_flags & JOB_MAGNETIC) && !bf_allow_magnetic_slot) |
| return false; |
| |
| /* Increment our user/partition limit counters as needed */ |
| if (user_part_usage) |
| user_part_usage->count++; |
| if (part_usage) |
| part_usage->count++; |
| if (user_usage) |
| user_usage->count++; |
| if (assoc_usage) |
| assoc_usage->count++; |
| return false; |
| } |
| |
| /* |
| * Handle the planned list. |
| * set - If true we are setting states, else we clear them. |
| */ |
| static void _handle_planned(bool set) |
| { |
| node_record_t *node_ptr; |
| bool node_update = false, select_synced = false; |
| |
| if (!planned_bitmap) |
| return; |
| |
| for (int n = 0; (n = bit_ffs_from_bit(planned_bitmap, n)) >= 0; n++) { |
| if (!(node_ptr = node_record_table_ptr[n])) { |
| /* Node could have been deleted while planned */ |
| bit_clear(planned_bitmap, n); |
| continue; |
| } |
| if (set) { |
| /* |
| * If the node is fully allocated ignore this flag. |
| * This only really matters for IDLE and MIXED. |
| */ |
| if (IS_NODE_ALLOCATED(node_ptr)) { |
| uint16_t idle_cpus = 0; |
| |
| if (!select_synced) { |
| select_g_select_nodeinfo_set_all(); |
| select_synced = true; |
| } |
| |
| idle_cpus = node_ptr->cpus_efctv - |
| node_ptr->alloc_cpus; |
| if (idle_cpus && |
| (idle_cpus < node_ptr->cpus_efctv)) |
| /* Mixed node as planned */ |
| goto mixed; |
| |
| /* |
| * Node fully allocated. Remove from planned. |
| * This is happening when a mixed node gets |
| * fully allocated while looping in |
| * _attempt_backfill (BF sched loop) |
| */ |
| node_ptr->node_state &= ~NODE_STATE_PLANNED; |
| node_update = true; |
| bit_clear(planned_bitmap, n); |
| } else { |
| /* Idle node as planned */ |
| mixed: |
| node_ptr->node_state |= NODE_STATE_PLANNED; |
| node_update = true; |
| } |
| } else { |
| /* Reset planned state for all nodes */ |
| node_ptr->node_state &= ~NODE_STATE_PLANNED; |
| node_update = true; |
| bit_clear(planned_bitmap, n); |
| } |
| |
| log_flag(BACKFILL, "%s: %s state is %s", |
| set ? "set" : "cleared", |
| node_ptr->name, |
| node_state_string(node_ptr->node_state)); |
| } |
| |
| if (node_update) |
| last_node_update = time(NULL); |
| } |
| static void _set_slot_time(job_record_t *job_ptr, uint32_t time_limit, |
| uint32_t boot_time, uint32_t *start, uint32_t *end) |
| { |
| *start = job_ptr->start_time; |
| *end = *start + boot_time + (time_limit * 60) + backfill_resolution - 1; |
| |
| *start = (*start / backfill_resolution) * backfill_resolution; |
| *end = (*end / backfill_resolution) * backfill_resolution; |
| } |
| |
| |
| /* |
| * Marks nodes' user status and last job end time |
| * Return positive if a node's last_job_end was updated else return 0 |
| */ |
| static int _mark_nodes_usage(void *x, void *arg) |
| { |
| job_record_t *job_ptr = x; |
| node_used_t *nodes_used = arg; |
| bool last_job_end_updated = false; |
| bool owned; |
| |
| int i; |
| |
| xassert(job_ptr); |
| xassert(nodes_used); |
| |
| if (IS_JOB_PENDING(job_ptr) || IS_JOB_COMPLETED(job_ptr) || |
| !job_ptr->node_bitmap) |
| return last_job_end_updated; |
| |
| owned = ((job_ptr->details->whole_node & WHOLE_NODE_USER) || |
| (job_ptr->part_ptr && |
| (job_ptr->part_ptr->flags & PART_FLAG_EXCLUSIVE_USER))); |
| |
| for (i = 0; (i = bit_ffs_from_bit(job_ptr->node_bitmap, i)) >= 0; i++) { |
| if (!nodes_used[i].allocated) { |
| nodes_used[i].allocated = true; |
| nodes_used[i].uid = job_ptr->user_id; |
| nodes_used[i].node_index = i; |
| nodes_used[i].owned = owned; |
| } else if (!nodes_used[i].owned && !nodes_used[i].mixed_user) { |
| nodes_used[i].mixed_user = |
| nodes_used[i].uid != job_ptr->user_id; |
| nodes_used[i].owned = owned; |
| } |
| |
| if (!nodes_used[i].mcs_label && job_ptr->mcs_label && |
| slurm_mcs_get_select(job_ptr) == 1) { |
| /* |
| * We do not need to copy mcs_label, jobs are not purged |
| * during backfill, so this memory should always be |
| * valid. |
| */ |
| nodes_used[i].mcs_label = job_ptr->mcs_label; |
| } |
| |
| if (nodes_used[i].last_job_end < job_ptr->end_time) { |
| nodes_used[i].last_job_end = job_ptr->end_time; |
| last_job_end_updated = true; |
| } |
| } |
| |
| |
| return last_job_end_updated; |
| } |
| |
| static int _cmp_last_job_end(void *x, void *y) |
| { |
| node_used_t *node1 = *(node_used_t **) x; |
| node_used_t *node2 = *(node_used_t **) y; |
| if (node1->last_job_end < node2->last_job_end) |
| return 1; |
| else if (node1->last_job_end > node2->last_job_end) |
| return -1; |
| return 0; |
| } |
| |
| /* For each node find if they have multiple users and the latest job end time */ |
| static void _init_node_used_array_and_list(node_used_t **nodes_used, |
| list_t **nodes_used_list) |
| { |
| xassert(nodes_used && !*nodes_used); |
| xassert(nodes_used_list && !*nodes_used_list); |
| |
| *nodes_used = xcalloc(node_record_count, sizeof(**nodes_used)); |
| *nodes_used_list = list_create(NULL); /* NULL to avoid double free */ |
| |
| list_for_each(job_list, _mark_nodes_usage, *nodes_used); |
| |
| for (int i = 0; i < node_record_count; i++) |
| list_append(*nodes_used_list, &(*nodes_used)[i]); |
| /* Sort list in descending order of last_job_end */ |
| list_sort(*nodes_used_list, _cmp_last_job_end); |
| } |
| |
| static bool _user_conflicts(bool is_exclusive_user, bool job_user_on_node, |
| node_used_t *node) |
| { |
| if (is_exclusive_user && !node->mixed_user && job_user_on_node) |
| return false; /* user alone on node */ |
| if (!is_exclusive_user && (!node->owned || job_user_on_node)) |
| return false; /* node not owned or the user owns the node */ |
| return true; /* can't use node due to user conflict */ |
| } |
| |
| static bool _mcs_label_conflicts(char *job_mcs_label, char *node_mcs_label) |
| { |
| if (job_mcs_label && !xstrcmp(node_mcs_label, job_mcs_label)) |
| return false; /* node already has required mcs_label */ |
| if (!job_mcs_label && !node_mcs_label) |
| return false; /* node can't have mcs_label and it doesn't */ |
| return true; /* can't use node due to mcs_label conflict */ |
| } |
| |
| /* |
| * Check if a node can be used, if not remove it. If the node can't be remove |
| * delay the start time. |
| * Return true if the start was delayed (or can't be delayed) |
| */ |
| static int _rm_node_or_delay_start(void *x, void *arg) |
| { |
| node_used_t *node = x; |
| filter_exclusive_args_t *args = arg; |
| bool job_user_on_node = node->uid == args->job_user; |
| |
| if (!node->allocated) |
| return true; /* following nodes are idle */ |
| if (node->last_job_end <= args->start_time) |
| return true; /* following nodes will be idle by start_time */ |
| if (!bit_test(args->node_bitmap, node->node_index)) |
| return false; /* not available to start with */ |
| if (!_user_conflicts(args->is_exclusive_user, job_user_on_node, node) && |
| !_mcs_label_conflicts(args->mcs_label, node->mcs_label)) |
| return false; /* job user and mcs don't conflict with node's */ |
| |
| /* can't use this node */ |
| *(args->later_start) = node->last_job_end; |
| |
| if ((args->node_cnt > args->min_nodes) && |
| (!args->req_nodes || |
| !bit_test(args->req_nodes, node->node_index))) { |
| /* able to remove the node*/ |
| bit_clear(args->node_bitmap, node->node_index); |
| args->node_cnt--; |
| return false; |
| } |
| |
| /* can't remove the node, delay job start */ |
| args->delay_start = true; |
| return true; |
| } |
| |
| /* Return true if start_time was delayed */ |
| static bool _filter_exclusive_user_mcs_nodes(job_record_t *job_ptr, |
| int mcs_select, |
| uint32_t min_nodes, |
| list_t *nodes_used_list, |
| time_t start_time, |
| time_t *later_filter_start, |
| bitstr_t *node_bitmap) |
| { |
| *later_filter_start = 0; |
| filter_exclusive_args_t args = { |
| .min_nodes = min_nodes, |
| .job_user = job_ptr->user_id, |
| .node_bitmap = node_bitmap, |
| .req_nodes = job_ptr->details->req_node_bitmap, |
| .node_cnt = bit_set_count(node_bitmap), |
| .later_start = later_filter_start, |
| .start_time = start_time, |
| }; |
| |
| /* |
| * Filter out any nodes used by other users, is_exclusive_user = true, |
| * or filter out nodes owned by other users, is_exclusive_user = false |
| */ |
| if ((job_ptr->details->whole_node & WHOLE_NODE_USER) || |
| (job_ptr->part_ptr->flags & PART_FLAG_EXCLUSIVE_USER)) |
| args.is_exclusive_user = true; |
| |
| /* Need to filter out any nodes allocated with other mcs */ |
| args.mcs_label = (mcs_select == 1) ? job_ptr->mcs_label : NULL; |
| |
| /* Note that nodes_used_list is sorted in descending order of job end */ |
| list_find_first(nodes_used_list, _rm_node_or_delay_start, &args); |
| |
| return args.delay_start; |
| } |
| |
| /* This is for use in _attempt_backfill() only */ |
| #define SKIP_SCHED_OR_TRY_LATER(job_ptr, job_no_reserve, later_start, \ |
| orig_time_limit, orig_start_time) \ |
| { \ |
| _set_job_time_limit(job_ptr, orig_time_limit); \ |
| if (later_start && !job_no_reserve) { \ |
| log_flag(BACKFILL, "Try later %pJ later_start %ld", \ |
| job_ptr, later_start); \ |
| job_ptr->start_time = 0; \ |
| goto TRY_LATER; \ |
| } \ |
| /* \ |
| * Job can not start until too far in the future. \ |
| * Use orig_start_time if job can't \ |
| * start in different partition it will be 0 \ |
| */ \ |
| log_flag(BACKFILL, "Can't schedule %pJ in partition %s", \ |
| job_ptr, job_ptr->part_ptr->name); \ |
| job_ptr->start_time = orig_start_time; \ |
| continue; /* not runnable in this partition */ \ |
| } |
| |
| static void _attempt_backfill(void) |
| { |
| DEF_TIMERS; |
| list_t *job_queue = NULL; |
| job_queue_rec_t *job_queue_rec = NULL; |
| int bb, i, j, node_space_recs, mcs_select = 0; |
| slurmdb_qos_rec_t *qos_ptr = NULL; |
| job_record_t *job_ptr = NULL; |
| part_record_t *part_ptr; |
| uint32_t end_time, end_reserve, deadline_time_limit, boot_time; |
| uint32_t orig_end_time; |
| uint32_t time_limit, comp_time_limit, orig_time_limit = 0, part_time_limit; |
| uint32_t min_nodes, max_nodes, req_nodes; |
| bitstr_t *active_bitmap = NULL, *avail_bitmap = NULL; |
| bitstr_t *resv_bitmap = NULL, *excluded_topo_bitmap = NULL; |
| time_t now, sched_start, later_start, start_res, resv_end, window_end; |
| time_t het_job_time, orig_sched_start, orig_start_time = (time_t) 0; |
| time_t later_filter_start; |
| node_space_map_t *node_space; |
| node_used_t *nodes_used = NULL; |
| list_t *nodes_used_list = NULL; |
| struct timeval bf_time1, bf_time2; |
| int error_code; |
| int job_test_count = 0, test_time_count = 0, pend_time; |
| bool already_counted, many_rpcs = false; |
| job_record_t *reject_array_job = NULL; |
| part_record_t *reject_array_part = NULL; |
| slurmdb_qos_rec_t *reject_array_qos = NULL; |
| slurmctld_resv_t *reject_array_resv = NULL; |
| bool reject_array_use_prefer = false; |
| uint32_t start_time, array_start_time = 0; |
| struct timeval start_tv; |
| uint32_t test_array_job_id = 0; |
| uint32_t test_array_count = 0; |
| uint32_t job_no_reserve; |
| bool is_job_array_head, resv_overlap = false; |
| uint8_t save_share_res = 0, save_whole_node = 0; |
| int test_fini; |
| uint32_t qos_flags = 0; |
| time_t qos_blocked_until = 0, qos_part_blocked_until = 0; |
| time_t tmp_preempt_start_time = 0; |
| bool tmp_preempt_in_progress = false; |
| bitstr_t *tmp_bitmap = NULL; |
| bool state_changed_break = false, nodes_planned = false; |
| bitstr_t *next_bitmap = NULL, *current_bitmap = NULL; |
| resv_exc_t resv_exc = { 0 }; |
| will_run_data_t will_run_data = { 0 }; |
| bool overlap_tested = false; |
| /* QOS Read lock */ |
| assoc_mgr_lock_t qos_read_lock = { |
| .qos = READ_LOCK, |
| }; |
| |
| bf_sleep_usec = 0; |
| job_start_cnt = 0; |
| job_test_cnt = 0; |
| |
| if (!fed_mgr_sibs_synced()) { |
| info("returning, federation siblings not synced yet"); |
| return; |
| } |
| |
| (void) bb_g_load_state(false); |
| |
| START_TIMER; |
| if (slurm_conf.debug_flags & DEBUG_FLAG_BACKFILL) |
| info("beginning"); |
| else |
| debug("beginning"); |
| sched_start = orig_sched_start = now = time(NULL); |
| gettimeofday(&start_tv, NULL); |
| |
| _handle_planned(nodes_planned); |
| |
| job_queue = build_job_queue(true, true); |
| job_test_count = list_count(job_queue); |
| if (job_test_count == 0) { |
| if (slurm_conf.debug_flags & DEBUG_FLAG_BACKFILL) |
| info("no jobs to backfill"); |
| else |
| debug("no jobs to backfill"); |
| FREE_NULL_LIST(job_queue); |
| return; |
| } else |
| debug("%u jobs to backfill", job_test_count); |
| |
| list_for_each(job_list, _clear_job_estimates, NULL); |
| |
| if (bf_hetjob_prio) |
| list_for_each(job_list, _set_hetjob_details, NULL); |
| |
| gettimeofday(&bf_time1, NULL); |
| |
| slurmctld_diag_stats.bf_queue_len = job_test_count; |
| slurmctld_diag_stats.bf_queue_len_sum += slurmctld_diag_stats. |
| bf_queue_len; |
| job_test_count = 0; |
| |
| slurmctld_diag_stats.bf_last_depth = 0; |
| slurmctld_diag_stats.bf_last_depth_try = 0; |
| slurmctld_diag_stats.bf_when_last_cycle = now; |
| |
| node_space = xcalloc((bf_node_space_size + 1), |
| sizeof(node_space_map_t)); |
| node_space[0].begin_time = sched_start / backfill_resolution; |
| node_space[0].begin_time *= backfill_resolution; |
| window_end = (sched_start + backfill_window) / backfill_resolution; |
| window_end *= backfill_resolution; |
| node_space[0].end_time = window_end; |
| |
| node_space[0].avail_bitmap = bit_copy(avail_node_bitmap); |
| /* Make "resuming" nodes available to be scheduled in backfill */ |
| bit_or(node_space[0].avail_bitmap, rs_node_bitmap); |
| |
| if (bf_licenses) |
| node_space[0].licenses = |
| bf_licenses_initial(bf_running_job_reserve); |
| |
| if (bf_topopt_enable) { |
| node_space[0].fragmentation = topology_g_get_fragmentation( |
| node_space[0].avail_bitmap); |
| } |
| |
| node_space[0].next = 0; |
| node_space_recs = 1; |
| |
| if (bf_running_job_reserve) { |
| node_space_handler_t node_space_handler; |
| node_space_handler.node_space = node_space; |
| node_space_handler.node_space_recs = &node_space_recs; |
| |
| if (bf_licenses) |
| list_for_each(resv_list, _bf_reserve_resv_licenses, |
| &node_space_handler); |
| |
| list_for_each(job_list, _bf_reserve_running, |
| &node_space_handler); |
| } |
| |
| _init_node_used_array_and_list(&nodes_used, &nodes_used_list); |
| |
| if (slurm_conf.debug_flags & DEBUG_FLAG_BACKFILL_MAP) |
| _dump_node_space_table(node_space); |
| |
| if (assoc_limit_stop) { |
| assoc_mgr_lock(&qos_read_lock); |
| list_for_each(assoc_mgr_qos_list, |
| _clear_qos_blocked_times, NULL); |
| assoc_mgr_unlock(&qos_read_lock); |
| } |
| |
| sort_job_queue(job_queue); |
| |
| /* Ignore nodes that have been set as available during this cycle. */ |
| bit_clear_all(bf_ignore_node_bitmap); |
| |
| if (bf_topopt_enable) |
| init_oracle(); |
| |
| while (1) { |
| uint32_t bf_job_priority, prio_reserve; |
| bool get_boot_time = false; |
| bool licenses_unavail; |
| bool use_prefer = false; |
| slurmctld_resv_t *resv_ptr = NULL; |
| |
| /* Run some final guaranteed logic after each job iteration */ |
| if (job_ptr) { |
| job_resv_clear_magnetic_flag(job_ptr); |
| fill_array_reasons(job_ptr, reject_array_job); |
| |
| /* Restore preemption state if needed. */ |
| _restore_preempt_state(job_ptr, &tmp_preempt_start_time, |
| &tmp_preempt_in_progress); |
| |
| /* |
| * Restore the original time limit in every corner case |
| * we didn't have done yet, like when we are looping |
| * through array tasks. |
| */ |
| if ((qos_flags & QOS_FLAG_NO_RESERVE) && |
| slurm_conf.preempt_mode && orig_time_limit && |
| (orig_time_limit != job_ptr->time_limit)) |
| job_ptr->time_limit = orig_time_limit; |
| |
| /* |
| * An array job with pending tasks should take on the |
| * start_time of the earliest pending task in the |
| * array. |
| */ |
| if (job_ptr->array_recs && array_start_time) |
| job_ptr->start_time = array_start_time; |
| } |
| array_start_time = 0; |
| xfree(job_queue_rec); |
| job_queue_rec = list_pop(job_queue); |
| if (!job_queue_rec) { |
| log_flag(BACKFILL, "reached end of job queue"); |
| _set_bf_exit(BF_EXIT_END); |
| break; |
| } |
| |
| if (job_test_cnt >= |
| max_backfill_job_cnt) { |
| log_flag(BACKFILL, "bf_max_job_test: limit of %d reached", |
| max_backfill_job_cnt); |
| _set_bf_exit(BF_EXIT_MAX_JOB_TEST); |
| break; |
| } |
| |
| if (window_end < now) { |
| log_flag(BACKFILL, "Now after current backfill window"); |
| _set_bf_exit(BF_EXIT_TIMEOUT); |
| break; |
| } |
| job_ptr = job_queue_rec->job_ptr; |
| part_ptr = job_queue_rec->part_ptr; |
| bf_job_priority = job_queue_rec->priority; |
| qos_ptr = job_queue_rec->qos_ptr; |
| use_prefer = job_queue_rec->use_prefer; |
| |
| if (job_ptr->array_recs && |
| (job_queue_rec->array_task_id == NO_VAL)) |
| is_job_array_head = true; |
| else |
| is_job_array_head = false; |
| |
| if (slurmctld_config.shutdown_time || |
| (difftime(time(NULL),orig_sched_start) >= bf_max_time)){ |
| _set_bf_exit(BF_EXIT_TIMEOUT); |
| break; |
| } |
| |
| many_rpcs = false; |
| slurm_mutex_lock(&slurmctld_config.thread_count_lock); |
| if ((max_rpc_cnt > 0) && |
| (slurmctld_config.server_thread_count >= max_rpc_cnt)) |
| many_rpcs = true; |
| slurm_mutex_unlock(&slurmctld_config.thread_count_lock); |
| |
| if (many_rpcs || (slurm_delta_tv(&start_tv) >= yield_interval)) { |
| if (slurm_conf.debug_flags & DEBUG_FLAG_BACKFILL) { |
| END_TIMER; |
| info("yielding locks after testing %u(%d) jobs, %s", |
| slurmctld_diag_stats.bf_last_depth, |
| job_test_count, TIME_STR); |
| } |
| /* Sync planned nodes before yielding locks */ |
| nodes_planned = true; |
| _handle_planned(nodes_planned); |
| if (_yield_locks(yield_sleep)) { |
| log_flag(BACKFILL, "system state changed, breaking out after testing %u(%d) jobs", |
| slurmctld_diag_stats.bf_last_depth, |
| job_test_count); |
| state_changed_break = true; |
| _set_bf_exit(BF_EXIT_STATE_CHANGED); |
| break; |
| } |
| /* Reset backfill scheduling timers, resume testing */ |
| sched_start = time(NULL); |
| gettimeofday(&start_tv, NULL); |
| job_test_count = 0; |
| test_time_count = 0; |
| nodes_planned = false; |
| START_TIMER; |
| } |
| |
| if (is_job_array_head && |
| (job_ptr->array_task_id != NO_VAL)) { |
| /* Job array element started in other partition, |
| * reset pointer to "master" job array record */ |
| log_flag(BACKFILL, "%pJ array scheduled during bf yield, try master", |
| job_ptr); |
| job_ptr = find_job_record(job_ptr->array_job_id); |
| if (!job_ptr) /* All task array elements started */ |
| continue; |
| job_queue_rec->job_ptr = job_ptr; |
| } |
| |
| /* |
| * Establish baseline (worst case) start time for hetjob |
| * Update time once start time estimate established |
| */ |
| _het_job_start_set(job_ptr, (now + YEAR_SECONDS), NO_VAL); |
| |
| if (job_ptr->het_job_id && |
| (job_ptr->state_reason == WAIT_NO_REASON)) { |
| xfree(job_ptr->state_desc); |
| job_ptr->state_reason = WAIT_RESOURCES; |
| } |
| |
| if (!_job_runnable_now(job_ptr)) |
| continue; |
| if (!part_ptr) |
| continue; |
| if (!_job_part_valid(job_ptr, part_ptr)) |
| continue; /* Partition change during lock yield */ |
| |
| if (job_ptr->resv_list) |
| job_queue_rec_resv_list(job_queue_rec); |
| else |
| job_queue_rec_magnetic_resv(job_queue_rec); |
| resv_ptr = job_ptr->resv_ptr; |
| xfree(job_queue_rec); |
| |
| job_ptr->bit_flags |= BACKFILL_SCHED; |
| job_ptr->last_sched_eval = now; |
| job_ptr->part_ptr = part_ptr; |
| job_ptr->priority = bf_job_priority; |
| job_ptr->qos_ptr = qos_ptr; |
| |
| mcs_select = slurm_mcs_get_select(job_ptr); |
| het_job_time = _het_job_start_find(job_ptr); |
| if (het_job_time > (now + backfill_window)) |
| continue; |
| |
| if (job_ptr->qos_ptr) { |
| assoc_mgr_lock_t locks = { |
| .assoc = READ_LOCK, |
| .qos = READ_LOCK, |
| }; |
| |
| assoc_mgr_lock(&locks); |
| if (job_ptr->assoc_ptr |
| && (accounting_enforce & ACCOUNTING_ENFORCE_QOS) |
| && ((job_ptr->qos_ptr->id >= g_qos_count) || |
| !job_ptr->assoc_ptr->usage || |
| !job_ptr->assoc_ptr->usage->valid_qos || |
| !bit_test(job_ptr->assoc_ptr->usage->valid_qos, |
| job_ptr->qos_ptr->id)) |
| && !job_ptr->limit_set.qos) { |
| debug("%pJ has invalid QOS", |
| job_ptr); |
| assoc_mgr_unlock(&locks); |
| job_fail_qos(job_ptr, __func__, false); |
| last_job_update = now; |
| continue; |
| } else if (job_ptr->state_reason == FAIL_QOS) { |
| xfree(job_ptr->state_desc); |
| job_ptr->state_reason = WAIT_NO_REASON; |
| last_job_update = now; |
| } |
| assoc_mgr_unlock(&locks); |
| } |
| |
| assoc_mgr_lock(&qos_read_lock); |
| if (job_ptr->qos_ptr) { |
| qos_flags = job_ptr->qos_ptr->flags; |
| qos_blocked_until = job_ptr->qos_ptr->blocked_until; |
| } else { |
| qos_flags = 0; |
| qos_blocked_until = 0; |
| } |
| |
| if (job_ptr->part_ptr->qos_ptr) |
| qos_part_blocked_until = |
| job_ptr->part_ptr->qos_ptr->blocked_until; |
| else |
| qos_part_blocked_until = 0; |
| |
| if (part_policy_valid_qos(job_ptr->part_ptr, job_ptr->qos_ptr, |
| job_ptr->user_id, job_ptr) != |
| SLURM_SUCCESS) { |
| assoc_mgr_unlock(&qos_read_lock); |
| continue; |
| } |
| assoc_mgr_unlock(&qos_read_lock); |
| |
| if (!assoc_limit_stop && |
| !acct_policy_job_runnable_pre_select(job_ptr, false)) { |
| continue; |
| } |
| |
| if (!(prio_reserve = acct_policy_get_prio_thresh( |
| job_ptr, false))) |
| prio_reserve = bf_min_prio_reserve; |
| |
| if (prio_reserve) |
| log_flag(BACKFILL, "%pJ has a prio_reserve of %u", |
| job_ptr, prio_reserve); |
| |
| job_no_reserve = 0; |
| if (prio_reserve && |
| (job_ptr->priority < prio_reserve)) { |
| job_no_reserve = TEST_NOW_ONLY; |
| } else if (bf_min_age_reserve && job_ptr->details->begin_time) { |
| pend_time = difftime(time(NULL), |
| job_ptr->details->begin_time); |
| if (pend_time < bf_min_age_reserve) |
| job_no_reserve = TEST_NOW_ONLY; |
| } |
| |
| if (bf_one_resv_per_job && job_ptr->start_time) { |
| log_flag(BACKFILL, "%pJ already added a backfill reservation. Test immediate start only for partition %s", |
| job_ptr, job_ptr->part_ptr->name); |
| job_no_reserve = TEST_NOW_ONLY; |
| } |
| |
| /* |
| * If we are trying to schedule preferred features don't |
| * reserve. |
| */ |
| if (use_prefer) |
| job_no_reserve = TEST_NOW_ONLY; |
| |
| /* If partition data is needed and not yet initialized, do so */ |
| if (!job_ptr->part_ptr->bf_data && |
| (bf_job_part_count_reserve || |
| max_backfill_job_per_user_part || |
| max_backfill_job_per_part)) { |
| bf_part_data_t *part_data = |
| xmalloc(sizeof(bf_part_data_t)); |
| part_data->job_usage = |
| xmalloc(sizeof(slurmdb_bf_usage_t)); |
| part_data->resv_usage = |
| xmalloc(sizeof(slurmdb_bf_usage_t)); |
| part_data->user_usage = xhash_init(_bf_map_key_id, |
| _bf_map_free); |
| job_ptr->part_ptr->bf_data = part_data; |
| } |
| |
| if ((job_no_reserve == 0) && bf_job_part_count_reserve) { |
| if (_check_bf_usage( |
| job_ptr->part_ptr->bf_data->resv_usage, |
| bf_job_part_count_reserve, |
| orig_sched_start)) |
| job_no_reserve = TEST_NOW_ONLY; |
| } |
| |
| if (job_ptr->preempt_in_progress) |
| continue; /* scheduled in another partition */ |
| |
| orig_start_time = job_ptr->start_time; |
| orig_time_limit = job_ptr->time_limit; |
| |
| next_task: |
| /* |
| * Restore time_limit for array tasks, just in case it has been |
| * overridden. This is no-op for the rest of cases. |
| */ |
| job_ptr->time_limit = orig_time_limit; |
| |
| /* |
| * Save the current preemption state. Reset preemption state |
| * in the job_ptr so a job array can preempt multiple jobs. |
| */ |
| if (job_ptr->preempt_in_progress) { |
| tmp_preempt_in_progress = job_ptr->preempt_in_progress; |
| tmp_preempt_start_time = job_ptr->details->preempt_start_time; |
| job_ptr->details->preempt_start_time = 0; |
| job_ptr->preempt_in_progress = false; |
| } |
| |
| /* |
| * Don't count queue records for magnetic reservation against |
| * backfill limits. |
| */ |
| if ((job_ptr->bit_flags & JOB_MAGNETIC) && |
| !bf_allow_magnetic_slot) { |
| already_counted = true; |
| } else { |
| job_test_count++; |
| slurmctld_diag_stats.bf_last_depth++; |
| already_counted = false; |
| } |
| |
| if (!IS_JOB_PENDING(job_ptr) || /* Started in other partition */ |
| (job_ptr->priority == 0)) /* Job has been held */ |
| continue; |
| if ((job_ptr->array_task_id != NO_VAL) || job_ptr->array_recs) { |
| if (reject_array_job && |
| (reject_array_job->array_job_id == |
| job_ptr->array_job_id) && |
| (reject_array_part == part_ptr) && |
| (reject_array_qos == qos_ptr) && |
| (reject_array_resv == resv_ptr) && |
| (reject_array_use_prefer == use_prefer)) |
| continue; /* already rejected array element */ |
| |
| /* assume reject whole array for now, clear if OK */ |
| reject_array_job = job_ptr; |
| reject_array_part = part_ptr; |
| reject_array_qos = qos_ptr; |
| reject_array_resv = resv_ptr; |
| reject_array_use_prefer = use_prefer; |
| |
| if (!job_array_start_test(job_ptr)) |
| continue; |
| } |
| /* |
| * If we are on a different task (see goto next_task) set it up |
| * the same way as we did it before. |
| */ |
| job_ptr->part_ptr = part_ptr; |
| job_ptr->qos_ptr = qos_ptr; |
| job_ptr->resv_ptr = resv_ptr; |
| if (resv_ptr) |
| job_ptr->resv_id = resv_ptr->resv_id; |
| |
| if (job_limits_check(&job_ptr, true) != WAIT_NO_REASON) { |
| /* should never happen */ |
| continue; |
| } |
| |
| log_flag(BACKFILL, "test for %pJ Prio=%u Partition=%s Reservation=%s", |
| job_ptr, job_ptr->priority, job_ptr->part_ptr->name, |
| job_ptr->resv_ptr ? job_ptr->resv_ptr->name : "NONE"); |
| |
| /* Test to see if we've exceeded any per user/partition limit */ |
| if (_job_exceeds_max_bf_param(job_ptr, orig_sched_start)) |
| continue; |
| |
| if (((part_ptr->state_up & PARTITION_SCHED) == 0) || |
| (part_ptr->node_bitmap == NULL)) { |
| log_flag(BACKFILL, "partition %s not usable", |
| job_ptr->part_ptr->name); |
| continue; |
| } |
| |
| if (!bf_licenses && |
| license_job_test(job_ptr, time(NULL), true)) { |
| log_flag(BACKFILL, "%pJ not runable now due to licenses", |
| job_ptr); |
| continue; |
| } |
| |
| if (!job_independent(job_ptr)) { |
| log_flag(BACKFILL, "%pJ not runable now", |
| job_ptr); |
| continue; |
| } |
| |
| /* Determine minimum and maximum node counts */ |
| error_code = get_node_cnts(job_ptr, qos_flags, part_ptr, |
| &min_nodes, &req_nodes, &max_nodes); |
| |
| if (error_code == ESLURM_ACCOUNTING_POLICY) { |
| log_flag(BACKFILL, "%pJ acct policy node limit", |
| job_ptr); |
| continue; |
| } else if (error_code == |
| ESLURM_REQUESTED_PART_CONFIG_UNAVAILABLE) { |
| log_flag(BACKFILL, "%pJ node count too high", |
| job_ptr); |
| continue; |
| } else if (error_code != SLURM_SUCCESS) { |
| log_flag(BACKFILL, "error setting nodes for %pJ: %s", |
| job_ptr, slurm_strerror(error_code)); |
| continue; |
| } |
| |
| /* test of deadline */ |
| now = time(NULL); |
| deadline_time_limit = 0; |
| if ((job_ptr->deadline) && (job_ptr->deadline != NO_VAL)) { |
| if (!deadline_ok(job_ptr, __func__)) |
| continue; |
| |
| deadline_time_limit = (job_ptr->deadline - now) / 60; |
| } |
| |
| /* Determine job's expected completion time */ |
| if (part_ptr->max_time == INFINITE) |
| part_time_limit = YEAR_MINUTES; |
| else |
| part_time_limit = part_ptr->max_time; |
| if ((job_ptr->time_limit == NO_VAL) || |
| (job_ptr->time_limit == INFINITE)) { |
| time_limit = part_time_limit; |
| job_ptr->limit_set.time = 1; |
| } else { |
| if (part_ptr->max_time == INFINITE) |
| time_limit = job_ptr->time_limit; |
| else |
| time_limit = MIN(job_ptr->time_limit, |
| part_time_limit); |
| } |
| if (deadline_time_limit) |
| comp_time_limit = MIN(time_limit, deadline_time_limit); |
| else if (job_ptr->time_min && |
| (job_ptr->time_min < time_limit)) { |
| comp_time_limit = time_limit; |
| time_limit = job_ptr->time_limit = job_ptr->time_min; |
| } else |
| comp_time_limit = time_limit; |
| if ((qos_flags & QOS_FLAG_NO_RESERVE) && |
| slurm_conf.preempt_mode) |
| time_limit = job_ptr->time_limit = 1; |
| |
| later_start = now; |
| used_slots = 0; |
| |
| if (assoc_limit_stop) { |
| if (qos_blocked_until > later_start) { |
| later_start = qos_blocked_until; |
| log_flag(BACKFILL, "QOS blocked_until move start_res to %ld", |
| later_start); |
| } |
| if (qos_part_blocked_until > later_start) { |
| later_start = qos_part_blocked_until; |
| log_flag(BACKFILL, "Part QOS blocked_until move start_res to %ld", |
| later_start); |
| } |
| } |
| |
| TRY_LATER: |
| if (slurmctld_config.shutdown_time || |
| (difftime(time(NULL), orig_sched_start) >= |
| bf_max_time)) { |
| _set_job_time_limit(job_ptr, orig_time_limit); |
| _set_bf_exit(BF_EXIT_TIMEOUT); |
| break; |
| } |
| test_time_count++; |
| |
| many_rpcs = false; |
| slurm_mutex_lock(&slurmctld_config.thread_count_lock); |
| if ((max_rpc_cnt > 0) && |
| (slurmctld_config.server_thread_count >= max_rpc_cnt)) |
| many_rpcs = true; |
| slurm_mutex_unlock(&slurmctld_config.thread_count_lock); |
| |
| if (many_rpcs || (slurm_delta_tv(&start_tv) >= yield_interval)) { |
| uint32_t save_time_limit = job_ptr->time_limit; |
| _set_job_time_limit(job_ptr, orig_time_limit); |
| if (slurm_conf.debug_flags & DEBUG_FLAG_BACKFILL) { |
| END_TIMER; |
| log_flag(BACKFILL, "yielding locks after testing %u(%d) jobs tested, %u time slots, %s", |
| slurmctld_diag_stats.bf_last_depth, |
| job_test_count, test_time_count, |
| TIME_STR); |
| } |
| /* Sync planned nodes before yielding locks */ |
| nodes_planned = true; |
| _handle_planned(nodes_planned); |
| if (_yield_locks(yield_sleep)) { |
| log_flag(BACKFILL, "system state changed, breaking out after testing %u(%d) jobs", |
| slurmctld_diag_stats.bf_last_depth, |
| job_test_count); |
| state_changed_break = true; |
| _set_bf_exit(BF_EXIT_STATE_CHANGED); |
| break; |
| } |
| |
| /* Reset backfill scheduling timers, resume testing */ |
| sched_start = time(NULL); |
| gettimeofday(&start_tv, NULL); |
| job_test_count = 1; |
| test_time_count = 0; |
| nodes_planned = false; |
| START_TIMER; |
| |
| if (is_job_array_head && |
| (job_ptr->array_task_id != NO_VAL)) { |
| /* |
| * Job array element started in other partition, |
| * reset pointer to "master" job array record |
| */ |
| log_flag(BACKFILL, "%pJ array scheduled during bf yield, try master", |
| job_ptr); |
| job_ptr = find_job_record( |
| job_ptr->array_job_id); |
| if (!job_ptr) |
| /* All task array elements started */ |
| continue; |
| } |
| |
| /* |
| * With bf_continue configured, the original job could |
| * have been scheduled. Revalidate the job record here. |
| */ |
| if (!_job_runnable_now(job_ptr)) |
| continue; |
| |
| /* |
| * If the job wasn't scheduled while we didn't have the |
| * locks restore the pointers we were last on just in |
| * case the main scheduler changed them. |
| */ |
| job_ptr->resv_ptr = resv_ptr; |
| if (resv_ptr) |
| job_ptr->resv_id = resv_ptr->resv_id; |
| if (!_job_part_valid(job_ptr, part_ptr)) |
| continue; /* Partition change during lock yield */ |
| if (!job_independent(job_ptr)) { |
| log_flag(BACKFILL, "%pJ no longer independent after bf yield", |
| job_ptr); |
| /* No longer independent |
| * (e.g. another singleton started) */ |
| continue; |
| } |
| |
| job_ptr->time_limit = save_time_limit; |
| job_ptr->part_ptr = part_ptr; |
| job_ptr->qos_ptr = qos_ptr; |
| } |
| |
| /* |
| * feature_list_use is a temporary variable and should |
| * be reset before each use. |
| * Do this after bf_yield to ensure the pointers are valid even |
| * if the job was updated during the bf_yield. |
| */ |
| if (use_prefer) { |
| /* |
| * Prefer was removed from the job since the |
| * job_queue_rec was created (during bf_yield). |
| * This is a separate queue record for prefer. Skip it. |
| */ |
| if (!job_ptr->details->prefer) |
| continue; |
| job_ptr->details->features_use = |
| job_ptr->details->prefer; |
| job_ptr->details->feature_list_use = |
| job_ptr->details->prefer_list; |
| } else { |
| job_ptr->details->features_use = |
| job_ptr->details->features; |
| job_ptr->details->feature_list_use = |
| job_ptr->details->feature_list; |
| } |
| |
| FREE_NULL_BITMAP(avail_bitmap); |
| reservation_delete_resv_exc_parts(&resv_exc); |
| start_res = MAX(later_start, het_job_time); |
| resv_end = 0; |
| later_start = 0; |
| licenses_unavail = false; |
| /* |
| * Restore the original time limit before checking against |
| * reservations, and revert it after. |
| */ |
| if ((qos_flags & QOS_FLAG_NO_RESERVE) && |
| slurm_conf.preempt_mode) |
| job_ptr->time_limit = orig_time_limit; |
| /* Determine impact of any advance reservations */ |
| j = job_test_resv(job_ptr, &start_res, true, &avail_bitmap, |
| &resv_exc, &resv_overlap, false); |
| if (j != SLURM_SUCCESS) { |
| log_flag(BACKFILL, "%pJ reservation defer", |
| job_ptr); |
| _set_job_time_limit(job_ptr, orig_time_limit); |
| continue; |
| } else if ((qos_flags & QOS_FLAG_NO_RESERVE) && |
| slurm_conf.preempt_mode) |
| job_ptr->time_limit = time_limit; |
| |
| if (window_end < start_res) { |
| log_flag(BACKFILL, "%pJ start_res after current backfill window", |
| job_ptr); |
| _set_job_time_limit(job_ptr, orig_time_limit); |
| continue; |
| } |
| |
| if (start_res > now) |
| end_time = (time_limit * 60) + start_res; |
| else |
| end_time = (time_limit * 60) + now; |
| if (end_time < now) /* Overflow 32-bits */ |
| end_time = INFINITE; |
| if (resv_overlap) |
| resv_end = find_resv_end(start_res, |
| backfill_resolution); |
| /* Identify usable nodes for this job */ |
| bit_and(avail_bitmap, part_ptr->node_bitmap); |
| bit_and(avail_bitmap, up_node_bitmap); |
| bit_and_not(avail_bitmap, bf_ignore_node_bitmap); |
| |
| if (job_ptr->details->exc_node_bitmap) { |
| bit_and_not(avail_bitmap, |
| job_ptr->details->exc_node_bitmap); |
| } |
| |
| if (_filter_exclusive_user_mcs_nodes(job_ptr, mcs_select, |
| min_nodes, nodes_used_list, |
| start_res, |
| &later_filter_start, |
| avail_bitmap)) { |
| /* start_res delayed must check resv times again */ |
| later_start = later_filter_start; |
| SKIP_SCHED_OR_TRY_LATER(job_ptr, job_no_reserve, |
| later_start, orig_time_limit, |
| orig_start_time); |
| } |
| |
| if (IS_JOB_WHOLE_TOPO(job_ptr)) { |
| if (excluded_topo_bitmap) |
| bit_clear_all(excluded_topo_bitmap); |
| else |
| excluded_topo_bitmap = |
| bit_alloc(node_record_count); |
| } |
| |
| COPY_BITMAP(tmp_bitmap, avail_bitmap); |
| for (j = 0; ; ) { |
| if ((node_space[j].end_time > start_res) && |
| node_space[j].next && (later_start == 0)) { |
| int tmp = node_space[j].next; |
| |
| if (job_ptr->license_list && |
| !bf_licenses_equal(node_space[tmp].licenses, |
| node_space[j] |
| .licenses)) { |
| later_start = node_space[j].end_time; |
| goto later_start_set; |
| } |
| |
| COPY_BITMAP(next_bitmap, tmp_bitmap); |
| COPY_BITMAP(current_bitmap, avail_bitmap); |
| bit_and(next_bitmap, |
| node_space[tmp].avail_bitmap); |
| bit_and(current_bitmap, |
| node_space[j].avail_bitmap); |
| /* |
| * Normally later_start is set at the end of the |
| * first backfill reservation when the select |
| * plugin predicts start time after later_start. |
| * Then it goes to TRY_LATER and tries again on |
| * a new set of nodes to check if the job can |
| * start earlier. But if the next set of nodes |
| * is a subset of the currently tested ones then |
| * calling _try_sched (expensive function) would |
| * be useless and would impact performance. |
| */ |
| if (!bit_super_set(next_bitmap, current_bitmap)) |
| later_start = node_space[j].end_time; |
| } |
| later_start_set: |
| if (node_space[j].end_time <= start_res) |
| ; |
| else if (node_space[j].begin_time <= end_time) { |
| bit_and(avail_bitmap, |
| node_space[j].avail_bitmap); |
| bf_hres_filter(job_ptr, avail_bitmap, |
| node_space[j].licenses); |
| if (!bf_licenses_avail(node_space[j].licenses, |
| job_ptr, NULL)) { |
| licenses_unavail = true; |
| later_start = node_space[j].end_time; |
| xfree(job_ptr->state_desc); |
| job_ptr->state_reason = WAIT_LICENSES; |
| break; |
| } |
| if (IS_JOB_WHOLE_TOPO(job_ptr)) { |
| bit_or_not(excluded_topo_bitmap, |
| node_space[j].avail_bitmap); |
| } |
| } else { |
| int next = node_space[j].next; |
| if ((later_start == 0) && next && |
| node_space[next].next) |
| later_start = node_space[next].end_time; |
| break; |
| } |
| if ((j = node_space[j].next) == 0) |
| break; |
| } |
| if (resv_end && (++resv_end < window_end) && |
| ((later_start == 0) || (resv_end < later_start))) { |
| later_start = resv_end; |
| } |
| |
| if (IS_JOB_WHOLE_TOPO(job_ptr)) { |
| bit_and(excluded_topo_bitmap, |
| node_space[0].avail_bitmap); |
| topology_g_whole_topo(excluded_topo_bitmap, |
| job_ptr->part_ptr->topology_idx); |
| bit_and_not(avail_bitmap, excluded_topo_bitmap); |
| } |
| |
| /* Test if licenses are unavailable OR |
| * required nodes missing OR |
| * nodes lack features OR |
| * no change since previously tested nodes (only changes |
| * in other partition nodes) */ |
| if (licenses_unavail || |
| ((job_ptr->details->req_node_bitmap) && |
| (!bit_super_set(job_ptr->details->req_node_bitmap, |
| avail_bitmap))) || |
| (job_req_node_filter(job_ptr, avail_bitmap, true))) { |
| SKIP_SCHED_OR_TRY_LATER(job_ptr, job_no_reserve, |
| later_start, orig_time_limit, |
| orig_start_time); |
| } |
| |
| if (!later_start && later_filter_start) |
| later_start = later_filter_start; /* filter out fewer */ |
| |
| /* Test if insufficient nodes remain */ |
| if (bit_set_count(avail_bitmap) < min_nodes) { |
| SKIP_SCHED_OR_TRY_LATER(job_ptr, job_no_reserve, |
| later_start, orig_time_limit, |
| orig_start_time); |
| } |
| |
| /* Identify nodes which are definitely off limits */ |
| FREE_NULL_BITMAP(resv_bitmap); |
| resv_bitmap = bit_copy(avail_bitmap); |
| bit_not(resv_bitmap); |
| |
| /* this is the time consuming operation */ |
| debug2("entering _try_sched for %pJ.", |
| job_ptr); |
| |
| if (!already_counted) { |
| slurmctld_diag_stats.bf_last_depth_try++; |
| job_test_cnt++; |
| already_counted = true; |
| } |
| if (slurm_conf.debug_flags & DEBUG_FLAG_BACKFILL_MAP) |
| _dump_job_test(job_ptr, avail_bitmap, start_res, |
| later_start); |
| test_fini = -1; |
| build_active_feature_bitmap(job_ptr, avail_bitmap, |
| &active_bitmap); |
| job_ptr->bit_flags |= BACKFILL_TEST; |
| job_ptr->bit_flags |= job_no_reserve; /* 0 or TEST_NOW_ONLY */ |
| |
| if (active_bitmap) { |
| will_run_data.start = start_res; |
| will_run_data.end = later_start; |
| j = _try_sched(job_ptr, &active_bitmap, min_nodes, |
| max_nodes, req_nodes, &resv_exc, |
| &will_run_data); |
| if (j == SLURM_SUCCESS) { |
| FREE_NULL_BITMAP(avail_bitmap); |
| avail_bitmap = active_bitmap; |
| active_bitmap = NULL; |
| test_fini = 1; |
| } else { |
| if (node_features_g_overlap(active_bitmap)) |
| get_boot_time = true; |
| FREE_NULL_BITMAP(active_bitmap); |
| save_share_res = job_ptr->details->share_res; |
| save_whole_node = job_ptr->details->whole_node; |
| job_ptr->details->share_res = 0; |
| job_ptr->details->whole_node |= |
| WHOLE_NODE_REQUIRED; |
| if (!save_whole_node) |
| job_ptr->bit_flags |= BF_WHOLE_NODE_TEST; |
| test_fini = 0; |
| } |
| } |
| boot_time = 0; |
| if (test_fini == 0) { |
| /* Unable to start job using currently active features, |
| * need to try using features which can be made |
| * available after node reboot */ |
| resv_exc_t tmp_resv_exc = { 0 }; |
| bitstr_t *tmp_node_bitmap = NULL; |
| debug2("entering _try_sched for %pJ. Need to use features which can be made available after node reboot", |
| job_ptr); |
| /* |
| * Restore the original time limit before checking against |
| * reservations, and revert it after. |
| */ |
| if ((qos_flags & QOS_FLAG_NO_RESERVE) && |
| slurm_conf.preempt_mode) |
| job_ptr->time_limit = orig_time_limit; |
| /* Determine impact of any advance reservations */ |
| resv_end = 0; |
| j = job_test_resv(job_ptr, &start_res, false, |
| &tmp_node_bitmap, &tmp_resv_exc, |
| &resv_overlap, true); |
| if ((qos_flags & QOS_FLAG_NO_RESERVE) && |
| slurm_conf.preempt_mode) |
| job_ptr->time_limit = time_limit; |
| if (resv_overlap) |
| resv_end = find_resv_end(start_res, |
| backfill_resolution); |
| |
| if (resv_end && (++resv_end < window_end) && |
| ((later_start == 0) || (resv_end < later_start))) { |
| later_start = resv_end; |
| } |
| if (j == SLURM_SUCCESS) { |
| reservation_delete_resv_exc_parts(&resv_exc); |
| memcpy(&resv_exc, &tmp_resv_exc, |
| sizeof(resv_exc)); |
| bit_and(avail_bitmap, tmp_node_bitmap); |
| FREE_NULL_BITMAP(tmp_node_bitmap); |
| } |
| if (get_boot_time) |
| boot_time = node_features_g_boot_time(); |
| orig_end_time = end_time; |
| end_time += boot_time; |
| |
| for (j = 0; ; ) { |
| if (node_space[j].end_time <= start_res) |
| ; |
| else if (node_space[j].begin_time <= end_time) { |
| if (node_space[j].begin_time > |
| orig_end_time) |
| bit_and(avail_bitmap, |
| node_space[j].avail_bitmap); |
| } else |
| break; |
| if ((j = node_space[j].next) == 0) |
| break; |
| } |
| } |
| if (test_fini != 1) { |
| /* Either active_bitmap was NULL or not usable by the |
| * job. Test using avail_bitmap instead */ |
| will_run_data.start = start_res; |
| will_run_data.end = later_start; |
| j = _try_sched(job_ptr, &avail_bitmap, min_nodes, |
| max_nodes, req_nodes, &resv_exc, |
| &will_run_data); |
| if (test_fini == 0) { |
| job_ptr->details->share_res = save_share_res; |
| job_ptr->details->whole_node = save_whole_node; |
| } |
| } |
| job_ptr->bit_flags &= ~BACKFILL_TEST; |
| job_ptr->bit_flags &= ~BF_WHOLE_NODE_TEST; |
| job_ptr->bit_flags &= ~TEST_NOW_ONLY; |
| |
| now = time(NULL); |
| if (j != SLURM_SUCCESS) { |
| SKIP_SCHED_OR_TRY_LATER(job_ptr, job_no_reserve, |
| later_start, orig_time_limit, |
| orig_start_time); |
| } |
| |
| if (start_res > job_ptr->start_time) { |
| job_ptr->start_time = start_res; |
| last_job_update = now; |
| } |
| |
| if (job_ptr->start_time > now) { |
| _set_slot_time(job_ptr, time_limit, boot_time, |
| &start_time, &end_reserve); |
| |
| if (_test_resv_overlap(node_space, avail_bitmap, |
| job_ptr, start_time, |
| end_reserve)) { |
| later_start = job_ptr->start_time; |
| |
| if (start_res == job_ptr->start_time) { |
| later_start += backfill_resolution; |
| log_flag(BACKFILL, "%pJ inf loop detect", job_ptr); |
| } |
| |
| job_ptr->start_time = 0; |
| log_flag(BACKFILL, "%pJ overlaps with existing reservation start_time=%u end_reserve=%u boot_time=%u later_start %ld", |
| job_ptr, start_time, end_reserve, |
| boot_time, later_start); |
| goto TRY_LATER; |
| } |
| overlap_tested = true; |
| } else |
| overlap_tested = false; |
| |
| if (!job_no_reserve && bf_topopt_enable) { |
| if (oracle(job_ptr, avail_bitmap, later_start, |
| &time_limit, &boot_time, node_space)) { |
| log_flag(BACKFILL, "%pJ used_slots:%u later_start %ld", |
| job_ptr, used_slots, later_start); |
| goto TRY_LATER; |
| } |
| _set_slot_time(job_ptr, time_limit, boot_time, |
| &start_time, &end_reserve); |
| } |
| |
| /* |
| * avail_bitmap at this point contains a bitmap of nodes |
| * selected for this job to be allocated |
| */ |
| if ((job_ptr->start_time <= now) && |
| (bit_overlap_any(avail_bitmap, cg_node_bitmap) || |
| bit_overlap_any(avail_bitmap, rs_node_bitmap))) { |
| /* Need to wait for in-progress completion/epilog */ |
| job_ptr->start_time = now + 1; |
| later_start = 0; |
| } |
| if ((job_ptr->start_time <= now) && |
| ((bb = bb_g_job_test_stage_in(job_ptr, true)) != 1)) { |
| if (job_ptr->state_reason != WAIT_NO_REASON) { |
| /* |
| * Don't change state_reason if it was already |
| * set. |
| */ |
| ; |
| } else if (bb == -1) { |
| /* |
| * Set reason now instead of in if (bb == -1) |
| * below for the sched_debug3() |
| */ |
| xfree(job_ptr->state_desc); |
| job_ptr->state_reason = |
| WAIT_BURST_BUFFER_RESOURCE; |
| } else { /* bb == 0 */ |
| xfree(job_ptr->state_desc); |
| job_ptr->state_reason=WAIT_BURST_BUFFER_STAGING; |
| /* |
| * Cannot start now, set start time in the |
| * future. |
| */ |
| job_ptr->start_time = now + 1; |
| } |
| sched_debug3("%pJ. State=%s. Reason=%s. Priority=%u.", |
| job_ptr, |
| job_state_string(job_ptr->job_state), |
| job_state_reason_string( |
| job_ptr->state_reason), |
| job_ptr->priority); |
| last_job_update = now; |
| _set_job_time_limit(job_ptr, orig_time_limit); |
| later_start = 0; |
| if (bb == -1) { |
| /* |
| * bb == -1 means that burst buffer stage-in |
| * hasn't started yet. Set an estimated start |
| * time so stage-in can start. |
| * |
| * Clear reject_array_job; otherwise we'll skip |
| * looking at other jobs in this array (if this |
| * is a job array), therefore we won't set |
| * estimated start times, therefore we won't be |
| * able to start stage-in for any other jobs in |
| * this array. |
| */ |
| job_ptr->start_time = |
| bb_g_job_get_est_start(job_ptr); |
| reject_array_job = NULL; |
| reject_array_part = NULL; |
| reject_array_qos = NULL; |
| reject_array_resv = NULL; |
| continue; |
| } |
| } else if ((job_ptr->het_job_id == 0) && |
| (job_ptr->start_time <= now)) { /* Can start now */ |
| uint32_t save_time_limit = job_ptr->time_limit; |
| uint32_t hard_limit; |
| bool reset_time = false; |
| int rc; |
| |
| /* get fed job lock from origin cluster */ |
| if (fed_mgr_job_lock(job_ptr)) { |
| log_flag(BACKFILL, "%pJ can't get fed job lock from origin cluster to backfill job", |
| job_ptr); |
| rc = ESLURM_FED_JOB_LOCK; |
| goto skip_start; |
| } |
| |
| rc = _start_job(job_ptr, resv_bitmap); |
| |
| if (rc == SLURM_SUCCESS) { |
| /* |
| * If the following fails because of network |
| * connectivity, the origin cluster should ask |
| * when it comes back up if the cluster_lock |
| * cluster actually started the job |
| */ |
| fed_mgr_job_start(job_ptr, job_ptr->start_time); |
| } else { |
| fed_mgr_job_unlock(job_ptr); |
| } |
| |
| skip_start: |
| if (qos_flags & QOS_FLAG_NO_RESERVE) { |
| if (orig_time_limit == NO_VAL) { |
| acct_policy_alter_job( |
| job_ptr, comp_time_limit); |
| job_ptr->time_limit = comp_time_limit; |
| job_ptr->limit_set.time = 1; |
| } else { |
| acct_policy_alter_job( |
| job_ptr, orig_time_limit); |
| _set_job_time_limit(job_ptr, |
| orig_time_limit); |
| } |
| } else if ((rc == SLURM_SUCCESS) && soft_time_limit && |
| job_ptr->time_min) { |
| acct_policy_alter_job(job_ptr, orig_time_limit); |
| job_ptr->time_limit = orig_time_limit; |
| } else if ((rc == SLURM_SUCCESS) && job_ptr->time_min) { |
| /* Set time limit as high as possible */ |
| acct_policy_alter_job(job_ptr, comp_time_limit); |
| job_ptr->time_limit = comp_time_limit; |
| reset_time = true; |
| } else if (orig_time_limit == NO_VAL) { |
| acct_policy_alter_job(job_ptr, comp_time_limit); |
| job_ptr->time_limit = comp_time_limit; |
| job_ptr->limit_set.time = 1; |
| } else if (deadline_time_limit && |
| (rc == SLURM_SUCCESS)) { |
| acct_policy_alter_job(job_ptr, comp_time_limit); |
| job_ptr->time_limit = comp_time_limit; |
| reset_time = true; |
| } else { |
| acct_policy_alter_job(job_ptr, orig_time_limit); |
| _set_job_time_limit(job_ptr, orig_time_limit); |
| } |
| /* |
| * Only set end_time if start_time is set, |
| * or else end_time will be small (ie. 1969). |
| */ |
| if (IS_JOB_FINISHED(job_ptr)) { |
| /* Zero size or killed on startup */ |
| } else if (job_ptr->start_time) { |
| node_space_handler_t ns_handler = { |
| .node_space = node_space, |
| .node_space_recs = &node_space_recs, |
| }; |
| |
| if (job_ptr->time_limit == INFINITE) |
| hard_limit = YEAR_SECONDS; |
| else |
| hard_limit = job_ptr->time_limit * 60; |
| job_ptr->end_time = job_ptr->start_time + |
| hard_limit; |
| /* |
| * Only set if start_time. end_time must be set |
| * beforehand for _reset_job_time_limit. |
| */ |
| if (reset_time) { |
| _reset_job_time_limit(job_ptr, now, |
| node_space); |
| time_limit = job_ptr->time_limit; |
| } |
| |
| _bf_reserve_running(job_ptr, &ns_handler); |
| } else if (rc == SLURM_SUCCESS) { |
| error("start_time of 0 on successful backfill. This shouldn't happen. :)"); |
| } |
| |
| if ((rc == ESLURM_RESERVATION_BUSY) || |
| (rc == ESLURM_ACCOUNTING_POLICY && |
| !assoc_limit_stop) || |
| ((rc == ESLURM_REQUESTED_NODE_CONFIG_UNAVAILABLE) && |
| job_ptr->extra_constraints)) { |
| /* Unknown future start time, just skip job */ |
| job_ptr->start_time = orig_start_time; |
| _set_job_time_limit(job_ptr, orig_time_limit); |
| continue; |
| } else if (rc == ESLURM_ACCOUNTING_POLICY) { |
| /* Unknown future start time. Determining |
| * when it can start with certainty requires |
| * when every running and pending job starts |
| * and ends and tracking all of there resources. |
| * That requires very high overhead, that we |
| * don't want to add. Estimate that it can start |
| * after the next job ends (or in 5 minutes if |
| * we don't have that information yet). */ |
| if (later_start) |
| job_ptr->start_time = later_start; |
| else |
| job_ptr->start_time = now + 500; |
| if (job_ptr->qos_blocking_ptr && |
| job_state_reason_check( |
| job_ptr->state_reason, |
| JSR_QOS_GRP)) { |
| assoc_mgr_lock(&qos_read_lock); |
| qos_ptr = job_ptr->qos_blocking_ptr; |
| if (qos_ptr->blocked_until < |
| job_ptr->start_time) { |
| qos_ptr->blocked_until = |
| job_ptr->start_time; |
| } |
| assoc_mgr_unlock(&qos_read_lock); |
| } |
| } else if (rc != SLURM_SUCCESS) { |
| log_flag(BACKFILL, "planned start of %pJ failed: %s", |
| job_ptr, slurm_strerror(rc)); |
| /* Drop through and reserve these resources. |
| * Likely due to state changes during sleep. |
| * Make best-effort based upon original state */ |
| _set_job_time_limit(job_ptr, orig_time_limit); |
| later_start = 0; |
| } else { |
| /* Started this job, move to next one */ |
| |
| /* Clear assumed rejected array status */ |
| reject_array_job = NULL; |
| reject_array_part = NULL; |
| reject_array_qos = NULL; |
| reject_array_resv = NULL; |
| |
| /* Update the database if job time limit |
| * changed and move to next job */ |
| if (save_time_limit != job_ptr->time_limit) |
| jobacct_storage_g_job_start( |
| acct_db_conn, job_ptr); |
| job_start_cnt++; |
| if (max_backfill_jobs_start && |
| (job_start_cnt >= max_backfill_jobs_start)){ |
| log_flag(BACKFILL, "bf_max_job_start limit of %d reached", |
| max_backfill_jobs_start); |
| _set_bf_exit(BF_EXIT_MAX_JOB_START); |
| break; |
| } |
| if (job_test_cnt >= max_backfill_job_cnt) { |
| log_flag(BACKFILL, "bf_max_job_test: limit of %d reached", |
| max_backfill_job_cnt); |
| _set_bf_exit(BF_EXIT_MAX_JOB_TEST); |
| break; |
| } |
| |
| if (_mark_nodes_usage(job_ptr, nodes_used)) |
| list_sort(nodes_used_list, |
| _cmp_last_job_end); |
| |
| if (is_job_array_head && |
| (job_ptr->array_task_id != NO_VAL)) { |
| /* Try starting next task of job array */ |
| job_record_t *tmp = job_ptr; |
| job_ptr = find_job_record(job_ptr-> |
| array_job_id); |
| if (job_ptr && (job_ptr != tmp) && |
| IS_JOB_PENDING(job_ptr) && |
| (bb_g_job_test_stage_in( |
| job_ptr, false) == 1)) |
| goto next_task; |
| } |
| continue; |
| } |
| } else if (job_ptr->het_job_id != 0) { |
| uint32_t max_time_limit; |
| max_time_limit =_get_job_max_tl(job_ptr, now, |
| node_space); |
| comp_time_limit = MIN(comp_time_limit, max_time_limit); |
| job_ptr->node_cnt_wag = |
| MAX(bit_set_count(avail_bitmap), 1); |
| _het_job_start_set(job_ptr, job_ptr->start_time, |
| comp_time_limit); |
| _set_job_time_limit(job_ptr, orig_time_limit); |
| if (bf_hetjob_immediate && |
| (!max_backfill_jobs_start || |
| (job_start_cnt < max_backfill_jobs_start))) |
| _het_job_start_test(node_space, |
| job_ptr->het_job_id, |
| nodes_used, |
| nodes_used_list); |
| } |
| |
| if ((job_ptr->start_time > now) && (job_no_reserve != 0)) { |
| if ((orig_start_time != 0) && |
| (orig_start_time < job_ptr->start_time)) { |
| /* Can start earlier in different partition */ |
| job_ptr->start_time = orig_start_time; |
| } else { |
| log_flag(BACKFILL, "%pJ StartTime set but no backfill reservation created.", |
| job_ptr); |
| } |
| _set_job_time_limit(job_ptr, orig_time_limit); |
| continue; |
| } |
| |
| if (later_start && (job_ptr->start_time > later_start)) { |
| /* Try later when some nodes currently reserved for |
| * pending jobs are free */ |
| log_flag(BACKFILL, "Try later %pJ later_start %ld", |
| job_ptr, later_start); |
| job_ptr->start_time = 0; |
| goto TRY_LATER; |
| } |
| |
| if (!overlap_tested) { |
| /* Job start deferred from now*/ |
| _set_slot_time(job_ptr, time_limit, boot_time, |
| &start_time, &end_reserve); |
| } |
| |
| if (job_ptr->start_time > (sched_start + backfill_window)) { |
| /* Starts too far in the future to worry about */ |
| end_reserve = job_ptr->start_time + boot_time + |
| (time_limit * 60); |
| if (slurm_conf.debug_flags & DEBUG_FLAG_BACKFILL) |
| _dump_job_sched(job_ptr, end_reserve, |
| avail_bitmap); |
| if ((orig_start_time != 0) && |
| (orig_start_time < job_ptr->start_time)) { |
| /* Can start earlier in different partition */ |
| job_ptr->start_time = orig_start_time; |
| } else { |
| log_flag(BACKFILL, "%pJ StartTime set to time after current backfill window. No reservation created", |
| job_ptr); |
| } |
| _set_job_time_limit(job_ptr, orig_time_limit); |
| continue; |
| } |
| |
| if (!overlap_tested && |
| (job_ptr->state_reason != WAIT_BURST_BUFFER_RESOURCE) && |
| (job_ptr->state_reason != WAIT_BURST_BUFFER_STAGING) && |
| _test_resv_overlap(node_space, avail_bitmap, job_ptr, |
| start_time, end_reserve)) { |
| /* This job overlaps with an existing reservation for |
| * job to be backfill scheduled, which the sched |
| * plugin does not know about. Try again later. */ |
| later_start = job_ptr->start_time; |
| job_ptr->start_time = 0; |
| log_flag(BACKFILL, "%pJ after defer overlaps with existing reservation start_time=%u end_reserve=%u boot_time=%u later_start %ld", |
| job_ptr, start_time, end_reserve, boot_time, |
| later_start); |
| goto TRY_LATER; |
| } |
| |
| if (_het_job_deadlock_test(job_ptr)) { |
| _set_job_time_limit(job_ptr, orig_time_limit); |
| continue; |
| } |
| |
| /* |
| * Add reservation to scheduling table if appropriate |
| */ |
| if (!assoc_limit_stop) { |
| uint32_t selected_node_cnt; |
| uint64_t tres_req_cnt[slurmctld_tres_cnt]; |
| uint16_t sockets_per_node; |
| assoc_mgr_lock_t locks = { |
| .assoc = READ_LOCK, |
| .qos = WRITE_LOCK, |
| .tres = READ_LOCK, |
| }; |
| |
| selected_node_cnt = bit_set_count(avail_bitmap); |
| memcpy(tres_req_cnt, job_ptr->tres_req_cnt, |
| sizeof(tres_req_cnt)); |
| tres_req_cnt[TRES_ARRAY_CPU] = |
| (uint64_t)(job_ptr->total_cpus ? |
| job_ptr->total_cpus : |
| job_ptr->details->min_cpus); |
| |
| sockets_per_node = job_get_sockets_per_node(job_ptr); |
| tres_req_cnt[TRES_ARRAY_MEM] = job_get_tres_mem( |
| job_ptr->job_resrcs, |
| job_ptr->details->pn_min_memory, |
| tres_req_cnt[TRES_ARRAY_CPU], |
| selected_node_cnt, |
| job_ptr->part_ptr, |
| job_ptr->gres_list_req, |
| (job_ptr->bit_flags & |
| JOB_MEM_SET), sockets_per_node, |
| job_ptr->details->num_tasks); |
| |
| tres_req_cnt[TRES_ARRAY_NODE] = |
| (uint64_t)selected_node_cnt; |
| |
| assoc_mgr_lock(&locks); |
| gres_stepmgr_set_job_tres_cnt(job_ptr->gres_list_req, |
| selected_node_cnt, |
| tres_req_cnt, |
| true); |
| |
| tres_req_cnt[TRES_ARRAY_BILLING] = |
| assoc_mgr_tres_weighted( |
| tres_req_cnt, |
| job_ptr->part_ptr->billing_weights, |
| slurm_conf.priority_flags, true); |
| |
| if (!acct_policy_job_runnable_post_select(job_ptr, |
| tres_req_cnt, true)) { |
| assoc_mgr_unlock(&locks); |
| log_flag(BACKFILL, "adding reservation for %pJ blocked by acct_policy_job_runnable_post_select", |
| job_ptr); |
| _set_job_time_limit(job_ptr, orig_time_limit); |
| continue; |
| } |
| assoc_mgr_unlock(&locks); |
| } |
| if (slurm_conf.debug_flags & DEBUG_FLAG_BACKFILL) |
| _dump_job_sched(job_ptr, end_reserve, avail_bitmap); |
| if (qos_flags & QOS_FLAG_NO_RESERVE) { |
| _set_job_time_limit(job_ptr, orig_time_limit); |
| continue; |
| } |
| |
| if (bf_job_part_count_reserve) { |
| if (_check_bf_usage( |
| job_ptr->part_ptr->bf_data->resv_usage, |
| bf_job_part_count_reserve, |
| orig_sched_start)) { |
| _set_job_time_limit(job_ptr, orig_time_limit); |
| continue; |
| } |
| job_ptr->part_ptr->bf_data->resv_usage->count++; |
| } |
| |
| /* Clear assumed rejected array status */ |
| reject_array_job = NULL; |
| reject_array_part = NULL; |
| reject_array_qos = NULL; |
| reject_array_resv = NULL; |
| |
| if ((!bf_one_resv_per_job || !orig_start_time) && |
| (!(job_ptr->bit_flags & JOB_MAGNETIC) || |
| bf_allow_magnetic_slot)) { |
| if (node_space_recs >= bf_node_space_size) { |
| log_flag(BACKFILL, "table size limit of %u reached", |
| bf_node_space_size); |
| if ((max_backfill_job_per_part != 0) && |
| (max_backfill_job_per_part >= |
| (bf_node_space_size / 2))) { |
| error("bf_max_job_part >= bf_node_space_size / 2 (%u >= %u)", |
| max_backfill_job_per_part, |
| (bf_node_space_size / 2)); |
| } else if ((max_backfill_job_per_user != 0) && |
| (max_backfill_job_per_user > |
| (bf_node_space_size / 2))) { |
| warning("bf_max_job_user > bf_node_space_size / 2 (%u > %u)", |
| max_backfill_job_per_user, |
| (bf_node_space_size / 2)); |
| } else if ((max_backfill_job_per_assoc != 0) && |
| (max_backfill_job_per_assoc > |
| (bf_node_space_size / 2))) { |
| warning("bf_max_job_assoc > bf_node_space_size / 2 (%u > %u)", |
| max_backfill_job_per_assoc, |
| (bf_node_space_size / 2)); |
| } |
| _set_job_time_limit(job_ptr, orig_time_limit); |
| _set_bf_exit(BF_EXIT_TABLE_LIMIT); |
| break; |
| } |
| _add_reservation(start_time, end_reserve, avail_bitmap, |
| job_ptr, node_space, &node_space_recs, |
| orig_start_time); |
| } |
| if (slurm_conf.debug_flags & DEBUG_FLAG_BACKFILL_MAP) |
| _dump_node_space_table(node_space); |
| if ((orig_start_time != 0) && |
| (orig_start_time < job_ptr->start_time)) { |
| /* Can start earlier in different partition */ |
| job_ptr->start_time = orig_start_time; |
| } |
| _set_job_time_limit(job_ptr, orig_time_limit); |
| if (job_ptr->array_recs) { |
| /* Try making reservation for next task of job array */ |
| if (test_array_job_id != job_ptr->array_job_id) { |
| test_array_job_id = job_ptr->array_job_id; |
| test_array_count = 1; |
| array_start_time = job_ptr->start_time; |
| } else { |
| test_array_count++; |
| array_start_time = MIN(array_start_time, |
| job_ptr->start_time); |
| } |
| |
| /* |
| * Don't consider the next task if it would exceed the |
| * maximum number of runnable tasks. If max_run_tasks is |
| * 0, then it wasn't set, so ignore it. |
| */ |
| if ((test_array_count < bf_max_job_array_resv) && |
| (test_array_count < |
| job_ptr->array_recs->task_cnt) && |
| (!job_ptr->array_recs->max_run_tasks || |
| ((MAX(job_ptr->array_recs->pend_run_tasks, |
| test_array_count) + |
| job_ptr->array_recs->tot_run_tasks) < |
| job_ptr->array_recs->max_run_tasks))) |
| goto next_task; |
| } |
| } |
| |
| if (!nodes_planned) |
| _handle_planned(true); |
| |
| xfree(job_queue_rec); |
| |
| if (job_ptr) { |
| /* Restore preemption state if needed. */ |
| _restore_preempt_state(job_ptr, &tmp_preempt_start_time, |
| &tmp_preempt_in_progress); |
| job_resv_clear_magnetic_flag(job_ptr); |
| |
| if (job_ptr->array_recs && array_start_time) |
| job_ptr->start_time = array_start_time; |
| } |
| |
| _het_job_deadlock_fini(); |
| if (!bf_hetjob_immediate && !state_changed_break && |
| (!max_backfill_jobs_start || |
| (job_start_cnt < max_backfill_jobs_start))) |
| _het_job_start_test(node_space, 0, NULL, NULL); |
| |
| FREE_NULL_BITMAP(avail_bitmap); |
| FREE_NULL_BITMAP(excluded_topo_bitmap); |
| reservation_delete_resv_exc_parts(&resv_exc); |
| FREE_NULL_BITMAP(resv_bitmap); |
| FREE_NULL_BITMAP(tmp_bitmap); |
| FREE_NULL_BITMAP(next_bitmap); |
| FREE_NULL_BITMAP(current_bitmap); |
| |
| for (i = 0; ; ) { |
| FREE_NULL_BITMAP(node_space[i].avail_bitmap); |
| FREE_NULL_BF_LICENSES(node_space[i].licenses); |
| if ((i = node_space[i].next) == 0) |
| break; |
| } |
| for (i = node_space_recs; i <= bf_node_space_size; i++) { |
| if (!node_space[i].avail_bitmap) |
| break; |
| FREE_NULL_BITMAP(node_space[i].avail_bitmap); |
| } |
| xfree(node_space); |
| |
| FREE_NULL_LIST(job_queue); |
| FREE_NULL_LIST(nodes_used_list); |
| xfree(nodes_used); |
| |
| if (bf_topopt_enable) |
| fini_oracle(); |
| |
| gettimeofday(&bf_time2, NULL); |
| _do_diag_stats(&bf_time1, &bf_time2, node_space_recs); |
| if (slurm_conf.debug_flags & DEBUG_FLAG_BACKFILL) { |
| END_TIMER; |
| info("completed testing %u(%d) jobs, %s", |
| slurmctld_diag_stats.bf_last_depth, |
| job_test_count, TIME_STR); |
| } |
| |
| slurm_mutex_lock(&slurmctld_config.thread_count_lock); |
| if (slurmctld_config.server_thread_count >= 150) { |
| info("%d pending RPCs at cycle end, consider " |
| "configuring max_rpc_cnt", |
| slurmctld_config.server_thread_count); |
| } |
| slurm_mutex_unlock(&slurmctld_config.thread_count_lock); |
| |
| return; |
| } |
| |
| /* Try to start the job on any non-reserved nodes */ |
| static int _start_job(job_record_t *job_ptr, bitstr_t *resv_bitmap) |
| { |
| int rc; |
| bitstr_t *orig_exc_nodes = NULL; |
| bool is_job_array_head = false; |
| static uint32_t fail_jobid = 0; |
| job_node_select_t job_node_select = { |
| .job_ptr = job_ptr, |
| }; |
| if (job_ptr->details->exc_node_bitmap) { |
| orig_exc_nodes = bit_copy(job_ptr->details->exc_node_bitmap); |
| bit_or(job_ptr->details->exc_node_bitmap, resv_bitmap); |
| } else |
| job_ptr->details->exc_node_bitmap = bit_copy(resv_bitmap); |
| if (job_ptr->array_recs) |
| is_job_array_head = true; |
| rc = select_nodes(&job_node_select, false, false, |
| SLURMDB_JOB_FLAG_BACKFILL); |
| |
| if (is_job_array_head && job_ptr->details) { |
| job_record_t *base_job_ptr; |
| base_job_ptr = find_job_record(job_ptr->array_job_id); |
| if (base_job_ptr && base_job_ptr != job_ptr |
| && base_job_ptr->array_recs) { |
| FREE_NULL_BITMAP( |
| base_job_ptr->details->exc_node_bitmap); |
| if (orig_exc_nodes) |
| base_job_ptr->details->exc_node_bitmap = |
| bit_copy(orig_exc_nodes); |
| } |
| } |
| if (job_ptr->details) { /* select_nodes() might reset exc_node_bitmap */ |
| FREE_NULL_BITMAP(job_ptr->details->exc_node_bitmap); |
| job_ptr->details->exc_node_bitmap = orig_exc_nodes; |
| } else |
| FREE_NULL_BITMAP(orig_exc_nodes); |
| if (rc == SLURM_SUCCESS) { |
| /* job initiated */ |
| last_job_update = time(NULL); |
| info("Started %pJ in %s on %s", |
| job_ptr, job_ptr->part_ptr->name, job_ptr->nodes); |
| if (job_ptr->batch_flag == 0) |
| srun_allocate(job_ptr); |
| else if (!IS_JOB_CONFIGURING(job_ptr)) |
| launch_job(job_ptr); |
| slurmctld_diag_stats.backfilled_jobs++; |
| slurmctld_diag_stats.last_backfilled_jobs++; |
| if (job_ptr->het_job_id) |
| slurmctld_diag_stats.backfilled_het_jobs++; |
| log_flag(BACKFILL, "Jobs backfilled since boot: %u", |
| slurmctld_diag_stats.backfilled_jobs); |
| } else if ((job_ptr->job_id != fail_jobid) && |
| (rc != ESLURM_ACCOUNTING_POLICY)) { |
| char *node_list; |
| bit_not(resv_bitmap); |
| node_list = bitmap2node_name(resv_bitmap); |
| /* This happens when a job has sharing disabled and |
| * a selected node is still completing some job, |
| * which should be a temporary situation. */ |
| verbose("Failed to start %pJ with %s avail: %s", |
| job_ptr, node_list, slurm_strerror(rc)); |
| xfree(node_list); |
| fail_jobid = job_ptr->job_id; |
| } else { |
| debug3("Failed to start %pJ: %s", |
| job_ptr, slurm_strerror(rc)); |
| } |
| |
| return rc; |
| } |
| |
| /* |
| * Compute a job's maximum time based upon conflicts in resources |
| * planned for use by other jobs and that job's min/max time limit |
| * Return NO_VAL if no restriction |
| */ |
| static uint32_t _get_job_max_tl(job_record_t *job_ptr, time_t now, |
| node_space_map_t *node_space) |
| { |
| int32_t j; |
| time_t comp_time = 0; |
| uint32_t max_tl = NO_VAL; |
| |
| if (job_ptr->time_min == 0) |
| return max_tl; |
| |
| for (j = 0; ; ) { |
| if ((node_space[j].begin_time != now) && // No current conflicts |
| (node_space[j].begin_time < job_ptr->end_time) && |
| (!bit_super_set(job_ptr->node_bitmap, |
| node_space[j].avail_bitmap) || |
| !bf_licenses_avail(node_space[j].licenses, job_ptr, |
| job_ptr->node_bitmap))) { |
| /* Job overlaps pending job's resource reservation */ |
| if ((comp_time == 0) || |
| (comp_time > node_space[j].begin_time)) |
| comp_time = node_space[j].begin_time; |
| } |
| if ((j = node_space[j].next) == 0) |
| break; |
| } |
| |
| if (comp_time != 0) |
| max_tl = (comp_time - now + 59) / 60; |
| |
| return max_tl; |
| } |
| |
| /* |
| * Reset a job's time limit (and end_time) as high as possible |
| * within the range job_ptr->time_min and job_ptr->time_limit. |
| * Avoid using resources reserved for pending jobs or in resource |
| * reservations |
| */ |
| static void _reset_job_time_limit(job_record_t *job_ptr, time_t now, |
| node_space_map_t *node_space) |
| { |
| int32_t j, resv_delay; |
| uint32_t orig_time_limit = job_ptr->time_limit; |
| uint32_t new_time_limit; |
| |
| for (j = 0; ; ) { |
| if ((node_space[j].begin_time != now) && // No current conflicts |
| (node_space[j].begin_time < job_ptr->end_time) && |
| (!bit_super_set(job_ptr->node_bitmap, |
| node_space[j].avail_bitmap))) { |
| /* Job overlaps pending job's resource reservation */ |
| resv_delay = difftime(node_space[j].begin_time, now); |
| resv_delay /= 60; /* seconds to minutes */ |
| if (resv_delay < job_ptr->time_limit) |
| job_ptr->time_limit = resv_delay; |
| } |
| if ((j = node_space[j].next) == 0) |
| break; |
| } |
| new_time_limit = MAX(job_ptr->time_min, job_ptr->time_limit); |
| acct_policy_alter_job(job_ptr, new_time_limit); |
| job_ptr->time_limit = new_time_limit; |
| job_ptr->end_time = job_ptr->start_time + (job_ptr->time_limit * 60); |
| |
| job_time_adj_resv(job_ptr); |
| |
| if (orig_time_limit != job_ptr->time_limit) { |
| info("%pJ time limit changed from %u to %u", |
| job_ptr, orig_time_limit, job_ptr->time_limit); |
| } |
| } |
| |
| /* |
| * Report if any changes occurred to job, node, reservation |
| * or partition information |
| */ |
| static bool _more_work(time_t last_backfill_time) |
| { |
| bool rc = false; |
| |
| if ((last_job_update >= last_backfill_time) || |
| (last_node_update >= last_backfill_time) || |
| (last_part_update >= last_backfill_time) || |
| (last_resv_update >= last_backfill_time)) { |
| rc = true; |
| } |
| |
| return rc; |
| } |
| |
| /* Create a reservation for a job in the future */ |
| static void _add_reservation(time_t start_time, time_t end_reserve, |
| bitstr_t *res_bitmap, job_record_t *job_ptr, |
| node_space_map_t *node_space, int *node_space_recs, |
| time_t orig_start_time) |
| { |
| bool placed = false; |
| int i, j, one_before = 0, one_after = -1; |
| bitstr_t *res_bitmap_orig = res_bitmap; |
| bitstr_t *res_bitmap_efctv = NULL; |
| |
| #if 0 |
| info("add job start:%u end:%u", start_time, end_reserve); |
| for (j = 0; ; ) { |
| info("node start:%u end:%u", |
| (uint32_t) node_space[j].begin_time, |
| (uint32_t) node_space[j].end_time); |
| if ((j = node_space[j].next) == 0) |
| break; |
| } |
| #endif |
| if (res_bitmap) { |
| if (IS_JOB_WHOLE_TOPO(job_ptr)) { |
| res_bitmap_efctv = bit_copy(res_bitmap); |
| topology_g_whole_topo(res_bitmap_efctv, |
| job_ptr->part_ptr->topology_idx); |
| res_bitmap = res_bitmap_efctv; |
| } |
| |
| if (!IS_JOB_RUNNING(job_ptr) && |
| ((orig_start_time == 0) || |
| (job_ptr->start_time < orig_start_time))) { |
| /* Can't start earlier in different partition. */ |
| xfree(job_ptr->sched_nodes); |
| job_ptr->sched_nodes = bitmap2node_name(res_bitmap); |
| /* |
| * These nodes are planned. We will set the state |
| * afterwards. |
| */ |
| bit_or(planned_bitmap, res_bitmap); |
| } |
| } |
| |
| start_time = MAX(start_time, node_space[0].begin_time); |
| /* |
| * Ensure that the job always occupies at least one bf_resolution |
| * slot within the map. This also fixes potential issues when |
| * running with bf_running_job_reserve if jobs have run past |
| * their timelimit but have not yet been terminated. |
| */ |
| if (end_reserve < (start_time + backfill_resolution)) |
| end_reserve = start_time + backfill_resolution; |
| for (j = 0; ; ) { |
| if (node_space[j].end_time > start_time) { |
| /* insert start entry record */ |
| i = *node_space_recs; |
| node_space[i].begin_time = start_time; |
| node_space[i].end_time = node_space[j].end_time; |
| node_space[j].end_time = start_time; |
| COPY_BITMAP(node_space[i].avail_bitmap, |
| node_space[j].avail_bitmap); |
| node_space[i].licenses = |
| bf_licenses_copy(node_space[j].licenses); |
| node_space[i].fragmentation = |
| node_space[j].fragmentation; |
| node_space[i].next = node_space[j].next; |
| node_space[j].next = i; |
| (*node_space_recs)++; |
| placed = true; |
| break; |
| } |
| if (node_space[j].end_time == start_time) { |
| /* no need to insert new start entry record */ |
| placed = true; |
| break; |
| } |
| one_before = j; |
| if ((j = node_space[j].next) == 0) |
| break; |
| } |
| |
| while (placed && (j = node_space[j].next)) { |
| if (end_reserve < node_space[j].end_time) { |
| /* insert end entry record */ |
| i = *node_space_recs; |
| node_space[i].begin_time = end_reserve; |
| node_space[i].end_time = node_space[j].end_time; |
| node_space[j].end_time = end_reserve; |
| COPY_BITMAP(node_space[i].avail_bitmap, |
| node_space[j].avail_bitmap); |
| node_space[i].licenses = |
| bf_licenses_copy(node_space[j].licenses); |
| node_space[i].fragmentation = |
| node_space[j].fragmentation; |
| node_space[i].next = node_space[j].next; |
| node_space[j].next = i; |
| (*node_space_recs)++; |
| } |
| |
| /* merge in new usage with this record */ |
| if (res_bitmap) { |
| bitstr_t *node_bitmap_orig = job_ptr->node_bitmap; |
| bit_and_not(node_space[j].avail_bitmap, res_bitmap); |
| if (!IS_JOB_RUNNING(job_ptr)) |
| job_ptr->node_bitmap = res_bitmap_orig; |
| bf_licenses_deduct(node_space[j].licenses, job_ptr); |
| if (!IS_JOB_RUNNING(job_ptr)) |
| job_ptr->node_bitmap = node_bitmap_orig; |
| if (bf_topopt_enable) { |
| node_space[j].fragmentation = |
| topology_g_get_fragmentation( |
| node_space[j].avail_bitmap); |
| } |
| } else { |
| /* setting up reservation licenses */ |
| bf_licenses_transfer(node_space[j].licenses, job_ptr); |
| } |
| |
| if (end_reserve == node_space[j].end_time) { |
| if (node_space[j].next) |
| one_after = node_space[j].next; |
| break; |
| } |
| } |
| |
| /* Drop records with identical bitmaps (up to one record). |
| * This can significantly improve performance of the backfill tests. */ |
| for (i = one_before; i != one_after; ) { |
| if ((j = node_space[i].next) == 0) |
| break; |
| if (!bf_licenses_equal(node_space[i].licenses, |
| node_space[j].licenses)) { |
| i = j; |
| continue; |
| } |
| if (!bit_equal(node_space[i].avail_bitmap, |
| node_space[j].avail_bitmap)) { |
| i = j; |
| continue; |
| } |
| node_space[i].end_time = node_space[j].end_time; |
| node_space[i].next = node_space[j].next; |
| if (node_space[j].avail_bitmap) { |
| for (i = *node_space_recs; |
| i <= bf_node_space_size; i++) { |
| if (!node_space[i].avail_bitmap) { |
| node_space[i].avail_bitmap = |
| node_space[j].avail_bitmap; |
| node_space[j].avail_bitmap = NULL; |
| break; |
| } |
| } |
| } |
| FREE_NULL_BITMAP(node_space[j].avail_bitmap); |
| FREE_NULL_BF_LICENSES(node_space[j].licenses); |
| break; |
| } |
| FREE_NULL_BITMAP(res_bitmap_efctv); |
| } |
| |
| /* |
| * Determine if the resource specification for a new job overlaps with a |
| * reservation that the backfill scheduler has made for a job to be |
| * started in the future. |
| * IN use_bitmap - nodes to be allocated |
| * IN job_ptr - used for license and reservation info |
| * IN start_time - start time of job |
| * IN end_reserve - end time of job |
| */ |
| static bool _test_resv_overlap(node_space_map_t *node_space, |
| bitstr_t *use_bitmap, job_record_t *job_ptr, |
| uint32_t start_time, uint32_t end_reserve) |
| { |
| bool overlap = false; |
| int j = 0; |
| bitstr_t *use_bitmap_efctv = NULL; |
| bitstr_t *use_bitmap_orig = use_bitmap; |
| |
| if (IS_JOB_WHOLE_TOPO(job_ptr)) { |
| use_bitmap_efctv = bit_copy(use_bitmap); |
| topology_g_whole_topo(use_bitmap_efctv, |
| job_ptr->part_ptr->topology_idx); |
| use_bitmap = use_bitmap_efctv; |
| } |
| |
| while (true) { |
| if ((node_space[j].end_time > start_time) && |
| (node_space[j].begin_time < end_reserve)) { |
| /* |
| * Jobs will run concurrently. |
| * Do they conflict for resources? |
| */ |
| if (!bit_super_set(use_bitmap, |
| node_space[j].avail_bitmap)) { |
| overlap = true; |
| break; |
| } |
| if (!bf_licenses_avail(node_space[j].licenses, job_ptr, |
| use_bitmap_orig)) { |
| overlap = true; |
| break; |
| } |
| } |
| |
| if ((j = node_space[j].next) == 0) |
| break; |
| } |
| FREE_NULL_BITMAP(use_bitmap_efctv); |
| return overlap; |
| } |
| |
| /* |
| * Delete het_job_map_t record from het_job_list |
| */ |
| static void _het_job_map_del(void *x) |
| { |
| het_job_map_t *map = (het_job_map_t *) x; |
| FREE_NULL_LIST(map->het_job_rec_list); |
| xfree(map); |
| } |
| |
| /* |
| * Return 1 if a het_job_map_t record with a specific het_job_id is found. |
| * Always return 1 if "key" is zero. |
| */ |
| static int _het_job_find_map(void *x, void *key) |
| { |
| het_job_map_t *map = (het_job_map_t *) x; |
| uint32_t *het_job_id = (uint32_t *) key; |
| |
| if ((het_job_id == NULL) || |
| (map->het_job_id == *het_job_id)) |
| return 1; |
| return 0; |
| } |
| |
| /* |
| * Return 1 if a het_job_rec_t record with a specific job_id is found. |
| */ |
| static int _het_job_find_rec(void *x, void *key) |
| { |
| het_job_rec_t *rec = (het_job_rec_t *) x; |
| uint32_t *job_id = (uint32_t *) key; |
| |
| if (rec->job_id == *job_id) |
| return 1; |
| return 0; |
| } |
| |
| /* |
| * Remove vestigial elements from het_job_list. For still active element, |
| * clear the previously computted start time. This is used to periodically clear |
| * history so that heterogeneous jobs do not keep getting deferred based |
| * upon old system state |
| */ |
| static void _het_job_start_clear(void) |
| { |
| het_job_map_t *map; |
| list_itr_t *iter; |
| |
| iter = list_iterator_create(het_job_list); |
| while ((map = list_next(iter))) { |
| if (map->prev_start == 0) { |
| list_delete_item(iter); |
| } else { |
| map->prev_start = 0; |
| list_flush(map->het_job_rec_list); |
| } |
| } |
| list_iterator_destroy(iter); |
| } |
| |
| /* |
| * For a given het_job_map_t record, determine the earliest that it can start, |
| * which is the time at which it's latest starting component begins. The |
| * "exclude_job_id" is used to exclude a hetjob component currently being |
| * tested to start, presumably in a different partition. |
| */ |
| static time_t _het_job_start_compute(het_job_map_t *map, |
| uint32_t exclude_job_id) |
| { |
| list_itr_t *iter; |
| het_job_rec_t *rec; |
| time_t latest_start = map->prev_start; |
| |
| iter = list_iterator_create(map->het_job_rec_list); |
| while ((rec = list_next(iter))) { |
| if (rec->job_id == exclude_job_id) |
| continue; |
| latest_start = MAX(latest_start, rec->latest_start); |
| } |
| list_iterator_destroy(iter); |
| |
| return latest_start; |
| } |
| |
| /* |
| * Return the earliest that a job can start based upon _other_ components of |
| * that same heterogeneous job. Return 0 if no limitation. |
| * |
| * If the job's state reason is BeginTime (the way all hetjobs start) and that |
| * time is passed, then clear the reason field. |
| */ |
| static time_t _het_job_start_find(job_record_t *job_ptr) |
| { |
| het_job_map_t *map; |
| time_t latest_start = (time_t) 0; |
| |
| if (job_ptr->het_job_id) { |
| map = list_find_first(het_job_list, _het_job_find_map, |
| &job_ptr->het_job_id); |
| if (map) { |
| latest_start = _het_job_start_compute(map, |
| job_ptr->job_id); |
| } |
| |
| log_flag(HETJOB, "%pJ in partition %s expected to start in %ld secs", |
| job_ptr, job_ptr->part_ptr->name, |
| MAX(0, latest_start - time(NULL))); |
| } |
| |
| return latest_start; |
| } |
| |
| /* |
| * Record the earliest that a hetjob component can start. If it can be |
| * started in multiple partitions, we only record the earliest start time |
| * for the job in any partition and reservation. |
| */ |
| static void _het_job_start_set(job_record_t *job_ptr, time_t latest_start, |
| uint32_t comp_time_limit) |
| { |
| het_job_map_t *map; |
| het_job_rec_t *rec; |
| |
| if (comp_time_limit == NO_VAL) |
| comp_time_limit = job_ptr->time_limit; |
| if (job_ptr->het_job_id) { |
| map = list_find_first(het_job_list, _het_job_find_map, |
| &job_ptr->het_job_id); |
| if (map) { |
| if (!map->comp_time_limit) { |
| map->comp_time_limit = comp_time_limit; |
| } else { |
| map->comp_time_limit = MIN(map->comp_time_limit, |
| comp_time_limit); |
| } |
| rec = list_find_first(map->het_job_rec_list, |
| _het_job_find_rec, |
| &job_ptr->job_id); |
| if (rec && (rec->latest_start <= latest_start)) { |
| /* |
| * This job can start an earlier time in |
| * some other partition, so ignore new info |
| */ |
| } else if (rec) { |
| rec->latest_start = latest_start; |
| rec->part_ptr = job_ptr->part_ptr; |
| rec->resv_ptr = job_ptr->resv_ptr; |
| } else { |
| rec = xmalloc(sizeof(het_job_rec_t)); |
| rec->job_id = job_ptr->job_id; |
| rec->job_ptr = job_ptr; |
| rec->latest_start = latest_start; |
| rec->part_ptr = job_ptr->part_ptr; |
| rec->resv_ptr = job_ptr->resv_ptr; |
| list_append(map->het_job_rec_list, rec); |
| } |
| } else { |
| rec = xmalloc(sizeof(het_job_rec_t)); |
| rec->job_id = job_ptr->job_id; |
| rec->job_ptr = job_ptr; |
| rec->latest_start = latest_start; |
| rec->part_ptr = job_ptr->part_ptr; |
| rec->resv_ptr = job_ptr->resv_ptr; |
| |
| map = xmalloc(sizeof(het_job_map_t)); |
| map->comp_time_limit = comp_time_limit; |
| map->het_job_id = job_ptr->het_job_id; |
| map->het_job_rec_list = list_create(xfree_ptr); |
| list_append(map->het_job_rec_list, rec); |
| list_append(het_job_list, map); |
| } |
| |
| log_flag(HETJOB, "%pJ in partition %s set to start in %ld secs", |
| job_ptr, job_ptr->part_ptr->name, |
| MAX(0, _het_job_start_compute(map, 0) - time(NULL))); |
| } |
| } |
| |
| /* |
| * Return TRUE if we have expected start times for all components of a hetjob |
| * and all components are valid and runable. |
| * |
| * NOTE: This should never happen, but we will also start the job if all of the |
| * other components are already running, |
| */ |
| static bool _het_job_full(het_job_map_t *map) |
| { |
| job_record_t *het_job_ptr, *job_ptr; |
| list_itr_t *iter; |
| bool rc = true; |
| |
| het_job_ptr = find_job_record(map->het_job_id); |
| if (!het_job_ptr || !het_job_ptr->het_job_list || |
| (!IS_JOB_RUNNING(het_job_ptr) && |
| !_job_runnable_now(het_job_ptr))) { |
| return false; |
| } |
| |
| iter = list_iterator_create(het_job_ptr->het_job_list); |
| while ((job_ptr = list_next(iter))) { |
| if ((job_ptr->magic != JOB_MAGIC) || |
| (job_ptr->het_job_id != map->het_job_id)) { |
| rc = false; /* bad job pointer */ |
| break; |
| } |
| if (IS_JOB_RUNNING(job_ptr)) |
| continue; |
| if (!list_find_first(map->het_job_rec_list, _het_job_find_rec, |
| &job_ptr->job_id) || |
| !_job_runnable_now(job_ptr)) { |
| rc = false; |
| break; |
| } |
| } |
| list_iterator_destroy(iter); |
| |
| return rc; |
| } |
| |
| /* |
| * Determine if all components of a hetjob can be started now or are |
| * prevented from doing so because of association or QOS limits. |
| * Return true if they can all start. |
| * |
| * NOTE: That a hetjob passes this test does not mean that it will be able |
| * to run. For example, this test assumes resource allocation at the CPU level. |
| * If each task is allocated one core, with 2 CPUs, then the CPU limit test |
| * would not be accurate. |
| */ |
| static bool _het_job_limit_check(het_job_map_t *map, time_t now) |
| { |
| job_record_t *job_ptr; |
| het_job_rec_t *rec; |
| list_itr_t *iter; |
| int begun_jobs = 0, fini_jobs = 0, slurmctld_tres_size; |
| bool runnable = true; |
| uint32_t selected_node_cnt; |
| uint64_t tres_req_cnt[slurmctld_tres_cnt]; |
| uint64_t **tres_alloc_save = NULL; |
| |
| tres_alloc_save = xcalloc(list_count(map->het_job_rec_list), |
| sizeof(uint64_t *)); |
| slurmctld_tres_size = sizeof(uint64_t) * slurmctld_tres_cnt; |
| iter = list_iterator_create(map->het_job_rec_list); |
| while ((rec = list_next(iter))) { |
| uint16_t sockets_per_node; |
| assoc_mgr_lock_t locks = { |
| .assoc = READ_LOCK, |
| .qos = WRITE_LOCK, |
| .tres = READ_LOCK, |
| }; |
| |
| job_ptr = rec->job_ptr; |
| job_ptr->part_ptr = rec->part_ptr; |
| if (rec->resv_ptr) { |
| job_ptr->resv_ptr = rec->resv_ptr; |
| job_ptr->resv_id = job_ptr->resv_ptr->resv_id; |
| } |
| selected_node_cnt = job_ptr->node_cnt_wag; |
| memcpy(tres_req_cnt, job_ptr->tres_req_cnt, |
| slurmctld_tres_size); |
| tres_req_cnt[TRES_ARRAY_CPU] = (uint64_t)(job_ptr->total_cpus ? |
| job_ptr->total_cpus : |
| job_ptr->details->min_cpus); |
| sockets_per_node = job_get_sockets_per_node(job_ptr); |
| tres_req_cnt[TRES_ARRAY_MEM] = job_get_tres_mem( |
| job_ptr->job_resrcs, |
| job_ptr->details->pn_min_memory, |
| tres_req_cnt[TRES_ARRAY_CPU], |
| selected_node_cnt, |
| job_ptr->part_ptr, |
| job_ptr->gres_list_req, |
| (job_ptr->bit_flags & |
| JOB_MEM_SET), sockets_per_node, |
| job_ptr->details->num_tasks); |
| tres_req_cnt[TRES_ARRAY_NODE] = (uint64_t)selected_node_cnt; |
| |
| assoc_mgr_lock(&locks); |
| gres_stepmgr_set_job_tres_cnt(job_ptr->gres_list_req, |
| selected_node_cnt, |
| tres_req_cnt, true); |
| |
| tres_req_cnt[TRES_ARRAY_BILLING] = |
| assoc_mgr_tres_weighted( |
| tres_req_cnt, |
| job_ptr->part_ptr->billing_weights, |
| slurm_conf.priority_flags, true); |
| |
| if (acct_policy_job_runnable_pre_select(job_ptr, true) && |
| acct_policy_job_runnable_post_select(job_ptr, |
| tres_req_cnt, true)) { |
| assoc_mgr_unlock(&locks); |
| tres_alloc_save[begun_jobs++] = job_ptr->tres_alloc_cnt; |
| job_ptr->tres_alloc_cnt = xmalloc(slurmctld_tres_size); |
| memcpy(job_ptr->tres_alloc_cnt, tres_req_cnt, |
| slurmctld_tres_size); |
| acct_policy_job_begin(job_ptr, false); |
| |
| } else { |
| assoc_mgr_unlock(&locks); |
| runnable = false; |
| break; |
| } |
| } |
| |
| list_iterator_reset(iter); |
| while ((rec = list_next(iter))) { |
| job_ptr = rec->job_ptr; |
| if (begun_jobs > fini_jobs) { |
| time_t end_time_exp = job_ptr->end_time_exp; |
| time_t end_time = job_ptr->end_time; |
| uint32_t job_state = job_ptr->job_state; |
| /* Simulate normal job completion */ |
| job_ptr->end_time_exp = now; |
| job_ptr->end_time = job_ptr->start_time; |
| job_state_set(job_ptr, (JOB_COMPLETE | JOB_COMPLETING)); |
| acct_policy_job_fini(job_ptr, false); |
| job_ptr->end_time_exp = end_time_exp; |
| job_ptr->end_time = end_time; |
| job_state_set(job_ptr, job_state); |
| xfree(job_ptr->tres_alloc_cnt); |
| job_ptr->tres_alloc_cnt = tres_alloc_save[fini_jobs++]; |
| } |
| } |
| list_iterator_destroy(iter); |
| xfree(tres_alloc_save); |
| |
| return runnable; |
| } |
| |
| /* |
| * Start all components of a hetjob now |
| */ |
| static int _het_job_start_now(het_job_map_t *map, node_space_map_t *node_space) |
| { |
| job_record_t *job_ptr; |
| bitstr_t *avail_bitmap = NULL; |
| bitstr_t *resv_bitmap = NULL, *used_bitmap = NULL; |
| het_job_rec_t *rec; |
| list_itr_t *iter; |
| int rc = SLURM_SUCCESS; |
| bool resv_overlap = false; |
| time_t now = time(NULL), start_res; |
| uint32_t hard_limit; |
| resv_exc_t resv_exc = { 0 }; |
| |
| iter = list_iterator_create(map->het_job_rec_list); |
| while ((rec = list_next(iter))) { |
| bool reset_time = false; |
| job_ptr = rec->job_ptr; |
| job_ptr->part_ptr = rec->part_ptr; |
| if (rec->resv_ptr) { |
| job_ptr->resv_ptr = rec->resv_ptr; |
| job_ptr->resv_id = job_ptr->resv_ptr->resv_id; |
| } |
| |
| /* |
| * Identify the nodes which this job can use |
| */ |
| start_res = now; |
| rc = job_test_resv(job_ptr, &start_res, true, &avail_bitmap, |
| &resv_exc, &resv_overlap, false); |
| reservation_delete_resv_exc_parts(&resv_exc); |
| if (rc != SLURM_SUCCESS) { |
| error("%pJ failed to start due to reservation", |
| job_ptr); |
| FREE_NULL_BITMAP(avail_bitmap); |
| break; |
| } |
| bit_and(avail_bitmap, job_ptr->part_ptr->node_bitmap); |
| bit_and(avail_bitmap, up_node_bitmap); |
| if (used_bitmap) |
| bit_and_not(avail_bitmap, used_bitmap); |
| if (job_ptr->details->exc_node_bitmap) { |
| bit_and_not(avail_bitmap, |
| job_ptr->details->exc_node_bitmap); |
| } |
| |
| if (fed_mgr_job_lock(job_ptr)) { |
| error("%pJ failed to start due to fed job lock", |
| job_ptr); |
| FREE_NULL_BITMAP(avail_bitmap); |
| continue; |
| } |
| |
| resv_bitmap = avail_bitmap; |
| avail_bitmap = NULL; |
| bit_not(resv_bitmap); |
| rc = _start_job(job_ptr, resv_bitmap); |
| FREE_NULL_BITMAP(resv_bitmap); |
| if (rc == SLURM_SUCCESS) { |
| /* |
| * If the following fails because of network |
| * connectivity, the origin cluster should ask |
| * when it comes back up if the cluster_lock |
| * cluster actually started the job |
| */ |
| fed_mgr_job_start(job_ptr, job_ptr->start_time); |
| log_flag(HETJOB, "%pJ started", job_ptr); |
| if (!used_bitmap && job_ptr->node_bitmap) |
| used_bitmap = bit_copy(job_ptr->node_bitmap); |
| else if (job_ptr->node_bitmap) |
| bit_or(used_bitmap, job_ptr->node_bitmap); |
| } else { |
| fed_mgr_job_unlock(job_ptr); |
| break; |
| } |
| if (job_ptr->time_min) { |
| /* Set time limit as high as possible */ |
| acct_policy_alter_job(job_ptr, map->comp_time_limit); |
| job_ptr->time_limit = map->comp_time_limit; |
| reset_time = true; |
| } |
| if (job_ptr->start_time) { |
| if (job_ptr->time_limit == INFINITE) |
| hard_limit = YEAR_SECONDS; |
| else |
| hard_limit = job_ptr->time_limit * 60; |
| job_ptr->end_time = job_ptr->start_time + hard_limit; |
| /* |
| * Only set if start_time. end_time must be set |
| * beforehand for _reset_job_time_limit. |
| */ |
| if (reset_time) |
| _reset_job_time_limit(job_ptr, now, node_space); |
| } |
| if (reset_time) |
| jobacct_storage_g_job_start(acct_db_conn, job_ptr); |
| } |
| list_iterator_destroy(iter); |
| FREE_NULL_BITMAP(used_bitmap); |
| |
| return rc; |
| } |
| |
| /* |
| * Deallocate all components if failed hetjob start |
| */ |
| static void _het_job_kill_now(het_job_map_t *map) |
| { |
| job_record_t *job_ptr; |
| het_job_rec_t *rec; |
| list_itr_t *iter; |
| time_t now = time(NULL); |
| int cred_lifetime = 1200; |
| uint32_t save_bitflags; |
| |
| cred_lifetime = cred_expiration(); |
| iter = list_iterator_create(map->het_job_rec_list); |
| while ((rec = list_next(iter))) { |
| job_ptr = rec->job_ptr; |
| if (IS_JOB_PENDING(job_ptr)) |
| continue; |
| info("Deallocate %pJ due to hetjob start failure", |
| job_ptr); |
| job_ptr->details->begin_time = now + cred_lifetime + 1; |
| job_ptr->end_time = now; |
| job_state_set(job_ptr, (JOB_PENDING | JOB_COMPLETING)); |
| last_job_update = now; |
| build_cg_bitmap(job_ptr); |
| job_completion_logger(job_ptr, false); |
| deallocate_nodes(job_ptr, false, false, false); |
| /* |
| * Since the job_completion_logger() removes the submit, |
| * we need to add it again, but don't stage-out burst buffer |
| */ |
| save_bitflags = job_ptr->bit_flags; |
| job_ptr->bit_flags |= JOB_KILL_HURRY; |
| acct_policy_add_job_submit(job_ptr, false); |
| job_ptr->bit_flags = save_bitflags; |
| if (!job_ptr->node_bitmap_cg || |
| (bit_set_count(job_ptr->node_bitmap_cg) == 0)) |
| batch_requeue_fini(job_ptr); |
| } |
| list_iterator_destroy(iter); |
| } |
| |
| /* |
| * If all components of a heterogeneous job can start now, then do so |
| * node_space IN - map of available resources through time |
| * map IN - info about this heterogeneous job |
| * single IN - true if testing single heterogeneous jobs |
| * Return true if heterogeneous job can start now |
| */ |
| static bool _het_job_start_test_single(node_space_map_t *node_space, |
| het_job_map_t *map, bool single) |
| { |
| time_t now = time(NULL); |
| int rc; |
| |
| if (!map) |
| return false; |
| |
| if (!_het_job_full(map)) { |
| log_flag(HETJOB, "Hetjob %u has indefinite start time", |
| map->het_job_id); |
| if (!single) |
| map->prev_start = now + YEAR_SECONDS; |
| return false; |
| } |
| |
| map->prev_start = _het_job_start_compute(map, 0); |
| if (map->prev_start > now) { |
| log_flag(HETJOB, "Hetjob %u should be able to start in %u seconds", |
| map->het_job_id, (uint32_t) (map->prev_start - now)); |
| return false; |
| } |
| |
| if (!_het_job_limit_check(map, now)) { |
| log_flag(HETJOB, "Hetjob %u prevented from starting by account/QOS limit", |
| map->het_job_id); |
| |
| map->prev_start = now + YEAR_SECONDS; |
| return false; |
| } |
| |
| log_flag(HETJOB, "Attempting to start hetjob %u", map->het_job_id); |
| |
| rc = _het_job_start_now(map, node_space); |
| if (rc != SLURM_SUCCESS) { |
| log_flag(HETJOB, "Failed to start hetjob %u", map->het_job_id); |
| _het_job_kill_now(map); |
| } else { |
| job_start_cnt += list_count(map->het_job_rec_list); |
| if (max_backfill_jobs_start && |
| (job_start_cnt >= max_backfill_jobs_start)) { |
| log_flag(BACKFILL, "bf_max_job_start limit of %d reached", |
| max_backfill_jobs_start); |
| } |
| return true; |
| } |
| |
| return false; |
| } |
| |
| static int _het_job_start_test_list(void *map, void *node_space) |
| { |
| if (!max_backfill_jobs_start || |
| (job_start_cnt < max_backfill_jobs_start)) |
| _het_job_start_test_single(node_space, map, false); |
| |
| return SLURM_SUCCESS; |
| } |
| |
| static int _foreach_add_job_to_nodes_used(void *x, void *arg) |
| { |
| het_job_rec_t *het_rec = x; |
| node_used_t *nodes_used = arg; |
| |
| if (_mark_nodes_usage(het_rec->job_ptr, nodes_used)) |
| nodes_used->needs_sorting = true; |
| |
| return 0; |
| } |
| |
| /* |
| * If all components of a heterogeneous job can start now, then do so |
| * node_space IN - map of available resources through time |
| * het_job_id IN - the ID of the heterogeneous job to evaluate, |
| * if zero then evaluate all heterogeneous jobs and |
| * nodes_used/node_used_list are not updated |
| * nodes_used IN/OUT - array of node usage used for exclusive filtering |
| * nodes_used_list IN/OUT - list of node usage used for exclusive filtering |
| */ |
| static void _het_job_start_test(node_space_map_t *node_space, |
| uint32_t het_job_id, node_used_t *nodes_used, |
| list_t *nodes_used_list) |
| { |
| het_job_map_t *map = NULL; |
| |
| if (!het_job_id) { |
| /* Test all maps. */ |
| (void)list_for_each(het_job_list, |
| _het_job_start_test_list, node_space); |
| } else { |
| /* Test single map. */ |
| map = list_find_first(het_job_list, _het_job_find_map, |
| &het_job_id); |
| if (_het_job_start_test_single(node_space, map, true)) { |
| nodes_used->needs_sorting = false; |
| (void) list_for_each(map->het_job_rec_list, |
| _foreach_add_job_to_nodes_used, |
| nodes_used); |
| if (nodes_used->needs_sorting) { |
| nodes_used->needs_sorting = false; |
| list_sort(nodes_used_list, _cmp_last_job_end); |
| } |
| } |
| } |
| } |
| |
| static void _deadlock_global_list_del(void *x) |
| { |
| deadlock_part_struct_t *dl_part_ptr = (deadlock_part_struct_t *) x; |
| FREE_NULL_LIST(dl_part_ptr->deadlock_job_list); |
| xfree(dl_part_ptr); |
| } |
| |
| static int _deadlock_part_list_srch(void *x, void *key) |
| { |
| deadlock_job_struct_t *dl_job = (deadlock_job_struct_t *) x; |
| job_record_t *job_ptr = (job_record_t *) key; |
| if (dl_job->het_job_id == job_ptr->het_job_id) |
| return 1; |
| return 0; |
| } |
| |
| static int _deadlock_part_list_srch2(void *x, void *key) |
| { |
| deadlock_job_struct_t *dl_job = (deadlock_job_struct_t *) x; |
| deadlock_job_struct_t *dl_job2 = (deadlock_job_struct_t *) key; |
| if (dl_job->het_job_id == dl_job2->het_job_id) |
| return 1; |
| return 0; |
| } |
| |
| static int _deadlock_global_list_srch(void *x, void *key) |
| { |
| deadlock_part_struct_t *dl_part = (deadlock_part_struct_t *) x; |
| if (dl_part->part_ptr == (part_record_t *) key) |
| return 1; |
| return 0; |
| } |
| |
| static int _deadlock_job_list_sort(void *x, void *y) |
| { |
| deadlock_job_struct_t *dl_job_ptr1 = *(deadlock_job_struct_t **) x; |
| deadlock_job_struct_t *dl_job_ptr2 = *(deadlock_job_struct_t **) y; |
| if (dl_job_ptr1->start_time > dl_job_ptr2->start_time) |
| return -1; |
| else if (dl_job_ptr1->start_time < dl_job_ptr2->start_time) |
| return 1; |
| return 0; |
| } |
| |
| /* |
| * Call at end of backup execution to release memory allocated by |
| * _het_job_deadlock_test() |
| */ |
| static void _het_job_deadlock_fini(void) |
| { |
| FREE_NULL_LIST(deadlock_global_list); |
| } |
| |
| /* |
| * Determine if job can run at it's "start_time" or later. |
| * job_ptr IN - job to test, set reason to "HET_JOB_DEADLOCK" if it will deadlock |
| * RET true if the job can not run due to possible deadlock with other hetjob |
| * |
| * NOTE: If there are a large number of hetjobs this will be painfully slow |
| * as the algorithm must be order n^2 |
| */ |
| static bool _het_job_deadlock_test(job_record_t *job_ptr) |
| { |
| deadlock_job_struct_t *dl_job_ptr = NULL, *dl_job_ptr2 = NULL; |
| deadlock_job_struct_t *dl_job_ptr3 = NULL; |
| deadlock_part_struct_t *dl_part_ptr = NULL, *dl_part_ptr2 = NULL; |
| list_itr_t *job_iter, *part_iter; |
| bool have_deadlock = false; |
| |
| if (!job_ptr->het_job_id || !job_ptr->part_ptr) |
| return false; |
| |
| /* |
| * Find the list representing the ordering of jobs in this specific |
| * partition and add this job in the list, sorted by job start time |
| */ |
| if (!deadlock_global_list) { |
| deadlock_global_list = list_create(_deadlock_global_list_del); |
| } else { |
| dl_part_ptr = list_find_first(deadlock_global_list, |
| _deadlock_global_list_srch, |
| job_ptr->part_ptr); |
| } |
| if (!dl_part_ptr) { |
| dl_part_ptr = xmalloc(sizeof(deadlock_part_struct_t)); |
| dl_part_ptr->deadlock_job_list = list_create(xfree_ptr); |
| dl_part_ptr->part_ptr = job_ptr->part_ptr; |
| list_append(deadlock_global_list, dl_part_ptr); |
| } else { |
| dl_job_ptr = list_find_first(dl_part_ptr->deadlock_job_list, |
| _deadlock_part_list_srch, |
| job_ptr); |
| } |
| if (!dl_job_ptr) { |
| dl_job_ptr = xmalloc(sizeof(deadlock_job_struct_t)); |
| dl_job_ptr->het_job_id = job_ptr->het_job_id; |
| dl_job_ptr->start_time = job_ptr->start_time; |
| list_append(dl_part_ptr->deadlock_job_list, dl_job_ptr); |
| } else if (dl_job_ptr->start_time < job_ptr->start_time) { |
| dl_job_ptr->start_time = job_ptr->start_time; |
| } |
| list_sort(dl_part_ptr->deadlock_job_list, _deadlock_job_list_sort); |
| |
| /* |
| * Log current table of hetjob start times by partition |
| */ |
| if (slurm_conf.debug_flags & DEBUG_FLAG_BACKFILL) { |
| part_iter = list_iterator_create(deadlock_global_list); |
| while ((dl_part_ptr2 = list_next(part_iter))){ |
| info("Partition %s Hetjobs:", |
| dl_part_ptr2->part_ptr->name); |
| job_iter = list_iterator_create(dl_part_ptr2-> |
| deadlock_job_list); |
| while ((dl_job_ptr2 = list_next(job_iter))) { |
| info(" Hetjob %u to start at %"PRIu64, |
| dl_job_ptr2->het_job_id, |
| (uint64_t) dl_job_ptr2->start_time); |
| } |
| list_iterator_destroy(job_iter); |
| } |
| list_iterator_destroy(part_iter); |
| } |
| |
| /* |
| * Determine if any hetjobs scheduled to start earlier than this job |
| * in this partition are scheduled to start after it in some other |
| * partition |
| */ |
| part_iter = list_iterator_create(deadlock_global_list); |
| while ((dl_part_ptr2 = list_next(part_iter))){ |
| if (dl_part_ptr2 == dl_part_ptr) /* Current partition, skip it */ |
| continue; |
| dl_job_ptr2 = list_find_first(dl_part_ptr2->deadlock_job_list, |
| _deadlock_part_list_srch, |
| job_ptr); |
| if (!dl_job_ptr2) /* Hetjob not in this partition, no check */ |
| continue; |
| job_iter = list_iterator_create(dl_part_ptr->deadlock_job_list); |
| while ((dl_job_ptr2 = list_next(job_iter))) { |
| if (dl_job_ptr2->het_job_id == dl_job_ptr->het_job_id) |
| break; /* Self */ |
| dl_job_ptr3 = list_find_first( |
| dl_part_ptr2->deadlock_job_list, |
| _deadlock_part_list_srch2, |
| dl_job_ptr2); |
| if (dl_job_ptr3 && |
| (dl_job_ptr3->start_time < dl_job_ptr->start_time)){ |
| have_deadlock = true; |
| break; |
| } |
| } |
| list_iterator_destroy(job_iter); |
| |
| if (have_deadlock) |
| log_flag(HETJOB, "Hetjob %u in partition %s would deadlock with hetjob %u in partition %s, skipping it", |
| dl_job_ptr->het_job_id, |
| dl_part_ptr->part_ptr->name, |
| dl_job_ptr3->het_job_id, |
| dl_part_ptr2->part_ptr->name); |
| if (have_deadlock) |
| break; |
| } |
| list_iterator_destroy(part_iter); |
| |
| return have_deadlock; |
| } |
| |
| static void _set_bf_exit(bf_exit_t code) |
| { |
| xassert(code < BF_EXIT_COUNT); |
| |
| slurmctld_diag_stats.bf_exit[code]++; |
| } |