|  | /*****************************************************************************\ | 
|  | *  backfill.c - simple backfill scheduler plugin. | 
|  | * | 
|  | *  If a partition does not have root only access and nodes are not shared | 
|  | *  then raise the priority of pending jobs if doing so does not adversely | 
|  | *  effect the expected initiation of any higher priority job. We do not alter | 
|  | *  a job's required or excluded node list, so this is a conservative | 
|  | *  algorithm. | 
|  | * | 
|  | *  For example, consider a cluster "lx[01-08]" with one job executing on | 
|  | *  nodes "lx[01-04]". The highest priority pending job requires five nodes | 
|  | *  including "lx05". The next highest priority pending job requires any | 
|  | *  three nodes. Without explicitly forcing the second job to use nodes | 
|  | *  "lx[06-08]", we can't start it without possibly delaying the higher | 
|  | *  priority job. | 
|  | ***************************************************************************** | 
|  | *  Copyright (C) 2003-2007 The Regents of the University of California. | 
|  | *  Copyright (C) 2008-2010 Lawrence Livermore National Security. | 
|  | *  Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). | 
|  | *  Written by Morris Jette <jette1@llnl.gov> | 
|  | *  CODE-OCEC-09-009. All rights reserved. | 
|  | * | 
|  | *  This file is part of Slurm, a resource management program. | 
|  | *  For details, see <https://slurm.schedmd.com/>. | 
|  | *  Please also read the included file: DISCLAIMER. | 
|  | * | 
|  | *  Slurm is free software; you can redistribute it and/or modify it under | 
|  | *  the terms of the GNU General Public License as published by the Free | 
|  | *  Software Foundation; either version 2 of the License, or (at your option) | 
|  | *  any later version. | 
|  | * | 
|  | *  In addition, as a special exception, the copyright holders give permission | 
|  | *  to link the code of portions of this program with the OpenSSL library under | 
|  | *  certain conditions as described in each individual source file, and | 
|  | *  distribute linked combinations including the two. You must obey the GNU | 
|  | *  General Public License in all respects for all of the code used other than | 
|  | *  OpenSSL. If you modify file(s) with this exception, you may extend this | 
|  | *  exception to your version of the file(s), but you are not obligated to do | 
|  | *  so. If you do not wish to do so, delete this exception statement from your | 
|  | *  version.  If you delete this exception statement from all source files in | 
|  | *  the program, then also delete it here. | 
|  | * | 
|  | *  Slurm is distributed in the hope that it will be useful, but WITHOUT ANY | 
|  | *  WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS | 
|  | *  FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more | 
|  | *  details. | 
|  | * | 
|  | *  You should have received a copy of the GNU General Public License along | 
|  | *  with Slurm; if not, write to the Free Software Foundation, Inc., | 
|  | *  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301  USA. | 
|  | \*****************************************************************************/ | 
|  |  | 
|  | #include "config.h" | 
|  |  | 
|  | #if HAVE_SYS_PRCTL_H | 
|  | #  include <sys/prctl.h> | 
|  | #endif | 
|  |  | 
|  | #include <pthread.h> | 
|  | #include <stdio.h> | 
|  | #include <stdlib.h> | 
|  | #include <string.h> | 
|  | #include <time.h> | 
|  | #include <unistd.h> | 
|  |  | 
|  | #include "slurm/slurm.h" | 
|  | #include "slurm/slurmdb.h" | 
|  | #include "slurm/slurm_errno.h" | 
|  |  | 
|  | #include "src/common/assoc_mgr.h" | 
|  | #include "src/common/job_features.h" | 
|  | #include "src/common/list.h" | 
|  | #include "src/common/macros.h" | 
|  | #include "src/common/parse_time.h" | 
|  | #include "src/common/read_config.h" | 
|  | #include "src/common/slurm_protocol_api.h" | 
|  | #include "src/common/xmalloc.h" | 
|  | #include "src/common/xstring.h" | 
|  |  | 
|  | #include "src/interfaces/accounting_storage.h" | 
|  | #include "src/interfaces/burst_buffer.h" | 
|  | #include "src/interfaces/gres.h" | 
|  | #include "src/interfaces/node_features.h" | 
|  | #include "src/interfaces/mcs.h" | 
|  | #include "src/interfaces/preempt.h" | 
|  | #include "src/interfaces/select.h" | 
|  | #include "src/interfaces/topology.h" | 
|  |  | 
|  |  | 
|  | #include "src/slurmctld/acct_policy.h" | 
|  | #include "src/slurmctld/fed_mgr.h" | 
|  | #include "src/slurmctld/job_scheduler.h" | 
|  | #include "src/slurmctld/licenses.h" | 
|  | #include "src/slurmctld/locks.h" | 
|  | #include "src/slurmctld/node_scheduler.h" | 
|  | #include "src/slurmctld/proc_req.h" | 
|  | #include "src/slurmctld/reservation.h" | 
|  | #include "src/slurmctld/slurmctld.h" | 
|  |  | 
|  | #include "src/stepmgr/gres_stepmgr.h" | 
|  | #include "src/stepmgr/srun_comm.h" | 
|  |  | 
|  | #include "backfill.h" | 
|  | #include "oracle.h" | 
|  |  | 
|  | #define BACKFILL_INTERVAL	30 | 
|  | #define BACKFILL_RESOLUTION	60 | 
|  | #define BACKFILL_WINDOW		(24 * 60 * 60) | 
|  | #define BF_MAX_JOB_ARRAY_RESV	20 | 
|  |  | 
|  | #define YIELD_INTERVAL		2000000	/* time in micro-seconds */ | 
|  | #define YIELD_SLEEP		500000;	/* time in micro-seconds */ | 
|  |  | 
|  | #define MAX_BACKFILL_INTERVAL          10800 /* 3 hours */ | 
|  | #define MAX_BACKFILL_RESOLUTION        3600 /* 1 hour */ | 
|  | #define MAX_BACKFILL_WINDOW            (30 * 24 * 60 * 60) /* 30 days */ | 
|  | #define MAX_BF_JOB_PART_COUNT_RESERVE  100000 | 
|  | #define MAX_BF_MAX_JOB_ARRAY_RESV      1000 | 
|  | #define MAX_BF_MAX_JOB_START           10000 | 
|  | #define DEF_BF_MAX_JOB_TEST            500 | 
|  | #define MAX_BF_MAX_JOB_TEST            1000000 | 
|  | #define MAX_BF_MAX_TIME                3600 | 
|  | #define MAX_BF_MIN_AGE_RESERVE         (30 * 24 * 60 * 60) /* 30 days */ | 
|  | #define MAX_BF_MIN_PRIO_RESERVE        INFINITE | 
|  | #define MAX_BF_YIELD_INTERVAL          10000000 /* 10 seconds in usec */ | 
|  | #define MAX_MAX_RPC_CNT                1000 | 
|  | #define MAX_YIELD_RPC_CNT 200 | 
|  | #define MAX_YIELD_SLEEP                10000000 /* 10 seconds in usec */ | 
|  |  | 
|  | #define MAX_BF_MAX_JOB_ASSOC           MAX_BF_MAX_JOB_TEST | 
|  | #define MAX_BF_MAX_JOB_USER            MAX_BF_MAX_JOB_TEST | 
|  | #define MAX_BF_MAX_JOB_USER_PART       MAX_BF_MAX_JOB_TEST | 
|  | #define MAX_BF_MAX_JOB_PART            MAX_BF_MAX_JOB_TEST | 
|  |  | 
|  | typedef struct { | 
|  | node_space_map_t *node_space; | 
|  | int *node_space_recs; | 
|  | } node_space_handler_t; | 
|  |  | 
|  | /* | 
|  | * HetJob scheduling structures | 
|  | * NOTE: An individual hetjob component can be submitted to multiple | 
|  | *       partitions and have different start times in each | 
|  | */ | 
|  | typedef struct { | 
|  | uint32_t job_id; | 
|  | job_record_t *job_ptr; | 
|  | time_t latest_start;		/* Time when expected to start */ | 
|  | part_record_t *part_ptr; | 
|  | slurmctld_resv_t *resv_ptr; | 
|  | } het_job_rec_t; | 
|  |  | 
|  | typedef struct { | 
|  | uint32_t comp_time_limit;	/* Time limit for hetjob */ | 
|  | uint32_t het_job_id; | 
|  | list_t *het_job_rec_list;	/* list of het_job_rec_t */ | 
|  | time_t prev_start;		/* Expected start time from last test */ | 
|  | } het_job_map_t; | 
|  |  | 
|  | typedef struct { | 
|  | uint32_t het_job_id; | 
|  | time_t start_time; | 
|  | } deadlock_job_struct_t; | 
|  |  | 
|  | typedef struct { | 
|  | list_t *deadlock_job_list; | 
|  | part_record_t *part_ptr; | 
|  | } deadlock_part_struct_t; | 
|  |  | 
|  | /* Diagnostic  statistics */ | 
|  | extern diag_stats_t slurmctld_diag_stats; | 
|  | uint32_t bf_sleep_usec = 0; | 
|  |  | 
|  | typedef struct { | 
|  | slurmdb_bf_usage_t bf_usage; | 
|  | uid_t uid; | 
|  | } bf_user_usage_t; | 
|  |  | 
|  | typedef struct { | 
|  | bool allocated; /* A job is running on this node */ | 
|  | time_t last_job_end; /* Last end time of running job on node*/ | 
|  | char *mcs_label; | 
|  | bool mixed_user; /* multiple users running on node */ | 
|  | bool needs_sorting; /* After adding to the mix sort related | 
|  | * nodes_used_list */ | 
|  | uint32_t node_index; | 
|  | bool owned; /* Node has exclusive=user job */ | 
|  | uint32_t uid; /* user id of a job running on the node */ | 
|  | } node_used_t; | 
|  |  | 
|  | typedef struct { | 
|  | bool delay_start; | 
|  | bool is_exclusive_user; | 
|  | uint32_t job_user; | 
|  | time_t *later_start; | 
|  | char *mcs_label; | 
|  | uint32_t min_nodes; | 
|  | bitstr_t *node_bitmap; | 
|  | int node_cnt; | 
|  | time_t prev_time; | 
|  | bitstr_t *req_nodes; | 
|  | bool set_later_start; | 
|  | time_t start_time; | 
|  | } filter_exclusive_args_t; | 
|  |  | 
|  | /*********************** local variables *********************/ | 
|  | static bool stop_backfill = false; | 
|  | static pthread_mutex_t term_lock = PTHREAD_MUTEX_INITIALIZER; | 
|  | static pthread_cond_t  term_cond = PTHREAD_COND_INITIALIZER; | 
|  | static pthread_mutex_t config_lock = PTHREAD_MUTEX_INITIALIZER; | 
|  | static bool config_flag = false; | 
|  | static int backfill_interval = BACKFILL_INTERVAL; | 
|  | static int bf_max_time = BACKFILL_INTERVAL; | 
|  | static int backfill_resolution = BACKFILL_RESOLUTION; | 
|  | static int backfill_window = BACKFILL_WINDOW; | 
|  | static int bf_job_part_count_reserve = 0; | 
|  | static int bf_max_job_array_resv = BF_MAX_JOB_ARRAY_RESV; | 
|  | static int bf_min_age_reserve = 0; | 
|  | static int bf_node_space_size = 0; | 
|  | static bool bf_running_job_reserve = false; | 
|  | static bool bf_licenses = false; | 
|  | static uint32_t bf_min_prio_reserve = 0; | 
|  | static list_t *deadlock_global_list = NULL; | 
|  | static bool bf_hetjob_immediate = false; | 
|  | static uint16_t bf_hetjob_prio = 0; | 
|  | static bool bf_one_resv_per_job = false; | 
|  | static bool bf_allow_magnetic_slot = false; | 
|  | static bool bf_topopt_enable = false; | 
|  | static uint32_t job_start_cnt = 0; | 
|  | static uint32_t job_test_cnt = 0; | 
|  | static int max_backfill_job_cnt = DEF_BF_MAX_JOB_TEST; | 
|  | static int max_backfill_job_per_assoc = 0; | 
|  | static int max_backfill_job_per_part = 0; | 
|  | static int max_backfill_job_per_user = 0; | 
|  | static int max_backfill_job_per_user_part = 0; | 
|  | static int max_backfill_jobs_start = 0; | 
|  | static bool backfill_continue = false; | 
|  | static bool assoc_limit_stop = false; | 
|  | static int max_rpc_cnt = 0; | 
|  | static int yield_rpc_cnt = 0; | 
|  | static int yield_interval = YIELD_INTERVAL; | 
|  | static int yield_sleep   = YIELD_SLEEP; | 
|  | static list_t *het_job_list = NULL; | 
|  | static xhash_t *user_usage_map = NULL; /* look up user usage when no assoc */ | 
|  | static bitstr_t *planned_bitmap = NULL; | 
|  | static bool soft_time_limit = false; | 
|  |  | 
|  | /*********************** local functions *********************/ | 
|  | static void _add_reservation(time_t start_time, time_t end_reserve, | 
|  | bitstr_t *res_bitmap, job_record_t *job_ptr, | 
|  | node_space_map_t *node_space, int *node_space_recs, | 
|  | time_t orig_start_time); | 
|  | static void _adjust_hetjob_prio(uint32_t *prio, uint32_t val); | 
|  | static void _attempt_backfill(void); | 
|  | static int  _clear_job_estimates(void *x, void *arg); | 
|  | static int  _clear_qos_blocked_times(void *x, void *arg); | 
|  | static void _do_diag_stats(struct timeval *tv1, struct timeval *tv2, | 
|  | int node_space_recs); | 
|  | static uint32_t _get_job_max_tl(job_record_t *job_ptr, time_t now, | 
|  | node_space_map_t *node_space); | 
|  | static bool _hetjob_any_resv(job_record_t *het_leader); | 
|  | static uint32_t _hetjob_calc_prio(job_record_t *het_leader); | 
|  | static uint32_t _hetjob_calc_prio_tier(job_record_t *het_leader); | 
|  | static void _het_job_deadlock_fini(void); | 
|  | static bool _het_job_deadlock_test(job_record_t *job_ptr); | 
|  | static bool _job_part_valid(job_record_t *job_ptr, part_record_t *part_ptr); | 
|  | static void _load_config(void); | 
|  | static bool _many_pending_rpcs(void); | 
|  | static bool _more_work(time_t last_backfill_time); | 
|  | static uint32_t _my_sleep(int64_t usec); | 
|  | static int  _num_feature_count(job_record_t *job_ptr, bool *has_xand, | 
|  | bool *has_mor); | 
|  | static int  _het_job_find_map(void *x, void *key); | 
|  | static void _het_job_map_del(void *x); | 
|  | static void _het_job_start_clear(void); | 
|  | static time_t _het_job_start_find(job_record_t *job_ptr); | 
|  | static void _het_job_start_set(job_record_t *job_ptr, time_t latest_start, | 
|  | uint32_t comp_time_limit); | 
|  | static bool _het_job_start_test_single(node_space_map_t *node_space, | 
|  | het_job_map_t *map, bool single); | 
|  | static int  _het_job_start_test_list(void *map, void *node_space); | 
|  | static void _het_job_start_test(node_space_map_t *node_space, | 
|  | uint32_t het_job_id, node_used_t *nodes_used, | 
|  | list_t *nodes_used_list); | 
|  | static void _reset_job_time_limit(job_record_t *job_ptr, time_t now, | 
|  | node_space_map_t *node_space); | 
|  | static void _set_bf_exit(bf_exit_t code); | 
|  | static int  _set_hetjob_details(void *x, void *arg); | 
|  | static int  _start_job(job_record_t *job_ptr, bitstr_t *avail_bitmap); | 
|  | static bool _test_resv_overlap(node_space_map_t *node_space, | 
|  | bitstr_t *use_bitmap, job_record_t *job_ptr, | 
|  | uint32_t start_time, uint32_t end_reserve); | 
|  | static int  _try_sched(job_record_t *job_ptr, bitstr_t **avail_bitmap, | 
|  | uint32_t min_nodes, uint32_t max_nodes, | 
|  | uint32_t req_nodes, resv_exc_t *resv_exc_ptr, | 
|  | will_run_data_t *will_run); | 
|  | static int  _yield_locks(int64_t usec); | 
|  | static void _bf_map_key_id(void *item, const char **key, uint32_t *key_len); | 
|  | static void _bf_map_free(void *item); | 
|  |  | 
|  | /* Log resources to be allocated to a pending job */ | 
|  | static void _dump_job_sched(job_record_t *job_ptr, time_t end_time, | 
|  | bitstr_t *avail_bitmap) | 
|  | { | 
|  | char begin_buf[256], end_buf[256], *node_list; | 
|  |  | 
|  | slurm_make_time_str(&job_ptr->start_time, begin_buf, sizeof(begin_buf)); | 
|  | slurm_make_time_str(&end_time, end_buf, sizeof(end_buf)); | 
|  | node_list = bitmap2node_name(avail_bitmap); | 
|  | log_flag(BACKFILL, "%pJ to start at %s, end at %s on nodes %s in partition %s", | 
|  | job_ptr, begin_buf, end_buf, node_list, | 
|  | job_ptr->part_ptr->name); | 
|  | xfree(node_list); | 
|  | } | 
|  |  | 
|  | static void _dump_job_test(job_record_t *job_ptr, bitstr_t *avail_bitmap, | 
|  | time_t start_time, time_t later_start) | 
|  | { | 
|  | char begin_buf[256], *node_list; | 
|  | char end_buf[256]; | 
|  | char later_buf[256]; | 
|  |  | 
|  | if (start_time == 0) | 
|  | strcpy(begin_buf, "NOW"); | 
|  | else | 
|  | slurm_make_time_str(&start_time, begin_buf, sizeof(begin_buf)); | 
|  | if (later_start == 0) | 
|  | strcpy(later_buf, "NO"); | 
|  | else | 
|  | slurm_make_time_str(&later_start, later_buf, sizeof(later_buf)); | 
|  | if (later_start) | 
|  | later_start += job_ptr->time_limit * 60; | 
|  | slurm_make_time_str(&later_start, end_buf, sizeof(end_buf)); | 
|  |  | 
|  | node_list = bitmap2node_name(avail_bitmap); | 
|  | log_flag(BACKFILL, "Test %pJ at %s to %s (later_start: %s) on %s", | 
|  | job_ptr, begin_buf, end_buf, later_buf, node_list); | 
|  | xfree(node_list); | 
|  | } | 
|  |  | 
|  | /* Log resource allocate table */ | 
|  | static void _dump_node_space_table(node_space_map_t *node_space_ptr) | 
|  | { | 
|  | int i = 0; | 
|  | char begin_buf[256], end_buf[256], *node_list, *licenses; | 
|  |  | 
|  | log_flag(BACKFILL, "========================================="); | 
|  | while (1) { | 
|  | slurm_make_time_str(&node_space_ptr[i].begin_time, | 
|  | begin_buf, sizeof(begin_buf)); | 
|  | slurm_make_time_str(&node_space_ptr[i].end_time, | 
|  | end_buf, sizeof(end_buf)); | 
|  | node_list = bitmap2node_name(node_space_ptr[i].avail_bitmap); | 
|  | licenses = bf_licenses_to_string(node_space_ptr[i].licenses); | 
|  | log_flag(BACKFILL, "Begin:%s End:%s Nodes:%s Licenses:%s Fragmentation:%u", | 
|  | begin_buf, end_buf, node_list, licenses, | 
|  | node_space_ptr[i].fragmentation); | 
|  | xfree(node_list); | 
|  | xfree(licenses); | 
|  | if ((i = node_space_ptr[i].next) == 0) | 
|  | break; | 
|  | } | 
|  | log_flag(BACKFILL, "========================================="); | 
|  | } | 
|  |  | 
|  | static void _set_job_time_limit(job_record_t *job_ptr, uint32_t new_limit) | 
|  | { | 
|  | job_ptr->time_limit = new_limit; | 
|  | /* reset flag if we have a NO_VAL time_limit */ | 
|  | if (job_ptr->time_limit == NO_VAL) | 
|  | job_ptr->limit_set.time = 0; | 
|  |  | 
|  | } | 
|  |  | 
|  | /* | 
|  | * _many_pending_rpcs - Determine if slurmctld is busy with many active RPCs | 
|  | * RET - True if slurmctld currently has more than max_rpc_cnt active RPCs | 
|  | */ | 
|  | static bool _many_pending_rpcs(void) | 
|  | { | 
|  | bool many_pending_rpcs = false; | 
|  |  | 
|  | slurm_mutex_lock(&slurmctld_config.thread_count_lock); | 
|  | //info("thread_count = %u", slurmctld_config.server_thread_count); | 
|  | if ((max_rpc_cnt > 0) && | 
|  | (slurmctld_config.server_thread_count >= max_rpc_cnt)) | 
|  | many_pending_rpcs = true; | 
|  | slurm_mutex_unlock(&slurmctld_config.thread_count_lock); | 
|  |  | 
|  | return many_pending_rpcs; | 
|  |  | 
|  | } | 
|  |  | 
|  | /* | 
|  | * Report summary of job's feature specification | 
|  | * IN job_ptr - job to schedule | 
|  | * OUT has_xand - true if features are XANDed together | 
|  | * OUT has_mor - true if features are MORed together | 
|  | * RET Total count for ALL job features, even counts with XAND separator | 
|  | */ | 
|  | static int _num_feature_count(job_record_t *job_ptr, bool *has_xand, | 
|  | bool *has_mor) | 
|  | { | 
|  | job_details_t *detail_ptr = job_ptr->details; | 
|  | int rc = 0; | 
|  | list_itr_t *feat_iter; | 
|  | job_feature_t *feat_ptr; | 
|  |  | 
|  | *has_xand = false; | 
|  | *has_mor = false; | 
|  | if (detail_ptr->feature_list_use == NULL)	/* no constraints */ | 
|  | return rc; | 
|  |  | 
|  | feat_iter = list_iterator_create(detail_ptr->feature_list_use); | 
|  | while ((feat_ptr = list_next(feat_iter))) { | 
|  | if (feat_ptr->count) | 
|  | rc++; | 
|  | if (feat_ptr->op_code == FEATURE_OP_XAND) | 
|  | *has_xand = true; | 
|  | if (feat_ptr->op_code == FEATURE_OP_MOR) | 
|  | *has_mor = true; | 
|  | } | 
|  | list_iterator_destroy(feat_iter); | 
|  |  | 
|  | return rc; | 
|  | } | 
|  |  | 
|  | static int _clear_qos_blocked_times(void *x, void *arg) | 
|  | { | 
|  | slurmdb_qos_rec_t *qos_ptr = (slurmdb_qos_rec_t *) x; | 
|  | qos_ptr->blocked_until = 0; | 
|  |  | 
|  | return 0; | 
|  | } | 
|  |  | 
|  | /* | 
|  | * Attempt to schedule a specific job on specific available nodes | 
|  | * IN job_ptr - job to schedule | 
|  | * IN/OUT avail_bitmap - nodes available/selected to use | 
|  | * IN resv_exc_ptr - Various TRES which can not be used | 
|  | * RET SLURM_SUCCESS on success, otherwise an error code | 
|  | */ | 
|  | static int  _try_sched(job_record_t *job_ptr, bitstr_t **avail_bitmap, | 
|  | uint32_t min_nodes, uint32_t max_nodes, | 
|  | uint32_t req_nodes, resv_exc_t *resv_exc_ptr, | 
|  | will_run_data_t *will_run) | 
|  | { | 
|  | bitstr_t *low_bitmap = NULL, *tmp_bitmap = NULL; | 
|  | int rc = SLURM_SUCCESS; | 
|  | bool has_xand = false, has_mor = false; | 
|  | int feat_cnt = _num_feature_count(job_ptr, &has_xand, &has_mor); | 
|  | job_details_t *detail_ptr = job_ptr->details; | 
|  | list_t *feature_cache = detail_ptr->feature_list_use; | 
|  | list_t *preemptee_candidates = NULL; | 
|  | list_itr_t *feat_iter; | 
|  | job_feature_t *feat_ptr; | 
|  | job_feature_t *feature_base; | 
|  |  | 
|  | if (has_xand || feat_cnt) { | 
|  | /* | 
|  | * Cache the feature information and test the individual | 
|  | * features (or sets of features in parenthesis), one at a time | 
|  | */ | 
|  | time_t high_start = 0; | 
|  | uint32_t feat_min_node; | 
|  | uint32_t feat_node_cnt; | 
|  |  | 
|  | tmp_bitmap = bit_copy(*avail_bitmap); | 
|  | preemptee_candidates = slurm_find_preemptable_jobs(job_ptr); | 
|  | feat_iter = list_iterator_create(feature_cache); | 
|  | while ((feat_ptr = list_next(feat_iter)) && | 
|  | (rc == SLURM_SUCCESS)) { | 
|  | detail_ptr->feature_list_use = | 
|  | list_create(feature_list_delete); | 
|  | feature_base = xmalloc(sizeof(job_feature_t)); | 
|  | feature_base->name = xstrdup(feat_ptr->name); | 
|  | feature_base->op_code = feat_ptr->op_code; | 
|  | list_append(detail_ptr->feature_list_use, feature_base); | 
|  | feat_min_node = feat_ptr->count; | 
|  | while ((feat_ptr->paren > 0) && | 
|  | ((feat_ptr = list_next(feat_iter)))) { | 
|  | feature_base = xmalloc(sizeof(job_feature_t)); | 
|  | feature_base->name = xstrdup(feat_ptr->name); | 
|  | feature_base->op_code = feat_ptr->op_code; | 
|  | feat_min_node = feat_ptr->count; | 
|  | list_append(detail_ptr->feature_list_use, | 
|  | feature_base); | 
|  | } | 
|  | feature_base->op_code = FEATURE_OP_END; | 
|  | feat_min_node = MAX(1, feat_min_node); | 
|  |  | 
|  | if ((job_req_node_filter(job_ptr, *avail_bitmap, true) | 
|  | == SLURM_SUCCESS) && | 
|  | (bit_set_count(*avail_bitmap) >= feat_min_node)) { | 
|  | rc = select_g_job_test(job_ptr, *avail_bitmap, | 
|  | feat_min_node, max_nodes, | 
|  | feat_min_node, | 
|  | SELECT_MODE_WILL_RUN, | 
|  | preemptee_candidates, | 
|  | NULL, | 
|  | resv_exc_ptr, | 
|  | will_run); | 
|  | if (rc == SLURM_SUCCESS) { | 
|  | if ((high_start == 0) || | 
|  | (high_start < job_ptr->start_time)) | 
|  | high_start = | 
|  | job_ptr->start_time; | 
|  |  | 
|  | if (low_bitmap) { | 
|  | bit_or(low_bitmap, | 
|  | *avail_bitmap); | 
|  | } else { | 
|  | low_bitmap = *avail_bitmap; | 
|  | *avail_bitmap = NULL; | 
|  | } | 
|  | } | 
|  | } else { | 
|  | rc = ESLURM_NODES_BUSY; | 
|  | } | 
|  | FREE_NULL_BITMAP(*avail_bitmap); | 
|  | *avail_bitmap = bit_copy(tmp_bitmap); | 
|  | if (low_bitmap) | 
|  | bit_and_not(*avail_bitmap, low_bitmap); | 
|  | FREE_NULL_LIST(detail_ptr->feature_list_use); | 
|  | } | 
|  | list_iterator_destroy(feat_iter); | 
|  |  | 
|  | if (low_bitmap) | 
|  | feat_node_cnt = bit_set_count(low_bitmap); | 
|  | else | 
|  | feat_node_cnt = 0; | 
|  | if (feat_node_cnt < req_nodes) { | 
|  | detail_ptr->feature_list_use = NULL; | 
|  | rc = select_g_job_test(job_ptr, *avail_bitmap, | 
|  | min_nodes - feat_node_cnt, | 
|  | max_nodes - feat_node_cnt, | 
|  | req_nodes - feat_node_cnt, | 
|  | SELECT_MODE_WILL_RUN, | 
|  | preemptee_candidates, | 
|  | NULL, | 
|  | resv_exc_ptr, | 
|  | will_run); | 
|  |  | 
|  | if (low_bitmap) { | 
|  | bit_or(low_bitmap, *avail_bitmap); | 
|  | } else { | 
|  | low_bitmap = *avail_bitmap; | 
|  | *avail_bitmap = NULL; | 
|  | } | 
|  | } | 
|  | FREE_NULL_LIST(preemptee_candidates); | 
|  | FREE_NULL_BITMAP(tmp_bitmap); | 
|  | if (high_start && rc == SLURM_SUCCESS) { | 
|  | job_ptr->start_time = high_start; | 
|  | FREE_NULL_BITMAP(*avail_bitmap); | 
|  | *avail_bitmap = low_bitmap; | 
|  | } else { | 
|  | rc = ESLURM_NODES_BUSY; | 
|  | job_ptr->start_time = 0; | 
|  | FREE_NULL_BITMAP(*avail_bitmap); | 
|  | FREE_NULL_BITMAP(low_bitmap); | 
|  | } | 
|  |  | 
|  | /* Restore the original feature information */ | 
|  | detail_ptr->feature_list_use = feature_cache; | 
|  | } else if (has_mor) { | 
|  | /* | 
|  | * Cache the feature information and test the individual | 
|  | * features (or sets of features in parenthesis), one at a time | 
|  | */ | 
|  | time_t low_start = 0; | 
|  |  | 
|  | tmp_bitmap = bit_copy(*avail_bitmap); | 
|  | preemptee_candidates = slurm_find_preemptable_jobs(job_ptr); | 
|  | feat_iter = list_iterator_create(feature_cache); | 
|  | while ((feat_ptr = list_next(feat_iter))) { | 
|  | detail_ptr->feature_list_use = | 
|  | list_create(feature_list_delete); | 
|  | feature_base = xmalloc(sizeof(job_feature_t)); | 
|  | feature_base->name = xstrdup(feat_ptr->name); | 
|  | feature_base->op_code = feat_ptr->op_code; | 
|  | list_append(detail_ptr->feature_list_use, feature_base); | 
|  | while ((feat_ptr->paren > 0) && | 
|  | ((feat_ptr = list_next(feat_iter)))) { | 
|  | feature_base = xmalloc(sizeof(job_feature_t)); | 
|  | feature_base->name = xstrdup(feat_ptr->name); | 
|  | feature_base->op_code = feat_ptr->op_code; | 
|  | list_append(detail_ptr->feature_list_use, | 
|  | feature_base); | 
|  | } | 
|  | feature_base->op_code = FEATURE_OP_END; | 
|  |  | 
|  | if ((job_req_node_filter(job_ptr, *avail_bitmap, true) | 
|  | == SLURM_SUCCESS) && | 
|  | (bit_set_count(*avail_bitmap) >= min_nodes)) { | 
|  | rc = select_g_job_test(job_ptr, *avail_bitmap, | 
|  | min_nodes, max_nodes, | 
|  | req_nodes, | 
|  | SELECT_MODE_WILL_RUN, | 
|  | preemptee_candidates, | 
|  | NULL, | 
|  | resv_exc_ptr, | 
|  | will_run); | 
|  | if ((rc == SLURM_SUCCESS) && | 
|  | ((low_start == 0) || | 
|  | (low_start > job_ptr->start_time))) { | 
|  | low_start = job_ptr->start_time; | 
|  | low_bitmap = *avail_bitmap; | 
|  | *avail_bitmap = NULL; | 
|  | } | 
|  | } | 
|  | FREE_NULL_BITMAP(*avail_bitmap); | 
|  | *avail_bitmap = bit_copy(tmp_bitmap); | 
|  | FREE_NULL_LIST(detail_ptr->feature_list_use); | 
|  | } | 
|  | list_iterator_destroy(feat_iter); | 
|  | FREE_NULL_LIST(preemptee_candidates); | 
|  | FREE_NULL_BITMAP(tmp_bitmap); | 
|  | if (low_start) { | 
|  | job_ptr->start_time = low_start; | 
|  | rc = SLURM_SUCCESS; | 
|  | FREE_NULL_BITMAP(*avail_bitmap); | 
|  | *avail_bitmap = low_bitmap; | 
|  | } else { | 
|  | rc = ESLURM_NODES_BUSY; | 
|  | FREE_NULL_BITMAP(low_bitmap); | 
|  | } | 
|  |  | 
|  | /* Restore the original feature information */ | 
|  | detail_ptr->feature_list_use = feature_cache; | 
|  | } else if (detail_ptr->feature_list_use) { | 
|  | if ((job_req_node_filter(job_ptr, *avail_bitmap, true) != | 
|  | SLURM_SUCCESS) || | 
|  | (bit_set_count(*avail_bitmap) < min_nodes)) { | 
|  | rc = ESLURM_NODES_BUSY; | 
|  | } else { | 
|  | preemptee_candidates = | 
|  | slurm_find_preemptable_jobs(job_ptr); | 
|  | rc = select_g_job_test(job_ptr, *avail_bitmap, | 
|  | min_nodes, max_nodes, req_nodes, | 
|  | SELECT_MODE_WILL_RUN, | 
|  | preemptee_candidates, | 
|  | NULL, | 
|  | resv_exc_ptr, | 
|  | will_run); | 
|  | } | 
|  | } else { | 
|  | /* Try to schedule the job. First on dedicated nodes | 
|  | * then on shared nodes (if so configured). */ | 
|  | uint16_t orig_shared; | 
|  | time_t now = time(NULL); | 
|  | char str[100]; | 
|  |  | 
|  | preemptee_candidates = slurm_find_preemptable_jobs(job_ptr); | 
|  | orig_shared = job_ptr->details->share_res; | 
|  | job_ptr->details->share_res = 0; | 
|  | tmp_bitmap = bit_copy(*avail_bitmap); | 
|  |  | 
|  | if (resv_exc_ptr && resv_exc_ptr->core_bitmap) { | 
|  | bit_fmt(str, (sizeof(str) - 1), | 
|  | resv_exc_ptr->core_bitmap); | 
|  | debug2("exclude core bitmap: %s", str); | 
|  | } | 
|  |  | 
|  | rc = select_g_job_test(job_ptr, *avail_bitmap, min_nodes, | 
|  | max_nodes, req_nodes, | 
|  | SELECT_MODE_WILL_RUN, | 
|  | preemptee_candidates, | 
|  | NULL, | 
|  | resv_exc_ptr, | 
|  | will_run); | 
|  |  | 
|  | job_ptr->details->share_res = orig_shared; | 
|  |  | 
|  | if (((rc != SLURM_SUCCESS) || (job_ptr->start_time > now)) && | 
|  | (orig_shared != 0)) { | 
|  | FREE_NULL_BITMAP(*avail_bitmap); | 
|  | *avail_bitmap = tmp_bitmap; | 
|  | rc = select_g_job_test(job_ptr, *avail_bitmap, | 
|  | min_nodes, max_nodes, req_nodes, | 
|  | SELECT_MODE_WILL_RUN, | 
|  | preemptee_candidates, | 
|  | NULL, | 
|  | resv_exc_ptr, | 
|  | will_run); | 
|  | } else | 
|  | FREE_NULL_BITMAP(tmp_bitmap); | 
|  | } | 
|  |  | 
|  | FREE_NULL_LIST(preemptee_candidates); | 
|  | return rc; | 
|  | } | 
|  |  | 
|  | /* Terminate backfill_agent */ | 
|  | extern void stop_backfill_agent(void) | 
|  | { | 
|  | slurm_mutex_lock(&term_lock); | 
|  | stop_backfill = true; | 
|  | slurm_cond_signal(&term_cond); | 
|  | slurm_mutex_unlock(&term_lock); | 
|  | } | 
|  |  | 
|  | /* Sleep for at least specified time, returns actual sleep time in usec */ | 
|  | static uint32_t _my_sleep(int64_t usec) | 
|  | { | 
|  | int64_t nsec; | 
|  | uint32_t sleep_time = 0; | 
|  | struct timespec ts = {0, 0}; | 
|  | struct timeval  tv1 = {0, 0}, tv2 = {0, 0}; | 
|  |  | 
|  | if (gettimeofday(&tv1, NULL)) {		/* Some error */ | 
|  | sleep(1); | 
|  | return 1000000; | 
|  | } | 
|  |  | 
|  | nsec  = tv1.tv_usec + usec; | 
|  | nsec *= 1000; | 
|  | ts.tv_sec  = tv1.tv_sec + (nsec / 1000000000); | 
|  | ts.tv_nsec = nsec % 1000000000; | 
|  | slurm_mutex_lock(&term_lock); | 
|  | if (!stop_backfill) | 
|  | slurm_cond_timedwait(&term_cond, &term_lock, &ts); | 
|  | slurm_mutex_unlock(&term_lock); | 
|  | if (gettimeofday(&tv2, NULL)) | 
|  | return usec; | 
|  | sleep_time = (tv2.tv_sec - tv1.tv_sec) * 1000000; | 
|  | sleep_time += tv2.tv_usec; | 
|  | sleep_time -= tv1.tv_usec; | 
|  | return sleep_time; | 
|  | } | 
|  |  | 
|  | static void _load_config(void) | 
|  | { | 
|  | char *sched_params = slurm_conf.sched_params, *tmp_ptr; | 
|  | long tmp_val = 0; | 
|  |  | 
|  | if ((tmp_ptr = xstrcasestr(sched_params, "bf_interval="))) { | 
|  | backfill_interval = atoi(tmp_ptr + 12); | 
|  | if (((backfill_interval != -1) && (backfill_interval < 1)) || | 
|  | backfill_interval > MAX_BACKFILL_INTERVAL) { | 
|  | error("Invalid SchedulerParameters bf_interval: %d", | 
|  | backfill_interval); | 
|  | backfill_interval = BACKFILL_INTERVAL; | 
|  | } | 
|  | } else { | 
|  | backfill_interval = BACKFILL_INTERVAL; | 
|  | } | 
|  |  | 
|  | if ((tmp_ptr = xstrcasestr(sched_params, "bf_max_time="))) { | 
|  | bf_max_time = atoi(tmp_ptr + 12); | 
|  | if (bf_max_time < 1  || bf_max_time > MAX_BF_MAX_TIME) { | 
|  | error("Invalid SchedulerParameters bf_max_time:" | 
|  | " %d", bf_max_time); | 
|  | bf_max_time = backfill_interval; | 
|  | } | 
|  | } else { | 
|  | bf_max_time = backfill_interval; | 
|  | } | 
|  |  | 
|  | if ((tmp_ptr = xstrcasestr(sched_params, "bf_window="))) { | 
|  | backfill_window = atoi(tmp_ptr + 10) * 60;  /* mins to secs */ | 
|  | if (backfill_window < 1 || | 
|  | backfill_window > MAX_BACKFILL_WINDOW) { | 
|  | error("Invalid SchedulerParameters bf_window: %d", | 
|  | backfill_window); | 
|  | backfill_window = BACKFILL_WINDOW; | 
|  | } | 
|  | } else { | 
|  | backfill_window = BACKFILL_WINDOW; | 
|  | } | 
|  |  | 
|  | if ((tmp_ptr = xstrcasestr(sched_params, "bf_max_job_test="))) | 
|  | max_backfill_job_cnt = atoi(tmp_ptr + 16); | 
|  | else if ((tmp_ptr = xstrcasestr(sched_params, "max_job_bf="))) { | 
|  | fatal("Invalid parameter max_job_bf. The option is no longer supported, please use bf_max_job_test instead."); | 
|  | } | 
|  | else | 
|  | max_backfill_job_cnt = DEF_BF_MAX_JOB_TEST; | 
|  |  | 
|  | if (max_backfill_job_cnt < 1 || | 
|  | max_backfill_job_cnt > MAX_BF_MAX_JOB_TEST) { | 
|  | error("Invalid SchedulerParameters bf_max_job_test: %d", | 
|  | max_backfill_job_cnt); | 
|  | max_backfill_job_cnt = DEF_BF_MAX_JOB_TEST; | 
|  | } | 
|  |  | 
|  | if ((tmp_ptr = xstrcasestr(sched_params, "bf_node_space_size="))) | 
|  | bf_node_space_size = atoi(tmp_ptr + 19); | 
|  | else | 
|  | bf_node_space_size = max_backfill_job_cnt; | 
|  |  | 
|  | if (bf_node_space_size < 2 || | 
|  | bf_node_space_size > 2 * MAX_BF_MAX_JOB_TEST) { | 
|  | error("Invalid SchedulerParameters bf_node_space_size: %d", | 
|  | bf_node_space_size); | 
|  | bf_node_space_size = max_backfill_job_cnt; | 
|  | } | 
|  |  | 
|  | if ((tmp_ptr = xstrcasestr(sched_params, "bf_resolution="))) { | 
|  | backfill_resolution = atoi(tmp_ptr + 14); | 
|  | if (backfill_resolution < 1 || | 
|  | backfill_resolution > MAX_BACKFILL_RESOLUTION) { | 
|  | error("Invalid SchedulerParameters bf_resolution: %d", | 
|  | backfill_resolution); | 
|  | backfill_resolution = BACKFILL_RESOLUTION; | 
|  | } | 
|  | } else { | 
|  | backfill_resolution = BACKFILL_RESOLUTION; | 
|  | } | 
|  |  | 
|  | if ((tmp_ptr = xstrcasestr(sched_params, "bf_max_job_array_resv="))) { | 
|  | bf_max_job_array_resv = atoi(tmp_ptr + 22); | 
|  | if (bf_max_job_array_resv < 0 || | 
|  | bf_max_job_array_resv > MAX_BF_MAX_JOB_ARRAY_RESV) { | 
|  | error("Invalid SchedulerParameters bf_max_job_array_resv: %d", | 
|  | bf_max_job_array_resv); | 
|  | bf_max_job_array_resv = BF_MAX_JOB_ARRAY_RESV; | 
|  | } | 
|  | } else { | 
|  | bf_max_job_array_resv = BF_MAX_JOB_ARRAY_RESV; | 
|  | } | 
|  |  | 
|  | if ((tmp_ptr = xstrcasestr(sched_params, "bf_max_job_part="))) { | 
|  | max_backfill_job_per_part = atoi(tmp_ptr + 16); | 
|  | if (max_backfill_job_per_part < 0) { | 
|  | error("Invalid SchedulerParameters bf_max_job_part: %d", | 
|  | max_backfill_job_per_part); | 
|  | max_backfill_job_per_part = 0; | 
|  | } | 
|  | } else { | 
|  | max_backfill_job_per_part = 0; | 
|  | } | 
|  | if ((max_backfill_job_per_part != 0) && | 
|  | (max_backfill_job_per_part >= max_backfill_job_cnt)) { | 
|  | error("bf_max_job_part >= bf_max_job_test (%u >= %u)", | 
|  | max_backfill_job_per_part, max_backfill_job_cnt); | 
|  | } | 
|  |  | 
|  | if ((tmp_ptr = xstrcasestr(sched_params, "bf_max_job_start="))) { | 
|  | max_backfill_jobs_start = atoi(tmp_ptr + 17); | 
|  | if (max_backfill_jobs_start < 0 || | 
|  | max_backfill_jobs_start > MAX_BF_MAX_JOB_START) { | 
|  | error("Invalid SchedulerParameters bf_max_job_start: %d", | 
|  | max_backfill_jobs_start); | 
|  | max_backfill_jobs_start = 0; | 
|  | } | 
|  | } else { | 
|  | max_backfill_jobs_start = 0; | 
|  | } | 
|  |  | 
|  | if ((tmp_ptr = xstrcasestr(sched_params, "bf_max_job_user="))) { | 
|  | max_backfill_job_per_user = atoi(tmp_ptr + 16); | 
|  | if (max_backfill_job_per_user < 0) { | 
|  | error("Invalid SchedulerParameters bf_max_job_user: %d", | 
|  | max_backfill_job_per_user); | 
|  | max_backfill_job_per_user = 0; | 
|  | } | 
|  | } else { | 
|  | max_backfill_job_per_user = 0; | 
|  | } | 
|  | if ((max_backfill_job_per_user != 0) && | 
|  | (max_backfill_job_per_user > max_backfill_job_cnt)) { | 
|  | warning("bf_max_job_user > bf_max_job_test (%u > %u)", | 
|  | max_backfill_job_per_user, max_backfill_job_cnt); | 
|  | } | 
|  |  | 
|  | bf_job_part_count_reserve = 0; | 
|  | if ((tmp_ptr = xstrcasestr(sched_params, "bf_job_part_count_reserve="))) { | 
|  | int job_cnt = atoi(tmp_ptr + 26); | 
|  | if (job_cnt < 0 || job_cnt > MAX_BF_JOB_PART_COUNT_RESERVE) { | 
|  | error("Invalid SchedulerParameters bf_job_part_count_reserve: %d", | 
|  | job_cnt); | 
|  | } else { | 
|  | bf_job_part_count_reserve = job_cnt; | 
|  | } | 
|  | } | 
|  |  | 
|  | if ((tmp_ptr = xstrcasestr(sched_params, "bf_max_job_user_part="))) { | 
|  | max_backfill_job_per_user_part = atoi(tmp_ptr + 21); | 
|  | if (max_backfill_job_per_user_part < 0) { | 
|  | error("Invalid SchedulerParameters bf_max_job_user_part: %d", | 
|  | max_backfill_job_per_user_part); | 
|  | max_backfill_job_per_user_part = 0; | 
|  | } | 
|  | } else { | 
|  | max_backfill_job_per_user_part = 0; | 
|  | } | 
|  | if ((max_backfill_job_per_user_part != 0) && | 
|  | (max_backfill_job_per_user_part > max_backfill_job_cnt)) { | 
|  | warning("bf_max_job_user_part > bf_max_job_test (%u > %u)", | 
|  | max_backfill_job_per_user_part, max_backfill_job_cnt); | 
|  | } | 
|  |  | 
|  |  | 
|  | if ((tmp_ptr = xstrcasestr(sched_params, "bf_max_job_assoc="))) { | 
|  | max_backfill_job_per_assoc = atoi(tmp_ptr + 17); | 
|  | if (max_backfill_job_per_assoc < 0) { | 
|  | error("Invalid SchedulerParameters bf_max_job_assoc: %d", | 
|  | max_backfill_job_per_assoc); | 
|  | max_backfill_job_per_assoc = 0; | 
|  | } | 
|  | } else { | 
|  | max_backfill_job_per_assoc = 0; | 
|  | } | 
|  | if ((max_backfill_job_per_assoc != 0) && | 
|  | (max_backfill_job_per_assoc > max_backfill_job_cnt)) { | 
|  | warning("bf_max_job_assoc > bf_max_job_test (%u > %u)", | 
|  | max_backfill_job_per_assoc, max_backfill_job_cnt); | 
|  | } | 
|  | if ((max_backfill_job_per_assoc != 0) && | 
|  | (max_backfill_job_per_user != 0)) { | 
|  | error("Both bf_max_job_user and bf_max_job_assoc are set: " | 
|  | "bf_max_job_assoc taking precedence."); | 
|  | max_backfill_job_per_user = 0; | 
|  | } | 
|  |  | 
|  | bf_min_age_reserve = 0; | 
|  | if ((tmp_ptr = xstrcasestr(sched_params, "bf_min_age_reserve="))) { | 
|  | int min_age = atoi(tmp_ptr + 19); | 
|  | if (min_age < 0 || min_age > MAX_BF_MIN_AGE_RESERVE) { | 
|  | error("Invalid SchedulerParameters bf_min_age_reserve: %d", | 
|  | min_age); | 
|  | } else { | 
|  | bf_min_age_reserve = min_age; | 
|  | } | 
|  | } | 
|  |  | 
|  | bf_min_prio_reserve = 0; | 
|  | if ((tmp_ptr = xstrcasestr(sched_params, "bf_min_prio_reserve="))) { | 
|  | unsigned long long int min_prio; | 
|  | tmp_ptr += 20; | 
|  | min_prio = strtoull(tmp_ptr, NULL, 10); | 
|  | if (!min_prio || min_prio > MAX_BF_MIN_PRIO_RESERVE) { | 
|  | error("Invalid SchedulerParameters bf_min_prio_reserve: %llu", | 
|  | min_prio); | 
|  | } else { | 
|  | bf_min_prio_reserve = (uint32_t) min_prio; | 
|  | } | 
|  | } | 
|  |  | 
|  | /* bf_continue makes backfill continue where it was if interrupted */ | 
|  | if (xstrcasestr(sched_params, "bf_continue")) { | 
|  | backfill_continue = true; | 
|  | } else { | 
|  | backfill_continue = false; | 
|  | } | 
|  |  | 
|  | if (xstrcasestr(sched_params, "assoc_limit_stop")) { | 
|  | assoc_limit_stop = true; | 
|  | } else { | 
|  | assoc_limit_stop = false; | 
|  | } | 
|  |  | 
|  |  | 
|  | if ((tmp_ptr = xstrcasestr(sched_params, "bf_yield_interval="))) { | 
|  | yield_interval = atoi(tmp_ptr + 18); | 
|  | if ((yield_interval <= 0) || | 
|  | (yield_interval > MAX_BF_YIELD_INTERVAL)) { | 
|  | error("Invalid backfill scheduler bf_yield_interval: %d", | 
|  | yield_interval); | 
|  | yield_interval = YIELD_INTERVAL; | 
|  | } | 
|  | } else { | 
|  | yield_interval = YIELD_INTERVAL; | 
|  | } | 
|  |  | 
|  | if ((tmp_ptr = xstrcasestr(sched_params, "bf_yield_sleep="))) { | 
|  | yield_sleep = (int64_t) atoll(tmp_ptr + 15); | 
|  | if (yield_sleep <= 0 || yield_sleep > MAX_YIELD_SLEEP) { | 
|  | error("Invalid backfill scheduler bf_yield_sleep: %d", | 
|  | yield_sleep); | 
|  | yield_sleep = YIELD_SLEEP; | 
|  | } | 
|  | } else { | 
|  | yield_sleep = YIELD_SLEEP; | 
|  | } | 
|  |  | 
|  | bf_hetjob_prio = 0; | 
|  | if ((tmp_ptr = xstrcasestr(sched_params, "bf_hetjob_prio="))) { | 
|  | tmp_ptr += 15; | 
|  | if (!xstrncasecmp(tmp_ptr, "min", 3)) | 
|  | bf_hetjob_prio |= HETJOB_PRIO_MIN; | 
|  | else if (!xstrncasecmp(tmp_ptr, "max", 3)) | 
|  | bf_hetjob_prio |= HETJOB_PRIO_MAX; | 
|  | else if (!xstrncasecmp(tmp_ptr, "avg", 3)) | 
|  | bf_hetjob_prio |= HETJOB_PRIO_AVG; | 
|  | else | 
|  | error("Invalid SchedulerParameters bf_hetjob_prio: %s", | 
|  | tmp_ptr); | 
|  | } | 
|  |  | 
|  | bf_hetjob_immediate = false; | 
|  | if (xstrcasestr(sched_params, "bf_hetjob_immediate")) | 
|  | bf_hetjob_immediate = true; | 
|  |  | 
|  | if (bf_hetjob_immediate && !bf_hetjob_prio) { | 
|  | bf_hetjob_prio |= HETJOB_PRIO_MIN; | 
|  | info("bf_hetjob_immediate automatically sets bf_hetjob_prio=min"); | 
|  | } | 
|  |  | 
|  | if (xstrcasestr(sched_params, "bf_one_resv_per_job")) | 
|  | bf_one_resv_per_job = true; | 
|  | else | 
|  | bf_one_resv_per_job = false; | 
|  |  | 
|  | if (xstrcasestr(sched_params, "bf_allow_magnetic_slot")) | 
|  | bf_allow_magnetic_slot = true; | 
|  | else | 
|  | bf_allow_magnetic_slot = false; | 
|  |  | 
|  | if (xstrcasestr(sched_params, "bf_running_job_reserve")) | 
|  | bf_running_job_reserve = true; | 
|  | else | 
|  | bf_running_job_reserve = false; | 
|  |  | 
|  | if (xstrcasestr(sched_params, "bf_licenses")) { | 
|  | bf_licenses = true; | 
|  | bf_running_job_reserve = true; | 
|  | } else { | 
|  | bf_licenses = false; | 
|  | } | 
|  |  | 
|  | if (xstrcasestr(sched_params, "bf_topopt_enable")) { | 
|  | bf_topopt_enable = true; | 
|  | } else { | 
|  | bf_topopt_enable = false; | 
|  | } | 
|  |  | 
|  | if ((tmp_ptr = xstrcasestr(sched_params, "bf_topopt_iterations="))) { | 
|  | bf_topopt_iterations = atoi(tmp_ptr + 21); | 
|  | if ((bf_topopt_iterations <= 1) || | 
|  | (bf_topopt_iterations > MAX_ORACLE_DEPTH)) { | 
|  | error("Invalid backfill scheduler bf_topopt_iterations: %d", | 
|  | bf_topopt_iterations); | 
|  | bf_topopt_iterations = ORACLE_DEPTH; | 
|  | } | 
|  | } else { | 
|  | bf_topopt_iterations = ORACLE_DEPTH; | 
|  | } | 
|  | if ((tmp_ptr = xstrcasestr(sched_params, "max_rpc_cnt="))) | 
|  | max_rpc_cnt = atoi(tmp_ptr + 12); | 
|  | else if ((tmp_ptr = xstrcasestr(sched_params, "max_rpc_count="))) | 
|  | max_rpc_cnt = atoi(tmp_ptr + 14); | 
|  | else | 
|  | max_rpc_cnt = 0; | 
|  | if ((max_rpc_cnt < 0) || (max_rpc_cnt > MAX_MAX_RPC_CNT)) { | 
|  | error("Invalid SchedulerParameters max_rpc_cnt: %d", | 
|  | max_rpc_cnt); | 
|  | max_rpc_cnt = 0; | 
|  | } | 
|  |  | 
|  | if ((tmp_ptr = xstrcasestr(sched_params, "bf_yield_rpc_cnt="))) | 
|  | tmp_val = strtol(tmp_ptr + 17, NULL, 10); | 
|  | else if ((tmp_ptr = xstrcasestr(sched_params, "bf_yield_rpc_count="))) | 
|  | tmp_val = strtol(tmp_ptr + 19, NULL, 10); | 
|  | else | 
|  | tmp_val = MAX((max_rpc_cnt / 10), 20); | 
|  | if ((tmp_val < 0) || (tmp_val > MAX_YIELD_RPC_CNT)) { | 
|  | error("Invalid SchedulerParameters bf_yield_rpc_cnt: %ld", | 
|  | tmp_val); | 
|  | yield_rpc_cnt = MAX((max_rpc_cnt / 10), 20); | 
|  | } else { | 
|  | yield_rpc_cnt = tmp_val; | 
|  | } | 
|  |  | 
|  | if (xstrcasestr(sched_params, "time_min_as_soft_limit")) | 
|  | soft_time_limit = true; | 
|  | } | 
|  |  | 
|  | /* Note that slurm.conf has changed */ | 
|  | extern void backfill_reconfig(void) | 
|  | { | 
|  | slurm_mutex_lock(&config_lock); | 
|  | config_flag = true; | 
|  | slurm_mutex_unlock(&config_lock); | 
|  | } | 
|  |  | 
|  | /* Update backfill scheduling statistics | 
|  | * IN tv1 - start time | 
|  | * IN tv2 - end (current) time | 
|  | * IN node_space_recs - count of records in resources/time table being tested | 
|  | */ | 
|  | static void _do_diag_stats(struct timeval *tv1, struct timeval *tv2, | 
|  | int node_space_recs) | 
|  | { | 
|  | uint32_t delta_t, real_time; | 
|  |  | 
|  | delta_t  = (tv2->tv_sec - tv1->tv_sec) * 1000000; | 
|  | delta_t +=  tv2->tv_usec; | 
|  | delta_t -=  tv1->tv_usec; | 
|  | real_time = delta_t - bf_sleep_usec; | 
|  |  | 
|  | slurmctld_diag_stats.bf_cycle_counter++; | 
|  | slurmctld_diag_stats.bf_cycle_sum += real_time; | 
|  | slurmctld_diag_stats.bf_cycle_last = real_time; | 
|  |  | 
|  | slurmctld_diag_stats.bf_depth_sum += slurmctld_diag_stats.bf_last_depth; | 
|  | slurmctld_diag_stats.bf_depth_try_sum += | 
|  | slurmctld_diag_stats.bf_last_depth_try; | 
|  | if (slurmctld_diag_stats.bf_cycle_last > | 
|  | slurmctld_diag_stats.bf_cycle_max) { | 
|  | slurmctld_diag_stats.bf_cycle_max = slurmctld_diag_stats. | 
|  | bf_cycle_last; | 
|  | } | 
|  | slurmctld_diag_stats.bf_table_size = node_space_recs; | 
|  | slurmctld_diag_stats.bf_table_size_sum += node_space_recs; | 
|  | } | 
|  |  | 
|  | static void _init_planned_bitmap(void) | 
|  | { | 
|  | slurmctld_lock_t read_node_lock = { .node = READ_LOCK }; | 
|  | node_record_t *node_ptr = NULL; | 
|  |  | 
|  | xassert(!planned_bitmap); | 
|  | planned_bitmap = bit_alloc(node_record_count); | 
|  |  | 
|  | /* Sync planned_bitmap with NODE_STATE_PLANNED nodes from state save */ | 
|  | lock_slurmctld(read_node_lock); | 
|  | for (int i = 0; (node_ptr = next_node(&i)); i++) | 
|  | if (IS_NODE_PLANNED(node_ptr)) | 
|  | bit_set(planned_bitmap, i); | 
|  | unlock_slurmctld(read_node_lock); | 
|  | } | 
|  |  | 
|  | extern void __attempt_backfill(void) | 
|  | { | 
|  | _load_config(); | 
|  | het_job_list = list_create(_het_job_map_del); | 
|  | _init_planned_bitmap(); | 
|  | _attempt_backfill(); | 
|  | FREE_NULL_LIST(het_job_list); | 
|  | FREE_NULL_BITMAP(planned_bitmap); | 
|  | } | 
|  |  | 
|  | /* backfill_agent - detached thread periodically attempts to backfill jobs */ | 
|  | extern void *backfill_agent(void *args) | 
|  | { | 
|  | time_t now; | 
|  | double wait_time; | 
|  | static time_t last_backfill_time = 0; | 
|  | /* Read config and partitions; Write jobs and nodes */ | 
|  | slurmctld_lock_t all_locks = { | 
|  | READ_LOCK, WRITE_LOCK, WRITE_LOCK, READ_LOCK, READ_LOCK }; | 
|  | bool load_config; | 
|  | bool short_sleep = false; | 
|  | int backfill_cnt = 0; | 
|  |  | 
|  | #if HAVE_SYS_PRCTL_H | 
|  | if (prctl(PR_SET_NAME, "bckfl", NULL, NULL, NULL) < 0) { | 
|  | error("cannot set my name to %s %m", "backfill"); | 
|  | } | 
|  | #endif | 
|  | _load_config(); | 
|  | last_backfill_time = time(NULL); | 
|  | _init_planned_bitmap(); | 
|  | het_job_list = list_create(_het_job_map_del); | 
|  | while (!stop_backfill) { | 
|  | if (short_sleep) | 
|  | _my_sleep(USEC_IN_SEC); | 
|  | else if (backfill_interval == -1) | 
|  | _my_sleep(BACKFILL_INTERVAL * USEC_IN_SEC); | 
|  | else | 
|  | _my_sleep((int64_t) backfill_interval * USEC_IN_SEC); | 
|  | if (stop_backfill) | 
|  | break; | 
|  |  | 
|  | if (slurmctld_config.scheduling_disabled) | 
|  | continue; | 
|  |  | 
|  | list_flush(het_job_list); | 
|  | slurm_mutex_lock(&config_lock); | 
|  | if (config_flag) { | 
|  | config_flag = false; | 
|  | load_config = true; | 
|  | } else { | 
|  | load_config = false; | 
|  | } | 
|  | slurm_mutex_unlock(&config_lock); | 
|  | if (load_config) | 
|  | _load_config(); | 
|  | if (backfill_interval == -1) { | 
|  | log_flag(BACKFILL, "skipping backfill cycle for %ds", | 
|  | BACKFILL_INTERVAL); | 
|  | continue; | 
|  | } | 
|  | now = time(NULL); | 
|  | wait_time = difftime(now, last_backfill_time); | 
|  | if ((wait_time < backfill_interval) || | 
|  | job_is_completing(NULL) || _many_pending_rpcs() || | 
|  | !_more_work(last_backfill_time)) { | 
|  | short_sleep = true; | 
|  | continue; | 
|  | } | 
|  |  | 
|  | slurm_mutex_lock(&check_bf_running_lock); | 
|  | slurmctld_diag_stats.bf_active = 1; | 
|  | slurm_mutex_unlock(&check_bf_running_lock); | 
|  |  | 
|  | lock_slurmctld(all_locks); | 
|  | validate_all_reservations(true, false); | 
|  | if ((backfill_cnt++ % 2) == 0) | 
|  | _het_job_start_clear(); | 
|  | _attempt_backfill(); | 
|  | last_backfill_time = time(NULL); | 
|  | (void) bb_g_job_try_stage_in(); | 
|  | unlock_slurmctld(all_locks); | 
|  |  | 
|  | slurm_mutex_lock(&check_bf_running_lock); | 
|  | slurmctld_diag_stats.bf_active = 0; | 
|  | slurm_mutex_unlock(&check_bf_running_lock); | 
|  |  | 
|  | short_sleep = false; | 
|  | } | 
|  | FREE_NULL_LIST(het_job_list); | 
|  | xhash_free(user_usage_map); /* May have been init'ed if used */ | 
|  | FREE_NULL_BITMAP(planned_bitmap); | 
|  |  | 
|  | return NULL; | 
|  | } | 
|  |  | 
|  | /* | 
|  | * Clear the start_time and sched_nodes for all pending jobs. This is used to | 
|  | * ensure that a job which can run in multiple partitions has its start_time and | 
|  | * sched_nodes set to the partition offering the earliest start_time. | 
|  | */ | 
|  | static int _clear_job_estimates(void *x, void *arg) | 
|  | { | 
|  | job_record_t *job_ptr = (job_record_t *) x; | 
|  | if (IS_JOB_PENDING(job_ptr)) { | 
|  | job_ptr->start_time = 0; | 
|  | xfree(job_ptr->sched_nodes); | 
|  | } | 
|  | return SLURM_SUCCESS; | 
|  | } | 
|  |  | 
|  | /* | 
|  | * Return non-zero to break the backfill loop if change in job, node, | 
|  | * reservation or partition state or the backfill scheduler needs to be stopped. | 
|  | */ | 
|  | static int _yield_locks(int64_t usec) | 
|  | { | 
|  | slurmctld_lock_t all_locks = { | 
|  | READ_LOCK, WRITE_LOCK, WRITE_LOCK, READ_LOCK, READ_LOCK }; | 
|  | time_t job_update, node_update, part_update, config_update, resv_update; | 
|  | bool load_config = false; | 
|  |  | 
|  | job_update  = last_job_update; | 
|  | node_update = last_node_update; | 
|  | part_update = last_part_update; | 
|  | config_update = slurm_conf.last_update; | 
|  | resv_update = last_resv_update; | 
|  |  | 
|  | unlock_slurmctld(all_locks); | 
|  | while (!stop_backfill) { | 
|  | bf_sleep_usec += _my_sleep(usec); | 
|  | slurm_mutex_lock(&slurmctld_config.thread_count_lock); | 
|  | if ((max_rpc_cnt == 0) || | 
|  | (slurmctld_config.server_thread_count <= yield_rpc_cnt)) { | 
|  | slurm_mutex_unlock(&slurmctld_config.thread_count_lock); | 
|  | break; | 
|  | } | 
|  | verbose("continuing to yield locks, %d RPCs pending", | 
|  | slurmctld_config.server_thread_count); | 
|  | slurm_mutex_unlock(&slurmctld_config.thread_count_lock); | 
|  | } | 
|  | lock_slurmctld(all_locks); | 
|  | slurm_mutex_lock(&config_lock); | 
|  | if (config_flag) | 
|  | load_config = true; | 
|  | slurm_mutex_unlock(&config_lock); | 
|  |  | 
|  | if (((!backfill_continue) && ((last_job_update != job_update) || | 
|  | (last_node_update != node_update))) || | 
|  | (last_part_update != part_update) || | 
|  | (slurm_conf.last_update != config_update) || | 
|  | (validate_resv_cnt != 0) || | 
|  | (last_resv_update != resv_update) || | 
|  | stop_backfill || load_config) | 
|  | return 1; | 
|  | else | 
|  | return 0; | 
|  | } | 
|  |  | 
|  | /* Test if this job still has access to the specified partition. The job's | 
|  | * available partitions may have changed when locks were released */ | 
|  | static bool _job_part_valid(job_record_t *job_ptr, part_record_t *part_ptr) | 
|  | { | 
|  | part_record_t *avail_part_ptr; | 
|  | list_itr_t *part_iterator; | 
|  | bool rc = false; | 
|  |  | 
|  | if (job_ptr->part_ptr_list) { | 
|  | part_iterator = list_iterator_create(job_ptr->part_ptr_list); | 
|  | while ((avail_part_ptr = list_next(part_iterator))) { | 
|  | if (avail_part_ptr == part_ptr) { | 
|  | rc = true; | 
|  | break; | 
|  | } | 
|  | } | 
|  | list_iterator_destroy(part_iterator); | 
|  | } else if (job_ptr->part_ptr == part_ptr) { | 
|  | rc = true; | 
|  | } | 
|  |  | 
|  | return rc; | 
|  | } | 
|  |  | 
|  | /* Determine if job in the backfill queue is still runnable. | 
|  | * Job state could change when lock are periodically released */ | 
|  | static bool _job_runnable_now(job_record_t *job_ptr) | 
|  | { | 
|  | if (IS_JOB_REVOKED(job_ptr)) { | 
|  | log_flag(BACKFILL, "%pJ revoked during bf yield", job_ptr); | 
|  | return false; | 
|  | } | 
|  | if (!IS_JOB_PENDING(job_ptr)) {	/* Started in other partition */ | 
|  | log_flag(BACKFILL, "%pJ started in other partition during bf yield", | 
|  | job_ptr); | 
|  | return false; | 
|  | } | 
|  | if (job_ptr->priority == 0) {	/* Job has been held */ | 
|  | log_flag(BACKFILL, "%pJ job held during bf yield", job_ptr); | 
|  | return false; | 
|  | } | 
|  | if (IS_JOB_COMPLETING(job_ptr)) { /* Started, requeue and completing */ | 
|  | log_flag(BACKFILL, "%pJ job started during bf yield", job_ptr); | 
|  | return false; | 
|  | } | 
|  | /* | 
|  | * Already reserved resources for either bf_max_job_array_resv or | 
|  | * max_run_tasks number of jobs in the array. If max_run_tasks is 0, it | 
|  | * wasn't set, so ignore it. | 
|  | */ | 
|  | if (job_ptr->array_recs && | 
|  | ((job_ptr->array_recs->pend_run_tasks >= bf_max_job_array_resv) || | 
|  | (job_ptr->array_recs->max_run_tasks && | 
|  | ((job_ptr->array_recs->pend_run_tasks + | 
|  | job_ptr->array_recs->tot_run_tasks) >= | 
|  | job_ptr->array_recs->max_run_tasks)))) | 
|  | return false; | 
|  |  | 
|  | return true; | 
|  | } | 
|  |  | 
|  | static void _restore_preempt_state(job_record_t *job_ptr, | 
|  | time_t *tmp_preempt_start_time, | 
|  | bool *tmp_preempt_in_progress) | 
|  | { | 
|  | if ((*tmp_preempt_start_time != 0) | 
|  | && (job_ptr->details->preempt_start_time == 0)) { | 
|  | job_ptr->details->preempt_start_time = | 
|  | *tmp_preempt_start_time; | 
|  | job_ptr->preempt_in_progress = *tmp_preempt_in_progress; | 
|  | } | 
|  |  | 
|  | *tmp_preempt_start_time = 0; | 
|  | *tmp_preempt_in_progress = false; | 
|  | } | 
|  |  | 
|  | /* | 
|  | * IN/OUT: prio to be adjusted | 
|  | * IN: value from current component partition | 
|  | */ | 
|  | static void _adjust_hetjob_prio(uint32_t *prio, uint32_t val) | 
|  | { | 
|  | if (!*prio) | 
|  | *prio = val; | 
|  | else if (bf_hetjob_prio & HETJOB_PRIO_MIN) | 
|  | *prio = MIN(*prio, val); | 
|  | else if (bf_hetjob_prio & HETJOB_PRIO_MAX) | 
|  | *prio = MAX(*prio, val); | 
|  | else if (bf_hetjob_prio & HETJOB_PRIO_AVG) | 
|  | *prio += val; | 
|  | } | 
|  |  | 
|  | /* | 
|  | * IN: job_record pointer of a hetjob leader (caller responsible) | 
|  | * RET: [min|max|avg] Priority of all components from same hetjob | 
|  | */ | 
|  | static uint32_t _hetjob_calc_prio(job_record_t *het_leader) | 
|  | { | 
|  | job_record_t *het_comp = NULL; | 
|  | uint32_t prio = 0, tmp = 0, cnt = 0, i = 0, nparts = 0; | 
|  | list_itr_t *iter = NULL; | 
|  |  | 
|  | if (bf_hetjob_prio & HETJOB_PRIO_MIN) | 
|  | prio = INFINITE; | 
|  |  | 
|  | iter = list_iterator_create(het_leader->het_job_list); | 
|  | while ((het_comp = list_next(iter))) { | 
|  | if (het_comp->part_ptr_list && | 
|  | het_comp->prio_mult && | 
|  | het_comp->prio_mult->priority_array && | 
|  | (nparts = list_count(het_comp->part_ptr_list))) { | 
|  | for (i = 0; i < nparts; i++) { | 
|  | tmp = het_comp->prio_mult->priority_array[i]; | 
|  | if (tmp == 0) { /* job held */ | 
|  | prio = 0; | 
|  | break; | 
|  | } | 
|  | _adjust_hetjob_prio(&prio, tmp); | 
|  | cnt++; | 
|  | } | 
|  | if (prio == 0) /* job held */ | 
|  | break; | 
|  | } else { | 
|  | tmp = het_comp->priority; | 
|  | if (tmp == 0) { /* job held */ | 
|  | prio = 0; | 
|  | break; | 
|  | } | 
|  | _adjust_hetjob_prio(&prio, tmp); | 
|  | cnt++; | 
|  | } | 
|  | if ((bf_hetjob_prio & HETJOB_PRIO_MIN) && (prio == 1)) | 
|  | break; /* Can not get lower */ | 
|  | } | 
|  | list_iterator_destroy(iter); | 
|  | if (prio && cnt && (bf_hetjob_prio & HETJOB_PRIO_AVG)) | 
|  | prio /= cnt; | 
|  |  | 
|  | return prio; | 
|  | } | 
|  |  | 
|  | /* | 
|  | * IN: job_record pointer of a hetjob leader (caller responsible) | 
|  | * RET: [min|max|avg] PriorityTier of all components from same hetjob | 
|  | */ | 
|  | static uint32_t _hetjob_calc_prio_tier(job_record_t *het_leader) | 
|  | { | 
|  | job_record_t *het_comp = NULL; | 
|  | part_record_t *part_ptr = NULL; | 
|  | uint32_t prio_tier = 0, tmp = 0, cnt = 0; | 
|  | list_itr_t *iter = NULL, *iter2 = NULL; | 
|  |  | 
|  | if (bf_hetjob_prio & HETJOB_PRIO_MIN) | 
|  | prio_tier = NO_VAL16 - 1; | 
|  |  | 
|  | iter = list_iterator_create(het_leader->het_job_list); | 
|  | while ((het_comp = list_next(iter))) { | 
|  | if (het_comp->part_ptr_list && | 
|  | list_count(het_comp->part_ptr_list)) { | 
|  | iter2 = list_iterator_create(het_comp->part_ptr_list); | 
|  | while ((part_ptr = list_next(iter2))) { | 
|  | tmp = part_ptr->priority_tier; | 
|  | _adjust_hetjob_prio(&prio_tier, tmp); | 
|  | cnt++; | 
|  | } | 
|  | list_iterator_destroy(iter2); | 
|  | } else { | 
|  | tmp = het_comp->part_ptr->priority_tier; | 
|  | _adjust_hetjob_prio(&prio_tier, tmp); | 
|  | cnt++; | 
|  | } | 
|  | if ((bf_hetjob_prio & HETJOB_PRIO_MIN) && (prio_tier == 0)) | 
|  | break; /* Minimum found. */ | 
|  | if ((bf_hetjob_prio & HETJOB_PRIO_MAX) && | 
|  | (prio_tier == (NO_VAL16 - 1))) | 
|  | break; /* Maximum found. */ | 
|  | } | 
|  | list_iterator_destroy(iter); | 
|  | if (prio_tier && cnt && (bf_hetjob_prio & HETJOB_PRIO_AVG)) | 
|  | prio_tier /= cnt; | 
|  |  | 
|  | return prio_tier; | 
|  | } | 
|  |  | 
|  | /* | 
|  | * IN: job_record pointer of a hetjob leader (caller responsible) | 
|  | * RET: true if any component from same hetjob has a reservation | 
|  | */ | 
|  | static bool _hetjob_any_resv(job_record_t *het_leader) | 
|  | { | 
|  | job_record_t *het_comp = NULL; | 
|  | list_itr_t *iter = NULL; | 
|  | bool any_resv = false; | 
|  |  | 
|  | iter = list_iterator_create(het_leader->het_job_list); | 
|  | while (!any_resv && (het_comp = list_next(iter))) { | 
|  | if (het_comp->resv_id != 0) | 
|  | any_resv = true; | 
|  | } | 
|  | list_iterator_destroy(iter); | 
|  |  | 
|  | return any_resv; | 
|  | } | 
|  |  | 
|  | static int _foreach_het_job_details(void *x, void *arg) | 
|  | { | 
|  | job_record_t *job_ptr = (job_record_t *) x; | 
|  | job_ptr->het_details = (het_job_details_t *)arg; | 
|  |  | 
|  | return SLURM_SUCCESS; | 
|  | } | 
|  |  | 
|  | static int _bf_reserve_resv_licenses(void *x, void *arg) | 
|  | { | 
|  | slurmctld_resv_t *resv_ptr = x; | 
|  | node_space_handler_t *ns_h = arg; | 
|  | node_space_map_t *node_space = ns_h->node_space; | 
|  | int *ns_recs_ptr = ns_h->node_space_recs; | 
|  | time_t start_time, end_time; | 
|  | job_record_t fake_job = { | 
|  | .license_list = resv_ptr->license_list, | 
|  | .resv_ptr = resv_ptr, | 
|  | }; | 
|  |  | 
|  | if (!resv_ptr->license_list) | 
|  | return 0; | 
|  |  | 
|  | if (resv_ptr->end_time < node_space[0].begin_time) | 
|  | return 0; | 
|  |  | 
|  | /* treat flex reservations as always active */ | 
|  | if (resv_ptr->flags & RESERVE_FLAG_FLEX) { | 
|  | start_time = 0; | 
|  | end_time = INFINITE; | 
|  | } else { | 
|  | /* align to resolution */ | 
|  |  | 
|  | start_time = resv_ptr->start_time / backfill_resolution; | 
|  | start_time *= backfill_resolution; | 
|  | end_time = ROUNDUP(resv_ptr->end_time, backfill_resolution); | 
|  | end_time *= backfill_resolution; | 
|  | } | 
|  |  | 
|  | _add_reservation(start_time, end_time, NULL, &fake_job, node_space, | 
|  | ns_recs_ptr, 0); | 
|  |  | 
|  | return 0; | 
|  | } | 
|  |  | 
|  | static int _bf_reserve_running(void *x, void *arg) | 
|  | { | 
|  | job_record_t *job_ptr = (job_record_t *) x; | 
|  | node_space_handler_t *ns_h = (node_space_handler_t *) arg; | 
|  | node_space_map_t *node_space = ns_h->node_space; | 
|  | int *ns_recs_ptr = ns_h->node_space_recs; | 
|  | time_t end_time = job_ptr->end_time; | 
|  | bool licenses, whole, preemptable; | 
|  | bitstr_t *tmp_bitmap; | 
|  |  | 
|  | if (!job_ptr || !IS_JOB_RUNNING(job_ptr) || !job_ptr->job_resrcs) | 
|  | return SLURM_SUCCESS; | 
|  |  | 
|  | whole = (job_ptr->job_resrcs->whole_node & WHOLE_NODE_REQUIRED) || | 
|  | (IS_JOB_WHOLE_TOPO(job_ptr)); | 
|  |  | 
|  | licenses = (job_ptr->license_list); | 
|  |  | 
|  | if (!whole && !licenses) | 
|  | return SLURM_SUCCESS; | 
|  |  | 
|  | preemptable = (slurm_job_preempt_mode(job_ptr) != PREEMPT_MODE_OFF); | 
|  |  | 
|  | if (preemptable && !licenses) | 
|  | return SLURM_SUCCESS; | 
|  |  | 
|  | if (*ns_recs_ptr >= bf_node_space_size) | 
|  | return SLURM_ERROR; | 
|  |  | 
|  | if (soft_time_limit && job_ptr->time_min) { | 
|  | time_t now = time(NULL); | 
|  | time_t soft_end = job_ptr->start_time + job_ptr->time_min * 60; | 
|  | /* | 
|  | * If over the soft limit, assume the job will use half of the | 
|  | * remaining time until the hard limit. | 
|  | */ | 
|  | if (soft_end < now) | 
|  | soft_end = now + (end_time - now) / 2; | 
|  | end_time = soft_end; | 
|  | } | 
|  |  | 
|  | end_time = ROUNDUP(end_time, backfill_resolution) * backfill_resolution; | 
|  |  | 
|  | if (preemptable || !whole) { | 
|  | /* Reservation only needed for licenses. */ | 
|  | tmp_bitmap = bit_alloc(node_record_count); | 
|  | } else { | 
|  | tmp_bitmap = bit_copy(job_ptr->node_bitmap); | 
|  | } | 
|  |  | 
|  | /* | 
|  | * Ensure reservation start time is aligned to the start of the | 
|  | * backfill map by sending 0 in instead of the actual start time. | 
|  | * A long-running backfill cycle could lead to a skew of a few | 
|  | * seconds - or significantly longer with bf_continue set - which | 
|  | * would fragment the start of the backfill map. | 
|  | */ | 
|  | _add_reservation(0, end_time, tmp_bitmap, job_ptr, node_space, | 
|  | ns_recs_ptr, 0); | 
|  |  | 
|  | FREE_NULL_BITMAP(tmp_bitmap); | 
|  |  | 
|  | return SLURM_SUCCESS; | 
|  | } | 
|  |  | 
|  | static int _set_hetjob_details(void *x, void *arg) | 
|  | { | 
|  | job_record_t *job_ptr = (job_record_t *) x; | 
|  | het_job_details_t *details = NULL; | 
|  |  | 
|  | if (IS_JOB_PENDING(job_ptr) && job_ptr->het_job_id && | 
|  | !job_ptr->het_job_offset && job_ptr->het_job_list) { | 
|  | /* | 
|  | * Pending hetjob leader component. Do calculations only once | 
|  | * for whole hetjob. xmalloc memory for 1 het_details struct, | 
|  | * but make the pointer accessible in all hetjob components. | 
|  | */ | 
|  | if (!job_ptr->het_details) | 
|  | job_ptr->het_details = | 
|  | xmalloc(sizeof(het_job_details_t)); | 
|  |  | 
|  | details = job_ptr->het_details; | 
|  | details->any_resv = _hetjob_any_resv(job_ptr); | 
|  | details->priority_tier = _hetjob_calc_prio_tier(job_ptr); | 
|  | details->priority = _hetjob_calc_prio(job_ptr); | 
|  |  | 
|  | list_for_each(job_ptr->het_job_list, | 
|  | _foreach_het_job_details, details); | 
|  | } | 
|  |  | 
|  | return SLURM_SUCCESS; | 
|  | } | 
|  |  | 
|  | /* Fetch key from xhash_t item. Called from function ptr */ | 
|  | static void _bf_map_key_id(void *item, const char **key, uint32_t *key_len) | 
|  | { | 
|  | bf_user_usage_t *user = (bf_user_usage_t *)item; | 
|  |  | 
|  | xassert(user); | 
|  |  | 
|  | *key = (char *)&user->uid; | 
|  | *key_len = sizeof(uid_t); | 
|  | } | 
|  |  | 
|  | /* Free item from xhash_t. Called from function ptr */ | 
|  | static void _bf_map_free(void *item) | 
|  | { | 
|  | bf_user_usage_t *user = (bf_user_usage_t *)item; | 
|  |  | 
|  | if (!user) | 
|  | return; | 
|  |  | 
|  | slurmdb_destroy_bf_usage_members(&user->bf_usage); | 
|  | xfree(user); | 
|  | } | 
|  |  | 
|  | /* Allocate new user and add to xhash_t map */ | 
|  | static bf_user_usage_t *_bf_map_add_user(xhash_t *map, uid_t uid) | 
|  | { | 
|  | bf_user_usage_t *user = xmalloc(sizeof(bf_user_usage_t)); | 
|  | user->uid = uid; | 
|  | xhash_add(map, user); | 
|  | return user; | 
|  | } | 
|  |  | 
|  | /* Find user usage from uid. Add new empty entry to map if not found */ | 
|  | static slurmdb_bf_usage_t *_bf_map_find_add(xhash_t* map, uid_t uid) | 
|  | { | 
|  | bf_user_usage_t *user; | 
|  | xassert(map != NULL); | 
|  |  | 
|  | if (!(user = xhash_get(map, (char *)&uid, sizeof(uid_t)))) | 
|  | user = _bf_map_add_user(map, uid); | 
|  | return &user->bf_usage; | 
|  | } | 
|  |  | 
|  | /* | 
|  | * Check if limit exceeded. Reset usage if usage time is before current | 
|  | * scheduling iteration time | 
|  | */ | 
|  | static bool _check_bf_usage( | 
|  | slurmdb_bf_usage_t *usage, int limit, time_t sched_time) | 
|  | { | 
|  | if (usage->last_sched < sched_time) { | 
|  | usage->last_sched = sched_time; | 
|  | usage->count = 0; | 
|  | return false; | 
|  | } | 
|  | return usage->count >= limit; | 
|  | } | 
|  |  | 
|  | /* | 
|  | * Check if job exceeds configured count limits | 
|  | * returns true if count exceeded | 
|  | */ | 
|  | static bool _job_exceeds_max_bf_param(job_record_t *job_ptr, | 
|  | time_t sched_start) | 
|  | { | 
|  | slurmdb_bf_usage_t *part_usage = NULL, *user_usage = NULL, | 
|  | *assoc_usage = NULL, *user_part_usage = NULL; | 
|  |  | 
|  | slurmdb_assoc_rec_t *assoc_ptr = job_ptr->assoc_ptr; | 
|  | part_record_t *part_ptr = job_ptr->part_ptr; | 
|  |  | 
|  | if (max_backfill_job_per_user_part) { | 
|  | xassert(part_ptr->bf_data); | 
|  | user_part_usage = _bf_map_find_add( | 
|  | part_ptr->bf_data->user_usage, | 
|  | job_ptr->user_id); | 
|  | if (_check_bf_usage(user_part_usage, | 
|  | max_backfill_job_per_user_part, | 
|  | sched_start)) { | 
|  | log_flag(BACKFILL, "have already checked %u jobs for user %u on partition %s; skipping job %u, %pJ", | 
|  | max_backfill_job_per_user_part, | 
|  | job_ptr->user_id, job_ptr->part_ptr->name, | 
|  | job_ptr->job_id, job_ptr); | 
|  | return true; | 
|  | } | 
|  | } | 
|  |  | 
|  | if (max_backfill_job_per_part) { | 
|  | xassert(part_ptr->bf_data); | 
|  | part_usage = part_ptr->bf_data->job_usage; | 
|  | if (_check_bf_usage(part_usage, max_backfill_job_per_part, | 
|  | sched_start)) { | 
|  | log_flag(BACKFILL, "have already checked %u jobs for partition %s; skipping %pJ", | 
|  | max_backfill_job_per_part, | 
|  | job_ptr->part_ptr->name, job_ptr); | 
|  | return true; | 
|  | } | 
|  | } | 
|  |  | 
|  | if (max_backfill_job_per_assoc) { | 
|  | if (assoc_ptr) { | 
|  | if (!assoc_ptr->bf_usage) | 
|  | assoc_ptr->bf_usage = | 
|  | xmalloc(sizeof(slurmdb_bf_usage_t)); | 
|  | assoc_usage = assoc_ptr->bf_usage; | 
|  |  | 
|  | if (_check_bf_usage(assoc_usage, | 
|  | max_backfill_job_per_assoc, | 
|  | sched_start)) { | 
|  | log_flag(BACKFILL, "have already checked %u jobs for user %u, assoc %u; skipping %pJ", | 
|  | max_backfill_job_per_assoc, | 
|  | job_ptr->user_id, job_ptr->assoc_id, | 
|  | job_ptr); | 
|  | return true; | 
|  | } | 
|  | } else { | 
|  | /* Null assoc_ptr indicates no database */ | 
|  | log_flag(BACKFILL, "no assoc for job %u, required for parameter bf_max_job_per_assoc", | 
|  | job_ptr->job_id); | 
|  | assoc_usage = NULL; | 
|  | } | 
|  | } | 
|  |  | 
|  | if (max_backfill_job_per_user) { | 
|  | if (assoc_ptr && assoc_ptr->user_rec) { | 
|  | if (!assoc_ptr->user_rec->bf_usage) | 
|  | assoc_ptr->user_rec->bf_usage = | 
|  | xmalloc(sizeof(slurmdb_bf_usage_t)); | 
|  | user_usage = assoc_ptr->user_rec->bf_usage; | 
|  | } else { | 
|  | /* No database, or user rec missing from assoc */ | 
|  | if (!user_usage_map) | 
|  | user_usage_map = xhash_init(_bf_map_key_id, | 
|  | _bf_map_free); | 
|  | user_usage = _bf_map_find_add(user_usage_map, | 
|  | job_ptr->user_id); | 
|  | } | 
|  |  | 
|  | if (_check_bf_usage(user_usage, max_backfill_job_per_user, | 
|  | sched_start)) { | 
|  | log_flag(BACKFILL, "have already checked %u jobs for user %u; skipping %pJ", | 
|  | max_backfill_job_per_user, job_ptr->user_id, | 
|  | job_ptr); | 
|  | return true; | 
|  | } | 
|  | } | 
|  |  | 
|  | /* | 
|  | * Don't count queue records for magnetic reservation against | 
|  | * backfill limits. | 
|  | */ | 
|  | if ((job_ptr->bit_flags & JOB_MAGNETIC) && !bf_allow_magnetic_slot) | 
|  | return false; | 
|  |  | 
|  | /* Increment our user/partition limit counters as needed */ | 
|  | if (user_part_usage) | 
|  | user_part_usage->count++; | 
|  | if (part_usage) | 
|  | part_usage->count++; | 
|  | if (user_usage) | 
|  | user_usage->count++; | 
|  | if (assoc_usage) | 
|  | assoc_usage->count++; | 
|  | return false; | 
|  | } | 
|  |  | 
|  | /* | 
|  | * Handle the planned list. | 
|  | * set - If true we are setting states, else we clear them. | 
|  | */ | 
|  | static void _handle_planned(bool set) | 
|  | { | 
|  | node_record_t *node_ptr; | 
|  | bool node_update = false, select_synced = false; | 
|  |  | 
|  | if (!planned_bitmap) | 
|  | return; | 
|  |  | 
|  | for (int n = 0; (n = bit_ffs_from_bit(planned_bitmap, n)) >= 0; n++) { | 
|  | if (!(node_ptr = node_record_table_ptr[n])) { | 
|  | /* Node could have been deleted while planned */ | 
|  | bit_clear(planned_bitmap, n); | 
|  | continue; | 
|  | } | 
|  | if (set) { | 
|  | /* | 
|  | * If the node is fully allocated ignore this flag. | 
|  | * This only really matters for IDLE and MIXED. | 
|  | */ | 
|  | if (IS_NODE_ALLOCATED(node_ptr)) { | 
|  | uint16_t idle_cpus = 0; | 
|  |  | 
|  | if (!select_synced) { | 
|  | select_g_select_nodeinfo_set_all(); | 
|  | select_synced = true; | 
|  | } | 
|  |  | 
|  | idle_cpus = node_ptr->cpus_efctv - | 
|  | node_ptr->alloc_cpus; | 
|  | if (idle_cpus && | 
|  | (idle_cpus < node_ptr->cpus_efctv)) | 
|  | /* Mixed node as planned */ | 
|  | goto mixed; | 
|  |  | 
|  | /* | 
|  | * Node fully allocated. Remove from planned. | 
|  | * This is happening when a mixed node gets | 
|  | * fully allocated while looping in | 
|  | * _attempt_backfill (BF sched loop) | 
|  | */ | 
|  | node_ptr->node_state &= ~NODE_STATE_PLANNED; | 
|  | node_update = true; | 
|  | bit_clear(planned_bitmap, n); | 
|  | } else { | 
|  | /* Idle node as planned */ | 
|  | mixed: | 
|  | node_ptr->node_state |= NODE_STATE_PLANNED; | 
|  | node_update = true; | 
|  | } | 
|  | } else { | 
|  | /* Reset planned state for all nodes */ | 
|  | node_ptr->node_state &= ~NODE_STATE_PLANNED; | 
|  | node_update = true; | 
|  | bit_clear(planned_bitmap, n); | 
|  | } | 
|  |  | 
|  | log_flag(BACKFILL, "%s: %s state is %s", | 
|  | set ? "set" : "cleared", | 
|  | node_ptr->name, | 
|  | node_state_string(node_ptr->node_state)); | 
|  | } | 
|  |  | 
|  | if (node_update) | 
|  | last_node_update = time(NULL); | 
|  | } | 
|  | static void _set_slot_time(job_record_t *job_ptr, uint32_t time_limit, | 
|  | uint32_t boot_time, uint32_t *start, uint32_t *end) | 
|  | { | 
|  | *start  = job_ptr->start_time; | 
|  | *end = *start + boot_time + (time_limit * 60) + backfill_resolution - 1; | 
|  |  | 
|  | *start  = (*start / backfill_resolution) * backfill_resolution; | 
|  | *end = (*end / backfill_resolution) * backfill_resolution; | 
|  | } | 
|  |  | 
|  |  | 
|  | /* | 
|  | * Marks nodes' user status and  last job end time | 
|  | * Return positive if a node's last_job_end was updated else return 0 | 
|  | */ | 
|  | static int _mark_nodes_usage(void *x, void *arg) | 
|  | { | 
|  | job_record_t *job_ptr = x; | 
|  | node_used_t *nodes_used = arg; | 
|  | bool last_job_end_updated = false; | 
|  | bool owned; | 
|  |  | 
|  | int i; | 
|  |  | 
|  | xassert(job_ptr); | 
|  | xassert(nodes_used); | 
|  |  | 
|  | if (IS_JOB_PENDING(job_ptr) || IS_JOB_COMPLETED(job_ptr) || | 
|  | !job_ptr->node_bitmap) | 
|  | return last_job_end_updated; | 
|  |  | 
|  | owned = ((job_ptr->details->whole_node & WHOLE_NODE_USER) || | 
|  | (job_ptr->part_ptr && | 
|  | (job_ptr->part_ptr->flags & PART_FLAG_EXCLUSIVE_USER))); | 
|  |  | 
|  | for (i = 0; (i = bit_ffs_from_bit(job_ptr->node_bitmap, i)) >= 0; i++) { | 
|  | if (!nodes_used[i].allocated) { | 
|  | nodes_used[i].allocated = true; | 
|  | nodes_used[i].uid = job_ptr->user_id; | 
|  | nodes_used[i].node_index = i; | 
|  | nodes_used[i].owned = owned; | 
|  | } else if (!nodes_used[i].owned && !nodes_used[i].mixed_user) { | 
|  | nodes_used[i].mixed_user = | 
|  | nodes_used[i].uid != job_ptr->user_id; | 
|  | nodes_used[i].owned = owned; | 
|  | } | 
|  |  | 
|  | if (!nodes_used[i].mcs_label && job_ptr->mcs_label && | 
|  | slurm_mcs_get_select(job_ptr) == 1) { | 
|  | /* | 
|  | * We do not need to copy mcs_label, jobs are not purged | 
|  | * during backfill, so this memory should always be | 
|  | * valid. | 
|  | */ | 
|  | nodes_used[i].mcs_label = job_ptr->mcs_label; | 
|  | } | 
|  |  | 
|  | if (nodes_used[i].last_job_end < job_ptr->end_time) { | 
|  | nodes_used[i].last_job_end = job_ptr->end_time; | 
|  | last_job_end_updated = true; | 
|  | } | 
|  | } | 
|  |  | 
|  |  | 
|  | return last_job_end_updated; | 
|  | } | 
|  |  | 
|  | static int _cmp_last_job_end(void *x, void *y) | 
|  | { | 
|  | node_used_t *node1 = *(node_used_t **) x; | 
|  | node_used_t *node2 = *(node_used_t **) y; | 
|  | if (node1->last_job_end < node2->last_job_end) | 
|  | return 1; | 
|  | else if (node1->last_job_end > node2->last_job_end) | 
|  | return -1; | 
|  | return 0; | 
|  | } | 
|  |  | 
|  | /* For each node find if they have multiple users and the latest job end time */ | 
|  | static void _init_node_used_array_and_list(node_used_t **nodes_used, | 
|  | list_t **nodes_used_list) | 
|  | { | 
|  | xassert(nodes_used && !*nodes_used); | 
|  | xassert(nodes_used_list && !*nodes_used_list); | 
|  |  | 
|  | *nodes_used = xcalloc(node_record_count, sizeof(**nodes_used)); | 
|  | *nodes_used_list = list_create(NULL); /* NULL to avoid double free */ | 
|  |  | 
|  | list_for_each(job_list, _mark_nodes_usage, *nodes_used); | 
|  |  | 
|  | for (int i = 0; i < node_record_count; i++) | 
|  | list_append(*nodes_used_list, &(*nodes_used)[i]); | 
|  | /* Sort list in descending order of last_job_end */ | 
|  | list_sort(*nodes_used_list, _cmp_last_job_end); | 
|  | } | 
|  |  | 
|  | static bool _user_conflicts(bool is_exclusive_user, bool job_user_on_node, | 
|  | node_used_t *node) | 
|  | { | 
|  | if (is_exclusive_user && !node->mixed_user && job_user_on_node) | 
|  | return false; /* user alone on node */ | 
|  | if (!is_exclusive_user && (!node->owned || job_user_on_node)) | 
|  | return false;	/* node not owned or the user owns the node */ | 
|  | return true; /* can't use node due to user conflict */ | 
|  | } | 
|  |  | 
|  | static bool _mcs_label_conflicts(char *job_mcs_label, char *node_mcs_label) | 
|  | { | 
|  | if (job_mcs_label && !xstrcmp(node_mcs_label, job_mcs_label)) | 
|  | return false; /* node already has required mcs_label */ | 
|  | if (!job_mcs_label && !node_mcs_label) | 
|  | return false; /* node can't have mcs_label and it doesn't */ | 
|  | return true; /* can't use node due to mcs_label conflict */ | 
|  | } | 
|  |  | 
|  | /* | 
|  | * Check if a node can be used, if not remove it. If the node can't be remove | 
|  | * delay the start time. | 
|  | * Return true if the start was delayed (or can't be delayed) | 
|  | */ | 
|  | static int _rm_node_or_delay_start(void *x, void *arg) | 
|  | { | 
|  | node_used_t *node = x; | 
|  | filter_exclusive_args_t *args = arg; | 
|  | bool job_user_on_node = node->uid == args->job_user; | 
|  |  | 
|  | if (!node->allocated) | 
|  | return true; /* following nodes are idle */ | 
|  | if (node->last_job_end <= args->start_time) | 
|  | return true; /* following nodes will be idle by start_time */ | 
|  | if (!bit_test(args->node_bitmap, node->node_index)) | 
|  | return false; /* not available to start with */ | 
|  | if (!_user_conflicts(args->is_exclusive_user, job_user_on_node, node) && | 
|  | !_mcs_label_conflicts(args->mcs_label, node->mcs_label)) | 
|  | return false; /* job user and mcs don't conflict with node's */ | 
|  |  | 
|  | /* can't use this node */ | 
|  | *(args->later_start) = node->last_job_end; | 
|  |  | 
|  | if ((args->node_cnt > args->min_nodes) && | 
|  | (!args->req_nodes || | 
|  | !bit_test(args->req_nodes, node->node_index))) { | 
|  | /* able to remove the node*/ | 
|  | bit_clear(args->node_bitmap, node->node_index); | 
|  | args->node_cnt--; | 
|  | return false; | 
|  | } | 
|  |  | 
|  | /* can't remove the node, delay job start */ | 
|  | args->delay_start = true; | 
|  | return true; | 
|  | } | 
|  |  | 
|  | /* Return true if start_time was delayed */ | 
|  | static bool _filter_exclusive_user_mcs_nodes(job_record_t *job_ptr, | 
|  | int mcs_select, | 
|  | uint32_t min_nodes, | 
|  | list_t *nodes_used_list, | 
|  | time_t start_time, | 
|  | time_t *later_filter_start, | 
|  | bitstr_t *node_bitmap) | 
|  | { | 
|  | *later_filter_start = 0; | 
|  | filter_exclusive_args_t args = { | 
|  | .min_nodes = min_nodes, | 
|  | .job_user = job_ptr->user_id, | 
|  | .node_bitmap = node_bitmap, | 
|  | .req_nodes = job_ptr->details->req_node_bitmap, | 
|  | .node_cnt = bit_set_count(node_bitmap), | 
|  | .later_start = later_filter_start, | 
|  | .start_time = start_time, | 
|  | }; | 
|  |  | 
|  | /* | 
|  | * Filter out any nodes used by other users, is_exclusive_user = true, | 
|  | * or filter out nodes owned by other users, is_exclusive_user = false | 
|  | */ | 
|  | if ((job_ptr->details->whole_node & WHOLE_NODE_USER) || | 
|  | (job_ptr->part_ptr->flags & PART_FLAG_EXCLUSIVE_USER)) | 
|  | args.is_exclusive_user = true; | 
|  |  | 
|  | /* Need to filter out any nodes allocated with other mcs */ | 
|  | args.mcs_label = (mcs_select == 1) ? job_ptr->mcs_label : NULL; | 
|  |  | 
|  | /* Note that nodes_used_list is sorted in descending order of job end */ | 
|  | list_find_first(nodes_used_list, _rm_node_or_delay_start, &args); | 
|  |  | 
|  | return args.delay_start; | 
|  | } | 
|  |  | 
|  | /* This is for use in _attempt_backfill() only */ | 
|  | #define SKIP_SCHED_OR_TRY_LATER(job_ptr, job_no_reserve, later_start,	\ | 
|  | orig_time_limit, orig_start_time)	\ | 
|  | {									\ | 
|  | _set_job_time_limit(job_ptr, orig_time_limit);			\ | 
|  | if (later_start && !job_no_reserve) {				\ | 
|  | log_flag(BACKFILL, "Try later %pJ later_start %ld",	\ | 
|  | job_ptr, later_start);				\ | 
|  | job_ptr->start_time = 0;				\ | 
|  | goto TRY_LATER;						\ | 
|  | }								\ | 
|  | /*								\ | 
|  | * Job can not start until too far in the future.		\ | 
|  | * Use orig_start_time if job can't				\ | 
|  | * start in different partition it will be 0			\ | 
|  | */								\ | 
|  | log_flag(BACKFILL, "Can't schedule %pJ in partition %s",	\ | 
|  | job_ptr, job_ptr->part_ptr->name);			\ | 
|  | job_ptr->start_time = orig_start_time;				\ | 
|  | continue;	/* not runnable in this partition */		\ | 
|  | } | 
|  |  | 
|  | static void _attempt_backfill(void) | 
|  | { | 
|  | DEF_TIMERS; | 
|  | list_t *job_queue = NULL; | 
|  | job_queue_rec_t *job_queue_rec = NULL; | 
|  | int bb, i, j, node_space_recs, mcs_select = 0; | 
|  | slurmdb_qos_rec_t *qos_ptr = NULL; | 
|  | job_record_t *job_ptr = NULL; | 
|  | part_record_t *part_ptr; | 
|  | uint32_t end_time, end_reserve, deadline_time_limit, boot_time; | 
|  | uint32_t orig_end_time; | 
|  | uint32_t time_limit, comp_time_limit, orig_time_limit = 0, part_time_limit; | 
|  | uint32_t min_nodes, max_nodes, req_nodes; | 
|  | bitstr_t *active_bitmap = NULL, *avail_bitmap = NULL; | 
|  | bitstr_t *resv_bitmap = NULL, *excluded_topo_bitmap = NULL; | 
|  | time_t now, sched_start, later_start, start_res, resv_end, window_end; | 
|  | time_t het_job_time, orig_sched_start, orig_start_time = (time_t) 0; | 
|  | time_t later_filter_start; | 
|  | node_space_map_t *node_space; | 
|  | node_used_t *nodes_used = NULL; | 
|  | list_t *nodes_used_list = NULL; | 
|  | struct timeval bf_time1, bf_time2; | 
|  | int error_code; | 
|  | int job_test_count = 0, test_time_count = 0, pend_time; | 
|  | bool already_counted, many_rpcs = false; | 
|  | job_record_t *reject_array_job = NULL; | 
|  | part_record_t *reject_array_part = NULL; | 
|  | slurmdb_qos_rec_t *reject_array_qos = NULL; | 
|  | slurmctld_resv_t *reject_array_resv = NULL; | 
|  | bool reject_array_use_prefer = false; | 
|  | uint32_t start_time, array_start_time = 0; | 
|  | struct timeval start_tv; | 
|  | uint32_t test_array_job_id = 0; | 
|  | uint32_t test_array_count = 0; | 
|  | uint32_t job_no_reserve; | 
|  | bool is_job_array_head, resv_overlap = false; | 
|  | uint8_t save_share_res = 0, save_whole_node = 0; | 
|  | int test_fini; | 
|  | uint32_t qos_flags = 0; | 
|  | time_t qos_blocked_until = 0, qos_part_blocked_until = 0; | 
|  | time_t tmp_preempt_start_time = 0; | 
|  | bool tmp_preempt_in_progress = false; | 
|  | bitstr_t *tmp_bitmap = NULL; | 
|  | bool state_changed_break = false, nodes_planned = false; | 
|  | bitstr_t *next_bitmap = NULL, *current_bitmap = NULL; | 
|  | resv_exc_t resv_exc = { 0 }; | 
|  | will_run_data_t will_run_data = { 0 }; | 
|  | bool overlap_tested = false; | 
|  | /* QOS Read lock */ | 
|  | assoc_mgr_lock_t qos_read_lock = { | 
|  | .qos = READ_LOCK, | 
|  | }; | 
|  |  | 
|  | bf_sleep_usec = 0; | 
|  | job_start_cnt = 0; | 
|  | job_test_cnt = 0; | 
|  |  | 
|  | if (!fed_mgr_sibs_synced()) { | 
|  | info("returning, federation siblings not synced yet"); | 
|  | return; | 
|  | } | 
|  |  | 
|  | (void) bb_g_load_state(false); | 
|  |  | 
|  | START_TIMER; | 
|  | if (slurm_conf.debug_flags & DEBUG_FLAG_BACKFILL) | 
|  | info("beginning"); | 
|  | else | 
|  | debug("beginning"); | 
|  | sched_start = orig_sched_start = now = time(NULL); | 
|  | gettimeofday(&start_tv, NULL); | 
|  |  | 
|  | _handle_planned(nodes_planned); | 
|  |  | 
|  | job_queue = build_job_queue(true, true); | 
|  | job_test_count = list_count(job_queue); | 
|  | if (job_test_count == 0) { | 
|  | if (slurm_conf.debug_flags & DEBUG_FLAG_BACKFILL) | 
|  | info("no jobs to backfill"); | 
|  | else | 
|  | debug("no jobs to backfill"); | 
|  | FREE_NULL_LIST(job_queue); | 
|  | return; | 
|  | } else | 
|  | debug("%u jobs to backfill", job_test_count); | 
|  |  | 
|  | list_for_each(job_list, _clear_job_estimates, NULL); | 
|  |  | 
|  | if (bf_hetjob_prio) | 
|  | list_for_each(job_list, _set_hetjob_details, NULL); | 
|  |  | 
|  | gettimeofday(&bf_time1, NULL); | 
|  |  | 
|  | slurmctld_diag_stats.bf_queue_len = job_test_count; | 
|  | slurmctld_diag_stats.bf_queue_len_sum += slurmctld_diag_stats. | 
|  | bf_queue_len; | 
|  | job_test_count = 0; | 
|  |  | 
|  | slurmctld_diag_stats.bf_last_depth = 0; | 
|  | slurmctld_diag_stats.bf_last_depth_try = 0; | 
|  | slurmctld_diag_stats.bf_when_last_cycle = now; | 
|  |  | 
|  | node_space = xcalloc((bf_node_space_size + 1), | 
|  | sizeof(node_space_map_t)); | 
|  | node_space[0].begin_time = sched_start / backfill_resolution; | 
|  | node_space[0].begin_time *= backfill_resolution; | 
|  | window_end = (sched_start + backfill_window) / backfill_resolution; | 
|  | window_end *= backfill_resolution; | 
|  | node_space[0].end_time = window_end; | 
|  |  | 
|  | node_space[0].avail_bitmap = bit_copy(avail_node_bitmap); | 
|  | /* Make "resuming" nodes available to be scheduled in backfill */ | 
|  | bit_or(node_space[0].avail_bitmap, rs_node_bitmap); | 
|  |  | 
|  | if (bf_licenses) | 
|  | node_space[0].licenses = | 
|  | bf_licenses_initial(bf_running_job_reserve); | 
|  |  | 
|  | if (bf_topopt_enable) { | 
|  | node_space[0].fragmentation = topology_g_get_fragmentation( | 
|  | node_space[0].avail_bitmap); | 
|  | } | 
|  |  | 
|  | node_space[0].next = 0; | 
|  | node_space_recs = 1; | 
|  |  | 
|  | if (bf_running_job_reserve) { | 
|  | node_space_handler_t node_space_handler; | 
|  | node_space_handler.node_space = node_space; | 
|  | node_space_handler.node_space_recs = &node_space_recs; | 
|  |  | 
|  | if (bf_licenses) | 
|  | list_for_each(resv_list, _bf_reserve_resv_licenses, | 
|  | &node_space_handler); | 
|  |  | 
|  | list_for_each(job_list, _bf_reserve_running, | 
|  | &node_space_handler); | 
|  | } | 
|  |  | 
|  | _init_node_used_array_and_list(&nodes_used, &nodes_used_list); | 
|  |  | 
|  | if (slurm_conf.debug_flags & DEBUG_FLAG_BACKFILL_MAP) | 
|  | _dump_node_space_table(node_space); | 
|  |  | 
|  | if (assoc_limit_stop) { | 
|  | assoc_mgr_lock(&qos_read_lock); | 
|  | list_for_each(assoc_mgr_qos_list, | 
|  | _clear_qos_blocked_times, NULL); | 
|  | assoc_mgr_unlock(&qos_read_lock); | 
|  | } | 
|  |  | 
|  | sort_job_queue(job_queue); | 
|  |  | 
|  | /* Ignore nodes that have been set as available during this cycle. */ | 
|  | bit_clear_all(bf_ignore_node_bitmap); | 
|  |  | 
|  | if (bf_topopt_enable) | 
|  | init_oracle(); | 
|  |  | 
|  | while (1) { | 
|  | uint32_t bf_job_priority, prio_reserve; | 
|  | bool get_boot_time = false; | 
|  | bool licenses_unavail; | 
|  | bool use_prefer = false; | 
|  | slurmctld_resv_t *resv_ptr = NULL; | 
|  |  | 
|  | /* Run some final guaranteed logic after each job iteration */ | 
|  | if (job_ptr) { | 
|  | job_resv_clear_magnetic_flag(job_ptr); | 
|  | fill_array_reasons(job_ptr, reject_array_job); | 
|  |  | 
|  | /* Restore preemption state if needed. */ | 
|  | _restore_preempt_state(job_ptr, &tmp_preempt_start_time, | 
|  | &tmp_preempt_in_progress); | 
|  |  | 
|  | /* | 
|  | * Restore the original time limit in every corner case | 
|  | * we didn't have done yet, like when we are looping | 
|  | * through array tasks. | 
|  | */ | 
|  | if ((qos_flags & QOS_FLAG_NO_RESERVE) && | 
|  | slurm_conf.preempt_mode && orig_time_limit && | 
|  | (orig_time_limit != job_ptr->time_limit)) | 
|  | job_ptr->time_limit = orig_time_limit; | 
|  |  | 
|  | /* | 
|  | * An array job with pending tasks should take on the | 
|  | * start_time of the earliest pending task in the | 
|  | * array. | 
|  | */ | 
|  | if (job_ptr->array_recs && array_start_time) | 
|  | job_ptr->start_time = array_start_time; | 
|  | } | 
|  | array_start_time = 0; | 
|  | xfree(job_queue_rec); | 
|  | job_queue_rec = list_pop(job_queue); | 
|  | if (!job_queue_rec) { | 
|  | log_flag(BACKFILL, "reached end of job queue"); | 
|  | _set_bf_exit(BF_EXIT_END); | 
|  | break; | 
|  | } | 
|  |  | 
|  | if (job_test_cnt >= | 
|  | max_backfill_job_cnt) { | 
|  | log_flag(BACKFILL, "bf_max_job_test: limit of %d reached", | 
|  | max_backfill_job_cnt); | 
|  | _set_bf_exit(BF_EXIT_MAX_JOB_TEST); | 
|  | break; | 
|  | } | 
|  |  | 
|  | if (window_end < now) { | 
|  | log_flag(BACKFILL, "Now after current backfill window"); | 
|  | _set_bf_exit(BF_EXIT_TIMEOUT); | 
|  | break; | 
|  | } | 
|  | job_ptr          = job_queue_rec->job_ptr; | 
|  | part_ptr         = job_queue_rec->part_ptr; | 
|  | bf_job_priority  = job_queue_rec->priority; | 
|  | qos_ptr = job_queue_rec->qos_ptr; | 
|  | use_prefer = job_queue_rec->use_prefer; | 
|  |  | 
|  | if (job_ptr->array_recs && | 
|  | (job_queue_rec->array_task_id == NO_VAL)) | 
|  | is_job_array_head = true; | 
|  | else | 
|  | is_job_array_head = false; | 
|  |  | 
|  | if (slurmctld_config.shutdown_time || | 
|  | (difftime(time(NULL),orig_sched_start) >= bf_max_time)){ | 
|  | _set_bf_exit(BF_EXIT_TIMEOUT); | 
|  | break; | 
|  | } | 
|  |  | 
|  | many_rpcs = false; | 
|  | slurm_mutex_lock(&slurmctld_config.thread_count_lock); | 
|  | if ((max_rpc_cnt > 0) && | 
|  | (slurmctld_config.server_thread_count >= max_rpc_cnt)) | 
|  | many_rpcs = true; | 
|  | slurm_mutex_unlock(&slurmctld_config.thread_count_lock); | 
|  |  | 
|  | if (many_rpcs || (slurm_delta_tv(&start_tv) >= yield_interval)) { | 
|  | if (slurm_conf.debug_flags & DEBUG_FLAG_BACKFILL) { | 
|  | END_TIMER; | 
|  | info("yielding locks after testing %u(%d) jobs, %s", | 
|  | slurmctld_diag_stats.bf_last_depth, | 
|  | job_test_count, TIME_STR); | 
|  | } | 
|  | /* Sync planned nodes before yielding locks */ | 
|  | nodes_planned = true; | 
|  | _handle_planned(nodes_planned); | 
|  | if (_yield_locks(yield_sleep)) { | 
|  | log_flag(BACKFILL, "system state changed, breaking out after testing %u(%d) jobs", | 
|  | slurmctld_diag_stats.bf_last_depth, | 
|  | job_test_count); | 
|  | state_changed_break = true; | 
|  | _set_bf_exit(BF_EXIT_STATE_CHANGED); | 
|  | break; | 
|  | } | 
|  | /* Reset backfill scheduling timers, resume testing */ | 
|  | sched_start = time(NULL); | 
|  | gettimeofday(&start_tv, NULL); | 
|  | job_test_count = 0; | 
|  | test_time_count = 0; | 
|  | nodes_planned = false; | 
|  | START_TIMER; | 
|  | } | 
|  |  | 
|  | if (is_job_array_head && | 
|  | (job_ptr->array_task_id != NO_VAL)) { | 
|  | /* Job array element started in other partition, | 
|  | * reset pointer to "master" job array record */ | 
|  | log_flag(BACKFILL, "%pJ array scheduled during bf yield, try master", | 
|  | job_ptr); | 
|  | job_ptr = find_job_record(job_ptr->array_job_id); | 
|  | if (!job_ptr)	/* All task array elements started */ | 
|  | continue; | 
|  | job_queue_rec->job_ptr = job_ptr; | 
|  | } | 
|  |  | 
|  | /* | 
|  | * Establish baseline (worst case) start time for hetjob | 
|  | * Update time once start time estimate established | 
|  | */ | 
|  | _het_job_start_set(job_ptr, (now + YEAR_SECONDS), NO_VAL); | 
|  |  | 
|  | if (job_ptr->het_job_id && | 
|  | (job_ptr->state_reason == WAIT_NO_REASON)) { | 
|  | xfree(job_ptr->state_desc); | 
|  | job_ptr->state_reason = WAIT_RESOURCES; | 
|  | } | 
|  |  | 
|  | if (!_job_runnable_now(job_ptr)) | 
|  | continue; | 
|  | if (!part_ptr) | 
|  | continue; | 
|  | if (!_job_part_valid(job_ptr, part_ptr)) | 
|  | continue;	/* Partition change during lock yield */ | 
|  |  | 
|  | if (job_ptr->resv_list) | 
|  | job_queue_rec_resv_list(job_queue_rec); | 
|  | else | 
|  | job_queue_rec_magnetic_resv(job_queue_rec); | 
|  | resv_ptr = job_ptr->resv_ptr; | 
|  | xfree(job_queue_rec); | 
|  |  | 
|  | job_ptr->bit_flags |= BACKFILL_SCHED; | 
|  | job_ptr->last_sched_eval = now; | 
|  | job_ptr->part_ptr = part_ptr; | 
|  | job_ptr->priority = bf_job_priority; | 
|  | job_ptr->qos_ptr = qos_ptr; | 
|  |  | 
|  | mcs_select = slurm_mcs_get_select(job_ptr); | 
|  | het_job_time = _het_job_start_find(job_ptr); | 
|  | if (het_job_time > (now + backfill_window)) | 
|  | continue; | 
|  |  | 
|  | if (job_ptr->qos_ptr) { | 
|  | assoc_mgr_lock_t locks = { | 
|  | .assoc = READ_LOCK, | 
|  | .qos = READ_LOCK, | 
|  | }; | 
|  |  | 
|  | assoc_mgr_lock(&locks); | 
|  | if (job_ptr->assoc_ptr | 
|  | && (accounting_enforce & ACCOUNTING_ENFORCE_QOS) | 
|  | && ((job_ptr->qos_ptr->id >= g_qos_count) || | 
|  | !job_ptr->assoc_ptr->usage || | 
|  | !job_ptr->assoc_ptr->usage->valid_qos || | 
|  | !bit_test(job_ptr->assoc_ptr->usage->valid_qos, | 
|  | job_ptr->qos_ptr->id)) | 
|  | && !job_ptr->limit_set.qos) { | 
|  | debug("%pJ has invalid QOS", | 
|  | job_ptr); | 
|  | assoc_mgr_unlock(&locks); | 
|  | job_fail_qos(job_ptr, __func__, false); | 
|  | last_job_update = now; | 
|  | continue; | 
|  | } else if (job_ptr->state_reason == FAIL_QOS) { | 
|  | xfree(job_ptr->state_desc); | 
|  | job_ptr->state_reason = WAIT_NO_REASON; | 
|  | last_job_update = now; | 
|  | } | 
|  | assoc_mgr_unlock(&locks); | 
|  | } | 
|  |  | 
|  | assoc_mgr_lock(&qos_read_lock); | 
|  | if (job_ptr->qos_ptr) { | 
|  | qos_flags = job_ptr->qos_ptr->flags; | 
|  | qos_blocked_until = job_ptr->qos_ptr->blocked_until; | 
|  | } else { | 
|  | qos_flags = 0; | 
|  | qos_blocked_until = 0; | 
|  | } | 
|  |  | 
|  | if (job_ptr->part_ptr->qos_ptr) | 
|  | qos_part_blocked_until = | 
|  | job_ptr->part_ptr->qos_ptr->blocked_until; | 
|  | else | 
|  | qos_part_blocked_until = 0; | 
|  |  | 
|  | if (part_policy_valid_qos(job_ptr->part_ptr, job_ptr->qos_ptr, | 
|  | job_ptr->user_id, job_ptr) != | 
|  | SLURM_SUCCESS) { | 
|  | assoc_mgr_unlock(&qos_read_lock); | 
|  | continue; | 
|  | } | 
|  | assoc_mgr_unlock(&qos_read_lock); | 
|  |  | 
|  | if (!assoc_limit_stop && | 
|  | !acct_policy_job_runnable_pre_select(job_ptr, false)) { | 
|  | continue; | 
|  | } | 
|  |  | 
|  | if (!(prio_reserve = acct_policy_get_prio_thresh( | 
|  | job_ptr, false))) | 
|  | prio_reserve = bf_min_prio_reserve; | 
|  |  | 
|  | if (prio_reserve) | 
|  | log_flag(BACKFILL, "%pJ has a prio_reserve of %u", | 
|  | job_ptr, prio_reserve); | 
|  |  | 
|  | job_no_reserve = 0; | 
|  | if (prio_reserve && | 
|  | (job_ptr->priority < prio_reserve)) { | 
|  | job_no_reserve = TEST_NOW_ONLY; | 
|  | } else if (bf_min_age_reserve && job_ptr->details->begin_time) { | 
|  | pend_time = difftime(time(NULL), | 
|  | job_ptr->details->begin_time); | 
|  | if (pend_time < bf_min_age_reserve) | 
|  | job_no_reserve = TEST_NOW_ONLY; | 
|  | } | 
|  |  | 
|  | if (bf_one_resv_per_job && job_ptr->start_time) { | 
|  | log_flag(BACKFILL, "%pJ already added a backfill reservation. Test immediate start only for partition %s", | 
|  | job_ptr, job_ptr->part_ptr->name); | 
|  | job_no_reserve = TEST_NOW_ONLY; | 
|  | } | 
|  |  | 
|  | /* | 
|  | * If we are trying to schedule preferred features don't | 
|  | * reserve. | 
|  | */ | 
|  | if (use_prefer) | 
|  | job_no_reserve = TEST_NOW_ONLY; | 
|  |  | 
|  | /* If partition data is needed and not yet initialized, do so */ | 
|  | if (!job_ptr->part_ptr->bf_data && | 
|  | (bf_job_part_count_reserve || | 
|  | max_backfill_job_per_user_part || | 
|  | max_backfill_job_per_part)) { | 
|  | bf_part_data_t *part_data = | 
|  | xmalloc(sizeof(bf_part_data_t)); | 
|  | part_data->job_usage = | 
|  | xmalloc(sizeof(slurmdb_bf_usage_t)); | 
|  | part_data->resv_usage = | 
|  | xmalloc(sizeof(slurmdb_bf_usage_t)); | 
|  | part_data->user_usage = xhash_init(_bf_map_key_id, | 
|  | _bf_map_free); | 
|  | job_ptr->part_ptr->bf_data = part_data; | 
|  | } | 
|  |  | 
|  | if ((job_no_reserve == 0) && bf_job_part_count_reserve) { | 
|  | if (_check_bf_usage( | 
|  | job_ptr->part_ptr->bf_data->resv_usage, | 
|  | bf_job_part_count_reserve, | 
|  | orig_sched_start)) | 
|  | job_no_reserve = TEST_NOW_ONLY; | 
|  | } | 
|  |  | 
|  | if (job_ptr->preempt_in_progress) | 
|  | continue; 	/* scheduled in another partition */ | 
|  |  | 
|  | orig_start_time = job_ptr->start_time; | 
|  | orig_time_limit = job_ptr->time_limit; | 
|  |  | 
|  | next_task: | 
|  | /* | 
|  | * Restore time_limit for array tasks, just in case it has been | 
|  | * overridden. This is no-op for the rest of cases. | 
|  | */ | 
|  | job_ptr->time_limit = orig_time_limit; | 
|  |  | 
|  | /* | 
|  | * Save the current preemption state. Reset preemption state | 
|  | * in the job_ptr so a job array can preempt multiple jobs. | 
|  | */ | 
|  | if (job_ptr->preempt_in_progress) { | 
|  | tmp_preempt_in_progress = job_ptr->preempt_in_progress; | 
|  | tmp_preempt_start_time = job_ptr->details->preempt_start_time; | 
|  | job_ptr->details->preempt_start_time = 0; | 
|  | job_ptr->preempt_in_progress = false; | 
|  | } | 
|  |  | 
|  | /* | 
|  | * Don't count queue records for magnetic reservation against | 
|  | * backfill limits. | 
|  | */ | 
|  | if ((job_ptr->bit_flags & JOB_MAGNETIC) && | 
|  | !bf_allow_magnetic_slot) { | 
|  | already_counted = true; | 
|  | } else { | 
|  | job_test_count++; | 
|  | slurmctld_diag_stats.bf_last_depth++; | 
|  | already_counted = false; | 
|  | } | 
|  |  | 
|  | if (!IS_JOB_PENDING(job_ptr) ||	/* Started in other partition */ | 
|  | (job_ptr->priority == 0))	/* Job has been held */ | 
|  | continue; | 
|  | if ((job_ptr->array_task_id != NO_VAL) || job_ptr->array_recs) { | 
|  | if (reject_array_job && | 
|  | (reject_array_job->array_job_id == | 
|  | job_ptr->array_job_id) && | 
|  | (reject_array_part == part_ptr) && | 
|  | (reject_array_qos == qos_ptr) && | 
|  | (reject_array_resv == resv_ptr) && | 
|  | (reject_array_use_prefer == use_prefer)) | 
|  | continue;  /* already rejected array element */ | 
|  |  | 
|  | /* assume reject whole array for now, clear if OK */ | 
|  | reject_array_job = job_ptr; | 
|  | reject_array_part = part_ptr; | 
|  | reject_array_qos = qos_ptr; | 
|  | reject_array_resv = resv_ptr; | 
|  | reject_array_use_prefer = use_prefer; | 
|  |  | 
|  | if (!job_array_start_test(job_ptr)) | 
|  | continue; | 
|  | } | 
|  | /* | 
|  | * If we are on a different task (see goto next_task) set it up | 
|  | * the same way as we did it before. | 
|  | */ | 
|  | job_ptr->part_ptr = part_ptr; | 
|  | job_ptr->qos_ptr = qos_ptr; | 
|  | job_ptr->resv_ptr = resv_ptr; | 
|  | if (resv_ptr) | 
|  | job_ptr->resv_id = resv_ptr->resv_id; | 
|  |  | 
|  | if (job_limits_check(&job_ptr, true) != WAIT_NO_REASON) { | 
|  | /* should never happen */ | 
|  | continue; | 
|  | } | 
|  |  | 
|  | log_flag(BACKFILL, "test for %pJ Prio=%u Partition=%s Reservation=%s", | 
|  | job_ptr, job_ptr->priority, job_ptr->part_ptr->name, | 
|  | job_ptr->resv_ptr ? job_ptr->resv_ptr->name : "NONE"); | 
|  |  | 
|  | /* Test to see if we've exceeded any per user/partition limit */ | 
|  | if (_job_exceeds_max_bf_param(job_ptr, orig_sched_start)) | 
|  | continue; | 
|  |  | 
|  | if (((part_ptr->state_up & PARTITION_SCHED) == 0) || | 
|  | (part_ptr->node_bitmap == NULL)) { | 
|  | log_flag(BACKFILL, "partition %s not usable", | 
|  | job_ptr->part_ptr->name); | 
|  | continue; | 
|  | } | 
|  |  | 
|  | if (!bf_licenses && | 
|  | license_job_test(job_ptr, time(NULL), true)) { | 
|  | log_flag(BACKFILL, "%pJ not runable now due to licenses", | 
|  | job_ptr); | 
|  | continue; | 
|  | } | 
|  |  | 
|  | if (!job_independent(job_ptr)) { | 
|  | log_flag(BACKFILL, "%pJ not runable now", | 
|  | job_ptr); | 
|  | continue; | 
|  | } | 
|  |  | 
|  | /* Determine minimum and maximum node counts */ | 
|  | error_code = get_node_cnts(job_ptr, qos_flags, part_ptr, | 
|  | &min_nodes, &req_nodes, &max_nodes); | 
|  |  | 
|  | if (error_code == ESLURM_ACCOUNTING_POLICY) { | 
|  | log_flag(BACKFILL, "%pJ acct policy node limit", | 
|  | job_ptr); | 
|  | continue; | 
|  | } else if (error_code == | 
|  | ESLURM_REQUESTED_PART_CONFIG_UNAVAILABLE) { | 
|  | log_flag(BACKFILL, "%pJ node count too high", | 
|  | job_ptr); | 
|  | continue; | 
|  | } else if (error_code != SLURM_SUCCESS) { | 
|  | log_flag(BACKFILL, "error setting nodes for %pJ: %s", | 
|  | job_ptr, slurm_strerror(error_code)); | 
|  | continue; | 
|  | } | 
|  |  | 
|  | /* test of deadline */ | 
|  | now = time(NULL); | 
|  | deadline_time_limit = 0; | 
|  | if ((job_ptr->deadline) && (job_ptr->deadline != NO_VAL)) { | 
|  | if (!deadline_ok(job_ptr, __func__)) | 
|  | continue; | 
|  |  | 
|  | deadline_time_limit = (job_ptr->deadline - now) / 60; | 
|  | } | 
|  |  | 
|  | /* Determine job's expected completion time */ | 
|  | if (part_ptr->max_time == INFINITE) | 
|  | part_time_limit = YEAR_MINUTES; | 
|  | else | 
|  | part_time_limit = part_ptr->max_time; | 
|  | if ((job_ptr->time_limit == NO_VAL) || | 
|  | (job_ptr->time_limit == INFINITE)) { | 
|  | time_limit = part_time_limit; | 
|  | job_ptr->limit_set.time = 1; | 
|  | } else { | 
|  | if (part_ptr->max_time == INFINITE) | 
|  | time_limit = job_ptr->time_limit; | 
|  | else | 
|  | time_limit = MIN(job_ptr->time_limit, | 
|  | part_time_limit); | 
|  | } | 
|  | if (deadline_time_limit) | 
|  | comp_time_limit = MIN(time_limit, deadline_time_limit); | 
|  | else if (job_ptr->time_min && | 
|  | (job_ptr->time_min < time_limit)) { | 
|  | comp_time_limit = time_limit; | 
|  | time_limit = job_ptr->time_limit = job_ptr->time_min; | 
|  | } else | 
|  | comp_time_limit = time_limit; | 
|  | if ((qos_flags & QOS_FLAG_NO_RESERVE) && | 
|  | slurm_conf.preempt_mode) | 
|  | time_limit = job_ptr->time_limit = 1; | 
|  |  | 
|  | later_start = now; | 
|  | used_slots = 0; | 
|  |  | 
|  | if (assoc_limit_stop) { | 
|  | if (qos_blocked_until > later_start) { | 
|  | later_start = qos_blocked_until; | 
|  | log_flag(BACKFILL, "QOS blocked_until move start_res to %ld", | 
|  | later_start); | 
|  | } | 
|  | if (qos_part_blocked_until > later_start) { | 
|  | later_start = qos_part_blocked_until; | 
|  | log_flag(BACKFILL, "Part QOS blocked_until move start_res to %ld", | 
|  | later_start); | 
|  | } | 
|  | } | 
|  |  | 
|  | TRY_LATER: | 
|  | if (slurmctld_config.shutdown_time || | 
|  | (difftime(time(NULL), orig_sched_start) >= | 
|  | bf_max_time)) { | 
|  | _set_job_time_limit(job_ptr, orig_time_limit); | 
|  | _set_bf_exit(BF_EXIT_TIMEOUT); | 
|  | break; | 
|  | } | 
|  | test_time_count++; | 
|  |  | 
|  | many_rpcs = false; | 
|  | slurm_mutex_lock(&slurmctld_config.thread_count_lock); | 
|  | if ((max_rpc_cnt > 0) && | 
|  | (slurmctld_config.server_thread_count >= max_rpc_cnt)) | 
|  | many_rpcs = true; | 
|  | slurm_mutex_unlock(&slurmctld_config.thread_count_lock); | 
|  |  | 
|  | if (many_rpcs || (slurm_delta_tv(&start_tv) >= yield_interval)) { | 
|  | uint32_t save_time_limit = job_ptr->time_limit; | 
|  | _set_job_time_limit(job_ptr, orig_time_limit); | 
|  | if (slurm_conf.debug_flags & DEBUG_FLAG_BACKFILL) { | 
|  | END_TIMER; | 
|  | log_flag(BACKFILL, "yielding locks after testing %u(%d) jobs tested, %u time slots, %s", | 
|  | slurmctld_diag_stats.bf_last_depth, | 
|  | job_test_count, test_time_count, | 
|  | TIME_STR); | 
|  | } | 
|  | /* Sync planned nodes before yielding locks */ | 
|  | nodes_planned = true; | 
|  | _handle_planned(nodes_planned); | 
|  | if (_yield_locks(yield_sleep)) { | 
|  | log_flag(BACKFILL, "system state changed, breaking out after testing %u(%d) jobs", | 
|  | slurmctld_diag_stats.bf_last_depth, | 
|  | job_test_count); | 
|  | state_changed_break = true; | 
|  | _set_bf_exit(BF_EXIT_STATE_CHANGED); | 
|  | break; | 
|  | } | 
|  |  | 
|  | /* Reset backfill scheduling timers, resume testing */ | 
|  | sched_start = time(NULL); | 
|  | gettimeofday(&start_tv, NULL); | 
|  | job_test_count = 1; | 
|  | test_time_count = 0; | 
|  | nodes_planned = false; | 
|  | START_TIMER; | 
|  |  | 
|  | if (is_job_array_head && | 
|  | (job_ptr->array_task_id != NO_VAL)) { | 
|  | /* | 
|  | * Job array element started in other partition, | 
|  | * reset pointer to "master" job array record | 
|  | */ | 
|  | log_flag(BACKFILL, "%pJ array scheduled during bf yield, try master", | 
|  | job_ptr); | 
|  | job_ptr = find_job_record( | 
|  | job_ptr->array_job_id); | 
|  | if (!job_ptr) | 
|  | /* All task array elements started */ | 
|  | continue; | 
|  | } | 
|  |  | 
|  | /* | 
|  | * With bf_continue configured, the original job could | 
|  | * have been scheduled. Revalidate the job record here. | 
|  | */ | 
|  | if (!_job_runnable_now(job_ptr)) | 
|  | continue; | 
|  |  | 
|  | /* | 
|  | * If the job wasn't scheduled while we didn't have the | 
|  | * locks restore the pointers we were last on just in | 
|  | * case the main scheduler changed them. | 
|  | */ | 
|  | job_ptr->resv_ptr = resv_ptr; | 
|  | if (resv_ptr) | 
|  | job_ptr->resv_id = resv_ptr->resv_id; | 
|  | if (!_job_part_valid(job_ptr, part_ptr)) | 
|  | continue;	/* Partition change during lock yield */ | 
|  | if (!job_independent(job_ptr)) { | 
|  | log_flag(BACKFILL, "%pJ no longer independent after bf yield", | 
|  | job_ptr); | 
|  | /* No longer independent | 
|  | * (e.g. another singleton started) */ | 
|  | continue; | 
|  | } | 
|  |  | 
|  | job_ptr->time_limit = save_time_limit; | 
|  | job_ptr->part_ptr = part_ptr; | 
|  | job_ptr->qos_ptr = qos_ptr; | 
|  | } | 
|  |  | 
|  | /* | 
|  | * feature_list_use is a temporary variable and should | 
|  | * be reset before each use. | 
|  | * Do this after bf_yield to ensure the pointers are valid even | 
|  | * if the job was updated during the bf_yield. | 
|  | */ | 
|  | if (use_prefer) { | 
|  | /* | 
|  | * Prefer was removed from the job since the | 
|  | * job_queue_rec was created (during bf_yield). | 
|  | * This is a separate queue record for prefer. Skip it. | 
|  | */ | 
|  | if (!job_ptr->details->prefer) | 
|  | continue; | 
|  | job_ptr->details->features_use = | 
|  | job_ptr->details->prefer; | 
|  | job_ptr->details->feature_list_use = | 
|  | job_ptr->details->prefer_list; | 
|  | } else { | 
|  | job_ptr->details->features_use = | 
|  | job_ptr->details->features; | 
|  | job_ptr->details->feature_list_use = | 
|  | job_ptr->details->feature_list; | 
|  | } | 
|  |  | 
|  | FREE_NULL_BITMAP(avail_bitmap); | 
|  | reservation_delete_resv_exc_parts(&resv_exc); | 
|  | start_res = MAX(later_start, het_job_time); | 
|  | resv_end = 0; | 
|  | later_start = 0; | 
|  | licenses_unavail = false; | 
|  | /* | 
|  | * Restore the original time limit before checking against | 
|  | * reservations, and revert it after. | 
|  | */ | 
|  | if ((qos_flags & QOS_FLAG_NO_RESERVE) && | 
|  | slurm_conf.preempt_mode) | 
|  | job_ptr->time_limit = orig_time_limit; | 
|  | /* Determine impact of any advance reservations */ | 
|  | j = job_test_resv(job_ptr, &start_res, true, &avail_bitmap, | 
|  | &resv_exc, &resv_overlap, false); | 
|  | if (j != SLURM_SUCCESS) { | 
|  | log_flag(BACKFILL, "%pJ reservation defer", | 
|  | job_ptr); | 
|  | _set_job_time_limit(job_ptr, orig_time_limit); | 
|  | continue; | 
|  | } else if ((qos_flags & QOS_FLAG_NO_RESERVE) && | 
|  | slurm_conf.preempt_mode) | 
|  | job_ptr->time_limit = time_limit; | 
|  |  | 
|  | if (window_end < start_res) { | 
|  | log_flag(BACKFILL, "%pJ start_res after current backfill window", | 
|  | job_ptr); | 
|  | _set_job_time_limit(job_ptr, orig_time_limit); | 
|  | continue; | 
|  | } | 
|  |  | 
|  | if (start_res > now) | 
|  | end_time = (time_limit * 60) + start_res; | 
|  | else | 
|  | end_time = (time_limit * 60) + now; | 
|  | if (end_time < now)	/* Overflow 32-bits */ | 
|  | end_time = INFINITE; | 
|  | if (resv_overlap) | 
|  | resv_end = find_resv_end(start_res, | 
|  | backfill_resolution); | 
|  | /* Identify usable nodes for this job */ | 
|  | bit_and(avail_bitmap, part_ptr->node_bitmap); | 
|  | bit_and(avail_bitmap, up_node_bitmap); | 
|  | bit_and_not(avail_bitmap, bf_ignore_node_bitmap); | 
|  |  | 
|  | if (job_ptr->details->exc_node_bitmap) { | 
|  | bit_and_not(avail_bitmap, | 
|  | job_ptr->details->exc_node_bitmap); | 
|  | } | 
|  |  | 
|  | if (_filter_exclusive_user_mcs_nodes(job_ptr, mcs_select, | 
|  | min_nodes, nodes_used_list, | 
|  | start_res, | 
|  | &later_filter_start, | 
|  | avail_bitmap)) { | 
|  | /* start_res delayed must check resv times again */ | 
|  | later_start = later_filter_start; | 
|  | SKIP_SCHED_OR_TRY_LATER(job_ptr, job_no_reserve, | 
|  | later_start, orig_time_limit, | 
|  | orig_start_time); | 
|  | } | 
|  |  | 
|  | if (IS_JOB_WHOLE_TOPO(job_ptr)) { | 
|  | if (excluded_topo_bitmap) | 
|  | bit_clear_all(excluded_topo_bitmap); | 
|  | else | 
|  | excluded_topo_bitmap = | 
|  | bit_alloc(node_record_count); | 
|  | } | 
|  |  | 
|  | COPY_BITMAP(tmp_bitmap, avail_bitmap); | 
|  | for (j = 0; ; ) { | 
|  | if ((node_space[j].end_time > start_res) && | 
|  | node_space[j].next && (later_start == 0)) { | 
|  | int tmp = node_space[j].next; | 
|  |  | 
|  | if (job_ptr->license_list && | 
|  | !bf_licenses_equal(node_space[tmp].licenses, | 
|  | node_space[j] | 
|  | .licenses)) { | 
|  | later_start = node_space[j].end_time; | 
|  | goto later_start_set; | 
|  | } | 
|  |  | 
|  | COPY_BITMAP(next_bitmap, tmp_bitmap); | 
|  | COPY_BITMAP(current_bitmap, avail_bitmap); | 
|  | bit_and(next_bitmap, | 
|  | node_space[tmp].avail_bitmap); | 
|  | bit_and(current_bitmap, | 
|  | node_space[j].avail_bitmap); | 
|  | /* | 
|  | * Normally later_start is set at the end of the | 
|  | * first backfill reservation when the select | 
|  | * plugin predicts start time after later_start. | 
|  | * Then it goes to TRY_LATER and tries again on | 
|  | * a new set of nodes to check if the job can | 
|  | * start earlier. But if the next set of nodes | 
|  | * is a subset of the currently tested ones then | 
|  | * calling _try_sched (expensive function) would | 
|  | * be useless and would impact performance. | 
|  | */ | 
|  | if (!bit_super_set(next_bitmap, current_bitmap)) | 
|  | later_start = node_space[j].end_time; | 
|  | } | 
|  | later_start_set: | 
|  | if (node_space[j].end_time <= start_res) | 
|  | ; | 
|  | else if (node_space[j].begin_time <= end_time) { | 
|  | bit_and(avail_bitmap, | 
|  | node_space[j].avail_bitmap); | 
|  | bf_hres_filter(job_ptr, avail_bitmap, | 
|  | node_space[j].licenses); | 
|  | if (!bf_licenses_avail(node_space[j].licenses, | 
|  | job_ptr, NULL)) { | 
|  | licenses_unavail = true; | 
|  | later_start = node_space[j].end_time; | 
|  | xfree(job_ptr->state_desc); | 
|  | job_ptr->state_reason = WAIT_LICENSES; | 
|  | break; | 
|  | } | 
|  | if (IS_JOB_WHOLE_TOPO(job_ptr)) { | 
|  | bit_or_not(excluded_topo_bitmap, | 
|  | node_space[j].avail_bitmap); | 
|  | } | 
|  | } else { | 
|  | int next = node_space[j].next; | 
|  | if ((later_start == 0) && next && | 
|  | node_space[next].next) | 
|  | later_start = node_space[next].end_time; | 
|  | break; | 
|  | } | 
|  | if ((j = node_space[j].next) == 0) | 
|  | break; | 
|  | } | 
|  | if (resv_end && (++resv_end < window_end) && | 
|  | ((later_start == 0) || (resv_end < later_start))) { | 
|  | later_start = resv_end; | 
|  | } | 
|  |  | 
|  | if (IS_JOB_WHOLE_TOPO(job_ptr)) { | 
|  | bit_and(excluded_topo_bitmap, | 
|  | node_space[0].avail_bitmap); | 
|  | topology_g_whole_topo(excluded_topo_bitmap, | 
|  | job_ptr->part_ptr->topology_idx); | 
|  | bit_and_not(avail_bitmap, excluded_topo_bitmap); | 
|  | } | 
|  |  | 
|  | /* Test if licenses are unavailable OR | 
|  | *	required nodes missing OR | 
|  | *	nodes lack features OR | 
|  | *	no change since previously tested nodes (only changes | 
|  | *	in other partition nodes) */ | 
|  | if (licenses_unavail || | 
|  | ((job_ptr->details->req_node_bitmap) && | 
|  | (!bit_super_set(job_ptr->details->req_node_bitmap, | 
|  | avail_bitmap))) || | 
|  | (job_req_node_filter(job_ptr, avail_bitmap, true))) { | 
|  | SKIP_SCHED_OR_TRY_LATER(job_ptr, job_no_reserve, | 
|  | later_start, orig_time_limit, | 
|  | orig_start_time); | 
|  | } | 
|  |  | 
|  | if (!later_start && later_filter_start) | 
|  | later_start = later_filter_start; /* filter out fewer */ | 
|  |  | 
|  | /* Test if insufficient nodes remain */ | 
|  | if (bit_set_count(avail_bitmap) < min_nodes) { | 
|  | SKIP_SCHED_OR_TRY_LATER(job_ptr, job_no_reserve, | 
|  | later_start, orig_time_limit, | 
|  | orig_start_time); | 
|  | } | 
|  |  | 
|  | /* Identify nodes which are definitely off limits */ | 
|  | FREE_NULL_BITMAP(resv_bitmap); | 
|  | resv_bitmap = bit_copy(avail_bitmap); | 
|  | bit_not(resv_bitmap); | 
|  |  | 
|  | /* this is the time consuming operation */ | 
|  | debug2("entering _try_sched for %pJ.", | 
|  | job_ptr); | 
|  |  | 
|  | if (!already_counted) { | 
|  | slurmctld_diag_stats.bf_last_depth_try++; | 
|  | job_test_cnt++; | 
|  | already_counted = true; | 
|  | } | 
|  | if (slurm_conf.debug_flags & DEBUG_FLAG_BACKFILL_MAP) | 
|  | _dump_job_test(job_ptr, avail_bitmap, start_res, | 
|  | later_start); | 
|  | test_fini = -1; | 
|  | build_active_feature_bitmap(job_ptr, avail_bitmap, | 
|  | &active_bitmap); | 
|  | job_ptr->bit_flags |= BACKFILL_TEST; | 
|  | job_ptr->bit_flags |= job_no_reserve;	/* 0 or TEST_NOW_ONLY */ | 
|  |  | 
|  | if (active_bitmap) { | 
|  | will_run_data.start = start_res; | 
|  | will_run_data.end = later_start; | 
|  | j = _try_sched(job_ptr, &active_bitmap, min_nodes, | 
|  | max_nodes, req_nodes, &resv_exc, | 
|  | &will_run_data); | 
|  | if (j == SLURM_SUCCESS) { | 
|  | FREE_NULL_BITMAP(avail_bitmap); | 
|  | avail_bitmap = active_bitmap; | 
|  | active_bitmap = NULL; | 
|  | test_fini = 1; | 
|  | } else { | 
|  | if (node_features_g_overlap(active_bitmap)) | 
|  | get_boot_time = true; | 
|  | FREE_NULL_BITMAP(active_bitmap); | 
|  | save_share_res  = job_ptr->details->share_res; | 
|  | save_whole_node = job_ptr->details->whole_node; | 
|  | job_ptr->details->share_res = 0; | 
|  | job_ptr->details->whole_node |= | 
|  | WHOLE_NODE_REQUIRED; | 
|  | if (!save_whole_node) | 
|  | job_ptr->bit_flags |= BF_WHOLE_NODE_TEST; | 
|  | test_fini = 0; | 
|  | } | 
|  | } | 
|  | boot_time = 0; | 
|  | if (test_fini == 0) { | 
|  | /* Unable to start job using currently active features, | 
|  | * need to try using features which can be made | 
|  | * available after node reboot */ | 
|  | resv_exc_t tmp_resv_exc = { 0 }; | 
|  | bitstr_t *tmp_node_bitmap = NULL; | 
|  | debug2("entering _try_sched for %pJ. Need to use features which can be made available after node reboot", | 
|  | job_ptr); | 
|  | /* | 
|  | * Restore the original time limit before checking against | 
|  | * reservations, and revert it after. | 
|  | */ | 
|  | if ((qos_flags & QOS_FLAG_NO_RESERVE) && | 
|  | slurm_conf.preempt_mode) | 
|  | job_ptr->time_limit = orig_time_limit; | 
|  | /* Determine impact of any advance reservations */ | 
|  | resv_end = 0; | 
|  | j = job_test_resv(job_ptr, &start_res, false, | 
|  | &tmp_node_bitmap, &tmp_resv_exc, | 
|  | &resv_overlap, true); | 
|  | if ((qos_flags & QOS_FLAG_NO_RESERVE) && | 
|  | slurm_conf.preempt_mode) | 
|  | job_ptr->time_limit = time_limit; | 
|  | if (resv_overlap) | 
|  | resv_end = find_resv_end(start_res, | 
|  | backfill_resolution); | 
|  |  | 
|  | if (resv_end && (++resv_end < window_end) && | 
|  | ((later_start == 0) || (resv_end < later_start))) { | 
|  | later_start = resv_end; | 
|  | } | 
|  | if (j == SLURM_SUCCESS) { | 
|  | reservation_delete_resv_exc_parts(&resv_exc); | 
|  | memcpy(&resv_exc, &tmp_resv_exc, | 
|  | sizeof(resv_exc)); | 
|  | bit_and(avail_bitmap, tmp_node_bitmap); | 
|  | FREE_NULL_BITMAP(tmp_node_bitmap); | 
|  | } | 
|  | if (get_boot_time) | 
|  | boot_time = node_features_g_boot_time(); | 
|  | orig_end_time = end_time; | 
|  | end_time += boot_time; | 
|  |  | 
|  | for (j = 0; ; ) { | 
|  | if (node_space[j].end_time <= start_res) | 
|  | ; | 
|  | else if (node_space[j].begin_time <= end_time) { | 
|  | if (node_space[j].begin_time > | 
|  | orig_end_time) | 
|  | bit_and(avail_bitmap, | 
|  | node_space[j].avail_bitmap); | 
|  | } else | 
|  | break; | 
|  | if ((j = node_space[j].next) == 0) | 
|  | break; | 
|  | } | 
|  | } | 
|  | if (test_fini != 1) { | 
|  | /* Either active_bitmap was NULL or not usable by the | 
|  | * job. Test using avail_bitmap instead */ | 
|  | will_run_data.start = start_res; | 
|  | will_run_data.end = later_start; | 
|  | j = _try_sched(job_ptr, &avail_bitmap, min_nodes, | 
|  | max_nodes, req_nodes, &resv_exc, | 
|  | &will_run_data); | 
|  | if (test_fini == 0) { | 
|  | job_ptr->details->share_res = save_share_res; | 
|  | job_ptr->details->whole_node = save_whole_node; | 
|  | } | 
|  | } | 
|  | job_ptr->bit_flags &= ~BACKFILL_TEST; | 
|  | job_ptr->bit_flags &= ~BF_WHOLE_NODE_TEST; | 
|  | job_ptr->bit_flags &= ~TEST_NOW_ONLY; | 
|  |  | 
|  | now = time(NULL); | 
|  | if (j != SLURM_SUCCESS) { | 
|  | SKIP_SCHED_OR_TRY_LATER(job_ptr, job_no_reserve, | 
|  | later_start, orig_time_limit, | 
|  | orig_start_time); | 
|  | } | 
|  |  | 
|  | if (start_res > job_ptr->start_time) { | 
|  | job_ptr->start_time = start_res; | 
|  | last_job_update = now; | 
|  | } | 
|  |  | 
|  | if (job_ptr->start_time > now) { | 
|  | _set_slot_time(job_ptr, time_limit, boot_time, | 
|  | &start_time, &end_reserve); | 
|  |  | 
|  | if (_test_resv_overlap(node_space, avail_bitmap, | 
|  | job_ptr, start_time, | 
|  | end_reserve)) { | 
|  | later_start = job_ptr->start_time; | 
|  |  | 
|  | if (start_res == job_ptr->start_time) { | 
|  | later_start += backfill_resolution; | 
|  | log_flag(BACKFILL, "%pJ inf loop detect", job_ptr); | 
|  | } | 
|  |  | 
|  | job_ptr->start_time = 0; | 
|  | log_flag(BACKFILL, "%pJ overlaps with existing reservation start_time=%u end_reserve=%u boot_time=%u later_start %ld", | 
|  | job_ptr, start_time, end_reserve, | 
|  | boot_time, later_start); | 
|  | goto TRY_LATER; | 
|  | } | 
|  | overlap_tested = true; | 
|  | } else | 
|  | overlap_tested = false; | 
|  |  | 
|  | if (!job_no_reserve && bf_topopt_enable) { | 
|  | if (oracle(job_ptr, avail_bitmap, later_start, | 
|  | &time_limit, &boot_time, node_space)) { | 
|  | log_flag(BACKFILL, "%pJ used_slots:%u later_start %ld", | 
|  | job_ptr, used_slots, later_start); | 
|  | goto TRY_LATER; | 
|  | } | 
|  | _set_slot_time(job_ptr, time_limit, boot_time, | 
|  | &start_time, &end_reserve); | 
|  | } | 
|  |  | 
|  | /* | 
|  | * avail_bitmap at this point contains a bitmap of nodes | 
|  | * selected for this job to be allocated | 
|  | */ | 
|  | if ((job_ptr->start_time <= now) && | 
|  | (bit_overlap_any(avail_bitmap, cg_node_bitmap) || | 
|  | bit_overlap_any(avail_bitmap, rs_node_bitmap))) { | 
|  | /* Need to wait for in-progress completion/epilog */ | 
|  | job_ptr->start_time = now + 1; | 
|  | later_start = 0; | 
|  | } | 
|  | if ((job_ptr->start_time <= now) && | 
|  | ((bb = bb_g_job_test_stage_in(job_ptr, true)) != 1)) { | 
|  | if (job_ptr->state_reason != WAIT_NO_REASON) { | 
|  | /* | 
|  | * Don't change state_reason if it was already | 
|  | * set. | 
|  | */ | 
|  | ; | 
|  | } else if (bb == -1) { | 
|  | /* | 
|  | * Set reason now instead of in if (bb == -1) | 
|  | * below for the sched_debug3() | 
|  | */ | 
|  | xfree(job_ptr->state_desc); | 
|  | job_ptr->state_reason = | 
|  | WAIT_BURST_BUFFER_RESOURCE; | 
|  | } else {	/* bb == 0 */ | 
|  | xfree(job_ptr->state_desc); | 
|  | job_ptr->state_reason=WAIT_BURST_BUFFER_STAGING; | 
|  | /* | 
|  | * Cannot start now, set start time in the | 
|  | * future. | 
|  | */ | 
|  | job_ptr->start_time = now + 1; | 
|  | } | 
|  | sched_debug3("%pJ. State=%s. Reason=%s. Priority=%u.", | 
|  | job_ptr, | 
|  | job_state_string(job_ptr->job_state), | 
|  | job_state_reason_string( | 
|  | job_ptr->state_reason), | 
|  | job_ptr->priority); | 
|  | last_job_update = now; | 
|  | _set_job_time_limit(job_ptr, orig_time_limit); | 
|  | later_start = 0; | 
|  | if (bb == -1) { | 
|  | /* | 
|  | * bb == -1 means that burst buffer stage-in | 
|  | * hasn't started yet. Set an estimated start | 
|  | * time so stage-in can start. | 
|  | * | 
|  | * Clear reject_array_job; otherwise we'll skip | 
|  | * looking at other jobs in this array (if this | 
|  | * is a job array), therefore we won't set | 
|  | * estimated start times, therefore we won't be | 
|  | * able to start stage-in for any other jobs in | 
|  | * this array. | 
|  | */ | 
|  | job_ptr->start_time = | 
|  | bb_g_job_get_est_start(job_ptr); | 
|  | reject_array_job = NULL; | 
|  | reject_array_part = NULL; | 
|  | reject_array_qos = NULL; | 
|  | reject_array_resv = NULL; | 
|  | continue; | 
|  | } | 
|  | } else if ((job_ptr->het_job_id == 0) && | 
|  | (job_ptr->start_time <= now)) { /* Can start now */ | 
|  | uint32_t save_time_limit = job_ptr->time_limit; | 
|  | uint32_t hard_limit; | 
|  | bool reset_time = false; | 
|  | int rc; | 
|  |  | 
|  | /* get fed job lock from origin cluster */ | 
|  | if (fed_mgr_job_lock(job_ptr)) { | 
|  | log_flag(BACKFILL, "%pJ can't get fed job lock from origin cluster to backfill job", | 
|  | job_ptr); | 
|  | rc = ESLURM_FED_JOB_LOCK; | 
|  | goto skip_start; | 
|  | } | 
|  |  | 
|  | rc = _start_job(job_ptr, resv_bitmap); | 
|  |  | 
|  | if (rc == SLURM_SUCCESS) { | 
|  | /* | 
|  | * If the following fails because of network | 
|  | * connectivity, the origin cluster should ask | 
|  | * when it comes back up if the cluster_lock | 
|  | * cluster actually started the job | 
|  | */ | 
|  | fed_mgr_job_start(job_ptr, job_ptr->start_time); | 
|  | } else { | 
|  | fed_mgr_job_unlock(job_ptr); | 
|  | } | 
|  |  | 
|  | skip_start: | 
|  | if (qos_flags & QOS_FLAG_NO_RESERVE) { | 
|  | if (orig_time_limit == NO_VAL) { | 
|  | acct_policy_alter_job( | 
|  | job_ptr, comp_time_limit); | 
|  | job_ptr->time_limit = comp_time_limit; | 
|  | job_ptr->limit_set.time = 1; | 
|  | } else { | 
|  | acct_policy_alter_job( | 
|  | job_ptr, orig_time_limit); | 
|  | _set_job_time_limit(job_ptr, | 
|  | orig_time_limit); | 
|  | } | 
|  | } else if ((rc == SLURM_SUCCESS) && soft_time_limit && | 
|  | job_ptr->time_min) { | 
|  | acct_policy_alter_job(job_ptr, orig_time_limit); | 
|  | job_ptr->time_limit = orig_time_limit; | 
|  | } else if ((rc == SLURM_SUCCESS) && job_ptr->time_min) { | 
|  | /* Set time limit as high as possible */ | 
|  | acct_policy_alter_job(job_ptr, comp_time_limit); | 
|  | job_ptr->time_limit = comp_time_limit; | 
|  | reset_time = true; | 
|  | } else if (orig_time_limit == NO_VAL) { | 
|  | acct_policy_alter_job(job_ptr, comp_time_limit); | 
|  | job_ptr->time_limit = comp_time_limit; | 
|  | job_ptr->limit_set.time = 1; | 
|  | } else if (deadline_time_limit && | 
|  | (rc == SLURM_SUCCESS)) { | 
|  | acct_policy_alter_job(job_ptr, comp_time_limit); | 
|  | job_ptr->time_limit = comp_time_limit; | 
|  | reset_time = true; | 
|  | } else { | 
|  | acct_policy_alter_job(job_ptr, orig_time_limit); | 
|  | _set_job_time_limit(job_ptr, orig_time_limit); | 
|  | } | 
|  | /* | 
|  | * Only set end_time if start_time is set, | 
|  | * or else end_time will be small (ie. 1969). | 
|  | */ | 
|  | if (IS_JOB_FINISHED(job_ptr)) { | 
|  | /* Zero size or killed on startup */ | 
|  | } else if (job_ptr->start_time) { | 
|  | node_space_handler_t ns_handler = { | 
|  | .node_space = node_space, | 
|  | .node_space_recs = &node_space_recs, | 
|  | }; | 
|  |  | 
|  | if (job_ptr->time_limit == INFINITE) | 
|  | hard_limit = YEAR_SECONDS; | 
|  | else | 
|  | hard_limit = job_ptr->time_limit * 60; | 
|  | job_ptr->end_time = job_ptr->start_time + | 
|  | hard_limit; | 
|  | /* | 
|  | * Only set if start_time. end_time must be set | 
|  | * beforehand for _reset_job_time_limit. | 
|  | */ | 
|  | if (reset_time) { | 
|  | _reset_job_time_limit(job_ptr, now, | 
|  | node_space); | 
|  | time_limit = job_ptr->time_limit; | 
|  | } | 
|  |  | 
|  | _bf_reserve_running(job_ptr, &ns_handler); | 
|  | } else if (rc == SLURM_SUCCESS) { | 
|  | error("start_time of 0 on successful backfill. This shouldn't happen. :)"); | 
|  | } | 
|  |  | 
|  | if ((rc == ESLURM_RESERVATION_BUSY) || | 
|  | (rc == ESLURM_ACCOUNTING_POLICY && | 
|  | !assoc_limit_stop) || | 
|  | ((rc == ESLURM_REQUESTED_NODE_CONFIG_UNAVAILABLE) && | 
|  | job_ptr->extra_constraints)) { | 
|  | /* Unknown future start time, just skip job */ | 
|  | job_ptr->start_time = orig_start_time; | 
|  | _set_job_time_limit(job_ptr, orig_time_limit); | 
|  | continue; | 
|  | } else if (rc == ESLURM_ACCOUNTING_POLICY) { | 
|  | /* Unknown future start time. Determining | 
|  | * when it can start with certainty requires | 
|  | * when every running and pending job starts | 
|  | * and ends and tracking all of there resources. | 
|  | * That requires very high overhead, that we | 
|  | * don't want to add. Estimate that it can start | 
|  | * after the next job ends (or in 5 minutes if | 
|  | * we don't have that information yet). */ | 
|  | if (later_start) | 
|  | job_ptr->start_time = later_start; | 
|  | else | 
|  | job_ptr->start_time = now + 500; | 
|  | if (job_ptr->qos_blocking_ptr && | 
|  | job_state_reason_check( | 
|  | job_ptr->state_reason, | 
|  | JSR_QOS_GRP)) { | 
|  | assoc_mgr_lock(&qos_read_lock); | 
|  | qos_ptr = job_ptr->qos_blocking_ptr; | 
|  | if (qos_ptr->blocked_until < | 
|  | job_ptr->start_time) { | 
|  | qos_ptr->blocked_until = | 
|  | job_ptr->start_time; | 
|  | } | 
|  | assoc_mgr_unlock(&qos_read_lock); | 
|  | } | 
|  | } else if (rc != SLURM_SUCCESS) { | 
|  | log_flag(BACKFILL, "planned start of %pJ failed: %s", | 
|  | job_ptr, slurm_strerror(rc)); | 
|  | /* Drop through and reserve these resources. | 
|  | * Likely due to state changes during sleep. | 
|  | * Make best-effort based upon original state */ | 
|  | _set_job_time_limit(job_ptr, orig_time_limit); | 
|  | later_start = 0; | 
|  | } else { | 
|  | /* Started this job, move to next one */ | 
|  |  | 
|  | /* Clear assumed rejected array status */ | 
|  | reject_array_job = NULL; | 
|  | reject_array_part = NULL; | 
|  | reject_array_qos = NULL; | 
|  | reject_array_resv = NULL; | 
|  |  | 
|  | /* Update the database if job time limit | 
|  | * changed and move to next job */ | 
|  | if (save_time_limit != job_ptr->time_limit) | 
|  | jobacct_storage_g_job_start( | 
|  | acct_db_conn, job_ptr); | 
|  | job_start_cnt++; | 
|  | if (max_backfill_jobs_start && | 
|  | (job_start_cnt >= max_backfill_jobs_start)){ | 
|  | log_flag(BACKFILL, "bf_max_job_start limit of %d reached", | 
|  | max_backfill_jobs_start); | 
|  | _set_bf_exit(BF_EXIT_MAX_JOB_START); | 
|  | break; | 
|  | } | 
|  | if (job_test_cnt >= max_backfill_job_cnt) { | 
|  | log_flag(BACKFILL, "bf_max_job_test: limit of %d reached", | 
|  | max_backfill_job_cnt); | 
|  | _set_bf_exit(BF_EXIT_MAX_JOB_TEST); | 
|  | break; | 
|  | } | 
|  |  | 
|  | if (_mark_nodes_usage(job_ptr, nodes_used)) | 
|  | list_sort(nodes_used_list, | 
|  | _cmp_last_job_end); | 
|  |  | 
|  | if (is_job_array_head && | 
|  | (job_ptr->array_task_id != NO_VAL)) { | 
|  | /* Try starting next task of job array */ | 
|  | job_record_t *tmp = job_ptr; | 
|  | job_ptr = find_job_record(job_ptr-> | 
|  | array_job_id); | 
|  | if (job_ptr && (job_ptr != tmp) && | 
|  | IS_JOB_PENDING(job_ptr) && | 
|  | (bb_g_job_test_stage_in( | 
|  | job_ptr, false) == 1)) | 
|  | goto next_task; | 
|  | } | 
|  | continue; | 
|  | } | 
|  | } else if (job_ptr->het_job_id != 0) { | 
|  | uint32_t max_time_limit; | 
|  | max_time_limit =_get_job_max_tl(job_ptr, now, | 
|  | node_space); | 
|  | comp_time_limit = MIN(comp_time_limit, max_time_limit); | 
|  | job_ptr->node_cnt_wag = | 
|  | MAX(bit_set_count(avail_bitmap), 1); | 
|  | _het_job_start_set(job_ptr, job_ptr->start_time, | 
|  | comp_time_limit); | 
|  | _set_job_time_limit(job_ptr, orig_time_limit); | 
|  | if (bf_hetjob_immediate && | 
|  | (!max_backfill_jobs_start || | 
|  | (job_start_cnt < max_backfill_jobs_start))) | 
|  | _het_job_start_test(node_space, | 
|  | job_ptr->het_job_id, | 
|  | nodes_used, | 
|  | nodes_used_list); | 
|  | } | 
|  |  | 
|  | if ((job_ptr->start_time > now) && (job_no_reserve != 0)) { | 
|  | if ((orig_start_time != 0) && | 
|  | (orig_start_time < job_ptr->start_time)) { | 
|  | /* Can start earlier in different partition */ | 
|  | job_ptr->start_time = orig_start_time; | 
|  | } else { | 
|  | log_flag(BACKFILL, "%pJ StartTime set but no backfill reservation created.", | 
|  | job_ptr); | 
|  | } | 
|  | _set_job_time_limit(job_ptr, orig_time_limit); | 
|  | continue; | 
|  | } | 
|  |  | 
|  | if (later_start && (job_ptr->start_time > later_start)) { | 
|  | /* Try later when some nodes currently reserved for | 
|  | * pending jobs are free */ | 
|  | log_flag(BACKFILL, "Try later %pJ later_start %ld", | 
|  | job_ptr, later_start); | 
|  | job_ptr->start_time = 0; | 
|  | goto TRY_LATER; | 
|  | } | 
|  |  | 
|  | if (!overlap_tested) { | 
|  | /* Job start deferred from now*/ | 
|  | _set_slot_time(job_ptr, time_limit, boot_time, | 
|  | &start_time, &end_reserve); | 
|  | } | 
|  |  | 
|  | if (job_ptr->start_time > (sched_start + backfill_window)) { | 
|  | /* Starts too far in the future to worry about */ | 
|  | end_reserve = job_ptr->start_time + boot_time + | 
|  | (time_limit * 60); | 
|  | if (slurm_conf.debug_flags & DEBUG_FLAG_BACKFILL) | 
|  | _dump_job_sched(job_ptr, end_reserve, | 
|  | avail_bitmap); | 
|  | if ((orig_start_time != 0) && | 
|  | (orig_start_time < job_ptr->start_time)) { | 
|  | /* Can start earlier in different partition */ | 
|  | job_ptr->start_time = orig_start_time; | 
|  | } else { | 
|  | log_flag(BACKFILL, "%pJ StartTime set to time after current backfill window. No reservation created", | 
|  | job_ptr); | 
|  | } | 
|  | _set_job_time_limit(job_ptr, orig_time_limit); | 
|  | continue; | 
|  | } | 
|  |  | 
|  | if (!overlap_tested && | 
|  | (job_ptr->state_reason != WAIT_BURST_BUFFER_RESOURCE) && | 
|  | (job_ptr->state_reason != WAIT_BURST_BUFFER_STAGING) && | 
|  | _test_resv_overlap(node_space, avail_bitmap, job_ptr, | 
|  | start_time, end_reserve)) { | 
|  | /* This job overlaps with an existing reservation for | 
|  | * job to be backfill scheduled, which the sched | 
|  | * plugin does not know about. Try again later. */ | 
|  | later_start = job_ptr->start_time; | 
|  | job_ptr->start_time = 0; | 
|  | log_flag(BACKFILL, "%pJ after defer overlaps with existing reservation start_time=%u end_reserve=%u boot_time=%u later_start %ld", | 
|  | job_ptr, start_time, end_reserve, boot_time, | 
|  | later_start); | 
|  | goto TRY_LATER; | 
|  | } | 
|  |  | 
|  | if (_het_job_deadlock_test(job_ptr)) { | 
|  | _set_job_time_limit(job_ptr, orig_time_limit); | 
|  | continue; | 
|  | } | 
|  |  | 
|  | /* | 
|  | * Add reservation to scheduling table if appropriate | 
|  | */ | 
|  | if (!assoc_limit_stop) { | 
|  | uint32_t selected_node_cnt; | 
|  | uint64_t tres_req_cnt[slurmctld_tres_cnt]; | 
|  | uint16_t sockets_per_node; | 
|  | assoc_mgr_lock_t locks = { | 
|  | .assoc = READ_LOCK, | 
|  | .qos = WRITE_LOCK, | 
|  | .tres = READ_LOCK, | 
|  | }; | 
|  |  | 
|  | selected_node_cnt = bit_set_count(avail_bitmap); | 
|  | memcpy(tres_req_cnt, job_ptr->tres_req_cnt, | 
|  | sizeof(tres_req_cnt)); | 
|  | tres_req_cnt[TRES_ARRAY_CPU] = | 
|  | (uint64_t)(job_ptr->total_cpus ? | 
|  | job_ptr->total_cpus : | 
|  | job_ptr->details->min_cpus); | 
|  |  | 
|  | sockets_per_node = job_get_sockets_per_node(job_ptr); | 
|  | tres_req_cnt[TRES_ARRAY_MEM] = job_get_tres_mem( | 
|  | job_ptr->job_resrcs, | 
|  | job_ptr->details->pn_min_memory, | 
|  | tres_req_cnt[TRES_ARRAY_CPU], | 
|  | selected_node_cnt, | 
|  | job_ptr->part_ptr, | 
|  | job_ptr->gres_list_req, | 
|  | (job_ptr->bit_flags & | 
|  | JOB_MEM_SET), sockets_per_node, | 
|  | job_ptr->details->num_tasks); | 
|  |  | 
|  | tres_req_cnt[TRES_ARRAY_NODE] = | 
|  | (uint64_t)selected_node_cnt; | 
|  |  | 
|  | assoc_mgr_lock(&locks); | 
|  | gres_stepmgr_set_job_tres_cnt(job_ptr->gres_list_req, | 
|  | selected_node_cnt, | 
|  | tres_req_cnt, | 
|  | true); | 
|  |  | 
|  | tres_req_cnt[TRES_ARRAY_BILLING] = | 
|  | assoc_mgr_tres_weighted( | 
|  | tres_req_cnt, | 
|  | job_ptr->part_ptr->billing_weights, | 
|  | slurm_conf.priority_flags, true); | 
|  |  | 
|  | if (!acct_policy_job_runnable_post_select(job_ptr, | 
|  | tres_req_cnt, true)) { | 
|  | assoc_mgr_unlock(&locks); | 
|  | log_flag(BACKFILL, "adding reservation for %pJ blocked by acct_policy_job_runnable_post_select", | 
|  | job_ptr); | 
|  | _set_job_time_limit(job_ptr, orig_time_limit); | 
|  | continue; | 
|  | } | 
|  | assoc_mgr_unlock(&locks); | 
|  | } | 
|  | if (slurm_conf.debug_flags & DEBUG_FLAG_BACKFILL) | 
|  | _dump_job_sched(job_ptr, end_reserve, avail_bitmap); | 
|  | if (qos_flags & QOS_FLAG_NO_RESERVE) { | 
|  | _set_job_time_limit(job_ptr, orig_time_limit); | 
|  | continue; | 
|  | } | 
|  |  | 
|  | if (bf_job_part_count_reserve) { | 
|  | if (_check_bf_usage( | 
|  | job_ptr->part_ptr->bf_data->resv_usage, | 
|  | bf_job_part_count_reserve, | 
|  | orig_sched_start)) { | 
|  | _set_job_time_limit(job_ptr, orig_time_limit); | 
|  | continue; | 
|  | } | 
|  | job_ptr->part_ptr->bf_data->resv_usage->count++; | 
|  | } | 
|  |  | 
|  | /* Clear assumed rejected array status */ | 
|  | reject_array_job = NULL; | 
|  | reject_array_part = NULL; | 
|  | reject_array_qos = NULL; | 
|  | reject_array_resv = NULL; | 
|  |  | 
|  | if ((!bf_one_resv_per_job || !orig_start_time) && | 
|  | (!(job_ptr->bit_flags & JOB_MAGNETIC) || | 
|  | bf_allow_magnetic_slot)) { | 
|  | if (node_space_recs >= bf_node_space_size) { | 
|  | log_flag(BACKFILL, "table size limit of %u reached", | 
|  | bf_node_space_size); | 
|  | if ((max_backfill_job_per_part != 0) && | 
|  | (max_backfill_job_per_part >= | 
|  | (bf_node_space_size / 2))) { | 
|  | error("bf_max_job_part >= bf_node_space_size / 2 (%u >= %u)", | 
|  | max_backfill_job_per_part, | 
|  | (bf_node_space_size / 2)); | 
|  | } else if ((max_backfill_job_per_user != 0) && | 
|  | (max_backfill_job_per_user > | 
|  | (bf_node_space_size / 2))) { | 
|  | warning("bf_max_job_user > bf_node_space_size / 2 (%u > %u)", | 
|  | max_backfill_job_per_user, | 
|  | (bf_node_space_size / 2)); | 
|  | } else if  ((max_backfill_job_per_assoc != 0) && | 
|  | (max_backfill_job_per_assoc > | 
|  | (bf_node_space_size / 2))) { | 
|  | warning("bf_max_job_assoc > bf_node_space_size / 2 (%u > %u)", | 
|  | max_backfill_job_per_assoc, | 
|  | (bf_node_space_size / 2)); | 
|  | } | 
|  | _set_job_time_limit(job_ptr, orig_time_limit); | 
|  | _set_bf_exit(BF_EXIT_TABLE_LIMIT); | 
|  | break; | 
|  | } | 
|  | _add_reservation(start_time, end_reserve, avail_bitmap, | 
|  | job_ptr, node_space, &node_space_recs, | 
|  | orig_start_time); | 
|  | } | 
|  | if (slurm_conf.debug_flags & DEBUG_FLAG_BACKFILL_MAP) | 
|  | _dump_node_space_table(node_space); | 
|  | if ((orig_start_time != 0) && | 
|  | (orig_start_time < job_ptr->start_time)) { | 
|  | /* Can start earlier in different partition */ | 
|  | job_ptr->start_time = orig_start_time; | 
|  | } | 
|  | _set_job_time_limit(job_ptr, orig_time_limit); | 
|  | if (job_ptr->array_recs) { | 
|  | /* Try making reservation for next task of job array */ | 
|  | if (test_array_job_id != job_ptr->array_job_id) { | 
|  | test_array_job_id = job_ptr->array_job_id; | 
|  | test_array_count = 1; | 
|  | array_start_time = job_ptr->start_time; | 
|  | } else { | 
|  | test_array_count++; | 
|  | array_start_time = MIN(array_start_time, | 
|  | job_ptr->start_time); | 
|  | } | 
|  |  | 
|  | /* | 
|  | * Don't consider the next task if it would exceed the | 
|  | * maximum number of runnable tasks. If max_run_tasks is | 
|  | * 0, then it wasn't set, so ignore it. | 
|  | */ | 
|  | if ((test_array_count < bf_max_job_array_resv) && | 
|  | (test_array_count < | 
|  | job_ptr->array_recs->task_cnt) && | 
|  | (!job_ptr->array_recs->max_run_tasks || | 
|  | ((MAX(job_ptr->array_recs->pend_run_tasks, | 
|  | test_array_count) + | 
|  | job_ptr->array_recs->tot_run_tasks) < | 
|  | job_ptr->array_recs->max_run_tasks))) | 
|  | goto next_task; | 
|  | } | 
|  | } | 
|  |  | 
|  | if (!nodes_planned) | 
|  | _handle_planned(true); | 
|  |  | 
|  | xfree(job_queue_rec); | 
|  |  | 
|  | if (job_ptr) { | 
|  | /* Restore preemption state if needed. */ | 
|  | _restore_preempt_state(job_ptr, &tmp_preempt_start_time, | 
|  | &tmp_preempt_in_progress); | 
|  | job_resv_clear_magnetic_flag(job_ptr); | 
|  |  | 
|  | if (job_ptr->array_recs && array_start_time) | 
|  | job_ptr->start_time = array_start_time; | 
|  | } | 
|  |  | 
|  | _het_job_deadlock_fini(); | 
|  | if (!bf_hetjob_immediate && !state_changed_break && | 
|  | (!max_backfill_jobs_start || | 
|  | (job_start_cnt < max_backfill_jobs_start))) | 
|  | _het_job_start_test(node_space, 0, NULL, NULL); | 
|  |  | 
|  | FREE_NULL_BITMAP(avail_bitmap); | 
|  | FREE_NULL_BITMAP(excluded_topo_bitmap); | 
|  | reservation_delete_resv_exc_parts(&resv_exc); | 
|  | FREE_NULL_BITMAP(resv_bitmap); | 
|  | FREE_NULL_BITMAP(tmp_bitmap); | 
|  | FREE_NULL_BITMAP(next_bitmap); | 
|  | FREE_NULL_BITMAP(current_bitmap); | 
|  |  | 
|  | for (i = 0; ; ) { | 
|  | FREE_NULL_BITMAP(node_space[i].avail_bitmap); | 
|  | FREE_NULL_BF_LICENSES(node_space[i].licenses); | 
|  | if ((i = node_space[i].next) == 0) | 
|  | break; | 
|  | } | 
|  | for (i = node_space_recs; i <= bf_node_space_size; i++) { | 
|  | if (!node_space[i].avail_bitmap) | 
|  | break; | 
|  | FREE_NULL_BITMAP(node_space[i].avail_bitmap); | 
|  | } | 
|  | xfree(node_space); | 
|  |  | 
|  | FREE_NULL_LIST(job_queue); | 
|  | FREE_NULL_LIST(nodes_used_list); | 
|  | xfree(nodes_used); | 
|  |  | 
|  | if (bf_topopt_enable) | 
|  | fini_oracle(); | 
|  |  | 
|  | gettimeofday(&bf_time2, NULL); | 
|  | _do_diag_stats(&bf_time1, &bf_time2, node_space_recs); | 
|  | if (slurm_conf.debug_flags & DEBUG_FLAG_BACKFILL) { | 
|  | END_TIMER; | 
|  | info("completed testing %u(%d) jobs, %s", | 
|  | slurmctld_diag_stats.bf_last_depth, | 
|  | job_test_count, TIME_STR); | 
|  | } | 
|  |  | 
|  | slurm_mutex_lock(&slurmctld_config.thread_count_lock); | 
|  | if (slurmctld_config.server_thread_count >= 150) { | 
|  | info("%d pending RPCs at cycle end, consider " | 
|  | "configuring max_rpc_cnt", | 
|  | slurmctld_config.server_thread_count); | 
|  | } | 
|  | slurm_mutex_unlock(&slurmctld_config.thread_count_lock); | 
|  |  | 
|  | return; | 
|  | } | 
|  |  | 
|  | /* Try to start the job on any non-reserved nodes */ | 
|  | static int _start_job(job_record_t *job_ptr, bitstr_t *resv_bitmap) | 
|  | { | 
|  | int rc; | 
|  | bitstr_t *orig_exc_nodes = NULL; | 
|  | bool is_job_array_head = false; | 
|  | static uint32_t fail_jobid = 0; | 
|  | job_node_select_t job_node_select = { | 
|  | .job_ptr = job_ptr, | 
|  | }; | 
|  | if (job_ptr->details->exc_node_bitmap) { | 
|  | orig_exc_nodes = bit_copy(job_ptr->details->exc_node_bitmap); | 
|  | bit_or(job_ptr->details->exc_node_bitmap, resv_bitmap); | 
|  | } else | 
|  | job_ptr->details->exc_node_bitmap = bit_copy(resv_bitmap); | 
|  | if (job_ptr->array_recs) | 
|  | is_job_array_head = true; | 
|  | rc = select_nodes(&job_node_select, false, false, | 
|  | SLURMDB_JOB_FLAG_BACKFILL); | 
|  |  | 
|  | if (is_job_array_head && job_ptr->details) { | 
|  | job_record_t *base_job_ptr; | 
|  | base_job_ptr = find_job_record(job_ptr->array_job_id); | 
|  | if (base_job_ptr && base_job_ptr != job_ptr | 
|  | && base_job_ptr->array_recs) { | 
|  | FREE_NULL_BITMAP( | 
|  | base_job_ptr->details->exc_node_bitmap); | 
|  | if (orig_exc_nodes) | 
|  | base_job_ptr->details->exc_node_bitmap = | 
|  | bit_copy(orig_exc_nodes); | 
|  | } | 
|  | } | 
|  | if (job_ptr->details) { /* select_nodes() might reset exc_node_bitmap */ | 
|  | FREE_NULL_BITMAP(job_ptr->details->exc_node_bitmap); | 
|  | job_ptr->details->exc_node_bitmap = orig_exc_nodes; | 
|  | } else | 
|  | FREE_NULL_BITMAP(orig_exc_nodes); | 
|  | if (rc == SLURM_SUCCESS) { | 
|  | /* job initiated */ | 
|  | last_job_update = time(NULL); | 
|  | info("Started %pJ in %s on %s", | 
|  | job_ptr, job_ptr->part_ptr->name, job_ptr->nodes); | 
|  | if (job_ptr->batch_flag == 0) | 
|  | srun_allocate(job_ptr); | 
|  | else if (!IS_JOB_CONFIGURING(job_ptr)) | 
|  | launch_job(job_ptr); | 
|  | slurmctld_diag_stats.backfilled_jobs++; | 
|  | slurmctld_diag_stats.last_backfilled_jobs++; | 
|  | if (job_ptr->het_job_id) | 
|  | slurmctld_diag_stats.backfilled_het_jobs++; | 
|  | log_flag(BACKFILL, "Jobs backfilled since boot: %u", | 
|  | slurmctld_diag_stats.backfilled_jobs); | 
|  | } else if ((job_ptr->job_id != fail_jobid) && | 
|  | (rc != ESLURM_ACCOUNTING_POLICY)) { | 
|  | char *node_list; | 
|  | bit_not(resv_bitmap); | 
|  | node_list = bitmap2node_name(resv_bitmap); | 
|  | /* This happens when a job has sharing disabled and | 
|  | * a selected node is still completing some job, | 
|  | * which should be a temporary situation. */ | 
|  | verbose("Failed to start %pJ with %s avail: %s", | 
|  | job_ptr, node_list, slurm_strerror(rc)); | 
|  | xfree(node_list); | 
|  | fail_jobid = job_ptr->job_id; | 
|  | } else { | 
|  | debug3("Failed to start %pJ: %s", | 
|  | job_ptr, slurm_strerror(rc)); | 
|  | } | 
|  |  | 
|  | return rc; | 
|  | } | 
|  |  | 
|  | /* | 
|  | * Compute a job's maximum time based upon conflicts in resources | 
|  | * planned for use by other jobs and that job's min/max time limit | 
|  | * Return NO_VAL if no restriction | 
|  | */ | 
|  | static uint32_t _get_job_max_tl(job_record_t *job_ptr, time_t now, | 
|  | node_space_map_t *node_space) | 
|  | { | 
|  | int32_t j; | 
|  | time_t comp_time = 0; | 
|  | uint32_t max_tl = NO_VAL; | 
|  |  | 
|  | if (job_ptr->time_min == 0) | 
|  | return max_tl; | 
|  |  | 
|  | for (j = 0; ; ) { | 
|  | if ((node_space[j].begin_time != now) && // No current conflicts | 
|  | (node_space[j].begin_time < job_ptr->end_time) && | 
|  | (!bit_super_set(job_ptr->node_bitmap, | 
|  | node_space[j].avail_bitmap) || | 
|  | !bf_licenses_avail(node_space[j].licenses, job_ptr, | 
|  | job_ptr->node_bitmap))) { | 
|  | /* Job overlaps pending job's resource reservation */ | 
|  | if ((comp_time == 0) || | 
|  | (comp_time > node_space[j].begin_time)) | 
|  | comp_time = node_space[j].begin_time; | 
|  | } | 
|  | if ((j = node_space[j].next) == 0) | 
|  | break; | 
|  | } | 
|  |  | 
|  | if (comp_time != 0) | 
|  | max_tl = (comp_time - now + 59) / 60; | 
|  |  | 
|  | return max_tl; | 
|  | } | 
|  |  | 
|  | /* | 
|  | * Reset a job's time limit (and end_time) as high as possible | 
|  | *	within the range job_ptr->time_min and job_ptr->time_limit. | 
|  | *	Avoid using resources reserved for pending jobs or in resource | 
|  | *	reservations | 
|  | */ | 
|  | static void _reset_job_time_limit(job_record_t *job_ptr, time_t now, | 
|  | node_space_map_t *node_space) | 
|  | { | 
|  | int32_t j, resv_delay; | 
|  | uint32_t orig_time_limit = job_ptr->time_limit; | 
|  | uint32_t new_time_limit; | 
|  |  | 
|  | for (j = 0; ; ) { | 
|  | if ((node_space[j].begin_time != now) && // No current conflicts | 
|  | (node_space[j].begin_time < job_ptr->end_time) && | 
|  | (!bit_super_set(job_ptr->node_bitmap, | 
|  | node_space[j].avail_bitmap))) { | 
|  | /* Job overlaps pending job's resource reservation */ | 
|  | resv_delay = difftime(node_space[j].begin_time, now); | 
|  | resv_delay /= 60;	/* seconds to minutes */ | 
|  | if (resv_delay < job_ptr->time_limit) | 
|  | job_ptr->time_limit = resv_delay; | 
|  | } | 
|  | if ((j = node_space[j].next) == 0) | 
|  | break; | 
|  | } | 
|  | new_time_limit = MAX(job_ptr->time_min, job_ptr->time_limit); | 
|  | acct_policy_alter_job(job_ptr, new_time_limit); | 
|  | job_ptr->time_limit = new_time_limit; | 
|  | job_ptr->end_time = job_ptr->start_time + (job_ptr->time_limit * 60); | 
|  |  | 
|  | job_time_adj_resv(job_ptr); | 
|  |  | 
|  | if (orig_time_limit != job_ptr->time_limit) { | 
|  | info("%pJ time limit changed from %u to %u", | 
|  | job_ptr, orig_time_limit, job_ptr->time_limit); | 
|  | } | 
|  | } | 
|  |  | 
|  | /* | 
|  | * Report if any changes occurred to job, node, reservation | 
|  | * or partition information | 
|  | */ | 
|  | static bool _more_work(time_t last_backfill_time) | 
|  | { | 
|  | bool rc = false; | 
|  |  | 
|  | if ((last_job_update  >= last_backfill_time) || | 
|  | (last_node_update >= last_backfill_time) || | 
|  | (last_part_update >= last_backfill_time) || | 
|  | (last_resv_update >= last_backfill_time)) { | 
|  | rc = true; | 
|  | } | 
|  |  | 
|  | return rc; | 
|  | } | 
|  |  | 
|  | /* Create a reservation for a job in the future */ | 
|  | static void _add_reservation(time_t start_time, time_t end_reserve, | 
|  | bitstr_t *res_bitmap, job_record_t *job_ptr, | 
|  | node_space_map_t *node_space, int *node_space_recs, | 
|  | time_t orig_start_time) | 
|  | { | 
|  | bool placed = false; | 
|  | int i, j, one_before = 0, one_after = -1; | 
|  | bitstr_t *res_bitmap_orig = res_bitmap; | 
|  | bitstr_t *res_bitmap_efctv = NULL; | 
|  |  | 
|  | #if 0 | 
|  | info("add job start:%u end:%u", start_time, end_reserve); | 
|  | for (j = 0; ; ) { | 
|  | info("node start:%u end:%u", | 
|  | (uint32_t) node_space[j].begin_time, | 
|  | (uint32_t) node_space[j].end_time); | 
|  | if ((j = node_space[j].next) == 0) | 
|  | break; | 
|  | } | 
|  | #endif | 
|  | if (res_bitmap) { | 
|  | if (IS_JOB_WHOLE_TOPO(job_ptr)) { | 
|  | res_bitmap_efctv = bit_copy(res_bitmap); | 
|  | topology_g_whole_topo(res_bitmap_efctv, | 
|  | job_ptr->part_ptr->topology_idx); | 
|  | res_bitmap = res_bitmap_efctv; | 
|  | } | 
|  |  | 
|  | if (!IS_JOB_RUNNING(job_ptr) && | 
|  | ((orig_start_time == 0) || | 
|  | (job_ptr->start_time < orig_start_time))) { | 
|  | /* Can't start earlier in different partition. */ | 
|  | xfree(job_ptr->sched_nodes); | 
|  | job_ptr->sched_nodes = bitmap2node_name(res_bitmap); | 
|  | /* | 
|  | * These nodes are planned.  We will set the state | 
|  | * afterwards. | 
|  | */ | 
|  | bit_or(planned_bitmap, res_bitmap); | 
|  | } | 
|  | } | 
|  |  | 
|  | start_time = MAX(start_time, node_space[0].begin_time); | 
|  | /* | 
|  | * Ensure that the job always occupies at least one bf_resolution | 
|  | * slot within the map. This also fixes potential issues when | 
|  | * running with bf_running_job_reserve if jobs have run past | 
|  | * their timelimit but have not yet been terminated. | 
|  | */ | 
|  | if (end_reserve < (start_time + backfill_resolution)) | 
|  | end_reserve = start_time + backfill_resolution; | 
|  | for (j = 0; ; ) { | 
|  | if (node_space[j].end_time > start_time) { | 
|  | /* insert start entry record */ | 
|  | i = *node_space_recs; | 
|  | node_space[i].begin_time = start_time; | 
|  | node_space[i].end_time = node_space[j].end_time; | 
|  | node_space[j].end_time = start_time; | 
|  | COPY_BITMAP(node_space[i].avail_bitmap, | 
|  | node_space[j].avail_bitmap); | 
|  | node_space[i].licenses = | 
|  | bf_licenses_copy(node_space[j].licenses); | 
|  | node_space[i].fragmentation = | 
|  | node_space[j].fragmentation; | 
|  | node_space[i].next = node_space[j].next; | 
|  | node_space[j].next = i; | 
|  | (*node_space_recs)++; | 
|  | placed = true; | 
|  | break; | 
|  | } | 
|  | if (node_space[j].end_time == start_time) { | 
|  | /* no need to insert new start entry record */ | 
|  | placed = true; | 
|  | break; | 
|  | } | 
|  | one_before = j; | 
|  | if ((j = node_space[j].next) == 0) | 
|  | break; | 
|  | } | 
|  |  | 
|  | while (placed && (j = node_space[j].next)) { | 
|  | if (end_reserve < node_space[j].end_time) { | 
|  | /* insert end entry record */ | 
|  | i = *node_space_recs; | 
|  | node_space[i].begin_time = end_reserve; | 
|  | node_space[i].end_time = node_space[j].end_time; | 
|  | node_space[j].end_time = end_reserve; | 
|  | COPY_BITMAP(node_space[i].avail_bitmap, | 
|  | node_space[j].avail_bitmap); | 
|  | node_space[i].licenses = | 
|  | bf_licenses_copy(node_space[j].licenses); | 
|  | node_space[i].fragmentation = | 
|  | node_space[j].fragmentation; | 
|  | node_space[i].next = node_space[j].next; | 
|  | node_space[j].next = i; | 
|  | (*node_space_recs)++; | 
|  | } | 
|  |  | 
|  | /* merge in new usage with this record */ | 
|  | if (res_bitmap) { | 
|  | bitstr_t *node_bitmap_orig = job_ptr->node_bitmap; | 
|  | bit_and_not(node_space[j].avail_bitmap, res_bitmap); | 
|  | if (!IS_JOB_RUNNING(job_ptr)) | 
|  | job_ptr->node_bitmap = res_bitmap_orig; | 
|  | bf_licenses_deduct(node_space[j].licenses, job_ptr); | 
|  | if (!IS_JOB_RUNNING(job_ptr)) | 
|  | job_ptr->node_bitmap = node_bitmap_orig; | 
|  | if (bf_topopt_enable) { | 
|  | node_space[j].fragmentation = | 
|  | topology_g_get_fragmentation( | 
|  | node_space[j].avail_bitmap); | 
|  | } | 
|  | } else { | 
|  | /* setting up reservation licenses */ | 
|  | bf_licenses_transfer(node_space[j].licenses, job_ptr); | 
|  | } | 
|  |  | 
|  | if (end_reserve == node_space[j].end_time) { | 
|  | if (node_space[j].next) | 
|  | one_after = node_space[j].next; | 
|  | break; | 
|  | } | 
|  | } | 
|  |  | 
|  | /* Drop records with identical bitmaps (up to one record). | 
|  | * This can significantly improve performance of the backfill tests. */ | 
|  | for (i = one_before; i != one_after; ) { | 
|  | if ((j = node_space[i].next) == 0) | 
|  | break; | 
|  | if (!bf_licenses_equal(node_space[i].licenses, | 
|  | node_space[j].licenses)) { | 
|  | i = j; | 
|  | continue; | 
|  | } | 
|  | if (!bit_equal(node_space[i].avail_bitmap, | 
|  | node_space[j].avail_bitmap)) { | 
|  | i = j; | 
|  | continue; | 
|  | } | 
|  | node_space[i].end_time = node_space[j].end_time; | 
|  | node_space[i].next = node_space[j].next; | 
|  | if (node_space[j].avail_bitmap) { | 
|  | for (i = *node_space_recs; | 
|  | i <= bf_node_space_size; i++) { | 
|  | if (!node_space[i].avail_bitmap) { | 
|  | node_space[i].avail_bitmap = | 
|  | node_space[j].avail_bitmap; | 
|  | node_space[j].avail_bitmap = NULL; | 
|  | break; | 
|  | } | 
|  | } | 
|  | } | 
|  | FREE_NULL_BITMAP(node_space[j].avail_bitmap); | 
|  | FREE_NULL_BF_LICENSES(node_space[j].licenses); | 
|  | break; | 
|  | } | 
|  | FREE_NULL_BITMAP(res_bitmap_efctv); | 
|  | } | 
|  |  | 
|  | /* | 
|  | * Determine if the resource specification for a new job overlaps with a | 
|  | *	reservation that the backfill scheduler has made for a job to be | 
|  | *	started in the future. | 
|  | * IN use_bitmap - nodes to be allocated | 
|  | * IN job_ptr - used for license and reservation info | 
|  | * IN start_time - start time of job | 
|  | * IN end_reserve - end time of job | 
|  | */ | 
|  | static bool _test_resv_overlap(node_space_map_t *node_space, | 
|  | bitstr_t *use_bitmap, job_record_t *job_ptr, | 
|  | uint32_t start_time, uint32_t end_reserve) | 
|  | { | 
|  | bool overlap = false; | 
|  | int j = 0; | 
|  | bitstr_t *use_bitmap_efctv = NULL; | 
|  | bitstr_t *use_bitmap_orig = use_bitmap; | 
|  |  | 
|  | if (IS_JOB_WHOLE_TOPO(job_ptr)) { | 
|  | use_bitmap_efctv = bit_copy(use_bitmap); | 
|  | topology_g_whole_topo(use_bitmap_efctv, | 
|  | job_ptr->part_ptr->topology_idx); | 
|  | use_bitmap = use_bitmap_efctv; | 
|  | } | 
|  |  | 
|  | while (true) { | 
|  | if ((node_space[j].end_time > start_time) && | 
|  | (node_space[j].begin_time < end_reserve)) { | 
|  | /* | 
|  | * Jobs will run concurrently. | 
|  | * Do they conflict for resources? | 
|  | */ | 
|  | if (!bit_super_set(use_bitmap, | 
|  | node_space[j].avail_bitmap)) { | 
|  | overlap = true; | 
|  | break; | 
|  | } | 
|  | if (!bf_licenses_avail(node_space[j].licenses, job_ptr, | 
|  | use_bitmap_orig)) { | 
|  | overlap = true; | 
|  | break; | 
|  | } | 
|  | } | 
|  |  | 
|  | if ((j = node_space[j].next) == 0) | 
|  | break; | 
|  | } | 
|  | FREE_NULL_BITMAP(use_bitmap_efctv); | 
|  | return overlap; | 
|  | } | 
|  |  | 
|  | /* | 
|  | * Delete het_job_map_t record from het_job_list | 
|  | */ | 
|  | static void _het_job_map_del(void *x) | 
|  | { | 
|  | het_job_map_t *map = (het_job_map_t *) x; | 
|  | FREE_NULL_LIST(map->het_job_rec_list); | 
|  | xfree(map); | 
|  | } | 
|  |  | 
|  | /* | 
|  | * Return 1 if a het_job_map_t record with a specific het_job_id is found. | 
|  | * Always return 1 if "key" is zero. | 
|  | */ | 
|  | static int _het_job_find_map(void *x, void *key) | 
|  | { | 
|  | het_job_map_t *map = (het_job_map_t *) x; | 
|  | uint32_t *het_job_id = (uint32_t *) key; | 
|  |  | 
|  | if ((het_job_id == NULL) || | 
|  | (map->het_job_id == *het_job_id)) | 
|  | return 1; | 
|  | return 0; | 
|  | } | 
|  |  | 
|  | /* | 
|  | * Return 1 if a het_job_rec_t record with a specific job_id is found. | 
|  | */ | 
|  | static int _het_job_find_rec(void *x, void *key) | 
|  | { | 
|  | het_job_rec_t *rec = (het_job_rec_t *) x; | 
|  | uint32_t *job_id = (uint32_t *) key; | 
|  |  | 
|  | if (rec->job_id == *job_id) | 
|  | return 1; | 
|  | return 0; | 
|  | } | 
|  |  | 
|  | /* | 
|  | * Remove vestigial elements from het_job_list. For still active element, | 
|  | * clear the previously computted start time. This is used to periodically clear | 
|  | * history so that heterogeneous jobs do not keep getting deferred based | 
|  | * upon old system state | 
|  | */ | 
|  | static void _het_job_start_clear(void) | 
|  | { | 
|  | het_job_map_t *map; | 
|  | list_itr_t *iter; | 
|  |  | 
|  | iter = list_iterator_create(het_job_list); | 
|  | while ((map = list_next(iter))) { | 
|  | if (map->prev_start == 0) { | 
|  | list_delete_item(iter); | 
|  | } else { | 
|  | map->prev_start = 0; | 
|  | list_flush(map->het_job_rec_list); | 
|  | } | 
|  | } | 
|  | list_iterator_destroy(iter); | 
|  | } | 
|  |  | 
|  | /* | 
|  | * For a given het_job_map_t record, determine the earliest that it can start, | 
|  | * which is the time at which it's latest starting component begins. The | 
|  | * "exclude_job_id" is used to exclude a hetjob component currently being | 
|  | * tested to start, presumably in a different partition. | 
|  | */ | 
|  | static time_t _het_job_start_compute(het_job_map_t *map, | 
|  | uint32_t exclude_job_id) | 
|  | { | 
|  | list_itr_t *iter; | 
|  | het_job_rec_t *rec; | 
|  | time_t latest_start = map->prev_start; | 
|  |  | 
|  | iter = list_iterator_create(map->het_job_rec_list); | 
|  | while ((rec = list_next(iter))) { | 
|  | if (rec->job_id == exclude_job_id) | 
|  | continue; | 
|  | latest_start = MAX(latest_start, rec->latest_start); | 
|  | } | 
|  | list_iterator_destroy(iter); | 
|  |  | 
|  | return latest_start; | 
|  | } | 
|  |  | 
|  | /* | 
|  | * Return the earliest that a job can start based upon _other_ components of | 
|  | * that same heterogeneous job. Return 0 if no limitation. | 
|  | * | 
|  | * If the job's state reason is BeginTime (the way all hetjobs start) and that | 
|  | * time is passed, then clear the reason field. | 
|  | */ | 
|  | static time_t _het_job_start_find(job_record_t *job_ptr) | 
|  | { | 
|  | het_job_map_t *map; | 
|  | time_t latest_start = (time_t) 0; | 
|  |  | 
|  | if (job_ptr->het_job_id) { | 
|  | map = list_find_first(het_job_list, _het_job_find_map, | 
|  | &job_ptr->het_job_id); | 
|  | if (map) { | 
|  | latest_start = _het_job_start_compute(map, | 
|  | job_ptr->job_id); | 
|  | } | 
|  |  | 
|  | log_flag(HETJOB, "%pJ in partition %s expected to start in %ld secs", | 
|  | job_ptr, job_ptr->part_ptr->name, | 
|  | MAX(0, latest_start - time(NULL))); | 
|  | } | 
|  |  | 
|  | return latest_start; | 
|  | } | 
|  |  | 
|  | /* | 
|  | * Record the earliest that a hetjob component can start. If it can be | 
|  | * started in multiple partitions, we only record the earliest start time | 
|  | * for the job in any partition and reservation. | 
|  | */ | 
|  | static void _het_job_start_set(job_record_t *job_ptr, time_t latest_start, | 
|  | uint32_t comp_time_limit) | 
|  | { | 
|  | het_job_map_t *map; | 
|  | het_job_rec_t *rec; | 
|  |  | 
|  | if (comp_time_limit == NO_VAL) | 
|  | comp_time_limit = job_ptr->time_limit; | 
|  | if (job_ptr->het_job_id) { | 
|  | map = list_find_first(het_job_list, _het_job_find_map, | 
|  | &job_ptr->het_job_id); | 
|  | if (map) { | 
|  | if (!map->comp_time_limit) { | 
|  | map->comp_time_limit = comp_time_limit; | 
|  | } else { | 
|  | map->comp_time_limit = MIN(map->comp_time_limit, | 
|  | comp_time_limit); | 
|  | } | 
|  | rec = list_find_first(map->het_job_rec_list, | 
|  | _het_job_find_rec, | 
|  | &job_ptr->job_id); | 
|  | if (rec && (rec->latest_start <= latest_start)) { | 
|  | /* | 
|  | * This job can start an earlier time in | 
|  | * some other partition, so ignore new info | 
|  | */ | 
|  | } else if (rec) { | 
|  | rec->latest_start = latest_start; | 
|  | rec->part_ptr = job_ptr->part_ptr; | 
|  | rec->resv_ptr = job_ptr->resv_ptr; | 
|  | } else { | 
|  | rec = xmalloc(sizeof(het_job_rec_t)); | 
|  | rec->job_id = job_ptr->job_id; | 
|  | rec->job_ptr = job_ptr; | 
|  | rec->latest_start = latest_start; | 
|  | rec->part_ptr = job_ptr->part_ptr; | 
|  | rec->resv_ptr = job_ptr->resv_ptr; | 
|  | list_append(map->het_job_rec_list, rec); | 
|  | } | 
|  | } else { | 
|  | rec = xmalloc(sizeof(het_job_rec_t)); | 
|  | rec->job_id = job_ptr->job_id; | 
|  | rec->job_ptr = job_ptr; | 
|  | rec->latest_start = latest_start; | 
|  | rec->part_ptr = job_ptr->part_ptr; | 
|  | rec->resv_ptr = job_ptr->resv_ptr; | 
|  |  | 
|  | map = xmalloc(sizeof(het_job_map_t)); | 
|  | map->comp_time_limit = comp_time_limit; | 
|  | map->het_job_id = job_ptr->het_job_id; | 
|  | map->het_job_rec_list = list_create(xfree_ptr); | 
|  | list_append(map->het_job_rec_list, rec); | 
|  | list_append(het_job_list, map); | 
|  | } | 
|  |  | 
|  | log_flag(HETJOB, "%pJ in partition %s set to start in %ld secs", | 
|  | job_ptr, job_ptr->part_ptr->name, | 
|  | MAX(0, _het_job_start_compute(map, 0) - time(NULL))); | 
|  | } | 
|  | } | 
|  |  | 
|  | /* | 
|  | * Return TRUE if we have expected start times for all components of a hetjob | 
|  | * and all components are valid and runable. | 
|  | * | 
|  | * NOTE: This should never happen, but we will also start the job if all of the | 
|  | * other components are already running, | 
|  | */ | 
|  | static bool _het_job_full(het_job_map_t *map) | 
|  | { | 
|  | job_record_t *het_job_ptr, *job_ptr; | 
|  | list_itr_t *iter; | 
|  | bool rc = true; | 
|  |  | 
|  | het_job_ptr = find_job_record(map->het_job_id); | 
|  | if (!het_job_ptr || !het_job_ptr->het_job_list || | 
|  | (!IS_JOB_RUNNING(het_job_ptr) && | 
|  | !_job_runnable_now(het_job_ptr))) { | 
|  | return false; | 
|  | } | 
|  |  | 
|  | iter = list_iterator_create(het_job_ptr->het_job_list); | 
|  | while ((job_ptr = list_next(iter))) { | 
|  | if ((job_ptr->magic != JOB_MAGIC) || | 
|  | (job_ptr->het_job_id != map->het_job_id)) { | 
|  | rc = false;	/* bad job pointer */ | 
|  | break; | 
|  | } | 
|  | if (IS_JOB_RUNNING(job_ptr)) | 
|  | continue; | 
|  | if (!list_find_first(map->het_job_rec_list, _het_job_find_rec, | 
|  | &job_ptr->job_id) || | 
|  | !_job_runnable_now(job_ptr)) { | 
|  | rc = false; | 
|  | break; | 
|  | } | 
|  | } | 
|  | list_iterator_destroy(iter); | 
|  |  | 
|  | return rc; | 
|  | } | 
|  |  | 
|  | /* | 
|  | * Determine if all components of a hetjob can be started now or are | 
|  | * prevented from doing so because of association or QOS limits. | 
|  | * Return true if they can all start. | 
|  | * | 
|  | * NOTE: That a hetjob passes this test does not mean that it will be able | 
|  | * to run. For example, this test assumes resource allocation at the CPU level. | 
|  | * If each task is allocated one core, with 2 CPUs, then the CPU limit test | 
|  | * would not be accurate. | 
|  | */ | 
|  | static bool _het_job_limit_check(het_job_map_t *map, time_t now) | 
|  | { | 
|  | job_record_t *job_ptr; | 
|  | het_job_rec_t *rec; | 
|  | list_itr_t *iter; | 
|  | int begun_jobs = 0, fini_jobs = 0, slurmctld_tres_size; | 
|  | bool runnable = true; | 
|  | uint32_t selected_node_cnt; | 
|  | uint64_t tres_req_cnt[slurmctld_tres_cnt]; | 
|  | uint64_t **tres_alloc_save = NULL; | 
|  |  | 
|  | tres_alloc_save = xcalloc(list_count(map->het_job_rec_list), | 
|  | sizeof(uint64_t *)); | 
|  | slurmctld_tres_size = sizeof(uint64_t) * slurmctld_tres_cnt; | 
|  | iter = list_iterator_create(map->het_job_rec_list); | 
|  | while ((rec = list_next(iter))) { | 
|  | uint16_t sockets_per_node; | 
|  | assoc_mgr_lock_t locks = { | 
|  | .assoc = READ_LOCK, | 
|  | .qos = WRITE_LOCK, | 
|  | .tres = READ_LOCK, | 
|  | }; | 
|  |  | 
|  | job_ptr = rec->job_ptr; | 
|  | job_ptr->part_ptr = rec->part_ptr; | 
|  | if (rec->resv_ptr) { | 
|  | job_ptr->resv_ptr = rec->resv_ptr; | 
|  | job_ptr->resv_id = job_ptr->resv_ptr->resv_id; | 
|  | } | 
|  | selected_node_cnt = job_ptr->node_cnt_wag; | 
|  | memcpy(tres_req_cnt, job_ptr->tres_req_cnt, | 
|  | slurmctld_tres_size); | 
|  | tres_req_cnt[TRES_ARRAY_CPU] = (uint64_t)(job_ptr->total_cpus ? | 
|  | job_ptr->total_cpus : | 
|  | job_ptr->details->min_cpus); | 
|  | sockets_per_node = job_get_sockets_per_node(job_ptr); | 
|  | tres_req_cnt[TRES_ARRAY_MEM] = job_get_tres_mem( | 
|  | job_ptr->job_resrcs, | 
|  | job_ptr->details->pn_min_memory, | 
|  | tres_req_cnt[TRES_ARRAY_CPU], | 
|  | selected_node_cnt, | 
|  | job_ptr->part_ptr, | 
|  | job_ptr->gres_list_req, | 
|  | (job_ptr->bit_flags & | 
|  | JOB_MEM_SET), sockets_per_node, | 
|  | job_ptr->details->num_tasks); | 
|  | tres_req_cnt[TRES_ARRAY_NODE] = (uint64_t)selected_node_cnt; | 
|  |  | 
|  | assoc_mgr_lock(&locks); | 
|  | gres_stepmgr_set_job_tres_cnt(job_ptr->gres_list_req, | 
|  | selected_node_cnt, | 
|  | tres_req_cnt, true); | 
|  |  | 
|  | tres_req_cnt[TRES_ARRAY_BILLING] = | 
|  | assoc_mgr_tres_weighted( | 
|  | tres_req_cnt, | 
|  | job_ptr->part_ptr->billing_weights, | 
|  | slurm_conf.priority_flags, true); | 
|  |  | 
|  | if (acct_policy_job_runnable_pre_select(job_ptr, true) && | 
|  | acct_policy_job_runnable_post_select(job_ptr, | 
|  | tres_req_cnt, true)) { | 
|  | assoc_mgr_unlock(&locks); | 
|  | tres_alloc_save[begun_jobs++] = job_ptr->tres_alloc_cnt; | 
|  | job_ptr->tres_alloc_cnt = xmalloc(slurmctld_tres_size); | 
|  | memcpy(job_ptr->tres_alloc_cnt, tres_req_cnt, | 
|  | slurmctld_tres_size); | 
|  | acct_policy_job_begin(job_ptr, false); | 
|  |  | 
|  | } else { | 
|  | assoc_mgr_unlock(&locks); | 
|  | runnable = false; | 
|  | break; | 
|  | } | 
|  | } | 
|  |  | 
|  | list_iterator_reset(iter); | 
|  | while ((rec = list_next(iter))) { | 
|  | job_ptr = rec->job_ptr; | 
|  | if (begun_jobs > fini_jobs) { | 
|  | time_t end_time_exp = job_ptr->end_time_exp; | 
|  | time_t end_time = job_ptr->end_time; | 
|  | uint32_t job_state = job_ptr->job_state; | 
|  | /* Simulate normal job completion */ | 
|  | job_ptr->end_time_exp = now; | 
|  | job_ptr->end_time = job_ptr->start_time; | 
|  | job_state_set(job_ptr, (JOB_COMPLETE | JOB_COMPLETING)); | 
|  | acct_policy_job_fini(job_ptr, false); | 
|  | job_ptr->end_time_exp = end_time_exp; | 
|  | job_ptr->end_time = end_time; | 
|  | job_state_set(job_ptr, job_state); | 
|  | xfree(job_ptr->tres_alloc_cnt); | 
|  | job_ptr->tres_alloc_cnt = tres_alloc_save[fini_jobs++]; | 
|  | } | 
|  | } | 
|  | list_iterator_destroy(iter); | 
|  | xfree(tres_alloc_save); | 
|  |  | 
|  | return runnable; | 
|  | } | 
|  |  | 
|  | /* | 
|  | * Start all components of a hetjob now | 
|  | */ | 
|  | static int _het_job_start_now(het_job_map_t *map, node_space_map_t *node_space) | 
|  | { | 
|  | job_record_t *job_ptr; | 
|  | bitstr_t *avail_bitmap = NULL; | 
|  | bitstr_t *resv_bitmap = NULL, *used_bitmap = NULL; | 
|  | het_job_rec_t *rec; | 
|  | list_itr_t *iter; | 
|  | int rc = SLURM_SUCCESS; | 
|  | bool resv_overlap = false; | 
|  | time_t now = time(NULL), start_res; | 
|  | uint32_t hard_limit; | 
|  | resv_exc_t resv_exc = { 0 }; | 
|  |  | 
|  | iter = list_iterator_create(map->het_job_rec_list); | 
|  | while ((rec = list_next(iter))) { | 
|  | bool reset_time = false; | 
|  | job_ptr = rec->job_ptr; | 
|  | job_ptr->part_ptr = rec->part_ptr; | 
|  | if (rec->resv_ptr) { | 
|  | job_ptr->resv_ptr = rec->resv_ptr; | 
|  | job_ptr->resv_id = job_ptr->resv_ptr->resv_id; | 
|  | } | 
|  |  | 
|  | /* | 
|  | * Identify the nodes which this job can use | 
|  | */ | 
|  | start_res = now; | 
|  | rc = job_test_resv(job_ptr, &start_res, true, &avail_bitmap, | 
|  | &resv_exc, &resv_overlap, false); | 
|  | reservation_delete_resv_exc_parts(&resv_exc); | 
|  | if (rc != SLURM_SUCCESS) { | 
|  | error("%pJ failed to start due to reservation", | 
|  | job_ptr); | 
|  | FREE_NULL_BITMAP(avail_bitmap); | 
|  | break; | 
|  | } | 
|  | bit_and(avail_bitmap, job_ptr->part_ptr->node_bitmap); | 
|  | bit_and(avail_bitmap, up_node_bitmap); | 
|  | if (used_bitmap) | 
|  | bit_and_not(avail_bitmap, used_bitmap); | 
|  | if (job_ptr->details->exc_node_bitmap) { | 
|  | bit_and_not(avail_bitmap, | 
|  | job_ptr->details->exc_node_bitmap); | 
|  | } | 
|  |  | 
|  | if (fed_mgr_job_lock(job_ptr)) { | 
|  | error("%pJ failed to start due to fed job lock", | 
|  | job_ptr); | 
|  | FREE_NULL_BITMAP(avail_bitmap); | 
|  | continue; | 
|  | } | 
|  |  | 
|  | resv_bitmap = avail_bitmap; | 
|  | avail_bitmap = NULL; | 
|  | bit_not(resv_bitmap); | 
|  | rc = _start_job(job_ptr, resv_bitmap); | 
|  | FREE_NULL_BITMAP(resv_bitmap); | 
|  | if (rc == SLURM_SUCCESS) { | 
|  | /* | 
|  | * If the following fails because of network | 
|  | * connectivity, the origin cluster should ask | 
|  | * when it comes back up if the cluster_lock | 
|  | * cluster actually started the job | 
|  | */ | 
|  | fed_mgr_job_start(job_ptr, job_ptr->start_time); | 
|  | log_flag(HETJOB, "%pJ started", job_ptr); | 
|  | if (!used_bitmap && job_ptr->node_bitmap) | 
|  | used_bitmap = bit_copy(job_ptr->node_bitmap); | 
|  | else if (job_ptr->node_bitmap) | 
|  | bit_or(used_bitmap, job_ptr->node_bitmap); | 
|  | } else { | 
|  | fed_mgr_job_unlock(job_ptr); | 
|  | break; | 
|  | } | 
|  | if (job_ptr->time_min) { | 
|  | /* Set time limit as high as possible */ | 
|  | acct_policy_alter_job(job_ptr, map->comp_time_limit); | 
|  | job_ptr->time_limit = map->comp_time_limit; | 
|  | reset_time = true; | 
|  | } | 
|  | if (job_ptr->start_time) { | 
|  | if (job_ptr->time_limit == INFINITE) | 
|  | hard_limit = YEAR_SECONDS; | 
|  | else | 
|  | hard_limit = job_ptr->time_limit * 60; | 
|  | job_ptr->end_time = job_ptr->start_time + hard_limit; | 
|  | /* | 
|  | * Only set if start_time. end_time must be set | 
|  | * beforehand for _reset_job_time_limit. | 
|  | */ | 
|  | if (reset_time) | 
|  | _reset_job_time_limit(job_ptr, now, node_space); | 
|  | } | 
|  | if (reset_time) | 
|  | jobacct_storage_g_job_start(acct_db_conn, job_ptr); | 
|  | } | 
|  | list_iterator_destroy(iter); | 
|  | FREE_NULL_BITMAP(used_bitmap); | 
|  |  | 
|  | return rc; | 
|  | } | 
|  |  | 
|  | /* | 
|  | * Deallocate all components if failed hetjob start | 
|  | */ | 
|  | static void _het_job_kill_now(het_job_map_t *map) | 
|  | { | 
|  | job_record_t *job_ptr; | 
|  | het_job_rec_t *rec; | 
|  | list_itr_t *iter; | 
|  | time_t now = time(NULL); | 
|  | int cred_lifetime = 1200; | 
|  | uint32_t save_bitflags; | 
|  |  | 
|  | cred_lifetime = cred_expiration(); | 
|  | iter = list_iterator_create(map->het_job_rec_list); | 
|  | while ((rec = list_next(iter))) { | 
|  | job_ptr = rec->job_ptr; | 
|  | if (IS_JOB_PENDING(job_ptr)) | 
|  | continue; | 
|  | info("Deallocate %pJ due to hetjob start failure", | 
|  | job_ptr); | 
|  | job_ptr->details->begin_time = now + cred_lifetime + 1; | 
|  | job_ptr->end_time   = now; | 
|  | job_state_set(job_ptr, (JOB_PENDING | JOB_COMPLETING)); | 
|  | last_job_update     = now; | 
|  | build_cg_bitmap(job_ptr); | 
|  | job_completion_logger(job_ptr, false); | 
|  | deallocate_nodes(job_ptr, false, false, false); | 
|  | /* | 
|  | * Since the job_completion_logger() removes the submit, | 
|  | * we need to add it again, but don't stage-out burst buffer | 
|  | */ | 
|  | save_bitflags = job_ptr->bit_flags; | 
|  | job_ptr->bit_flags |= JOB_KILL_HURRY; | 
|  | acct_policy_add_job_submit(job_ptr, false); | 
|  | job_ptr->bit_flags = save_bitflags; | 
|  | if (!job_ptr->node_bitmap_cg || | 
|  | (bit_set_count(job_ptr->node_bitmap_cg) == 0)) | 
|  | batch_requeue_fini(job_ptr); | 
|  | } | 
|  | list_iterator_destroy(iter); | 
|  | } | 
|  |  | 
|  | /* | 
|  | * If all components of a heterogeneous job can start now, then do so | 
|  | * node_space IN - map of available resources through time | 
|  | * map IN - info about this heterogeneous job | 
|  | * single IN - true if testing single heterogeneous jobs | 
|  | * Return true if heterogeneous job can start now | 
|  | */ | 
|  | static bool _het_job_start_test_single(node_space_map_t *node_space, | 
|  | het_job_map_t *map, bool single) | 
|  | { | 
|  | time_t now = time(NULL); | 
|  | int rc; | 
|  |  | 
|  | if (!map) | 
|  | return false; | 
|  |  | 
|  | if (!_het_job_full(map)) { | 
|  | log_flag(HETJOB, "Hetjob %u has indefinite start time", | 
|  | map->het_job_id); | 
|  | if (!single) | 
|  | map->prev_start = now + YEAR_SECONDS; | 
|  | return false; | 
|  | } | 
|  |  | 
|  | map->prev_start = _het_job_start_compute(map, 0); | 
|  | if (map->prev_start > now) { | 
|  | log_flag(HETJOB, "Hetjob %u should be able to start in %u seconds", | 
|  | map->het_job_id, (uint32_t) (map->prev_start - now)); | 
|  | return false; | 
|  | } | 
|  |  | 
|  | if (!_het_job_limit_check(map, now)) { | 
|  | log_flag(HETJOB, "Hetjob %u prevented from starting by account/QOS limit", | 
|  | map->het_job_id); | 
|  |  | 
|  | map->prev_start = now + YEAR_SECONDS; | 
|  | return false; | 
|  | } | 
|  |  | 
|  | log_flag(HETJOB, "Attempting to start hetjob %u", map->het_job_id); | 
|  |  | 
|  | rc = _het_job_start_now(map, node_space); | 
|  | if (rc != SLURM_SUCCESS) { | 
|  | log_flag(HETJOB, "Failed to start hetjob %u", map->het_job_id); | 
|  | _het_job_kill_now(map); | 
|  | } else { | 
|  | job_start_cnt += list_count(map->het_job_rec_list); | 
|  | if (max_backfill_jobs_start && | 
|  | (job_start_cnt >= max_backfill_jobs_start)) { | 
|  | log_flag(BACKFILL, "bf_max_job_start limit of %d reached", | 
|  | max_backfill_jobs_start); | 
|  | } | 
|  | return true; | 
|  | } | 
|  |  | 
|  | return false; | 
|  | } | 
|  |  | 
|  | static int _het_job_start_test_list(void *map, void *node_space) | 
|  | { | 
|  | if (!max_backfill_jobs_start || | 
|  | (job_start_cnt < max_backfill_jobs_start)) | 
|  | _het_job_start_test_single(node_space, map, false); | 
|  |  | 
|  | return SLURM_SUCCESS; | 
|  | } | 
|  |  | 
|  | static int _foreach_add_job_to_nodes_used(void *x, void *arg) | 
|  | { | 
|  | het_job_rec_t *het_rec = x; | 
|  | node_used_t *nodes_used = arg; | 
|  |  | 
|  | if (_mark_nodes_usage(het_rec->job_ptr, nodes_used)) | 
|  | nodes_used->needs_sorting = true; | 
|  |  | 
|  | return 0; | 
|  | } | 
|  |  | 
|  | /* | 
|  | * If all components of a heterogeneous job can start now, then do so | 
|  | * node_space IN - map of available resources through time | 
|  | * het_job_id IN - the ID of the heterogeneous job to evaluate, | 
|  | *		    if zero then evaluate all heterogeneous jobs and | 
|  | * 		    nodes_used/node_used_list are not updated | 
|  | * nodes_used IN/OUT - array of node usage used for exclusive filtering | 
|  | * nodes_used_list IN/OUT - list of node usage used for exclusive filtering | 
|  | */ | 
|  | static void _het_job_start_test(node_space_map_t *node_space, | 
|  | uint32_t het_job_id, node_used_t *nodes_used, | 
|  | list_t *nodes_used_list) | 
|  | { | 
|  | het_job_map_t *map = NULL; | 
|  |  | 
|  | if (!het_job_id) { | 
|  | /* Test all maps. */ | 
|  | (void)list_for_each(het_job_list, | 
|  | _het_job_start_test_list, node_space); | 
|  | } else { | 
|  | /* Test single map. */ | 
|  | map = list_find_first(het_job_list, _het_job_find_map, | 
|  | &het_job_id); | 
|  | if (_het_job_start_test_single(node_space, map, true)) { | 
|  | nodes_used->needs_sorting = false; | 
|  | (void) list_for_each(map->het_job_rec_list, | 
|  | _foreach_add_job_to_nodes_used, | 
|  | nodes_used); | 
|  | if (nodes_used->needs_sorting) { | 
|  | nodes_used->needs_sorting = false; | 
|  | list_sort(nodes_used_list, _cmp_last_job_end); | 
|  | } | 
|  | } | 
|  | } | 
|  | } | 
|  |  | 
|  | static void _deadlock_global_list_del(void *x) | 
|  | { | 
|  | deadlock_part_struct_t *dl_part_ptr = (deadlock_part_struct_t *) x; | 
|  | FREE_NULL_LIST(dl_part_ptr->deadlock_job_list); | 
|  | xfree(dl_part_ptr); | 
|  | } | 
|  |  | 
|  | static int _deadlock_part_list_srch(void *x, void *key) | 
|  | { | 
|  | deadlock_job_struct_t *dl_job = (deadlock_job_struct_t *) x; | 
|  | job_record_t *job_ptr = (job_record_t *) key; | 
|  | if (dl_job->het_job_id == job_ptr->het_job_id) | 
|  | return 1; | 
|  | return 0; | 
|  | } | 
|  |  | 
|  | static int _deadlock_part_list_srch2(void *x, void *key) | 
|  | { | 
|  | deadlock_job_struct_t *dl_job = (deadlock_job_struct_t *) x; | 
|  | deadlock_job_struct_t *dl_job2 = (deadlock_job_struct_t *) key; | 
|  | if (dl_job->het_job_id == dl_job2->het_job_id) | 
|  | return 1; | 
|  | return 0; | 
|  | } | 
|  |  | 
|  | static int _deadlock_global_list_srch(void *x, void *key) | 
|  | { | 
|  | deadlock_part_struct_t *dl_part = (deadlock_part_struct_t *) x; | 
|  | if (dl_part->part_ptr == (part_record_t *) key) | 
|  | return 1; | 
|  | return 0; | 
|  | } | 
|  |  | 
|  | static int _deadlock_job_list_sort(void *x, void *y) | 
|  | { | 
|  | deadlock_job_struct_t *dl_job_ptr1 = *(deadlock_job_struct_t **) x; | 
|  | deadlock_job_struct_t *dl_job_ptr2 = *(deadlock_job_struct_t **) y; | 
|  | if (dl_job_ptr1->start_time > dl_job_ptr2->start_time) | 
|  | return -1; | 
|  | else if (dl_job_ptr1->start_time < dl_job_ptr2->start_time) | 
|  | return 1; | 
|  | return 0; | 
|  | } | 
|  |  | 
|  | /* | 
|  | * Call at end of backup execution to release memory allocated by | 
|  | * _het_job_deadlock_test() | 
|  | */ | 
|  | static void _het_job_deadlock_fini(void) | 
|  | { | 
|  | FREE_NULL_LIST(deadlock_global_list); | 
|  | } | 
|  |  | 
|  | /* | 
|  | * Determine if job can run at it's "start_time" or later. | 
|  | * job_ptr IN - job to test, set reason to "HET_JOB_DEADLOCK" if it will deadlock | 
|  | * RET true if the job can not run due to possible deadlock with other hetjob | 
|  | * | 
|  | * NOTE: If there are a large number of hetjobs this will be painfully slow | 
|  | *       as the algorithm must be order n^2 | 
|  | */ | 
|  | static bool _het_job_deadlock_test(job_record_t *job_ptr) | 
|  | { | 
|  | deadlock_job_struct_t  *dl_job_ptr  = NULL, *dl_job_ptr2 = NULL; | 
|  | deadlock_job_struct_t  *dl_job_ptr3 = NULL; | 
|  | deadlock_part_struct_t *dl_part_ptr = NULL, *dl_part_ptr2 = NULL; | 
|  | list_itr_t *job_iter, *part_iter; | 
|  | bool have_deadlock = false; | 
|  |  | 
|  | if (!job_ptr->het_job_id || !job_ptr->part_ptr) | 
|  | return false; | 
|  |  | 
|  | /* | 
|  | * Find the list representing the ordering of jobs in this specific | 
|  | * partition and add this job in the list, sorted by job start time | 
|  | */ | 
|  | if (!deadlock_global_list) { | 
|  | deadlock_global_list = list_create(_deadlock_global_list_del); | 
|  | } else { | 
|  | dl_part_ptr = list_find_first(deadlock_global_list, | 
|  | _deadlock_global_list_srch, | 
|  | job_ptr->part_ptr); | 
|  | } | 
|  | if (!dl_part_ptr) { | 
|  | dl_part_ptr = xmalloc(sizeof(deadlock_part_struct_t)); | 
|  | dl_part_ptr->deadlock_job_list = list_create(xfree_ptr); | 
|  | dl_part_ptr->part_ptr = job_ptr->part_ptr; | 
|  | list_append(deadlock_global_list, dl_part_ptr); | 
|  | } else { | 
|  | dl_job_ptr = list_find_first(dl_part_ptr->deadlock_job_list, | 
|  | _deadlock_part_list_srch, | 
|  | job_ptr); | 
|  | } | 
|  | if (!dl_job_ptr) { | 
|  | dl_job_ptr = xmalloc(sizeof(deadlock_job_struct_t)); | 
|  | dl_job_ptr->het_job_id = job_ptr->het_job_id; | 
|  | dl_job_ptr->start_time = job_ptr->start_time; | 
|  | list_append(dl_part_ptr->deadlock_job_list, dl_job_ptr); | 
|  | } else if (dl_job_ptr->start_time < job_ptr->start_time) { | 
|  | dl_job_ptr->start_time = job_ptr->start_time; | 
|  | } | 
|  | list_sort(dl_part_ptr->deadlock_job_list, _deadlock_job_list_sort); | 
|  |  | 
|  | /* | 
|  | * Log current table of hetjob start times by partition | 
|  | */ | 
|  | if (slurm_conf.debug_flags & DEBUG_FLAG_BACKFILL) { | 
|  | part_iter = list_iterator_create(deadlock_global_list); | 
|  | while ((dl_part_ptr2 = list_next(part_iter))){ | 
|  | info("Partition %s Hetjobs:", | 
|  | dl_part_ptr2->part_ptr->name); | 
|  | job_iter = list_iterator_create(dl_part_ptr2-> | 
|  | deadlock_job_list); | 
|  | while ((dl_job_ptr2 = list_next(job_iter))) { | 
|  | info("   Hetjob %u to start at %"PRIu64, | 
|  | dl_job_ptr2->het_job_id, | 
|  | (uint64_t) dl_job_ptr2->start_time); | 
|  | } | 
|  | list_iterator_destroy(job_iter); | 
|  | } | 
|  | list_iterator_destroy(part_iter); | 
|  | } | 
|  |  | 
|  | /* | 
|  | * Determine if any hetjobs scheduled to start earlier than this job | 
|  | * in this partition are scheduled to start after it in some other | 
|  | * partition | 
|  | */ | 
|  | part_iter = list_iterator_create(deadlock_global_list); | 
|  | while ((dl_part_ptr2 = list_next(part_iter))){ | 
|  | if (dl_part_ptr2 == dl_part_ptr) /* Current partition, skip it */ | 
|  | continue; | 
|  | dl_job_ptr2 = list_find_first(dl_part_ptr2->deadlock_job_list, | 
|  | _deadlock_part_list_srch, | 
|  | job_ptr); | 
|  | if (!dl_job_ptr2) /* Hetjob not in this partition, no check */ | 
|  | continue; | 
|  | job_iter = list_iterator_create(dl_part_ptr->deadlock_job_list); | 
|  | while ((dl_job_ptr2 = list_next(job_iter))) { | 
|  | if (dl_job_ptr2->het_job_id == dl_job_ptr->het_job_id) | 
|  | break;	/* Self */ | 
|  | dl_job_ptr3 = list_find_first( | 
|  | dl_part_ptr2->deadlock_job_list, | 
|  | _deadlock_part_list_srch2, | 
|  | dl_job_ptr2); | 
|  | if (dl_job_ptr3 && | 
|  | (dl_job_ptr3->start_time < dl_job_ptr->start_time)){ | 
|  | have_deadlock = true; | 
|  | break; | 
|  | } | 
|  | } | 
|  | list_iterator_destroy(job_iter); | 
|  |  | 
|  | if (have_deadlock) | 
|  | log_flag(HETJOB, "Hetjob %u in partition %s would deadlock with hetjob %u in partition %s, skipping it", | 
|  | dl_job_ptr->het_job_id, | 
|  | dl_part_ptr->part_ptr->name, | 
|  | dl_job_ptr3->het_job_id, | 
|  | dl_part_ptr2->part_ptr->name); | 
|  | if (have_deadlock) | 
|  | break; | 
|  | } | 
|  | list_iterator_destroy(part_iter); | 
|  |  | 
|  | return have_deadlock; | 
|  | } | 
|  |  | 
|  | static void _set_bf_exit(bf_exit_t code) | 
|  | { | 
|  | xassert(code < BF_EXIT_COUNT); | 
|  |  | 
|  | slurmctld_diag_stats.bf_exit[code]++; | 
|  | } |