| /*****************************************************************************\ |
| * backfill.c - simple backfill scheduler plugin. |
| * |
| * If a partition is does not have root only access and nodes are not shared |
| * then raise the priority of pending jobs if doing so does not adversely |
| * effect the expected initiation of any higher priority job. We do not alter |
| * a job's required or excluded node list, so this is a conservative |
| * algorithm. |
| * |
| * For example, consider a cluster "lx[01-08]" with one job executing on |
| * nodes "lx[01-04]". The highest priority pending job requires five nodes |
| * including "lx05". The next highest priority pending job requires any |
| * three nodes. Without explicitly forcing the second job to use nodes |
| * "lx[06-08]", we can't start it without possibly delaying the higher |
| * priority job. |
| ***************************************************************************** |
| * Copyright (C) 2003-2007 The Regents of the University of California. |
| * Copyright (C) 2008-2010 Lawrence Livermore National Security. |
| * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). |
| * Written by Morris Jette <jette1@llnl.gov> |
| * CODE-OCEC-09-009. All rights reserved. |
| * |
| * This file is part of SLURM, a resource management program. |
| * For details, see <https://computing.llnl.gov/linux/slurm/>. |
| * Please also read the included file: DISCLAIMER. |
| * |
| * SLURM is free software; you can redistribute it and/or modify it under |
| * the terms of the GNU General Public License as published by the Free |
| * Software Foundation; either version 2 of the License, or (at your option) |
| * any later version. |
| * |
| * In addition, as a special exception, the copyright holders give permission |
| * to link the code of portions of this program with the OpenSSL library under |
| * certain conditions as described in each individual source file, and |
| * distribute linked combinations including the two. You must obey the GNU |
| * General Public License in all respects for all of the code used other than |
| * OpenSSL. If you modify file(s) with this exception, you may extend this |
| * exception to your version of the file(s), but you are not obligated to do |
| * so. If you do not wish to do so, delete this exception statement from your |
| * version. If you delete this exception statement from all source files in |
| * the program, then also delete it here. |
| * |
| * SLURM is distributed in the hope that it will be useful, but WITHOUT ANY |
| * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS |
| * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more |
| * details. |
| * |
| * You should have received a copy of the GNU General Public License along |
| * with SLURM; if not, write to the Free Software Foundation, Inc., |
| * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. |
| \*****************************************************************************/ |
| |
| #include <pthread.h> |
| #include <stdio.h> |
| #include <stdlib.h> |
| #include <string.h> |
| #include <time.h> |
| #include <unistd.h> |
| |
| #include "slurm/slurm.h" |
| #include "slurm/slurm_errno.h" |
| |
| #include "src/common/list.h" |
| #include "src/common/macros.h" |
| #include "src/common/node_select.h" |
| #include "src/common/parse_time.h" |
| #include "src/common/slurm_protocol_api.h" |
| #include "src/common/xmalloc.h" |
| #include "src/common/xstring.h" |
| |
| #include "src/slurmctld/acct_policy.h" |
| #include "src/slurmctld/job_scheduler.h" |
| #include "src/slurmctld/licenses.h" |
| #include "src/slurmctld/locks.h" |
| #include "src/slurmctld/node_scheduler.h" |
| #include "src/slurmctld/preempt.h" |
| #include "src/slurmctld/reservation.h" |
| #include "src/slurmctld/slurmctld.h" |
| #include "src/slurmctld/srun_comm.h" |
| #include "backfill.h" |
| |
| #ifndef BACKFILL_INTERVAL |
| # define BACKFILL_INTERVAL 30 |
| #endif |
| |
| /* Do not build job/resource/time record for more than this |
| * far in the future, in seconds, currently one day */ |
| #ifndef BACKFILL_WINDOW |
| #define BACKFILL_WINDOW (24 * 60 * 60) |
| #endif |
| |
| #define SLURMCTLD_THREAD_LIMIT 5 |
| |
| typedef struct node_space_map { |
| time_t begin_time; |
| time_t end_time; |
| bitstr_t *avail_bitmap; |
| int next; /* next record, by time, zero termination */ |
| } node_space_map_t; |
| int backfilled_jobs = 0; |
| |
| /*********************** local variables *********************/ |
| static bool stop_backfill = false; |
| static pthread_mutex_t thread_flag_mutex = PTHREAD_MUTEX_INITIALIZER; |
| static pthread_mutex_t term_lock = PTHREAD_MUTEX_INITIALIZER; |
| static pthread_cond_t term_cond = PTHREAD_COND_INITIALIZER; |
| static bool config_flag = false; |
| static uint32_t debug_flags = 0; |
| static int backfill_interval = BACKFILL_INTERVAL; |
| static int backfill_window = BACKFILL_WINDOW; |
| static int max_backfill_job_cnt = 50; |
| |
| /*********************** local functions *********************/ |
| static void _add_reservation(uint32_t start_time, uint32_t end_reserve, |
| bitstr_t *res_bitmap, |
| node_space_map_t *node_space, |
| int *node_space_recs); |
| static int _attempt_backfill(void); |
| static void _diff_tv_str(struct timeval *tv1,struct timeval *tv2, |
| char *tv_str, int len_tv_str); |
| static bool _job_is_completing(void); |
| static void _load_config(void); |
| static bool _many_pending_rpcs(void); |
| static bool _more_work(time_t last_backfill_time); |
| static void _my_sleep(int secs); |
| static int _num_feature_count(struct job_record *job_ptr); |
| static void _reset_job_time_limit(struct job_record *job_ptr, time_t now, |
| node_space_map_t *node_space); |
| static int _start_job(struct job_record *job_ptr, bitstr_t *avail_bitmap); |
| static bool _test_resv_overlap(node_space_map_t *node_space, |
| bitstr_t *use_bitmap, uint32_t start_time, |
| uint32_t end_reserve); |
| static int _try_sched(struct job_record *job_ptr, bitstr_t **avail_bitmap, |
| uint32_t min_nodes, uint32_t max_nodes, |
| uint32_t req_nodes); |
| |
| /* Log resource allocate table */ |
| static void _dump_node_space_table(node_space_map_t *node_space_ptr) |
| { |
| int i = 0; |
| char begin_buf[32], end_buf[32], *node_list; |
| |
| info("========================================="); |
| while (1) { |
| slurm_make_time_str(&node_space_ptr[i].begin_time, |
| begin_buf, sizeof(begin_buf)); |
| slurm_make_time_str(&node_space_ptr[i].end_time, |
| end_buf, sizeof(end_buf)); |
| node_list = bitmap2node_name(node_space_ptr[i].avail_bitmap); |
| info("Begin:%s End:%s Nodes:%s", |
| begin_buf, end_buf, node_list); |
| xfree(node_list); |
| if ((i = node_space_ptr[i].next) == 0) |
| break; |
| } |
| info("========================================="); |
| } |
| |
| /* |
| * _diff_tv_str - build a string showing the time difference between two times |
| * IN tv1 - start of event |
| * IN tv2 - end of event |
| * OUT tv_str - place to put delta time in format "usec=%ld" |
| * IN len_tv_str - size of tv_str in bytes |
| */ |
| static void _diff_tv_str(struct timeval *tv1,struct timeval *tv2, |
| char *tv_str, int len_tv_str) |
| { |
| long delta_t; |
| delta_t = (tv2->tv_sec - tv1->tv_sec) * 1000000; |
| delta_t += tv2->tv_usec - tv1->tv_usec; |
| snprintf(tv_str, len_tv_str, "usec=%ld", delta_t); |
| } |
| |
| /* |
| * _job_is_completing - Determine if jobs are in the process of completing. |
| * This is a variant of job_is_completing in slurmctld/job_scheduler.c. |
| * It always gives completing jobs at least 5 secs to complete. |
| * RET - True if any job is in the process of completing |
| */ |
| static bool _job_is_completing(void) |
| { |
| bool completing = false; |
| ListIterator job_iterator; |
| struct job_record *job_ptr = NULL; |
| uint16_t complete_wait = slurm_get_complete_wait(); |
| time_t recent; |
| |
| if (job_list == NULL) |
| return completing; |
| |
| recent = time(NULL) - MAX(complete_wait, 5); |
| job_iterator = list_iterator_create(job_list); |
| while ((job_ptr = (struct job_record *) list_next(job_iterator))) { |
| if (IS_JOB_COMPLETING(job_ptr) && |
| (job_ptr->end_time >= recent)) { |
| completing = true; |
| break; |
| } |
| } |
| list_iterator_destroy(job_iterator); |
| |
| return completing; |
| } |
| |
| /* |
| * _many_pending_rpcs - Determine if slurmctld is busy with many active RPCs |
| * RET - True if slurmctld currently has more than SLURMCTLD_THREAD_LIMIT |
| * active RPCs |
| */ |
| static bool _many_pending_rpcs(void) |
| { |
| //info("thread_count = %u", slurmctld_config.server_thread_count); |
| if (slurmctld_config.server_thread_count > SLURMCTLD_THREAD_LIMIT) |
| return true; |
| return false; |
| } |
| |
| /* test if job has feature count specification */ |
| static int _num_feature_count(struct job_record *job_ptr) |
| { |
| struct job_details *detail_ptr = job_ptr->details; |
| int rc = 0; |
| ListIterator feat_iter; |
| struct feature_record *feat_ptr; |
| |
| if (detail_ptr->feature_list == NULL) /* no constraints */ |
| return rc; |
| |
| feat_iter = list_iterator_create(detail_ptr->feature_list); |
| if (feat_iter == NULL) |
| fatal("list_iterator_create: malloc failure"); |
| while ((feat_ptr = (struct feature_record *) list_next(feat_iter))) { |
| if (feat_ptr->count) |
| rc++; |
| } |
| list_iterator_destroy(feat_iter); |
| |
| return rc; |
| } |
| |
| /* Attempt to schedule a specific job on specific available nodes |
| * IN job_ptr - job to schedule |
| * IN/OUT avail_bitmap - nodes available/selected to use |
| * RET SLURM_SUCCESS on success, otherwise an error code |
| */ |
| static int _try_sched(struct job_record *job_ptr, bitstr_t **avail_bitmap, |
| uint32_t min_nodes, uint32_t max_nodes, |
| uint32_t req_nodes) |
| { |
| bitstr_t *tmp_bitmap; |
| int rc = SLURM_SUCCESS; |
| int feat_cnt = _num_feature_count(job_ptr); |
| List preemptee_candidates = NULL; |
| |
| if (feat_cnt) { |
| /* Ideally schedule the job feature by feature, |
| * but I don't want to add that complexity here |
| * right now, so clear the feature counts and try |
| * to schedule. This will work if there is only |
| * one feature count. It should work fairly well |
| * in cases where there are multiple feature |
| * counts. */ |
| struct job_details *detail_ptr = job_ptr->details; |
| ListIterator feat_iter; |
| struct feature_record *feat_ptr; |
| int i = 0, list_size; |
| uint16_t *feat_cnt_orig = NULL, high_cnt = 0; |
| |
| /* Clear the feature counts */ |
| list_size = list_count(detail_ptr->feature_list); |
| feat_cnt_orig = xmalloc(sizeof(uint16_t) * list_size); |
| feat_iter = list_iterator_create(detail_ptr->feature_list); |
| if (feat_iter == NULL) |
| fatal("list_iterator_create: malloc failure"); |
| while ((feat_ptr = |
| (struct feature_record *) list_next(feat_iter))) { |
| high_cnt = MAX(high_cnt, feat_ptr->count); |
| feat_cnt_orig[i++] = feat_ptr->count; |
| feat_ptr->count = 0; |
| } |
| list_iterator_destroy(feat_iter); |
| |
| if ((job_req_node_filter(job_ptr, *avail_bitmap) != |
| SLURM_SUCCESS) || |
| (bit_set_count(*avail_bitmap) < high_cnt)) { |
| rc = ESLURM_NODES_BUSY; |
| } else { |
| preemptee_candidates = |
| slurm_find_preemptable_jobs(job_ptr); |
| rc = select_g_job_test(job_ptr, *avail_bitmap, |
| high_cnt, max_nodes, req_nodes, |
| SELECT_MODE_WILL_RUN, |
| preemptee_candidates, NULL); |
| } |
| |
| /* Restore the feature counts */ |
| i = 0; |
| feat_iter = list_iterator_create(detail_ptr->feature_list); |
| while ((feat_ptr = |
| (struct feature_record *) list_next(feat_iter))) { |
| feat_ptr->count = feat_cnt_orig[i++]; |
| } |
| list_iterator_destroy(feat_iter); |
| xfree(feat_cnt_orig); |
| } else { |
| /* Try to schedule the job. First on dedicated nodes |
| * then on shared nodes (if so configured). */ |
| uint16_t orig_shared; |
| time_t now = time(NULL); |
| preemptee_candidates = slurm_find_preemptable_jobs(job_ptr); |
| orig_shared = job_ptr->details->shared; |
| job_ptr->details->shared = 0; |
| tmp_bitmap = bit_copy(*avail_bitmap); |
| rc = select_g_job_test(job_ptr, *avail_bitmap, min_nodes, |
| max_nodes, req_nodes, |
| SELECT_MODE_WILL_RUN, |
| preemptee_candidates, NULL); |
| job_ptr->details->shared = orig_shared; |
| if (((rc != SLURM_SUCCESS) || (job_ptr->start_time > now)) && |
| (orig_shared != 0)) { |
| FREE_NULL_BITMAP(*avail_bitmap); |
| *avail_bitmap= tmp_bitmap; |
| rc = select_g_job_test(job_ptr, *avail_bitmap, |
| min_nodes, max_nodes, req_nodes, |
| SELECT_MODE_WILL_RUN, |
| preemptee_candidates, NULL); |
| } else |
| FREE_NULL_BITMAP(tmp_bitmap); |
| } |
| |
| if (preemptee_candidates) |
| list_destroy(preemptee_candidates); |
| return rc; |
| |
| } |
| |
| /* Terminate backfill_agent */ |
| extern void stop_backfill_agent(void) |
| { |
| pthread_mutex_lock(&term_lock); |
| stop_backfill = true; |
| pthread_cond_signal(&term_cond); |
| pthread_mutex_unlock(&term_lock); |
| } |
| |
| static void _my_sleep(int secs) |
| { |
| struct timespec ts = {0, 0}; |
| |
| ts.tv_sec = time(NULL) + secs; |
| pthread_mutex_lock(&term_lock); |
| if (!stop_backfill) |
| pthread_cond_timedwait(&term_cond, &term_lock, &ts); |
| pthread_mutex_unlock(&term_lock); |
| } |
| |
| static void _load_config(void) |
| { |
| char *sched_params, *tmp_ptr; |
| |
| sched_params = slurm_get_sched_params(); |
| debug_flags = slurm_get_debug_flags(); |
| |
| if (sched_params && (tmp_ptr=strstr(sched_params, "interval="))) |
| backfill_interval = atoi(tmp_ptr + 9); |
| if (backfill_interval < 1) { |
| fatal("Invalid backfill scheduler interval: %d", |
| backfill_interval); |
| } |
| |
| if (sched_params && (tmp_ptr=strstr(sched_params, "bf_window="))) |
| backfill_window = atoi(tmp_ptr + 10) * 60; /* mins to secs */ |
| if (backfill_window < 1) { |
| fatal("Invalid backfill scheduler window: %d", |
| backfill_window); |
| } |
| if (sched_params && (tmp_ptr=strstr(sched_params, "max_job_bf="))) |
| max_backfill_job_cnt = atoi(tmp_ptr + 11); |
| if (max_backfill_job_cnt < 1) { |
| fatal("Invalid backfill scheduler max_job_bf: %d", |
| max_backfill_job_cnt); |
| } |
| xfree(sched_params); |
| } |
| |
| /* Note that slurm.conf has changed */ |
| extern void backfill_reconfig(void) |
| { |
| config_flag = true; |
| } |
| |
| /* backfill_agent - detached thread periodically attempts to backfill jobs */ |
| extern void *backfill_agent(void *args) |
| { |
| struct timeval tv1, tv2; |
| char tv_str[20]; |
| time_t now; |
| double wait_time; |
| static time_t last_backfill_time = 0; |
| /* Read config and partitions; Write jobs and nodes */ |
| slurmctld_lock_t all_locks = { |
| READ_LOCK, WRITE_LOCK, WRITE_LOCK, READ_LOCK }; |
| |
| _load_config(); |
| last_backfill_time = time(NULL); |
| while (!stop_backfill) { |
| _my_sleep(backfill_interval); |
| if (stop_backfill) |
| break; |
| if (config_flag) { |
| config_flag = false; |
| _load_config(); |
| } |
| now = time(NULL); |
| wait_time = difftime(now, last_backfill_time); |
| if ((wait_time < backfill_interval) || |
| _job_is_completing() || _many_pending_rpcs() || |
| !_more_work(last_backfill_time)) |
| continue; |
| |
| gettimeofday(&tv1, NULL); |
| lock_slurmctld(all_locks); |
| while (_attempt_backfill()) ; |
| last_backfill_time = time(NULL); |
| unlock_slurmctld(all_locks); |
| gettimeofday(&tv2, NULL); |
| _diff_tv_str(&tv1, &tv2, tv_str, 20); |
| if (debug_flags & DEBUG_FLAG_BACKFILL) |
| info("backfill: completed, %s", tv_str); |
| } |
| return NULL; |
| } |
| |
| /* Return non-zero to break the backfill loop if change in job, node or |
| * partition state or the backfill scheduler needs to be stopped. */ |
| static int _yield_locks(void) |
| { |
| slurmctld_lock_t all_locks = { |
| READ_LOCK, WRITE_LOCK, WRITE_LOCK, READ_LOCK }; |
| time_t job_update, node_update, part_update; |
| |
| job_update = last_job_update; |
| node_update = last_node_update; |
| part_update = last_part_update; |
| |
| unlock_slurmctld(all_locks); |
| _my_sleep(backfill_interval); |
| lock_slurmctld(all_locks); |
| |
| if ((last_job_update == job_update) && |
| (last_node_update == node_update) && |
| (last_part_update == part_update) && |
| (! stop_backfill) && (! config_flag)) |
| return 0; |
| else |
| return 1; |
| } |
| |
| static int _attempt_backfill(void) |
| { |
| bool filter_root = false; |
| List job_queue; |
| job_queue_rec_t *job_queue_rec; |
| slurmdb_qos_rec_t *qos_ptr = NULL; |
| int i, j, node_space_recs; |
| struct job_record *job_ptr; |
| struct part_record *part_ptr; |
| uint32_t end_time, end_reserve; |
| uint32_t time_limit, comp_time_limit, orig_time_limit; |
| uint32_t min_nodes, max_nodes, req_nodes; |
| bitstr_t *avail_bitmap = NULL, *resv_bitmap = NULL; |
| time_t now = time(NULL), sched_start, later_start, start_res; |
| node_space_map_t *node_space; |
| static int sched_timeout = 0; |
| int this_sched_timeout = 0, rc = 0; |
| |
| sched_start = now; |
| if (sched_timeout == 0) { |
| sched_timeout = slurm_get_msg_timeout() / 2; |
| sched_timeout = MAX(sched_timeout, 1); |
| sched_timeout = MIN(sched_timeout, 10); |
| } |
| this_sched_timeout = sched_timeout; |
| |
| if (slurm_get_root_filter()) |
| filter_root = true; |
| |
| job_queue = build_job_queue(true); |
| if (list_count(job_queue) <= 1) { |
| debug("backfill: no jobs to backfill"); |
| list_destroy(job_queue); |
| return 0; |
| } |
| |
| node_space = xmalloc(sizeof(node_space_map_t) * |
| (max_backfill_job_cnt + 3)); |
| node_space[0].begin_time = sched_start; |
| node_space[0].end_time = sched_start + backfill_window; |
| node_space[0].avail_bitmap = bit_copy(avail_node_bitmap); |
| node_space[0].next = 0; |
| node_space_recs = 1; |
| if (debug_flags & DEBUG_FLAG_BACKFILL) |
| _dump_node_space_table(node_space); |
| |
| while ((job_queue_rec = (job_queue_rec_t *) |
| list_pop_bottom(job_queue, sort_job_queue2))) { |
| job_ptr = job_queue_rec->job_ptr; |
| part_ptr = job_queue_rec->part_ptr; |
| xfree(job_queue_rec); |
| if (!IS_JOB_PENDING(job_ptr)) |
| continue; /* started in other partition */ |
| job_ptr->part_ptr = part_ptr; |
| |
| if (debug_flags & DEBUG_FLAG_BACKFILL) |
| info("backfill test for job %u", job_ptr->job_id); |
| |
| if ((job_ptr->state_reason == WAIT_ASSOC_JOB_LIMIT) || |
| (job_ptr->state_reason == WAIT_ASSOC_RESOURCE_LIMIT) || |
| (job_ptr->state_reason == WAIT_ASSOC_TIME_LIMIT)) { |
| debug2("backfill: job %u is not allowed to run now. " |
| "Skipping it. State=%s. Reason=%s. Priority=%u", |
| job_ptr->job_id, |
| job_state_string(job_ptr->job_state), |
| job_reason_string(job_ptr->state_reason), |
| job_ptr->priority); |
| continue; |
| } |
| |
| if (((part_ptr->state_up & PARTITION_SCHED) == 0) || |
| (part_ptr->node_bitmap == NULL)) |
| continue; |
| if ((part_ptr->flags & PART_FLAG_ROOT_ONLY) && filter_root) |
| continue; |
| |
| if ((!job_independent(job_ptr, 0)) || |
| (license_job_test(job_ptr, time(NULL)) != SLURM_SUCCESS)) |
| continue; |
| |
| /* Determine minimum and maximum node counts */ |
| min_nodes = MAX(job_ptr->details->min_nodes, |
| part_ptr->min_nodes); |
| if (job_ptr->details->max_nodes == 0) |
| max_nodes = part_ptr->max_nodes; |
| else |
| max_nodes = MIN(job_ptr->details->max_nodes, |
| part_ptr->max_nodes); |
| max_nodes = MIN(max_nodes, 500000); /* prevent overflows */ |
| if (job_ptr->details->max_nodes) |
| req_nodes = max_nodes; |
| else |
| req_nodes = min_nodes; |
| if (min_nodes > max_nodes) { |
| /* job's min_nodes exceeds partition's max_nodes */ |
| continue; |
| } |
| |
| /* Determine job's expected completion time */ |
| if (job_ptr->time_limit == NO_VAL) { |
| if (part_ptr->max_time == INFINITE) |
| time_limit = 365 * 24 * 60; /* one year */ |
| else |
| time_limit = part_ptr->max_time; |
| } else { |
| if (part_ptr->max_time == INFINITE) |
| time_limit = job_ptr->time_limit; |
| else |
| time_limit = MIN(job_ptr->time_limit, |
| part_ptr->max_time); |
| } |
| comp_time_limit = time_limit; |
| orig_time_limit = job_ptr->time_limit; |
| if (qos_ptr && (qos_ptr->flags & QOS_FLAG_NO_RESERVE)) |
| time_limit = job_ptr->time_limit = 1; |
| else if (job_ptr->time_min && (job_ptr->time_min < time_limit)) |
| time_limit = job_ptr->time_limit = job_ptr->time_min; |
| |
| /* Determine impact of any resource reservations */ |
| later_start = now; |
| TRY_LATER: FREE_NULL_BITMAP(avail_bitmap); |
| start_res = later_start; |
| later_start = 0; |
| j = job_test_resv(job_ptr, &start_res, true, &avail_bitmap); |
| if (j != SLURM_SUCCESS) { |
| job_ptr->time_limit = orig_time_limit; |
| continue; |
| } |
| if (start_res > now) |
| end_time = (time_limit * 60) + start_res; |
| else |
| end_time = (time_limit * 60) + now; |
| |
| /* Identify usable nodes for this job */ |
| bit_and(avail_bitmap, part_ptr->node_bitmap); |
| bit_and(avail_bitmap, up_node_bitmap); |
| for (j=0; ; ) { |
| if ((node_space[j].end_time > start_res) && |
| node_space[j].next && (later_start == 0)) |
| later_start = node_space[j].end_time; |
| if (node_space[j].end_time <= start_res) |
| ; |
| else if (node_space[j].begin_time <= end_time) { |
| bit_and(avail_bitmap, |
| node_space[j].avail_bitmap); |
| } else |
| break; |
| if ((j = node_space[j].next) == 0) |
| break; |
| } |
| |
| if (job_ptr->details->exc_node_bitmap) { |
| bit_not(job_ptr->details->exc_node_bitmap); |
| bit_and(avail_bitmap, |
| job_ptr->details->exc_node_bitmap); |
| bit_not(job_ptr->details->exc_node_bitmap); |
| } |
| |
| /* Test if insufficient nodes remain OR |
| * required nodes missing OR |
| * nodes lack features */ |
| if ((bit_set_count(avail_bitmap) < min_nodes) || |
| ((job_ptr->details->req_node_bitmap) && |
| (!bit_super_set(job_ptr->details->req_node_bitmap, |
| avail_bitmap))) || |
| (job_req_node_filter(job_ptr, avail_bitmap))) { |
| if (later_start) { |
| job_ptr->start_time = 0; |
| goto TRY_LATER; |
| } |
| job_ptr->time_limit = orig_time_limit; |
| continue; |
| } |
| |
| /* Identify nodes which are definitely off limits */ |
| FREE_NULL_BITMAP(resv_bitmap); |
| resv_bitmap = bit_copy(avail_bitmap); |
| bit_not(resv_bitmap); |
| |
| if ((time(NULL) - sched_start) >= this_sched_timeout) { |
| debug("backfill: loop taking too long, yielding locks"); |
| if (_yield_locks()) { |
| debug("backfill: system state changed, " |
| "breaking out"); |
| rc = 1; |
| break; |
| } else { |
| this_sched_timeout += sched_timeout; |
| } |
| } |
| /* this is the time consuming operation */ |
| debug2("backfill: entering _try_sched for job %u.", |
| job_ptr->job_id); |
| j = _try_sched(job_ptr, &avail_bitmap, |
| min_nodes, max_nodes, req_nodes); |
| debug2("backfill: finished _try_sched for job %u.", |
| job_ptr->job_id); |
| now = time(NULL); |
| if (j != SLURM_SUCCESS) { |
| job_ptr->time_limit = orig_time_limit; |
| job_ptr->start_time = 0; |
| continue; /* not runable */ |
| } |
| |
| if (start_res > job_ptr->start_time) { |
| job_ptr->start_time = start_res; |
| last_job_update = now; |
| } |
| if (job_ptr->start_time <= now) { |
| int rc = _start_job(job_ptr, resv_bitmap); |
| if (qos_ptr && (qos_ptr->flags & QOS_FLAG_NO_RESERVE)) |
| job_ptr->time_limit = orig_time_limit; |
| else if ((rc == SLURM_SUCCESS) && job_ptr->time_min) { |
| /* Set time limit as high as possible */ |
| job_ptr->time_limit = comp_time_limit; |
| job_ptr->end_time = job_ptr->start_time + |
| (comp_time_limit * 60); |
| _reset_job_time_limit(job_ptr, now, |
| node_space); |
| time_limit = job_ptr->time_limit; |
| } else { |
| job_ptr->time_limit = orig_time_limit; |
| } |
| if (rc == ESLURM_ACCOUNTING_POLICY) { |
| /* Unknown future start time, just skip job */ |
| job_ptr->start_time = 0; |
| continue; |
| } else if (rc != SLURM_SUCCESS) { |
| /* Planned to start job, but something bad |
| * happended. */ |
| job_ptr->start_time = 0; |
| break; |
| } else { |
| /* Started this job, move to next one */ |
| continue; |
| } |
| } else |
| job_ptr->time_limit = orig_time_limit; |
| |
| if (later_start && (job_ptr->start_time > later_start)) { |
| /* Try later when some nodes currently reserved for |
| * pending jobs are free */ |
| job_ptr->start_time = 0; |
| goto TRY_LATER; |
| } |
| |
| if (job_ptr->start_time > (sched_start + backfill_window)) { |
| /* Starts too far in the future to worry about */ |
| continue; |
| } |
| |
| if (node_space_recs >= max_backfill_job_cnt) { |
| /* Already have too many jobs to deal with */ |
| break; |
| } |
| |
| end_reserve = job_ptr->start_time + (time_limit * 60); |
| if (_test_resv_overlap(node_space, avail_bitmap, |
| job_ptr->start_time, end_reserve)) { |
| /* This job overlaps with an existing reservation for |
| * job to be backfill scheduled, which the sched |
| * plugin does not know about. Try again later. */ |
| later_start = job_ptr->start_time; |
| job_ptr->start_time = 0; |
| goto TRY_LATER; |
| } |
| |
| /* |
| * Add reservation to scheduling table if appropriate |
| */ |
| qos_ptr = job_ptr->qos_ptr; |
| if (qos_ptr && (qos_ptr->flags & QOS_FLAG_NO_RESERVE)) |
| continue; |
| bit_not(avail_bitmap); |
| _add_reservation(job_ptr->start_time, end_reserve, |
| avail_bitmap, node_space, &node_space_recs); |
| if (debug_flags & DEBUG_FLAG_BACKFILL) |
| _dump_node_space_table(node_space); |
| } |
| FREE_NULL_BITMAP(avail_bitmap); |
| FREE_NULL_BITMAP(resv_bitmap); |
| |
| for (i=0; ; ) { |
| FREE_NULL_BITMAP(node_space[i].avail_bitmap); |
| if ((i = node_space[i].next) == 0) |
| break; |
| } |
| xfree(node_space); |
| list_destroy(job_queue); |
| return rc; |
| } |
| |
| /* Try to start the job on any non-reserved nodes */ |
| static int _start_job(struct job_record *job_ptr, bitstr_t *resv_bitmap) |
| { |
| int rc; |
| bitstr_t *orig_exc_nodes = NULL; |
| static uint32_t fail_jobid = 0; |
| |
| if (job_ptr->details->exc_node_bitmap) { |
| orig_exc_nodes = bit_copy(job_ptr->details->exc_node_bitmap); |
| bit_or(job_ptr->details->exc_node_bitmap, resv_bitmap); |
| } else |
| job_ptr->details->exc_node_bitmap = bit_copy(resv_bitmap); |
| |
| rc = select_nodes(job_ptr, false, NULL); |
| FREE_NULL_BITMAP(job_ptr->details->exc_node_bitmap); |
| job_ptr->details->exc_node_bitmap = orig_exc_nodes; |
| if (rc == SLURM_SUCCESS) { |
| /* job initiated */ |
| last_job_update = time(NULL); |
| info("backfill: Started JobId=%u on %s", |
| job_ptr->job_id, job_ptr->nodes); |
| if (job_ptr->batch_flag == 0) |
| srun_allocate(job_ptr->job_id); |
| else if (job_ptr->details->prolog_running == 0) |
| launch_job(job_ptr); |
| backfilled_jobs++; |
| if (debug_flags & DEBUG_FLAG_BACKFILL) { |
| info("backfill: Jobs backfilled since boot: %d", |
| backfilled_jobs); |
| } |
| } else if ((job_ptr->job_id != fail_jobid) && |
| (rc != ESLURM_ACCOUNTING_POLICY)) { |
| char *node_list; |
| bit_not(resv_bitmap); |
| node_list = bitmap2node_name(resv_bitmap); |
| /* This happens when a job has sharing disabled and |
| * a selected node is still completing some job, |
| * which should be a temporary situation. */ |
| verbose("backfill: Failed to start JobId=%u on %s: %s", |
| job_ptr->job_id, node_list, slurm_strerror(rc)); |
| xfree(node_list); |
| fail_jobid = job_ptr->job_id; |
| } else { |
| debug3("backfill: Failed to start JobId=%u: %s", |
| job_ptr->job_id, slurm_strerror(rc)); |
| } |
| |
| return rc; |
| } |
| |
| /* Reset a job's time limit (and end_time) as high as possible |
| * within the range job_ptr->time_min and job_ptr->time_limit. |
| * Avoid using resources reserved for pending jobs or in resource |
| * reservations */ |
| static void _reset_job_time_limit(struct job_record *job_ptr, time_t now, |
| node_space_map_t *node_space) |
| { |
| int32_t j, resv_delay; |
| uint32_t orig_time_limit = job_ptr->time_limit; |
| |
| for (j=0; ; ) { |
| if ((node_space[j].begin_time != now) && |
| (node_space[j].begin_time < job_ptr->end_time) && |
| (!bit_super_set(job_ptr->node_bitmap, |
| node_space[j].avail_bitmap))) { |
| /* Job overlaps pending job's resource reservation */ |
| resv_delay = difftime(node_space[j].begin_time, now); |
| resv_delay /= 60; /* seconds to minutes */ |
| if (resv_delay < job_ptr->time_limit) |
| job_ptr->time_limit = resv_delay; |
| } |
| if ((j = node_space[j].next) == 0) |
| break; |
| } |
| job_ptr->time_limit = MAX(job_ptr->time_min, job_ptr->time_limit); |
| job_ptr->end_time = job_ptr->start_time + (job_ptr->time_limit * 60); |
| |
| job_time_adj_resv(job_ptr); |
| |
| if (orig_time_limit != job_ptr->time_limit) { |
| info("backfill: job %u time limit changed from %u to %u", |
| job_ptr->job_id, orig_time_limit, job_ptr->time_limit); |
| } |
| } |
| |
| /* Report if any changes occurred to job, node or partition information */ |
| static bool _more_work (time_t last_backfill_time) |
| { |
| bool rc = false; |
| |
| pthread_mutex_lock( &thread_flag_mutex ); |
| if ( (last_job_update >= last_backfill_time ) || |
| (last_node_update >= last_backfill_time ) || |
| (last_part_update >= last_backfill_time ) ) { |
| rc = true; |
| } |
| pthread_mutex_unlock( &thread_flag_mutex ); |
| return rc; |
| } |
| |
| /* Create a reservation for a job in the future */ |
| static void _add_reservation(uint32_t start_time, uint32_t end_reserve, |
| bitstr_t *res_bitmap, |
| node_space_map_t *node_space, |
| int *node_space_recs) |
| { |
| bool placed = false; |
| int i, j; |
| |
| for (j=0; ; ) { |
| if (node_space[j].end_time > start_time) { |
| /* insert start entry record */ |
| i = *node_space_recs; |
| node_space[i].begin_time = start_time; |
| node_space[i].end_time = node_space[j].end_time; |
| node_space[j].end_time = start_time; |
| node_space[i].avail_bitmap = |
| bit_copy(node_space[j].avail_bitmap); |
| node_space[i].next = node_space[j].next; |
| node_space[j].next = i; |
| (*node_space_recs)++; |
| placed = true; |
| } |
| if (node_space[j].end_time == start_time) { |
| /* no need to insert new start entry record */ |
| placed = true; |
| } |
| if (placed == true) { |
| j = node_space[j].next; |
| if (j && (end_reserve < node_space[j].end_time)) { |
| /* insert end entry record */ |
| i = *node_space_recs; |
| node_space[i].begin_time = end_reserve; |
| node_space[i].end_time = node_space[j]. |
| end_time; |
| node_space[j].end_time = end_reserve; |
| node_space[i].avail_bitmap = |
| bit_copy(node_space[j].avail_bitmap); |
| node_space[i].next = node_space[j].next; |
| node_space[j].next = i; |
| (*node_space_recs)++; |
| } |
| break; |
| } |
| if ((j = node_space[j].next) == 0) |
| break; |
| } |
| |
| for (j=0; ; ) { |
| if ((node_space[j].begin_time >= start_time) && |
| (node_space[j].end_time <= end_reserve)) |
| bit_and(node_space[j].avail_bitmap, res_bitmap); |
| if ((node_space[j].begin_time >= end_reserve) || |
| ((j = node_space[j].next) == 0)) |
| break; |
| } |
| } |
| |
| /* |
| * Determine if the resource specification for a new job overlaps with a |
| * reservation that the backfill scheduler has made for a job to be |
| * started in the future. |
| * IN use_bitmap - nodes to be allocated |
| * IN start_time - start time of job |
| * IN end_reserve - end time of job |
| */ |
| static bool _test_resv_overlap(node_space_map_t *node_space, |
| bitstr_t *use_bitmap, uint32_t start_time, |
| uint32_t end_reserve) |
| { |
| bool overlap = false; |
| int j; |
| |
| for (j=0; ; ) { |
| if ((node_space[j].end_time > start_time) && |
| (node_space[j].begin_time < end_reserve) && |
| (!bit_super_set(use_bitmap, node_space[j].avail_bitmap))) { |
| overlap = true; |
| break; |
| } |
| if ((j = node_space[j].next) == 0) |
| break; |
| } |
| return overlap; |
| } |