src/plugins/sched/backfill/backfill.c - SchedMD/slurm - Git at Google

 /*****************************************************************************\
  *  backfill.c - simple backfill scheduler plugin.
  *
  *  If a partition is does not have root only access and nodes are not shared
  *  then raise the priority of pending jobs if doing so does not adversely
  *  effect the expected initiation of any higher priority job. We do not alter
  *  a job's required or excluded node list, so this is a conservative
  *  algorithm.
  *
  *  For example, consider a cluster "lx[01-08]" with one job executing on
  *  nodes "lx[01-04]". The highest priority pending job requires five nodes
  *  including "lx05". The next highest priority pending job requires any
  *  three nodes. Without explicitly forcing the second job to use nodes
  *  "lx[06-08]", we can't start it without possibly delaying the higher
  *  priority job.
  *****************************************************************************
  *  Copyright (C) 2003-2007 The Regents of the University of California.
  *  Copyright (C) 2008-2010 Lawrence Livermore National Security.
  *  Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
  *  Written by Morris Jette <jette1@llnl.gov>
  *  CODE-OCEC-09-009. All rights reserved.
  *
  *  This file is part of SLURM, a resource management program.
  *  For details, see <https://computing.llnl.gov/linux/slurm/>.
  *  Please also read the included file: DISCLAIMER.
  *
  *  SLURM is free software; you can redistribute it and/or modify it under
  *  the terms of the GNU General Public License as published by the Free
  *  Software Foundation; either version 2 of the License, or (at your option)
  *  any later version.
  *
  *  In addition, as a special exception, the copyright holders give permission
  *  to link the code of portions of this program with the OpenSSL library under
  *  certain conditions as described in each individual source file, and
  *  distribute linked combinations including the two. You must obey the GNU
  *  General Public License in all respects for all of the code used other than
  *  OpenSSL. If you modify file(s) with this exception, you may extend this
  *  exception to your version of the file(s), but you are not obligated to do
  *  so. If you do not wish to do so, delete this exception statement from your
  *  version.  If you delete this exception statement from all source files in
  *  the program, then also delete it here.
  *
  *  SLURM is distributed in the hope that it will be useful, but WITHOUT ANY
  *  WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
  *  FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
  *  details.
  *
  *  You should have received a copy of the GNU General Public License along
  *  with SLURM; if not, write to the Free Software Foundation, Inc.,
  *  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301  USA.
 \*****************************************************************************/

 #include <pthread.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <time.h>
 #include <unistd.h>

 #include "slurm/slurm.h"
 #include "slurm/slurm_errno.h"

 #include "src/common/list.h"
 #include "src/common/macros.h"
 #include "src/common/node_select.h"
 #include "src/common/parse_time.h"
 #include "src/common/slurm_protocol_api.h"
 #include "src/common/xmalloc.h"
 #include "src/common/xstring.h"

 #include "src/slurmctld/acct_policy.h"
 #include "src/slurmctld/job_scheduler.h"
 #include "src/slurmctld/licenses.h"
 #include "src/slurmctld/locks.h"
 #include "src/slurmctld/node_scheduler.h"
 #include "src/slurmctld/preempt.h"
 #include "src/slurmctld/reservation.h"
 #include "src/slurmctld/slurmctld.h"
 #include "src/slurmctld/srun_comm.h"
 #include "backfill.h"

 #ifndef BACKFILL_INTERVAL
 #  define BACKFILL_INTERVAL	30
 #endif

 /* Do not build job/resource/time record for more than this
  * far in the future, in seconds, currently one day */
 #ifndef BACKFILL_WINDOW
 #define   BACKFILL_WINDOW		(24 * 60 * 60)
 #endif

 #define SLURMCTLD_THREAD_LIMIT	5

 typedef struct node_space_map {
 	time_t begin_time;
 	time_t end_time;
 	bitstr_t *avail_bitmap;
 	int next;	/* next record, by time, zero termination */
 } node_space_map_t;
 int backfilled_jobs = 0;

 /*********************** local variables *********************/
 static bool stop_backfill = false;
 static pthread_mutex_t thread_flag_mutex = PTHREAD_MUTEX_INITIALIZER;
 static pthread_mutex_t term_lock = PTHREAD_MUTEX_INITIALIZER;
 static pthread_cond_t  term_cond = PTHREAD_COND_INITIALIZER;
 static bool config_flag = false;
 static uint32_t debug_flags = 0;
 static int backfill_interval = BACKFILL_INTERVAL;
 static int backfill_window = BACKFILL_WINDOW;
 static int max_backfill_job_cnt = 50;

 /*********************** local functions *********************/
 static void _add_reservation(uint32_t start_time, uint32_t end_reserve,
 			     bitstr_t *res_bitmap,
 			     node_space_map_t *node_space,
 			     int *node_space_recs);
 static int  _attempt_backfill(void);
 static void _diff_tv_str(struct timeval *tv1,struct timeval *tv2,
 		char *tv_str, int len_tv_str);
 static bool _job_is_completing(void);
 static void _load_config(void);
 static bool _many_pending_rpcs(void);
 static bool _more_work(time_t last_backfill_time);
 static void _my_sleep(int secs);
 static int  _num_feature_count(struct job_record *job_ptr);
 static void _reset_job_time_limit(struct job_record *job_ptr, time_t now,
 				  node_space_map_t *node_space);
 static int  _start_job(struct job_record *job_ptr, bitstr_t *avail_bitmap);
 static bool _test_resv_overlap(node_space_map_t *node_space,
 			       bitstr_t *use_bitmap, uint32_t start_time,
 			       uint32_t end_reserve);
 static int  _try_sched(struct job_record *job_ptr, bitstr_t **avail_bitmap,
 		       uint32_t min_nodes, uint32_t max_nodes,
 		       uint32_t req_nodes);

 /* Log resource allocate table */
 static void _dump_node_space_table(node_space_map_t *node_space_ptr)
 {
 	int i = 0;
 	char begin_buf[32], end_buf[32], *node_list;

 	info("=========================================");
 	while (1) {
 		slurm_make_time_str(&node_space_ptr[i].begin_time,
 				    begin_buf, sizeof(begin_buf));
 		slurm_make_time_str(&node_space_ptr[i].end_time,
 				    end_buf, sizeof(end_buf));
 		node_list = bitmap2node_name(node_space_ptr[i].avail_bitmap);
 		info("Begin:%s End:%s Nodes:%s",
 		     begin_buf, end_buf, node_list);
 		xfree(node_list);
 		if ((i = node_space_ptr[i].next) == 0)
 			break;
 	}
 	info("=========================================");
 }

 /*
  * _diff_tv_str - build a string showing the time difference between two times
  * IN tv1 - start of event
  * IN tv2 - end of event
  * OUT tv_str - place to put delta time in format "usec=%ld"
  * IN len_tv_str - size of tv_str in bytes
  */
 static void _diff_tv_str(struct timeval *tv1,struct timeval *tv2,
 		char *tv_str, int len_tv_str)
 {
 	long delta_t;
 	delta_t  = (tv2->tv_sec  - tv1->tv_sec) * 1000000;
 	delta_t +=  tv2->tv_usec - tv1->tv_usec;
 	snprintf(tv_str, len_tv_str, "usec=%ld", delta_t);
 }

 /*
  * _job_is_completing - Determine if jobs are in the process of completing.
  *	This is a variant of job_is_completing in slurmctld/job_scheduler.c.
  *	It always gives completing jobs at least 5 secs to complete.
  * RET - True if any job is in the process of completing
  */
 static bool _job_is_completing(void)
 {
 	bool completing = false;
 	ListIterator job_iterator;
 	struct job_record *job_ptr = NULL;
 	uint16_t complete_wait = slurm_get_complete_wait();
 	time_t recent;

 	if (job_list == NULL)
 		return completing;

 	recent = time(NULL) - MAX(complete_wait, 5);
 	job_iterator = list_iterator_create(job_list);
 	while ((job_ptr = (struct job_record *) list_next(job_iterator))) {
 		if (IS_JOB_COMPLETING(job_ptr) &&
 		    (job_ptr->end_time >= recent)) {
 			completing = true;
 			break;
 		}
 	}
 	list_iterator_destroy(job_iterator);

 	return completing;
 }

 /*
  * _many_pending_rpcs - Determine if slurmctld is busy with many active RPCs
  * RET - True if slurmctld currently has more than SLURMCTLD_THREAD_LIMIT
  *	 active RPCs
  */
 static bool _many_pending_rpcs(void)
 {
 	//info("thread_count = %u", slurmctld_config.server_thread_count);
 	if (slurmctld_config.server_thread_count > SLURMCTLD_THREAD_LIMIT)
 		return true;
 	return false;
 }

 /* test if job has feature count specification */
 static int _num_feature_count(struct job_record *job_ptr)
 {
 	struct job_details *detail_ptr = job_ptr->details;
 	int rc = 0;
 	ListIterator feat_iter;
 	struct feature_record *feat_ptr;

 	if (detail_ptr->feature_list == NULL)	/* no constraints */
 		return rc;

 	feat_iter = list_iterator_create(detail_ptr->feature_list);
 	if (feat_iter == NULL)
 		fatal("list_iterator_create: malloc failure");
 	while ((feat_ptr = (struct feature_record *) list_next(feat_iter))) {
 		if (feat_ptr->count)
 			rc++;
 	}
 	list_iterator_destroy(feat_iter);

 	return rc;
 }

 /* Attempt to schedule a specific job on specific available nodes
  * IN job_ptr - job to schedule
  * IN/OUT avail_bitmap - nodes available/selected to use
  * RET SLURM_SUCCESS on success, otherwise an error code
  */
 static int  _try_sched(struct job_record *job_ptr, bitstr_t **avail_bitmap,
 		       uint32_t min_nodes, uint32_t max_nodes,
 		       uint32_t req_nodes)
 {
 	bitstr_t *tmp_bitmap;
 	int rc = SLURM_SUCCESS;
 	int feat_cnt = _num_feature_count(job_ptr);
 	List preemptee_candidates = NULL;

 	if (feat_cnt) {
 		/* Ideally schedule the job feature by feature,
 		 * but I don't want to add that complexity here
 		 * right now, so clear the feature counts and try
 		 * to schedule. This will work if there is only
 		 * one feature count. It should work fairly well
 		 * in cases where there are multiple feature
 		 * counts. */
 		struct job_details *detail_ptr = job_ptr->details;
 		ListIterator feat_iter;
 		struct feature_record *feat_ptr;
 		int i = 0, list_size;
 		uint16_t *feat_cnt_orig = NULL, high_cnt = 0;

 		/* Clear the feature counts */
 		list_size = list_count(detail_ptr->feature_list);
 		feat_cnt_orig = xmalloc(sizeof(uint16_t) * list_size);
 		feat_iter = list_iterator_create(detail_ptr->feature_list);
 		if (feat_iter == NULL)
 			fatal("list_iterator_create: malloc failure");
 		while ((feat_ptr =
 			(struct feature_record *) list_next(feat_iter))) {
 			high_cnt = MAX(high_cnt, feat_ptr->count);
 			feat_cnt_orig[i++] = feat_ptr->count;
 			feat_ptr->count = 0;
 		}
 		list_iterator_destroy(feat_iter);

 		if ((job_req_node_filter(job_ptr, *avail_bitmap) !=
 		     SLURM_SUCCESS) ||
 		    (bit_set_count(*avail_bitmap) < high_cnt)) {
 			rc = ESLURM_NODES_BUSY;
 		} else {
 			preemptee_candidates =
 					slurm_find_preemptable_jobs(job_ptr);
 			rc = select_g_job_test(job_ptr, *avail_bitmap,
 					       high_cnt, max_nodes, req_nodes,
 					       SELECT_MODE_WILL_RUN,
 					       preemptee_candidates, NULL);
 		}

 		/* Restore the feature counts */
 		i = 0;
 		feat_iter = list_iterator_create(detail_ptr->feature_list);
 		while ((feat_ptr =
 			(struct feature_record *) list_next(feat_iter))) {
 			feat_ptr->count = feat_cnt_orig[i++];
 		}
 		list_iterator_destroy(feat_iter);
 		xfree(feat_cnt_orig);
 	} else {
 		/* Try to schedule the job. First on dedicated nodes
 		 * then on shared nodes (if so configured). */
 		uint16_t orig_shared;
 		time_t now = time(NULL);
 		preemptee_candidates = slurm_find_preemptable_jobs(job_ptr);
 		orig_shared = job_ptr->details->shared;
 		job_ptr->details->shared = 0;
 		tmp_bitmap = bit_copy(*avail_bitmap);
 		rc = select_g_job_test(job_ptr, *avail_bitmap, min_nodes,
 				       max_nodes, req_nodes,
 				       SELECT_MODE_WILL_RUN,
 				       preemptee_candidates, NULL);
 		job_ptr->details->shared = orig_shared;
 		if (((rc != SLURM_SUCCESS) || (job_ptr->start_time > now)) &&
 		    (orig_shared != 0)) {
 			FREE_NULL_BITMAP(*avail_bitmap);
 			*avail_bitmap= tmp_bitmap;
 			rc = select_g_job_test(job_ptr, *avail_bitmap,
 					       min_nodes, max_nodes, req_nodes,
 					       SELECT_MODE_WILL_RUN,
 					       preemptee_candidates, NULL);
 		} else
 			FREE_NULL_BITMAP(tmp_bitmap);
 	}

 	if (preemptee_candidates)
 		list_destroy(preemptee_candidates);
 	return rc;

 }

 /* Terminate backfill_agent */
 extern void stop_backfill_agent(void)
 {
 	pthread_mutex_lock(&term_lock);
 	stop_backfill = true;
 	pthread_cond_signal(&term_cond);
 	pthread_mutex_unlock(&term_lock);
 }

 static void _my_sleep(int secs)
 {
 	struct timespec ts = {0, 0};

 	ts.tv_sec = time(NULL) + secs;
 	pthread_mutex_lock(&term_lock);
 	if (!stop_backfill)
 		pthread_cond_timedwait(&term_cond, &term_lock, &ts);
 	pthread_mutex_unlock(&term_lock);
 }

 static void _load_config(void)
 {
 	char *sched_params, *tmp_ptr;

 	sched_params = slurm_get_sched_params();
 	debug_flags  = slurm_get_debug_flags();

 	if (sched_params && (tmp_ptr=strstr(sched_params, "interval=")))
 		backfill_interval = atoi(tmp_ptr + 9);
 	if (backfill_interval < 1) {
 		fatal("Invalid backfill scheduler interval: %d",
 		      backfill_interval);
 	}

 	if (sched_params && (tmp_ptr=strstr(sched_params, "bf_window=")))
 		backfill_window = atoi(tmp_ptr + 10) * 60;  /* mins to secs */
 	if (backfill_window < 1) {
 		fatal("Invalid backfill scheduler window: %d",
 		      backfill_window);
 	}
 	if (sched_params && (tmp_ptr=strstr(sched_params, "max_job_bf=")))
 		max_backfill_job_cnt = atoi(tmp_ptr + 11);
 	if (max_backfill_job_cnt < 1) {
 		fatal("Invalid backfill scheduler max_job_bf: %d",
 		      max_backfill_job_cnt);
 	}
 	xfree(sched_params);
 }

 /* Note that slurm.conf has changed */
 extern void backfill_reconfig(void)
 {
 	config_flag = true;
 }

 /* backfill_agent - detached thread periodically attempts to backfill jobs */
 extern void *backfill_agent(void *args)
 {
 	struct timeval tv1, tv2;
 	char tv_str[20];
 	time_t now;
 	double wait_time;
 	static time_t last_backfill_time = 0;
 	/* Read config and partitions; Write jobs and nodes */
 	slurmctld_lock_t all_locks = {
 		READ_LOCK, WRITE_LOCK, WRITE_LOCK, READ_LOCK };

 	_load_config();
 	last_backfill_time = time(NULL);
 	while (!stop_backfill) {
 		_my_sleep(backfill_interval);
 		if (stop_backfill)
 			break;
 		if (config_flag) {
 			config_flag = false;
 			_load_config();
 		}
 		now = time(NULL);
 		wait_time = difftime(now, last_backfill_time);
 		if ((wait_time < backfill_interval) ||
 		    _job_is_completing() || _many_pending_rpcs() ||
 		    !_more_work(last_backfill_time))
 			continue;

 		gettimeofday(&tv1, NULL);
 		lock_slurmctld(all_locks);
 		while (_attempt_backfill()) ;
 		last_backfill_time = time(NULL);
 		unlock_slurmctld(all_locks);
 		gettimeofday(&tv2, NULL);
 		_diff_tv_str(&tv1, &tv2, tv_str, 20);
 		if (debug_flags & DEBUG_FLAG_BACKFILL)
 			info("backfill: completed, %s", tv_str);
 	}
 	return NULL;
 }

 /* Return non-zero to break the backfill loop if change in job, node or
  * partition state or the backfill scheduler needs to be stopped. */
 static int _yield_locks(void)
 {
 	slurmctld_lock_t all_locks = {
 		READ_LOCK, WRITE_LOCK, WRITE_LOCK, READ_LOCK };
 	time_t job_update, node_update, part_update;

 	job_update  = last_job_update;
 	node_update = last_node_update;
 	part_update = last_part_update;

 	unlock_slurmctld(all_locks);
 	_my_sleep(backfill_interval);
 	lock_slurmctld(all_locks);

 	if ((last_job_update  == job_update)  &&
 	    (last_node_update == node_update) &&
 	    (last_part_update == part_update) &&
 	    (! stop_backfill) && (! config_flag))
 		return 0;
 	else
 		return 1;
 }

 static int _attempt_backfill(void)
 {
 	bool filter_root = false;
 	List job_queue;
 	job_queue_rec_t *job_queue_rec;
 	slurmdb_qos_rec_t *qos_ptr = NULL;
 	int i, j, node_space_recs;
 	struct job_record *job_ptr;
 	struct part_record *part_ptr;
 	uint32_t end_time, end_reserve;
 	uint32_t time_limit, comp_time_limit, orig_time_limit;
 	uint32_t min_nodes, max_nodes, req_nodes;
 	bitstr_t *avail_bitmap = NULL, *resv_bitmap = NULL;
 	time_t now = time(NULL), sched_start, later_start, start_res;
 	node_space_map_t *node_space;
 	static int sched_timeout = 0;
 	int this_sched_timeout = 0, rc = 0;

 	sched_start = now;
 	if (sched_timeout == 0) {
 		sched_timeout = slurm_get_msg_timeout() / 2;
 		sched_timeout = MAX(sched_timeout, 1);
 		sched_timeout = MIN(sched_timeout, 10);
 	}
 	this_sched_timeout = sched_timeout;

 	if (slurm_get_root_filter())
 		filter_root = true;

 	job_queue = build_job_queue(true);
 	if (list_count(job_queue) <= 1) {
 		debug("backfill: no jobs to backfill");
 		list_destroy(job_queue);
 		return 0;
 	}

 	node_space = xmalloc(sizeof(node_space_map_t) *
 			     (max_backfill_job_cnt + 3));
 	node_space[0].begin_time = sched_start;
 	node_space[0].end_time = sched_start + backfill_window;
 	node_space[0].avail_bitmap = bit_copy(avail_node_bitmap);
 	node_space[0].next = 0;
 	node_space_recs = 1;
 	if (debug_flags & DEBUG_FLAG_BACKFILL)
 		_dump_node_space_table(node_space);

 	while ((job_queue_rec = (job_queue_rec_t *)
 				list_pop_bottom(job_queue, sort_job_queue2))) {
 		job_ptr  = job_queue_rec->job_ptr;
 		part_ptr = job_queue_rec->part_ptr;
 		xfree(job_queue_rec);
 		if (!IS_JOB_PENDING(job_ptr))
 			continue;	/* started in other partition */
 		job_ptr->part_ptr = part_ptr;

 		if (debug_flags & DEBUG_FLAG_BACKFILL)
 			info("backfill test for job %u", job_ptr->job_id);

 		if ((job_ptr->state_reason == WAIT_ASSOC_JOB_LIMIT) ||
 		    (job_ptr->state_reason == WAIT_ASSOC_RESOURCE_LIMIT) ||
 		    (job_ptr->state_reason == WAIT_ASSOC_TIME_LIMIT)) {
 			debug2("backfill: job %u is not allowed to run now. "
 			       "Skipping it. State=%s. Reason=%s. Priority=%u",
 			       job_ptr->job_id,
 			       job_state_string(job_ptr->job_state),
 			       job_reason_string(job_ptr->state_reason),
 			       job_ptr->priority);
 			continue;
 		}

 		if (((part_ptr->state_up & PARTITION_SCHED) == 0) ||
 		    (part_ptr->node_bitmap == NULL))
 		 	continue;
 		if ((part_ptr->flags & PART_FLAG_ROOT_ONLY) && filter_root)
 			continue;

 		if ((!job_independent(job_ptr, 0)) ||
 		    (license_job_test(job_ptr, time(NULL)) != SLURM_SUCCESS))
 			continue;

 		/* Determine minimum and maximum node counts */
 		min_nodes = MAX(job_ptr->details->min_nodes,
 				part_ptr->min_nodes);
 		if (job_ptr->details->max_nodes == 0)
 			max_nodes = part_ptr->max_nodes;
 		else
 			max_nodes = MIN(job_ptr->details->max_nodes,
 					part_ptr->max_nodes);
 		max_nodes = MIN(max_nodes, 500000);     /* prevent overflows */
 		if (job_ptr->details->max_nodes)
 			req_nodes = max_nodes;
 		else
 			req_nodes = min_nodes;
 		if (min_nodes > max_nodes) {
 			/* job's min_nodes exceeds partition's max_nodes */
 			continue;
 		}

 		/* Determine job's expected completion time */
 		if (job_ptr->time_limit == NO_VAL) {
 			if (part_ptr->max_time == INFINITE)
 				time_limit = 365 * 24 * 60; /* one year */
 			else
 				time_limit = part_ptr->max_time;
 		} else {
 			if (part_ptr->max_time == INFINITE)
 				time_limit = job_ptr->time_limit;
 			else
 				time_limit = MIN(job_ptr->time_limit,
 						 part_ptr->max_time);
 		}
 		comp_time_limit = time_limit;
 		orig_time_limit = job_ptr->time_limit;
 		if (qos_ptr && (qos_ptr->flags & QOS_FLAG_NO_RESERVE))
 			time_limit = job_ptr->time_limit = 1;
 		else if (job_ptr->time_min && (job_ptr->time_min < time_limit))
 			time_limit = job_ptr->time_limit = job_ptr->time_min;

 		/* Determine impact of any resource reservations */
 		later_start = now;
  TRY_LATER:	FREE_NULL_BITMAP(avail_bitmap);
 		start_res   = later_start;
 		later_start = 0;
 		j = job_test_resv(job_ptr, &start_res, true, &avail_bitmap);
 		if (j != SLURM_SUCCESS) {
 			job_ptr->time_limit = orig_time_limit;
 			continue;
 		}
 		if (start_res > now)
 			end_time = (time_limit * 60) + start_res;
 		else
 			end_time = (time_limit * 60) + now;

 		/* Identify usable nodes for this job */
 		bit_and(avail_bitmap, part_ptr->node_bitmap);
 		bit_and(avail_bitmap, up_node_bitmap);
 		for (j=0; ; ) {
 			if ((node_space[j].end_time > start_res) &&
 			     node_space[j].next && (later_start == 0))
 				later_start = node_space[j].end_time;
 			if (node_space[j].end_time <= start_res)
 				;
 			else if (node_space[j].begin_time <= end_time) {
 				bit_and(avail_bitmap,
 					node_space[j].avail_bitmap);
 			} else
 				break;
 			if ((j = node_space[j].next) == 0)
 				break;
 		}

 		if (job_ptr->details->exc_node_bitmap) {
 			bit_not(job_ptr->details->exc_node_bitmap);
 			bit_and(avail_bitmap,
 				job_ptr->details->exc_node_bitmap);
 			bit_not(job_ptr->details->exc_node_bitmap);
 		}

 		/* Test if insufficient nodes remain OR
 		 *	required nodes missing OR
 		 *	nodes lack features */
 		if ((bit_set_count(avail_bitmap) < min_nodes) ||
 		    ((job_ptr->details->req_node_bitmap) &&
 		     (!bit_super_set(job_ptr->details->req_node_bitmap,
 				     avail_bitmap))) ||
 		    (job_req_node_filter(job_ptr, avail_bitmap))) {
 			if (later_start) {
 				job_ptr->start_time = 0;
 				goto TRY_LATER;
 			}
 			job_ptr->time_limit = orig_time_limit;
 			continue;
 		}

 		/* Identify nodes which are definitely off limits */
 		FREE_NULL_BITMAP(resv_bitmap);
 		resv_bitmap = bit_copy(avail_bitmap);
 		bit_not(resv_bitmap);

 		if ((time(NULL) - sched_start) >= this_sched_timeout) {
 			debug("backfill: loop taking too long, yielding locks");
 			if (_yield_locks()) {
 				debug("backfill: system state changed, "
 				      "breaking out");
 				rc = 1;
 				break;
 			} else {
 				this_sched_timeout += sched_timeout;
 			}
 		}
 		/* this is the time consuming operation */
 		debug2("backfill: entering _try_sched for job %u.",
 		       job_ptr->job_id);
 		j = _try_sched(job_ptr, &avail_bitmap,
 			       min_nodes, max_nodes, req_nodes);
 		debug2("backfill: finished _try_sched for job %u.",
 		       job_ptr->job_id);
 		now = time(NULL);
 		if (j != SLURM_SUCCESS) {
 			job_ptr->time_limit = orig_time_limit;
 			job_ptr->start_time = 0;
 			continue;	/* not runable */
 		}

 		if (start_res > job_ptr->start_time) {
 			job_ptr->start_time = start_res;
 			last_job_update = now;
 		}
 		if (job_ptr->start_time <= now) {
 			int rc = _start_job(job_ptr, resv_bitmap);
 			if (qos_ptr && (qos_ptr->flags & QOS_FLAG_NO_RESERVE))
 				job_ptr->time_limit = orig_time_limit;
 			else if ((rc == SLURM_SUCCESS) && job_ptr->time_min) {
 				/* Set time limit as high as possible */
 				job_ptr->time_limit = comp_time_limit;
 				job_ptr->end_time = job_ptr->start_time +
 						    (comp_time_limit * 60);
 				_reset_job_time_limit(job_ptr, now,
 						      node_space);
 				time_limit = job_ptr->time_limit;
 			} else {
 				job_ptr->time_limit = orig_time_limit;
 			}
 			if (rc == ESLURM_ACCOUNTING_POLICY) {
 				/* Unknown future start time, just skip job */
 				job_ptr->start_time = 0;
 				continue;
 			} else if (rc != SLURM_SUCCESS) {
 				/* Planned to start job, but something bad
 				 * happended. */
 				job_ptr->start_time = 0;
 				break;
 			} else {
 				/* Started this job, move to next one */
 				continue;
 			}
 		} else
 			job_ptr->time_limit = orig_time_limit;

 		if (later_start && (job_ptr->start_time > later_start)) {
 			/* Try later when some nodes currently reserved for
 			 * pending jobs are free */
 			job_ptr->start_time = 0;
 			goto TRY_LATER;
 		}

 		if (job_ptr->start_time > (sched_start + backfill_window)) {
 			/* Starts too far in the future to worry about */
 			continue;
 		}

 		if (node_space_recs >= max_backfill_job_cnt) {
 			/* Already have too many jobs to deal with */
 			break;
 		}

 		end_reserve = job_ptr->start_time + (time_limit * 60);
 		if (_test_resv_overlap(node_space, avail_bitmap,
 				       job_ptr->start_time, end_reserve)) {
 			/* This job overlaps with an existing reservation for
 			 * job to be backfill scheduled, which the sched
 			 * plugin does not know about. Try again later. */
 			later_start = job_ptr->start_time;
 			job_ptr->start_time = 0;
 			goto TRY_LATER;
 		}

 		/*
 		 * Add reservation to scheduling table if appropriate
 		 */
 		qos_ptr = job_ptr->qos_ptr;
 		if (qos_ptr && (qos_ptr->flags & QOS_FLAG_NO_RESERVE))
 			continue;
 		bit_not(avail_bitmap);
 		_add_reservation(job_ptr->start_time, end_reserve,
 				 avail_bitmap, node_space, &node_space_recs);
 		if (debug_flags & DEBUG_FLAG_BACKFILL)
 			_dump_node_space_table(node_space);
 	}
 	FREE_NULL_BITMAP(avail_bitmap);
 	FREE_NULL_BITMAP(resv_bitmap);

 	for (i=0; ; ) {
 		FREE_NULL_BITMAP(node_space[i].avail_bitmap);
 		if ((i = node_space[i].next) == 0)
 			break;
 	}
 	xfree(node_space);
 	list_destroy(job_queue);
 	return rc;
 }

 /* Try to start the job on any non-reserved nodes */
 static int _start_job(struct job_record *job_ptr, bitstr_t *resv_bitmap)
 {
 	int rc;
 	bitstr_t *orig_exc_nodes = NULL;
 	static uint32_t fail_jobid = 0;

 	if (job_ptr->details->exc_node_bitmap) {
 		orig_exc_nodes = bit_copy(job_ptr->details->exc_node_bitmap);
 		bit_or(job_ptr->details->exc_node_bitmap, resv_bitmap);
 	} else
 		job_ptr->details->exc_node_bitmap = bit_copy(resv_bitmap);

 	rc = select_nodes(job_ptr, false, NULL);
 	FREE_NULL_BITMAP(job_ptr->details->exc_node_bitmap);
 	job_ptr->details->exc_node_bitmap = orig_exc_nodes;
 	if (rc == SLURM_SUCCESS) {
 		/* job initiated */
 		last_job_update = time(NULL);
 		info("backfill: Started JobId=%u on %s",
 		     job_ptr->job_id, job_ptr->nodes);
 		if (job_ptr->batch_flag == 0)
 			srun_allocate(job_ptr->job_id);
 		else if (job_ptr->details->prolog_running == 0)
 			launch_job(job_ptr);
 		backfilled_jobs++;
 		if (debug_flags & DEBUG_FLAG_BACKFILL) {
 			info("backfill: Jobs backfilled since boot: %d",
 			     backfilled_jobs);
 		}
 	} else if ((job_ptr->job_id != fail_jobid) &&
 		   (rc != ESLURM_ACCOUNTING_POLICY)) {
 		char *node_list;
 		bit_not(resv_bitmap);
 		node_list = bitmap2node_name(resv_bitmap);
 		/* This happens when a job has sharing disabled and
 		 * a selected node is still completing some job,
 		 * which should be a temporary situation. */
 		verbose("backfill: Failed to start JobId=%u on %s: %s",
 			job_ptr->job_id, node_list, slurm_strerror(rc));
 		xfree(node_list);
 		fail_jobid = job_ptr->job_id;
 	} else {
 		debug3("backfill: Failed to start JobId=%u: %s",
 		       job_ptr->job_id, slurm_strerror(rc));
 	}

 	return rc;
 }

 /* Reset a job's time limit (and end_time) as high as possible
  *	within the range job_ptr->time_min and job_ptr->time_limit.
  *	Avoid using resources reserved for pending jobs or in resource
  *	reservations */
 static void _reset_job_time_limit(struct job_record *job_ptr, time_t now,
 				  node_space_map_t *node_space)
 {
 	int32_t j, resv_delay;
 	uint32_t orig_time_limit = job_ptr->time_limit;

 	for (j=0; ; ) {
 		if ((node_space[j].begin_time != now) &&
 		    (node_space[j].begin_time < job_ptr->end_time) &&
 		    (!bit_super_set(job_ptr->node_bitmap,
 				    node_space[j].avail_bitmap))) {
 			/* Job overlaps pending job's resource reservation */
 			resv_delay = difftime(node_space[j].begin_time, now);
 			resv_delay /= 60;	/* seconds to minutes */
 			if (resv_delay < job_ptr->time_limit)
 				job_ptr->time_limit = resv_delay;
 		}
 		if ((j = node_space[j].next) == 0)
 			break;
 	}
 	job_ptr->time_limit = MAX(job_ptr->time_min, job_ptr->time_limit);
 	job_ptr->end_time = job_ptr->start_time + (job_ptr->time_limit * 60);

 	job_time_adj_resv(job_ptr);

 	if (orig_time_limit != job_ptr->time_limit) {
 		info("backfill: job %u time limit changed from %u to %u",
 		     job_ptr->job_id, orig_time_limit, job_ptr->time_limit);
 	}
 }

 /* Report if any changes occurred to job, node or partition information */
 static bool _more_work (time_t last_backfill_time)
 {
 	bool rc = false;

 	pthread_mutex_lock( &thread_flag_mutex );
 	if ( (last_job_update  >= last_backfill_time ) ||
 	     (last_node_update >= last_backfill_time ) ||
 	     (last_part_update >= last_backfill_time ) ) {
 		rc = true;
 	}
 	pthread_mutex_unlock( &thread_flag_mutex );
 	return rc;
 }

 /* Create a reservation for a job in the future */
 static void _add_reservation(uint32_t start_time, uint32_t end_reserve,
 			     bitstr_t *res_bitmap,
 			     node_space_map_t *node_space,
 			     int *node_space_recs)
 {
 	bool placed = false;
 	int i, j;

 	for (j=0; ; ) {
 		if (node_space[j].end_time > start_time) {
 			/* insert start entry record */
 			i = *node_space_recs;
 			node_space[i].begin_time = start_time;
 			node_space[i].end_time = node_space[j].end_time;
 			node_space[j].end_time = start_time;
 			node_space[i].avail_bitmap =
 				bit_copy(node_space[j].avail_bitmap);
 			node_space[i].next = node_space[j].next;
 			node_space[j].next = i;
 			(*node_space_recs)++;
 			placed = true;
 		}
 		if (node_space[j].end_time == start_time) {
 			/* no need to insert new start entry record */
 			placed = true;
 		}
 		if (placed == true) {
 			j = node_space[j].next;
 			if (j && (end_reserve < node_space[j].end_time)) {
 				/* insert end entry record */
 				i = *node_space_recs;
 				node_space[i].begin_time = end_reserve;
 				node_space[i].end_time = node_space[j].
 							 end_time;
 				node_space[j].end_time = end_reserve;
 				node_space[i].avail_bitmap =
 					bit_copy(node_space[j].avail_bitmap);
 				node_space[i].next = node_space[j].next;
 				node_space[j].next = i;
 				(*node_space_recs)++;
 			}
 			break;
 		}
 		if ((j = node_space[j].next) == 0)
 			break;
 	}

 	for (j=0; ; ) {
 		if ((node_space[j].begin_time >= start_time) &&
 		    (node_space[j].end_time <= end_reserve))
 			bit_and(node_space[j].avail_bitmap, res_bitmap);
 		if ((node_space[j].begin_time >= end_reserve) ||
 		    ((j = node_space[j].next) == 0))
 			break;
 	}
 }

 /*
  * Determine if the resource specification for a new job overlaps with a
  *	reservation that the backfill scheduler has made for a job to be
  *	started in the future.
  * IN use_bitmap - nodes to be allocated
  * IN start_time - start time of job
  * IN end_reserve - end time of job
  */
 static bool _test_resv_overlap(node_space_map_t *node_space,
 			       bitstr_t *use_bitmap, uint32_t start_time,
 			       uint32_t end_reserve)
 {
 	bool overlap = false;
 	int j;

 	for (j=0; ; ) {
 		if ((node_space[j].end_time   > start_time) &&
 		    (node_space[j].begin_time < end_reserve) &&
 		    (!bit_super_set(use_bitmap, node_space[j].avail_bitmap))) {
 			overlap = true;
 			break;
 		}
 		if ((j = node_space[j].next) == 0)
 			break;
 	}
 	return overlap;
 }