|  | /*****************************************************************************\ | 
|  | * job_scheduler.c - manage the scheduling of pending jobs in priority order | 
|  | *	Note there is a global job list (job_list) | 
|  | ***************************************************************************** | 
|  | *  Copyright (C) 2002-2007 The Regents of the University of California. | 
|  | *  Copyright (C) 2008-2010 Lawrence Livermore National Security. | 
|  | *  Copyright (C) SchedMD LLC. | 
|  | *  Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). | 
|  | *  Written by Morris Jette <jette1@llnl.gov> | 
|  | *  CODE-OCEC-09-009. All rights reserved. | 
|  | * | 
|  | *  This file is part of Slurm, a resource management program. | 
|  | *  For details, see <https://slurm.schedmd.com/>. | 
|  | *  Please also read the included file: DISCLAIMER. | 
|  | * | 
|  | *  Slurm is free software; you can redistribute it and/or modify it under | 
|  | *  the terms of the GNU General Public License as published by the Free | 
|  | *  Software Foundation; either version 2 of the License, or (at your option) | 
|  | *  any later version. | 
|  | * | 
|  | *  In addition, as a special exception, the copyright holders give permission | 
|  | *  to link the code of portions of this program with the OpenSSL library under | 
|  | *  certain conditions as described in each individual source file, and | 
|  | *  distribute linked combinations including the two. You must obey the GNU | 
|  | *  General Public License in all respects for all of the code used other than | 
|  | *  OpenSSL. If you modify file(s) with this exception, you may extend this | 
|  | *  exception to your version of the file(s), but you are not obligated to do | 
|  | *  so. If you do not wish to do so, delete this exception statement from your | 
|  | *  version.  If you delete this exception statement from all source files in | 
|  | *  the program, then also delete it here. | 
|  | * | 
|  | *  Slurm is distributed in the hope that it will be useful, but WITHOUT ANY | 
|  | *  WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS | 
|  | *  FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more | 
|  | *  details. | 
|  | * | 
|  | *  You should have received a copy of the GNU General Public License along | 
|  | *  with Slurm; if not, write to the Free Software Foundation, Inc., | 
|  | *  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301  USA. | 
|  | \*****************************************************************************/ | 
|  |  | 
|  | #include "config.h" | 
|  |  | 
|  | #include <ctype.h> | 
|  | #include <errno.h> | 
|  | #include <poll.h> | 
|  | #include <signal.h> /* for SIGKILL */ | 
|  | #include <stdio.h> | 
|  | #include <stdlib.h> | 
|  | #include <string.h> | 
|  | #include <unistd.h> | 
|  |  | 
|  | #if HAVE_SYS_PRCTL_H | 
|  | #  include <sys/prctl.h> | 
|  | #endif | 
|  |  | 
|  | #include "src/common/assoc_mgr.h" | 
|  | #include "src/common/cpu_frequency.h" | 
|  | #include "src/common/env.h" | 
|  | #include "src/common/group_cache.h" | 
|  | #include "src/common/job_features.h" | 
|  | #include "src/common/list.h" | 
|  | #include "src/common/macros.h" | 
|  | #include "src/common/node_features.h" | 
|  | #include "src/common/parse_time.h" | 
|  | #include "src/common/strlcpy.h" | 
|  | #include "src/common/timers.h" | 
|  | #include "src/common/track_script.h" | 
|  | #include "src/common/uid.h" | 
|  | #include "src/common/xassert.h" | 
|  | #include "src/common/xstring.h" | 
|  |  | 
|  | #include "src/interfaces/accounting_storage.h" | 
|  | #include "src/interfaces/acct_gather.h" | 
|  | #include "src/interfaces/burst_buffer.h" | 
|  | #include "src/interfaces/gres.h" | 
|  | #include "src/interfaces/node_features.h" | 
|  | #include "src/interfaces/preempt.h" | 
|  | #include "src/interfaces/prep.h" | 
|  | #include "src/interfaces/select.h" | 
|  |  | 
|  | #include "src/slurmctld/acct_policy.h" | 
|  | #include "src/slurmctld/agent.h" | 
|  | #include "src/slurmctld/fed_mgr.h" | 
|  | #include "src/slurmctld/gang.h" | 
|  | #include "src/slurmctld/locks.h" | 
|  | #include "src/slurmctld/job_scheduler.h" | 
|  | #include "src/slurmctld/licenses.h" | 
|  | #include "src/slurmctld/locks.h" | 
|  | #include "src/slurmctld/node_scheduler.h" | 
|  | #include "src/slurmctld/power_save.h" | 
|  | #include "src/slurmctld/proc_req.h" | 
|  | #include "src/slurmctld/reservation.h" | 
|  | #include "src/slurmctld/slurmctld.h" | 
|  | #include "src/slurmctld/state_save.h" | 
|  |  | 
|  | #include "src/stepmgr/gres_stepmgr.h" | 
|  | #include "src/stepmgr/srun_comm.h" | 
|  | #include "src/stepmgr/stepmgr.h" | 
|  |  | 
|  | #ifndef CORRESPOND_ARRAY_TASK_CNT | 
|  | #  define CORRESPOND_ARRAY_TASK_CNT 10 | 
|  | #endif | 
|  | #define BUILD_TIMEOUT 2000000	/* Max build_job_queue() run time in usec */ | 
|  |  | 
|  | typedef enum { | 
|  | ARRAY_SPLIT_BURST_BUFFER, | 
|  | ARRAY_SPLIT_AFTER_CORR, | 
|  | } array_split_type_t; | 
|  |  | 
|  | typedef struct { | 
|  | list_t *job_list; | 
|  | int pend_cnt_limit; | 
|  | char *reason_msg; | 
|  | array_split_type_t type; | 
|  | } split_job_t; | 
|  |  | 
|  | typedef struct { | 
|  | bool backfill; | 
|  | bool clear_start; | 
|  | int job_prio_pairs; | 
|  | job_record_t *job_ptr; | 
|  | list_t *job_queue; | 
|  | time_t *last_log_time; | 
|  | time_t now; | 
|  | int prio_inx; | 
|  | struct timeval start_tv; | 
|  | int tested_jobs; | 
|  | } build_job_queue_for_part_t; | 
|  |  | 
|  | typedef struct { | 
|  | bool completing; | 
|  | bitstr_t *eff_cg_bitmap; | 
|  | time_t recent; | 
|  | } job_is_comp_t; | 
|  |  | 
|  | typedef struct { | 
|  | uint32_t prio; | 
|  | bool set; | 
|  | } part_prios_same_t; | 
|  |  | 
|  | typedef struct { | 
|  | char *cg_part_str; | 
|  | char *cg_part_str_pos; | 
|  | bitstr_t *eff_cg_bitmap; | 
|  | } part_reduce_frag_t; | 
|  |  | 
|  | typedef struct { | 
|  | job_record_t *het_job; | 
|  | job_record_t *het_job_leader; | 
|  | job_record_t *job_ptr; | 
|  | } het_job_ready_t; | 
|  |  | 
|  | typedef struct { | 
|  | job_record_t *het_job_leader; | 
|  | int het_job_offset; | 
|  | batch_job_launch_msg_t *launch_msg_ptr; | 
|  | } het_job_env_t; | 
|  |  | 
|  | typedef struct { | 
|  | job_record_t *job_ptr; | 
|  | char *sep; | 
|  | bool set_or_flag; | 
|  | } depend_str_t; | 
|  |  | 
|  | typedef struct { | 
|  | bool and_failed; | 
|  | bool changed; | 
|  | bool has_local_depend; | 
|  | bool has_unfulfilled; | 
|  | job_record_t *job_ptr; | 
|  | bool or_flag; | 
|  | bool or_satisfied; | 
|  | } test_job_dep_t; | 
|  |  | 
|  | typedef struct { | 
|  | uint64_t cume_space_time; | 
|  | job_record_t *job_ptr; | 
|  | uint32_t part_cpus_per_node; | 
|  | } delay_start_t; | 
|  |  | 
|  | typedef struct { | 
|  | job_record_t *job_ptr; | 
|  | time_t now; | 
|  | int rc; | 
|  | will_run_response_msg_t **resp; | 
|  | } job_start_data_t; | 
|  |  | 
|  | typedef struct { | 
|  | int bracket; | 
|  | bool can_reboot; | 
|  | char *debug_str; | 
|  | char *features; | 
|  | list_t *feature_list; | 
|  | bool has_xand; | 
|  | bool has_mor; | 
|  | int paren; | 
|  | int rc; | 
|  | bool skip_validation; | 
|  | } valid_feature_t; | 
|  |  | 
|  | static batch_job_launch_msg_t *_build_launch_job_msg(job_record_t *job_ptr, | 
|  | uint16_t protocol_version); | 
|  | static bool	_job_runnable_test1(job_record_t *job_ptr, bool clear_start); | 
|  | static bool	_job_runnable_test2(job_record_t *job_ptr, time_t now, | 
|  | bool check_min_time); | 
|  | static bool _scan_depend(list_t *dependency_list, job_record_t *job_ptr); | 
|  | static void *	_sched_agent(void *args); | 
|  | static void _set_schedule_exit(schedule_exit_t code); | 
|  | static int	_schedule(bool full_queue); | 
|  | static int	_valid_batch_features(job_record_t *job_ptr, bool can_reboot); | 
|  | static int _valid_feature_list(job_record_t *job_ptr, | 
|  | valid_feature_t *valid_feature, | 
|  | bool is_reservation); | 
|  | static int	_valid_node_feature(char *feature, bool can_reboot); | 
|  | static int	build_queue_timeout = BUILD_TIMEOUT; | 
|  | static int	correspond_after_task_cnt = CORRESPOND_ARRAY_TASK_CNT; | 
|  |  | 
|  | static pthread_mutex_t sched_mutex = PTHREAD_MUTEX_INITIALIZER; | 
|  | static pthread_cond_t  sched_cond = PTHREAD_COND_INITIALIZER; | 
|  | static pthread_t thread_id_sched = 0; | 
|  | static bool sched_full_queue = false; | 
|  | static int sched_requests = 0; | 
|  | static struct timeval sched_last = {0, 0}; | 
|  |  | 
|  | static uint32_t max_array_size = NO_VAL; | 
|  | static bool bf_hetjob_immediate = false; | 
|  | static uint16_t bf_hetjob_prio = 0; | 
|  | static int sched_min_interval = 2; | 
|  |  | 
|  | static int bb_array_stage_cnt = 10; | 
|  | extern diag_stats_t slurmctld_diag_stats; | 
|  |  | 
|  | static int _find_singleton_job (void *x, void *key) | 
|  | { | 
|  | job_record_t *qjob_ptr = (job_record_t *) x; | 
|  | job_record_t *job_ptr = (job_record_t *) key; | 
|  |  | 
|  | xassert (qjob_ptr->magic == JOB_MAGIC); | 
|  |  | 
|  | /* | 
|  | * get user jobs with the same user and name | 
|  | */ | 
|  | if (qjob_ptr->user_id != job_ptr->user_id) | 
|  | return 0; | 
|  | if (qjob_ptr->name && job_ptr->name && | 
|  | xstrcmp(qjob_ptr->name, job_ptr->name)) | 
|  | return 0; | 
|  | /* | 
|  | * already running/suspended job or previously | 
|  | * submitted pending job | 
|  | * and not a het job, or not part of the same het job | 
|  | */ | 
|  | if ((IS_JOB_RUNNING(qjob_ptr) || IS_JOB_SUSPENDED(qjob_ptr) || | 
|  | (IS_JOB_PENDING(qjob_ptr) && | 
|  | (qjob_ptr->job_id < job_ptr->job_id))) && | 
|  | (!job_ptr->het_job_id || | 
|  | (job_ptr->het_job_id != qjob_ptr->het_job_id))) { | 
|  | return 1; | 
|  | } | 
|  |  | 
|  | return 0; | 
|  | } | 
|  |  | 
|  | static int _queue_resv_list(void *x, void *key) | 
|  | { | 
|  | job_queue_req_t *job_queue_req = (job_queue_req_t *) key; | 
|  |  | 
|  | job_queue_req->resv_ptr = (slurmctld_resv_t *) x; | 
|  |  | 
|  | if ((job_queue_req->job_ptr->bit_flags & JOB_PART_ASSIGNED) && | 
|  | job_queue_req->resv_ptr->part_ptr) | 
|  | job_queue_req->part_ptr = job_queue_req->resv_ptr->part_ptr; | 
|  |  | 
|  | job_queue_append_internal(job_queue_req); | 
|  |  | 
|  | return 0; | 
|  | } | 
|  |  | 
|  | static void _job_queue_append(list_t *job_queue, job_record_t *job_ptr, | 
|  | uint32_t prio) | 
|  | { | 
|  | job_queue_req_t job_queue_req = { .job_ptr = job_ptr, | 
|  | .job_queue = job_queue, | 
|  | .part_ptr = job_ptr->part_ptr, | 
|  | .prio = prio }; | 
|  |  | 
|  | /* We have multiple reservations, process and end here */ | 
|  | if (job_ptr->resv_list) { | 
|  | list_for_each(job_ptr->resv_list, _queue_resv_list, | 
|  | &job_queue_req); | 
|  | return; | 
|  | } | 
|  |  | 
|  | job_queue_append_internal(&job_queue_req); | 
|  |  | 
|  | /* | 
|  | * This means we requested a specific reservation, don't do any magnetic | 
|  | * ones | 
|  | */ | 
|  | if (job_ptr->resv_name) | 
|  | return; | 
|  |  | 
|  | /* | 
|  | * For het jobs, backfill makes a plan for each component; however, | 
|  | * backfill doesn't track magnetic reservations in the plan, so backfill | 
|  | * can't start hetjobs in a magnetic reservation unless the het job | 
|  | * explicitly requests the magnetic reservation. | 
|  | * | 
|  | * Also, if there is a magnetic reservation that starts in the future, | 
|  | * backfill will not be able to start the het job if there is a separate | 
|  | * magnetic reservation queue record for the component. So, don't create | 
|  | * a separate magnetic reservation queue record for het jobs. | 
|  | */ | 
|  | if (job_ptr->het_job_id) | 
|  | return; | 
|  |  | 
|  | job_resv_append_magnetic(&job_queue_req); | 
|  | } | 
|  |  | 
|  | /* Job test for ability to run now, excludes partition specific tests */ | 
|  | static bool _job_runnable_test1(job_record_t *job_ptr, bool sched_plugin) | 
|  | { | 
|  | bool job_indepen = false; | 
|  | time_t now = time(NULL); | 
|  |  | 
|  | xassert(job_ptr->magic == JOB_MAGIC); | 
|  | if (!IS_JOB_PENDING(job_ptr) || IS_JOB_COMPLETING(job_ptr)) | 
|  | return false; | 
|  |  | 
|  | if (IS_JOB_REVOKED(job_ptr)) | 
|  | return false; | 
|  |  | 
|  | if ((job_ptr->details && job_ptr->details->prolog_running) || | 
|  | (job_ptr->step_list && list_count(job_ptr->step_list))) { | 
|  | /* Job's been requeued and the | 
|  | * previous run hasn't finished yet */ | 
|  | job_ptr->state_reason = WAIT_CLEANING; | 
|  | xfree(job_ptr->state_desc); | 
|  | last_job_update = now; | 
|  | sched_debug3("%pJ. State=PENDING. Reason=Cleaning.", job_ptr); | 
|  | return false; | 
|  | } | 
|  |  | 
|  | job_indepen = job_independent(job_ptr); | 
|  | if (sched_plugin) | 
|  | job_ptr->start_time = (time_t) 0; | 
|  | if (job_ptr->priority == 0)	{ /* held */ | 
|  | if ((job_ptr->state_reason != FAIL_BAD_CONSTRAINTS) && | 
|  | (job_ptr->state_reason != FAIL_BURST_BUFFER_OP) && | 
|  | (job_ptr->state_reason != WAIT_HELD) && | 
|  | (job_ptr->state_reason != WAIT_HELD_USER) && | 
|  | (job_ptr->state_reason != WAIT_MAX_REQUEUE) && | 
|  | (job_ptr->state_reason != WAIT_RESV_INVALID) && | 
|  | (job_ptr->state_reason != WAIT_RESV_DELETED)) { | 
|  | job_ptr->state_reason = WAIT_HELD; | 
|  | xfree(job_ptr->state_desc); | 
|  | last_job_update = now; | 
|  | } | 
|  | sched_debug3("%pJ. State=%s. Reason=%s. Priority=%u.", | 
|  | job_ptr, | 
|  | job_state_string(job_ptr->job_state), | 
|  | job_state_reason_string(job_ptr->state_reason), | 
|  | job_ptr->priority); | 
|  | return false; | 
|  | } | 
|  |  | 
|  | if (!job_indepen && | 
|  | ((job_ptr->state_reason == WAIT_HELD) || | 
|  | (job_ptr->state_reason == WAIT_HELD_USER))) { | 
|  | /* released behind active dependency? */ | 
|  | job_ptr->state_reason = WAIT_DEPENDENCY; | 
|  | xfree(job_ptr->state_desc); | 
|  | last_job_update = now; | 
|  | } | 
|  |  | 
|  | if (!job_indepen)	/* can not run now */ | 
|  | return false; | 
|  |  | 
|  | return true; | 
|  | } | 
|  |  | 
|  | /* | 
|  | * Job and partition tests for ability to run now | 
|  | * IN job_ptr - job to test | 
|  | * IN now - update time | 
|  | * IN check_min_time - If set, test job's minimum time limit | 
|  | *		otherwise test maximum time limit | 
|  | */ | 
|  | static bool _job_runnable_test2(job_record_t *job_ptr, time_t now, | 
|  | bool check_min_time) | 
|  | { | 
|  | int reason; | 
|  |  | 
|  | reason = job_limits_check(&job_ptr, check_min_time); | 
|  | if ((reason != job_ptr->state_reason) && | 
|  | ((reason != WAIT_NO_REASON) || | 
|  | (job_state_reason_check(job_ptr->state_reason, JSR_PART)))) { | 
|  | job_ptr->state_reason = reason; | 
|  | xfree(job_ptr->state_desc); | 
|  | last_job_update = now; | 
|  | } | 
|  | if (reason != WAIT_NO_REASON) | 
|  | return false; | 
|  | return true; | 
|  | } | 
|  |  | 
|  | /* | 
|  | * Job, reservation and partition tests for ability to run now. | 
|  | * If a job is submitted to multiple partitions, don't consider partitions | 
|  | * on which the job would not fit given the current set of nodes in the | 
|  | * reservation. | 
|  | * IN job_ptr - job to test | 
|  | * IN part_ptr - partition to test | 
|  | */ | 
|  | static bool _job_runnable_test3(job_record_t *job_ptr, part_record_t *part_ptr) | 
|  | { | 
|  | if (job_ptr->resv_ptr && job_ptr->resv_ptr->node_bitmap && | 
|  | !(job_ptr->resv_ptr->flags & RESERVE_FLAG_FLEX) && | 
|  | part_ptr && part_ptr->node_bitmap && | 
|  | (bit_overlap(job_ptr->resv_ptr->node_bitmap, part_ptr->node_bitmap) | 
|  | < job_ptr->node_cnt_wag)) | 
|  | return false; | 
|  | return true; | 
|  | } | 
|  |  | 
|  | static int _find_depend_after_corr(void *x, void *arg) | 
|  | { | 
|  | depend_spec_t *dep_ptr = x; | 
|  |  | 
|  | if (dep_ptr->depend_type == SLURM_DEPEND_AFTER_CORRESPOND) | 
|  | return 1; | 
|  |  | 
|  | return 0; | 
|  | } | 
|  |  | 
|  | static job_record_t *_split_job_on_schedule_recurse( | 
|  | job_record_t *job_ptr, split_job_t *split_job) | 
|  | { | 
|  | job_record_t *new_job_ptr; | 
|  | int array_task_id; | 
|  |  | 
|  | if (num_pending_job_array_tasks(job_ptr->array_job_id) >= | 
|  | split_job->pend_cnt_limit) | 
|  | return job_ptr; | 
|  |  | 
|  | if (job_ptr->array_recs->task_cnt < 1) | 
|  | return job_ptr; | 
|  |  | 
|  | array_task_id = bit_ffs(job_ptr->array_recs->task_id_bitmap); | 
|  | if (array_task_id < 0) | 
|  | return job_ptr; | 
|  |  | 
|  | if (job_ptr->array_recs->task_cnt == 1) { | 
|  | job_ptr->array_task_id = array_task_id; | 
|  | new_job_ptr = job_array_post_sched(job_ptr, false); | 
|  | if (new_job_ptr != job_ptr) { | 
|  | if (!split_job->job_list) | 
|  | split_job->job_list = list_create(NULL); | 
|  | list_append(split_job->job_list, new_job_ptr); | 
|  | } | 
|  | if (job_ptr->details && | 
|  | job_ptr->details->dependency && | 
|  | job_ptr->details->depend_list) | 
|  | fed_mgr_submit_remote_dependencies(job_ptr, | 
|  | false, | 
|  | false); | 
|  | return new_job_ptr; | 
|  | } | 
|  |  | 
|  | job_ptr->array_task_id = array_task_id; | 
|  | new_job_ptr = job_array_split(job_ptr, false); | 
|  | debug("%s: Split out %pJ for %s use", | 
|  | __func__, job_ptr, split_job->reason_msg); | 
|  | job_state_set(new_job_ptr, JOB_PENDING); | 
|  | new_job_ptr->start_time = (time_t) 0; | 
|  |  | 
|  | if (!split_job->job_list) | 
|  | split_job->job_list = list_create(NULL); | 
|  | list_append(split_job->job_list, new_job_ptr); | 
|  |  | 
|  | /* | 
|  | * Do NOT clear db_index here, it is handled when task_id_str | 
|  | * is created elsewhere. | 
|  | */ | 
|  |  | 
|  | if (split_job->type == ARRAY_SPLIT_BURST_BUFFER) | 
|  | (void) bb_g_job_validate2(new_job_ptr, NULL); | 
|  |  | 
|  | /* | 
|  | * See if we need to spawn off any more since the new_job_ptr now has | 
|  | * ->array_recs. | 
|  | */ | 
|  | return _split_job_on_schedule_recurse(new_job_ptr, split_job); | 
|  | } | 
|  |  | 
|  | static int _split_job_on_schedule(void *x, void *arg) | 
|  | { | 
|  | job_record_t *job_ptr = x; | 
|  | split_job_t *split_job = arg; | 
|  |  | 
|  | if (!IS_JOB_PENDING(job_ptr) || | 
|  | !job_ptr->array_recs || | 
|  | !job_ptr->array_recs->task_id_bitmap || | 
|  | (job_ptr->array_task_id != NO_VAL)) | 
|  | return 0; | 
|  | /* | 
|  | * Create individual job records for job arrays that need burst buffer | 
|  | * staging | 
|  | */ | 
|  | if (job_ptr->burst_buffer) { | 
|  | split_job->pend_cnt_limit = bb_array_stage_cnt; | 
|  | split_job->reason_msg = "burst buffer"; | 
|  | split_job->type = ARRAY_SPLIT_BURST_BUFFER; | 
|  | job_ptr = _split_job_on_schedule_recurse(job_ptr, split_job); | 
|  | } | 
|  |  | 
|  | /* | 
|  | * Create individual job records for job arrays with | 
|  | * depend_type == SLURM_DEPEND_AFTER_CORRESPOND | 
|  | */ | 
|  | if (job_ptr->details && | 
|  | job_ptr->details->depend_list && | 
|  | list_count(job_ptr->details->depend_list) && | 
|  | list_find_first(job_ptr->details->depend_list, | 
|  | _find_depend_after_corr, | 
|  | NULL)) { | 
|  | split_job->pend_cnt_limit = correspond_after_task_cnt; | 
|  | split_job->reason_msg = "SLURM_DEPEND_AFTER_CORRESPOND"; | 
|  | split_job->type = ARRAY_SPLIT_AFTER_CORR; | 
|  | /* If another thing is added after this set job_ptr as above */ | 
|  | (void) _split_job_on_schedule_recurse(job_ptr, split_job); | 
|  | } | 
|  |  | 
|  | return 0; | 
|  | } | 
|  |  | 
|  | static int _transfer_job_list(void *x, void *arg) | 
|  | { | 
|  | list_append(job_list, x); | 
|  |  | 
|  | return 0; | 
|  | } | 
|  |  | 
|  | static int _build_job_queue_for_qos(void *x, void *arg) | 
|  | { | 
|  | build_job_queue_for_part_t *setup_job = arg; | 
|  | job_record_t *job_ptr = setup_job->job_ptr; | 
|  |  | 
|  | job_ptr->qos_ptr = x; | 
|  |  | 
|  | /* | 
|  | * priority_array index matches part_ptr_list * qos_list | 
|  | * position: increment inx | 
|  | */ | 
|  | setup_job->prio_inx++; | 
|  |  | 
|  | if (!_job_runnable_test2(job_ptr, setup_job->now, setup_job->backfill)) | 
|  | return 0; | 
|  |  | 
|  | setup_job->job_prio_pairs++; | 
|  | if (job_ptr->prio_mult && job_ptr->prio_mult->priority_array) { | 
|  | _job_queue_append(setup_job->job_queue, job_ptr, | 
|  | job_ptr->prio_mult-> | 
|  | priority_array[setup_job->prio_inx]); | 
|  | } else { | 
|  | _job_queue_append(setup_job->job_queue, job_ptr, | 
|  | job_ptr->priority); | 
|  | } | 
|  |  | 
|  | return 0; | 
|  | } | 
|  |  | 
|  | static int _build_job_queue_for_part(void *x, void *arg) | 
|  | { | 
|  | build_job_queue_for_part_t *setup_job = arg; | 
|  | job_record_t *job_ptr = setup_job->job_ptr; | 
|  |  | 
|  | job_ptr->part_ptr = x; | 
|  |  | 
|  | if (job_ptr->qos_list) { | 
|  | (void) list_for_each(job_ptr->qos_list, | 
|  | _build_job_queue_for_qos, | 
|  | setup_job); | 
|  | } else { | 
|  | (void) _build_job_queue_for_qos(job_ptr->qos_ptr, setup_job); | 
|  | } | 
|  |  | 
|  | return 0; | 
|  | } | 
|  |  | 
|  | static int _foreach_job_is_completing(void *x, void *arg) | 
|  | { | 
|  | job_record_t *job_ptr = x; | 
|  | job_is_comp_t *job_is_comp = arg; | 
|  |  | 
|  | if (IS_JOB_COMPLETING(job_ptr) && | 
|  | (job_ptr->end_time >= job_is_comp->recent)) { | 
|  | job_is_comp->completing = true; | 
|  |  | 
|  | /* | 
|  | * Can return after finding first completing job so long | 
|  | * as a map of nodes in partitions affected by | 
|  | * completing jobs is not required. | 
|  | */ | 
|  | if (!job_is_comp->eff_cg_bitmap) | 
|  | return -1; | 
|  | else if (job_ptr->part_ptr) | 
|  | bit_or(job_is_comp->eff_cg_bitmap, | 
|  | job_ptr->part_ptr->node_bitmap); | 
|  | } | 
|  |  | 
|  | return 0; | 
|  | } | 
|  |  | 
|  | static int _foreach_part_reduce_frag(void *x, void *arg) | 
|  | { | 
|  | part_record_t *part_ptr = x; | 
|  | part_reduce_frag_t *part_reduce_frag = arg; | 
|  |  | 
|  | if (bit_overlap_any(part_reduce_frag->eff_cg_bitmap, | 
|  | part_ptr->node_bitmap) && | 
|  | (part_ptr->state_up & PARTITION_SCHED)) { | 
|  | part_ptr->flags |= PART_FLAG_SCHED_FAILED; | 
|  | if (slurm_conf.slurmctld_debug >= LOG_LEVEL_DEBUG) { | 
|  | xstrfmtcatat(part_reduce_frag->cg_part_str, | 
|  | &part_reduce_frag->cg_part_str_pos, | 
|  | "%s%s", | 
|  | part_reduce_frag->cg_part_str ? "," : "", | 
|  | part_ptr->name); | 
|  | } | 
|  | } | 
|  |  | 
|  | return 0; | 
|  | } | 
|  |  | 
|  | static int _foreach_setup_part_sched(void *x, void *arg) | 
|  | { | 
|  | part_record_t *part_ptr = x; | 
|  |  | 
|  | part_ptr->num_sched_jobs = 0; | 
|  | part_ptr->flags &= ~PART_FLAG_SCHED_FAILED; | 
|  | part_ptr->flags &= ~PART_FLAG_SCHED_CLEARED; | 
|  |  | 
|  | return 0; | 
|  | } | 
|  |  | 
|  | static int _foreach_setup_resv_sched(void *x, void *arg) | 
|  | { | 
|  | slurmctld_resv_t *resv_ptr = x; | 
|  |  | 
|  | resv_ptr->flags &= ~RESERVE_FLAG_SCHED_FAILED; | 
|  |  | 
|  | return 0; | 
|  | } | 
|  |  | 
|  | static int _foreach_build_job_queue(void *x, void *arg) | 
|  | { | 
|  | job_record_t *job_ptr = x; | 
|  | build_job_queue_for_part_t *setup_job = arg; | 
|  |  | 
|  | setup_job->job_ptr = job_ptr; | 
|  |  | 
|  | if (IS_JOB_PENDING(job_ptr)) { | 
|  | /* Remove backfill flag */ | 
|  | job_ptr->bit_flags &= ~BACKFILL_SCHED; | 
|  | set_job_failed_assoc_qos_ptr(job_ptr); | 
|  | acct_policy_handle_accrue_time(job_ptr, false); | 
|  | if ((job_ptr->state_reason != WAIT_NO_REASON) && | 
|  | (job_ptr->state_reason != WAIT_PRIORITY) && | 
|  | (job_ptr->state_reason != WAIT_RESOURCES) && | 
|  | (job_ptr->state_reason != job_ptr->state_reason_prev_db)) { | 
|  | job_ptr->state_reason_prev_db = job_ptr->state_reason; | 
|  | last_job_update = setup_job->now; | 
|  | } | 
|  | } | 
|  |  | 
|  | if (((setup_job->tested_jobs % 100) == 0) && | 
|  | (slurm_delta_tv(&setup_job->start_tv) >= build_queue_timeout)) { | 
|  | if (difftime(setup_job->now, *setup_job->last_log_time) > 600) { | 
|  | /* Log at most once every 10 minutes */ | 
|  | info("%s has run for %d usec, exiting with %d of %d jobs tested, %d job-partition-qos pairs added", | 
|  | __func__, build_queue_timeout, | 
|  | setup_job->tested_jobs, | 
|  | list_count(job_list), | 
|  | setup_job->job_prio_pairs); | 
|  | *setup_job->last_log_time = setup_job->now; | 
|  | } | 
|  | return -1; | 
|  | } | 
|  | setup_job->tested_jobs++; | 
|  | job_ptr->preempt_in_progress = false; /* initialize */ | 
|  | if (job_ptr->array_recs && setup_job->backfill) | 
|  | job_ptr->array_recs->pend_run_tasks = 0; | 
|  | if (job_ptr->resv_list) | 
|  | job_ptr->resv_ptr = NULL; | 
|  | if (!_job_runnable_test1(job_ptr, setup_job->clear_start)) | 
|  | return 0; | 
|  |  | 
|  | setup_job->prio_inx = -1; | 
|  | if (job_ptr->part_ptr_list) { | 
|  | (void) list_for_each(job_ptr->part_ptr_list, | 
|  | _build_job_queue_for_part, | 
|  | setup_job); | 
|  | } else { | 
|  | if (job_ptr->part_ptr == NULL) { | 
|  | part_record_t *part_ptr = | 
|  | find_part_record(job_ptr->partition); | 
|  | if (!part_ptr) { | 
|  | error("Could not find partition %s for %pJ", | 
|  | job_ptr->partition, job_ptr); | 
|  | return 0; | 
|  | } | 
|  | job_ptr->part_ptr = part_ptr; | 
|  | error("partition pointer reset for %pJ, part %s", | 
|  | job_ptr, job_ptr->partition); | 
|  | job_ptr->bit_flags |= JOB_PART_ASSIGNED; | 
|  | } | 
|  | (void) _build_job_queue_for_part(job_ptr->part_ptr, setup_job); | 
|  | } | 
|  |  | 
|  | return 0; | 
|  | } | 
|  |  | 
|  | static int _foreach_set_job_elig(void *x, void *arg) | 
|  | { | 
|  | job_record_t *job_ptr = x; | 
|  | time_t now = *(time_t *) arg; | 
|  | part_record_t *part_ptr = job_ptr->part_ptr; | 
|  |  | 
|  | if (!IS_JOB_PENDING(job_ptr)) | 
|  | return 0; | 
|  | if (!part_ptr) | 
|  | return 0; | 
|  | if (!job_ptr->details || | 
|  | (job_ptr->details->begin_time > now)) | 
|  | return 0; | 
|  | if (!(part_ptr->state_up & PARTITION_SCHED)) | 
|  | return 0; | 
|  | if ((job_ptr->time_limit != NO_VAL) && | 
|  | (job_ptr->time_limit > part_ptr->max_time)) | 
|  | return 0; | 
|  | if (job_ptr->details->max_nodes && | 
|  | ((job_ptr->details->max_nodes < part_ptr->min_nodes) || | 
|  | (job_ptr->details->min_nodes > part_ptr->max_nodes))) | 
|  | return 0; | 
|  | /* Job's eligible time is set in job_independent() */ | 
|  | (void) job_independent(job_ptr); | 
|  |  | 
|  | return 0; | 
|  | } | 
|  |  | 
|  | extern void job_queue_rec_magnetic_resv(job_queue_rec_t *job_queue_rec) | 
|  | { | 
|  | job_record_t *job_ptr; | 
|  |  | 
|  | if (!job_queue_rec->resv_ptr) | 
|  | return; | 
|  |  | 
|  | xassert(job_queue_rec->job_ptr); | 
|  | xassert(!job_queue_rec->job_ptr->resv_name); | 
|  |  | 
|  | job_ptr = job_queue_rec->job_ptr; | 
|  | job_ptr->resv_ptr = job_queue_rec->resv_ptr; | 
|  | job_ptr->resv_name = xstrdup(job_ptr->resv_ptr->name); | 
|  | job_ptr->resv_id = job_ptr->resv_ptr->resv_id; | 
|  | job_queue_rec->job_ptr->bit_flags |= JOB_MAGNETIC; | 
|  | } | 
|  |  | 
|  | extern void job_queue_rec_resv_list(job_queue_rec_t *job_queue_rec) | 
|  | { | 
|  | job_record_t *job_ptr; | 
|  |  | 
|  | if (!job_queue_rec->resv_ptr) | 
|  | return; | 
|  |  | 
|  | xassert(job_queue_rec->job_ptr); | 
|  |  | 
|  | job_ptr = job_queue_rec->job_ptr; | 
|  | job_ptr->resv_ptr = job_queue_rec->resv_ptr; | 
|  | /* | 
|  | * Do not set the name since we have multiple and we don't want to | 
|  | * overwrite it. | 
|  | */ | 
|  | job_ptr->resv_id = job_ptr->resv_ptr->resv_id; | 
|  | } | 
|  |  | 
|  | /* | 
|  | * build_job_queue - build (non-priority ordered) list of pending jobs | 
|  | * IN clear_start - if set then clear the start_time for pending jobs, | 
|  | *		    true when called from sched/backfill or sched/builtin | 
|  | * IN backfill - true if running backfill scheduler, enforce min time limit | 
|  | * RET the job queue | 
|  | * NOTE: the caller must call FREE_NULL_LIST() on RET value to free memory | 
|  | */ | 
|  | extern list_t *build_job_queue(bool clear_start, bool backfill) | 
|  | { | 
|  | static time_t last_log_time = 0; | 
|  | split_job_t split_job = { 0 }; | 
|  | build_job_queue_for_part_t setup_job = { | 
|  | .backfill = backfill, | 
|  | .clear_start = clear_start, | 
|  | .last_log_time = &last_log_time, | 
|  | .now = time(NULL), | 
|  | .start_tv = { 0, 0 }, | 
|  | }; | 
|  | /* init the timer */ | 
|  | (void) slurm_delta_tv(&setup_job.start_tv); | 
|  | setup_job.job_queue = list_create(xfree_ptr); | 
|  |  | 
|  | (void) list_for_each(job_list, _split_job_on_schedule, &split_job); | 
|  |  | 
|  | if (split_job.job_list) { | 
|  | /* | 
|  | * We can't use list_transfer() because we don't have the same | 
|  | * destroy function. | 
|  | */ | 
|  | (void) list_for_each(split_job.job_list, | 
|  | _transfer_job_list, NULL); | 
|  | FREE_NULL_LIST(split_job.job_list); | 
|  | } | 
|  |  | 
|  | (void) list_for_each(job_list, _foreach_build_job_queue, &setup_job); | 
|  |  | 
|  | return setup_job.job_queue; | 
|  | } | 
|  |  | 
|  | /* | 
|  | * job_is_completing - Determine if jobs are in the process of completing. | 
|  | * IN/OUT  eff_cg_bitmap - optional bitmap of all relevant completing nodes, | 
|  | *                         relevenace determined by filtering via CompleteWait | 
|  | *                         if NULL, function will terminate at first completing | 
|  | *                         job | 
|  | * RET - True of any job is in the process of completing AND | 
|  | *	 CompleteWait is configured non-zero | 
|  | * NOTE: This function can reduce resource fragmentation, which is a | 
|  | * critical issue on Elan interconnect based systems. | 
|  | */ | 
|  | extern bool job_is_completing(bitstr_t *eff_cg_bitmap) | 
|  | { | 
|  | job_is_comp_t job_is_comp = { | 
|  | .eff_cg_bitmap = eff_cg_bitmap, | 
|  | }; | 
|  |  | 
|  | if ((job_list == NULL) || (slurm_conf.complete_wait == 0)) | 
|  | return false; | 
|  |  | 
|  | job_is_comp.recent = time(NULL) - slurm_conf.complete_wait; | 
|  |  | 
|  | (void) list_for_each(job_list, _foreach_job_is_completing, | 
|  | &job_is_comp); | 
|  |  | 
|  | return job_is_comp.completing; | 
|  | } | 
|  |  | 
|  | /* | 
|  | * set_job_elig_time - set the eligible time for pending jobs once their | 
|  | *      dependencies are lifted (in job->details->begin_time) | 
|  | */ | 
|  | extern void set_job_elig_time(void) | 
|  | { | 
|  | slurmctld_lock_t job_write_lock = | 
|  | { READ_LOCK, WRITE_LOCK, WRITE_LOCK, READ_LOCK, NO_LOCK }; | 
|  | time_t now = time(NULL); | 
|  |  | 
|  | lock_slurmctld(job_write_lock); | 
|  | (void) list_for_each(job_list, _foreach_set_job_elig, &now); | 
|  | unlock_slurmctld(job_write_lock); | 
|  | } | 
|  |  | 
|  | static void _do_diag_stats(long delta_t) | 
|  | { | 
|  | if (delta_t > slurmctld_diag_stats.schedule_cycle_max) | 
|  | slurmctld_diag_stats.schedule_cycle_max = delta_t; | 
|  |  | 
|  | slurmctld_diag_stats.schedule_cycle_sum += delta_t; | 
|  | slurmctld_diag_stats.schedule_cycle_last = delta_t; | 
|  | slurmctld_diag_stats.schedule_cycle_counter++; | 
|  | } | 
|  |  | 
|  | /* | 
|  | * Queue requests of job scheduler | 
|  | */ | 
|  | extern void schedule(bool full_queue) | 
|  | { | 
|  |  | 
|  | if (slurmctld_config.scheduling_disabled) | 
|  | return; | 
|  |  | 
|  | slurm_mutex_lock(&sched_mutex); | 
|  | sched_full_queue |= full_queue; | 
|  | slurm_cond_broadcast(&sched_cond); | 
|  | sched_requests++; | 
|  | slurm_mutex_unlock(&sched_mutex); | 
|  | } | 
|  |  | 
|  | /* detached thread periodically attempts to schedule jobs */ | 
|  | static void *_sched_agent(void *args) | 
|  | { | 
|  | long delta_t; | 
|  | struct timeval now; | 
|  | int job_cnt; | 
|  | bool full_queue; | 
|  |  | 
|  | #if HAVE_SYS_PRCTL_H | 
|  | if (prctl(PR_SET_NAME, "sched_agent", NULL, NULL, NULL) < 0) { | 
|  | error("cannot set my name to _sched_agent %m"); | 
|  | } | 
|  | #endif | 
|  |  | 
|  | while (true) { | 
|  | slurm_mutex_lock(&sched_mutex); | 
|  | while (true) { | 
|  | if (slurmctld_config.shutdown_time) { | 
|  | slurm_mutex_unlock(&sched_mutex); | 
|  | return NULL; | 
|  | } | 
|  |  | 
|  | gettimeofday(&now, NULL); | 
|  | delta_t  = (now.tv_sec  - sched_last.tv_sec) * | 
|  | USEC_IN_SEC; | 
|  | delta_t +=  now.tv_usec - sched_last.tv_usec; | 
|  |  | 
|  | if (sched_requests && delta_t > sched_min_interval ) { | 
|  | break; | 
|  | } else if (sched_requests) { | 
|  | struct timespec ts = {0, 0}; | 
|  | int64_t nsec; | 
|  |  | 
|  | nsec = sched_min_interval + sched_last.tv_usec; | 
|  | nsec *= NSEC_IN_USEC; | 
|  | nsec += NSEC_IN_USEC; | 
|  | ts.tv_sec = sched_last.tv_sec + | 
|  | (nsec / NSEC_IN_SEC); | 
|  | ts.tv_nsec = nsec % NSEC_IN_SEC; | 
|  | slurm_cond_timedwait(&sched_cond, | 
|  | &sched_mutex, &ts); | 
|  | } else { | 
|  | slurm_cond_wait(&sched_cond, &sched_mutex); | 
|  | } | 
|  | } | 
|  |  | 
|  | full_queue = sched_full_queue; | 
|  | sched_full_queue = false; | 
|  | sched_requests = 0; | 
|  | slurm_mutex_unlock(&sched_mutex); | 
|  |  | 
|  | job_cnt = _schedule(full_queue); | 
|  | gettimeofday(&now, NULL); | 
|  | sched_last.tv_sec  = now.tv_sec; | 
|  | sched_last.tv_usec = now.tv_usec; | 
|  | if (job_cnt) { | 
|  | /* jobs were started, save state */ | 
|  | schedule_node_save();		/* Has own locking */ | 
|  | schedule_job_save();		/* Has own locking */ | 
|  | } | 
|  | } | 
|  |  | 
|  | return NULL; | 
|  | } | 
|  |  | 
|  | /* Determine if job's deadline specification is still valid, kill job if not | 
|  | * job_ptr IN - Job to test | 
|  | * func IN - function name used for logging | 
|  | * RET - true of valid, false if invalid and job cancelled | 
|  | */ | 
|  | extern bool deadline_ok(job_record_t *job_ptr, const char *func) | 
|  | { | 
|  | time_t now; | 
|  | char time_str_deadline[256]; | 
|  | bool fail_job = false; | 
|  | time_t inter; | 
|  |  | 
|  | now = time(NULL); | 
|  | if ((job_ptr->time_min) && (job_ptr->time_min != NO_VAL)) { | 
|  | inter = now + job_ptr->time_min * 60; | 
|  | if (job_ptr->deadline < inter) { | 
|  | slurm_make_time_str(&job_ptr->deadline, | 
|  | time_str_deadline, | 
|  | sizeof(time_str_deadline)); | 
|  | info("%s: %pJ with time_min %u exceeded deadline %s and cancelled", | 
|  | func, job_ptr, job_ptr->time_min, | 
|  | time_str_deadline); | 
|  | fail_job = true; | 
|  | } | 
|  | } else if ((job_ptr->time_limit != NO_VAL) && | 
|  | (job_ptr->time_limit != INFINITE)) { | 
|  | inter = now + job_ptr->time_limit * 60; | 
|  | if (job_ptr->deadline < inter) { | 
|  | slurm_make_time_str(&job_ptr->deadline, | 
|  | time_str_deadline, | 
|  | sizeof(time_str_deadline)); | 
|  | info("%s: %pJ with time_limit %u exceeded deadline %s and cancelled", | 
|  | func, job_ptr, job_ptr->time_limit, | 
|  | time_str_deadline); | 
|  | fail_job = true; | 
|  | } | 
|  | } | 
|  | if (fail_job) { | 
|  | last_job_update = now; | 
|  | job_state_set(job_ptr, JOB_DEADLINE); | 
|  | job_ptr->exit_code = 1; | 
|  | job_ptr->state_reason = FAIL_DEADLINE; | 
|  | xfree(job_ptr->state_desc); | 
|  | job_ptr->start_time = now; | 
|  | job_ptr->end_time = now; | 
|  | srun_allocate_abort(job_ptr); | 
|  | job_completion_logger(job_ptr, false); | 
|  | return false; | 
|  | } | 
|  | return true; | 
|  | } | 
|  |  | 
|  | /* | 
|  | * When an array job is rejected for some reason, the remaining array tasks will | 
|  | * get skipped by both the main scheduler and the backfill scheduler (it's an | 
|  | * optimization). Hence, their reasons should match the reason of the first job. | 
|  | * This function sets those reasons. | 
|  | * | 
|  | * job_ptr		(IN) The current job being evaluated, after it has gone | 
|  | * 			through the scheduling loop. | 
|  | * reject_array_job	(IN) A pointer to the first job (array task) in the most | 
|  | * 			recently rejected array job. If job_ptr belongs to the | 
|  | * 			same array as reject_array_job, then set job_ptr's | 
|  | * 			reason to match reject_array_job. | 
|  | */ | 
|  | extern void fill_array_reasons(job_record_t *job_ptr, | 
|  | job_record_t *reject_array_job) | 
|  | { | 
|  | if (!reject_array_job || !reject_array_job->array_job_id) | 
|  | return; | 
|  |  | 
|  | if (job_ptr == reject_array_job) | 
|  | return; | 
|  |  | 
|  | /* | 
|  | * If the current job is part of the rejected job array... | 
|  | * And if the reason isn't properly set yet... | 
|  | */ | 
|  | if ((job_ptr->array_job_id == reject_array_job->array_job_id) && | 
|  | (job_ptr->state_reason != reject_array_job->state_reason)) { | 
|  | /* Set the reason for the subsequent array task */ | 
|  | xfree(job_ptr->state_desc); | 
|  | job_ptr->state_reason = reject_array_job->state_reason; | 
|  | last_job_update = time(NULL); | 
|  | debug3("%s: Setting reason of array task %pJ to %s", | 
|  | __func__, job_ptr, | 
|  | job_state_reason_string(job_ptr->state_reason)); | 
|  | } | 
|  | } | 
|  |  | 
|  | static job_queue_rec_t *_create_job_queue_rec(job_queue_req_t *job_queue_req) | 
|  | { | 
|  | job_queue_rec_t *job_queue_rec = xmalloc(sizeof(*job_queue_rec)); | 
|  | job_queue_rec->array_task_id = job_queue_req->job_ptr->array_task_id; | 
|  | job_queue_rec->job_id   = job_queue_req->job_ptr->job_id; | 
|  | job_queue_rec->job_ptr  = job_queue_req->job_ptr; | 
|  | job_queue_rec->part_ptr = job_queue_req->part_ptr; | 
|  | job_queue_rec->priority = job_queue_req->prio; | 
|  | job_queue_rec->qos_ptr = job_queue_req->job_ptr->qos_ptr; | 
|  | job_queue_rec->resv_ptr = job_queue_req->resv_ptr; | 
|  |  | 
|  | return job_queue_rec; | 
|  | } | 
|  |  | 
|  | extern void job_queue_append_internal(job_queue_req_t *job_queue_req) | 
|  | { | 
|  | job_queue_rec_t *job_queue_rec; | 
|  |  | 
|  | xassert(job_queue_req); | 
|  | xassert(job_queue_req->job_ptr); | 
|  | xassert(job_queue_req->job_queue); | 
|  | xassert(job_queue_req->part_ptr); | 
|  |  | 
|  | if (job_queue_req->job_ptr->details && | 
|  | job_queue_req->job_ptr->details->prefer) { | 
|  | job_queue_rec = _create_job_queue_rec(job_queue_req); | 
|  | job_queue_rec->use_prefer = true; | 
|  | list_append(job_queue_req->job_queue, job_queue_rec); | 
|  | } | 
|  |  | 
|  | job_queue_rec = _create_job_queue_rec(job_queue_req); | 
|  |  | 
|  | list_append(job_queue_req->job_queue, job_queue_rec); | 
|  | } | 
|  |  | 
|  | static void _set_features(job_record_t *job_ptr, bool use_prefer) | 
|  | { | 
|  | /* | 
|  | * feature_list_use is a temporary variable and should | 
|  | * be reset before each use. Do this after the check for | 
|  | * pending because the job could have started with | 
|  | * "preferred" job_queue_rec. | 
|  | */ | 
|  | if (use_prefer) { | 
|  | job_ptr->details->features_use = | 
|  | job_ptr->details->prefer; | 
|  | job_ptr->details->feature_list_use = | 
|  | job_ptr->details->prefer_list; | 
|  | } else { | 
|  | job_ptr->details->features_use = | 
|  | job_ptr->details->features; | 
|  | job_ptr->details->feature_list_use = | 
|  | job_ptr->details->feature_list; | 
|  | } | 
|  | } | 
|  |  | 
|  | static void _set_schedule_exit(schedule_exit_t code) | 
|  | { | 
|  | xassert(code < SCHEDULE_EXIT_COUNT); | 
|  |  | 
|  | slurmctld_diag_stats.schedule_exit[code]++; | 
|  | } | 
|  |  | 
|  | static int _get_nodes_in_reservations(void *x, void *arg) | 
|  | { | 
|  | slurmctld_resv_t *resv_ptr = x; | 
|  | bitstr_t *node_bitmap = arg; | 
|  |  | 
|  | xassert(resv_ptr); | 
|  | xassert(node_bitmap); | 
|  |  | 
|  | if (resv_ptr->node_bitmap) | 
|  | bit_or(node_bitmap, resv_ptr->node_bitmap); | 
|  | return SLURM_SUCCESS; | 
|  | } | 
|  |  | 
|  | static int _schedule(bool full_queue) | 
|  | { | 
|  | list_t *job_queue = NULL; | 
|  | int job_cnt = 0; | 
|  | int error_code, i, time_limit, pend_time; | 
|  | uint32_t job_depth = 0, array_task_id; | 
|  | job_queue_rec_t *job_queue_rec; | 
|  | job_record_t *job_ptr = NULL; | 
|  | part_record_t *part_ptr, *skip_part_ptr = NULL; | 
|  | bitstr_t *save_avail_node_bitmap; | 
|  | int bb_wait_cnt = 0; | 
|  | /* Locks: Read config, write job, write node, read partition */ | 
|  | slurmctld_lock_t job_write_lock = | 
|  | { READ_LOCK, WRITE_LOCK, WRITE_LOCK, READ_LOCK, READ_LOCK }; | 
|  | bool is_job_array_head; | 
|  | static time_t sched_update = 0; | 
|  | static bool assoc_limit_stop = false; | 
|  | static int sched_timeout = 0; | 
|  | static int sched_max_job_start = 0; | 
|  | static int bf_min_age_reserve = 0; | 
|  | static uint32_t bf_min_prio_reserve = 0; | 
|  | static bool bf_licenses = false; | 
|  | static int def_job_limit = 100; | 
|  | static int max_jobs_per_part = 0; | 
|  | static int defer_rpc_cnt = 0; | 
|  | static bool reduce_completing_frag = false; | 
|  | time_t now, last_job_sched_start, sched_start; | 
|  | job_record_t *reject_array_job = NULL; | 
|  | part_record_t *reject_array_part = NULL; | 
|  | slurmctld_resv_t *reject_array_resv = NULL; | 
|  | bool reject_array_use_prefer = false; | 
|  | bool use_prefer; | 
|  | bool fail_by_part, wait_on_resv, fail_by_part_non_reserve; | 
|  | uint32_t deadline_time_limit, save_time_limit = 0; | 
|  | uint32_t prio_reserve; | 
|  | DEF_TIMERS; | 
|  | job_node_select_t job_node_select = { 0 }; | 
|  | static bool ignore_prefer_val = false; | 
|  |  | 
|  | if (slurmctld_config.shutdown_time) | 
|  | return 0; | 
|  |  | 
|  | if (sched_update != slurm_conf.last_update) { | 
|  | char *tmp_ptr; | 
|  |  | 
|  | if (xstrcasestr(slurm_conf.sched_params, "assoc_limit_stop")) | 
|  | assoc_limit_stop = true; | 
|  | else | 
|  | assoc_limit_stop = false; | 
|  |  | 
|  | if ((tmp_ptr = xstrcasestr(slurm_conf.sched_params, | 
|  | "batch_sched_delay="))) { | 
|  | batch_sched_delay = atoi(tmp_ptr + 18); | 
|  | if (batch_sched_delay < 0) { | 
|  | error("Invalid batch_sched_delay: %d", | 
|  | batch_sched_delay); | 
|  | batch_sched_delay = 3; | 
|  | } | 
|  | } else { | 
|  | batch_sched_delay = 3; | 
|  | } | 
|  |  | 
|  | bb_array_stage_cnt = 10; | 
|  | if ((tmp_ptr = xstrcasestr(slurm_conf.sched_params, | 
|  | "bb_array_stage_cnt="))) { | 
|  | int task_cnt = atoi(tmp_ptr + 19); | 
|  | if (task_cnt > 0) | 
|  | bb_array_stage_cnt = task_cnt; | 
|  | } | 
|  |  | 
|  | bf_min_age_reserve = 0; | 
|  | if ((tmp_ptr = xstrcasestr(slurm_conf.sched_params, | 
|  | "bf_min_age_reserve="))) { | 
|  | int min_age = atoi(tmp_ptr + 19); | 
|  | if (min_age > 0) | 
|  | bf_min_age_reserve = min_age; | 
|  | } | 
|  |  | 
|  | bf_min_prio_reserve = 0; | 
|  | if ((tmp_ptr = xstrcasestr(slurm_conf.sched_params, | 
|  | "bf_min_prio_reserve="))) { | 
|  | int64_t min_prio = (int64_t) atoll(tmp_ptr + 20); | 
|  | if (min_prio > 0) | 
|  | bf_min_prio_reserve = (uint32_t) min_prio; | 
|  | } | 
|  |  | 
|  | bf_licenses = false; | 
|  | if (xstrcasestr(slurm_conf.sched_params, "bf_licenses")) { | 
|  | if (!xstrcmp(slurm_conf.schedtype, "sched/builtin")) | 
|  | error("Ignoring SchedulerParameters=bf_licenses, this option is incompatible with sched/builtin."); | 
|  | else | 
|  | bf_licenses = true; | 
|  | } | 
|  |  | 
|  | if ((tmp_ptr = xstrcasestr(slurm_conf.sched_params, | 
|  | "build_queue_timeout="))) { | 
|  | build_queue_timeout = atoi(tmp_ptr + 20); | 
|  | if (build_queue_timeout < 100) { | 
|  | error("Invalid build_queue_time: %d", | 
|  | build_queue_timeout); | 
|  | build_queue_timeout = BUILD_TIMEOUT; | 
|  | } | 
|  | } else { | 
|  | build_queue_timeout = BUILD_TIMEOUT; | 
|  | } | 
|  |  | 
|  | if ((tmp_ptr = xstrcasestr(slurm_conf.sched_params, | 
|  | "correspond_after_task_cnt="))) { | 
|  | correspond_after_task_cnt = atoi(tmp_ptr + 26); | 
|  | if (correspond_after_task_cnt < | 
|  | CORRESPOND_ARRAY_TASK_CNT) { | 
|  | error("Invalid correspond_after_task_cnt: %d, the value can't be lower than %d", | 
|  | correspond_after_task_cnt, | 
|  | CORRESPOND_ARRAY_TASK_CNT); | 
|  | correspond_after_task_cnt = | 
|  | CORRESPOND_ARRAY_TASK_CNT; | 
|  | } | 
|  | } else { | 
|  | correspond_after_task_cnt = CORRESPOND_ARRAY_TASK_CNT; | 
|  | } | 
|  |  | 
|  |  | 
|  | if ((tmp_ptr = xstrcasestr(slurm_conf.sched_params, | 
|  | "default_queue_depth="))) { | 
|  | def_job_limit = atoi(tmp_ptr + 20); | 
|  | if (def_job_limit < 0) { | 
|  | error("ignoring SchedulerParameters: " | 
|  | "default_queue_depth value of %d", | 
|  | def_job_limit); | 
|  | def_job_limit = 100; | 
|  | } | 
|  | } else { | 
|  | def_job_limit = 100; | 
|  | } | 
|  |  | 
|  | bf_hetjob_prio = 0; | 
|  | if ((tmp_ptr = xstrcasestr(slurm_conf.sched_params, | 
|  | "bf_hetjob_prio="))) { | 
|  | tmp_ptr += 15; | 
|  | if (!xstrncasecmp(tmp_ptr, "min", 3)) | 
|  | bf_hetjob_prio |= HETJOB_PRIO_MIN; | 
|  | else if (!xstrncasecmp(tmp_ptr, "max", 3)) | 
|  | bf_hetjob_prio |= HETJOB_PRIO_MAX; | 
|  | else if (!xstrncasecmp(tmp_ptr, "avg", 3)) | 
|  | bf_hetjob_prio |= HETJOB_PRIO_AVG; | 
|  | else | 
|  | error("Invalid SchedulerParameters bf_hetjob_prio: %s", | 
|  | tmp_ptr); | 
|  | } | 
|  |  | 
|  | bf_hetjob_immediate = false; | 
|  | if (xstrcasestr(slurm_conf.sched_params, "bf_hetjob_immediate")) | 
|  | bf_hetjob_immediate = true; | 
|  |  | 
|  | if (bf_hetjob_immediate && !bf_hetjob_prio) { | 
|  | bf_hetjob_prio |= HETJOB_PRIO_MIN; | 
|  | info("bf_hetjob_immediate automatically sets bf_hetjob_prio=min"); | 
|  | } | 
|  |  | 
|  | if ((tmp_ptr = xstrcasestr(slurm_conf.sched_params, | 
|  | "partition_job_depth="))) { | 
|  | max_jobs_per_part = atoi(tmp_ptr + 20); | 
|  | if (max_jobs_per_part < 0) { | 
|  | error("ignoring SchedulerParameters: " | 
|  | "partition_job_depth value of %d", | 
|  | max_jobs_per_part); | 
|  | max_jobs_per_part = 0; | 
|  | } | 
|  | } else { | 
|  | max_jobs_per_part = 0; | 
|  | } | 
|  |  | 
|  | if (xstrcasestr(slurm_conf.sched_params, | 
|  | "reduce_completing_frag")) | 
|  | reduce_completing_frag = true; | 
|  | else | 
|  | reduce_completing_frag = false; | 
|  |  | 
|  | if ((tmp_ptr = xstrcasestr(slurm_conf.sched_params, | 
|  | "max_rpc_cnt="))) | 
|  | defer_rpc_cnt = atoi(tmp_ptr + 12); | 
|  | else if ((tmp_ptr = xstrcasestr(slurm_conf.sched_params, | 
|  | "max_rpc_count="))) | 
|  | defer_rpc_cnt = atoi(tmp_ptr + 14); | 
|  | else | 
|  | defer_rpc_cnt = 0; | 
|  | if (defer_rpc_cnt < 0) { | 
|  | error("Invalid max_rpc_cnt: %d", defer_rpc_cnt); | 
|  | defer_rpc_cnt = 0; | 
|  | } | 
|  |  | 
|  | time_limit = slurm_conf.msg_timeout / 2; | 
|  | if ((tmp_ptr = xstrcasestr(slurm_conf.sched_params, | 
|  | "max_sched_time="))) { | 
|  | sched_timeout = atoi(tmp_ptr + 15); | 
|  | if ((sched_timeout <= 0) || | 
|  | (sched_timeout > time_limit)) { | 
|  | error("Invalid max_sched_time: %d", | 
|  | sched_timeout); | 
|  | sched_timeout = 0; | 
|  | } | 
|  | } else { | 
|  | sched_timeout = 0; | 
|  | } | 
|  | if (sched_timeout == 0) { | 
|  | sched_timeout = MAX(time_limit, 1); | 
|  | sched_timeout = MIN(sched_timeout, 2); | 
|  | } | 
|  |  | 
|  | if ((tmp_ptr = xstrcasestr(slurm_conf.sched_params, | 
|  | "sched_interval="))) { | 
|  | sched_interval = atoi(tmp_ptr + 15); | 
|  | if (sched_interval == -1) { | 
|  | sched_debug("schedule() returning, sched_interval=-1"); | 
|  | /* | 
|  | * Exit without setting sched_update.  This gets | 
|  | * verbose, but makes this setting easy to | 
|  | * happen. | 
|  | * | 
|  | * No memory is allocated above this. | 
|  | */ | 
|  | return 0; | 
|  | } else if (sched_interval < 0) { | 
|  | error("Invalid sched_interval: %d", | 
|  | sched_interval); | 
|  | sched_interval = 60; | 
|  | } | 
|  | } else { | 
|  | sched_interval = 60; | 
|  | } | 
|  |  | 
|  | if ((tmp_ptr = xstrcasestr(slurm_conf.sched_params, | 
|  | "sched_min_interval="))) { | 
|  | i = atoi(tmp_ptr + 19); | 
|  | if (i < 0) | 
|  | error("Invalid sched_min_interval: %d", i); | 
|  | else | 
|  | sched_min_interval = i; | 
|  | } else { | 
|  | sched_min_interval = 2; | 
|  | } | 
|  |  | 
|  | if ((tmp_ptr = xstrcasestr(slurm_conf.sched_params, | 
|  | "sched_max_job_start="))) { | 
|  | sched_max_job_start = atoi(tmp_ptr + 20); | 
|  | if (sched_max_job_start < 0) { | 
|  | error("Invalid sched_max_job_start: %d", | 
|  | sched_max_job_start); | 
|  | sched_max_job_start = 0; | 
|  | } | 
|  | } else { | 
|  | sched_max_job_start = 0; | 
|  | } | 
|  |  | 
|  | if (xstrcasestr(slurm_conf.sched_params, | 
|  | "ignore_prefer_validation")) | 
|  | ignore_prefer_val = true; | 
|  | else | 
|  | ignore_prefer_val = false; | 
|  |  | 
|  | sched_update = slurm_conf.last_update; | 
|  | if (slurm_conf.sched_params && strlen(slurm_conf.sched_params)) | 
|  | info("SchedulerParameters=%s", slurm_conf.sched_params); | 
|  | } | 
|  |  | 
|  | slurm_mutex_lock(&slurmctld_config.thread_count_lock); | 
|  | if ((defer_rpc_cnt > 0) && | 
|  | (slurmctld_config.server_thread_count >= defer_rpc_cnt)) { | 
|  | sched_debug("schedule() returning, too many RPCs"); | 
|  | slurm_mutex_unlock(&slurmctld_config.thread_count_lock); | 
|  | goto out; | 
|  | } | 
|  | slurm_mutex_unlock(&slurmctld_config.thread_count_lock); | 
|  |  | 
|  | if (!fed_mgr_sibs_synced()) { | 
|  | sched_info("schedule() returning, federation siblings not synced yet"); | 
|  | goto out; | 
|  | } | 
|  |  | 
|  | lock_slurmctld(job_write_lock); | 
|  | now = time(NULL); | 
|  | sched_start = now; | 
|  | last_job_sched_start = now; | 
|  | START_TIMER; | 
|  | if (!reduce_completing_frag && job_is_completing(NULL)) { | 
|  | unlock_slurmctld(job_write_lock); | 
|  | sched_debug("schedule() returning, some job is still completing"); | 
|  | goto out; | 
|  | } | 
|  |  | 
|  | (void) list_for_each(part_list, _foreach_setup_part_sched, NULL); | 
|  | (void) list_for_each(resv_list, _foreach_setup_resv_sched, NULL); | 
|  |  | 
|  | save_avail_node_bitmap = bit_copy(avail_node_bitmap); | 
|  |  | 
|  | /* Avoid resource fragmentation if important */ | 
|  | if (reduce_completing_frag) { | 
|  | bitstr_t *eff_cg_bitmap = bit_alloc(node_record_count); | 
|  | if (job_is_completing(eff_cg_bitmap)) { | 
|  | part_reduce_frag_t part_reduce_frag = { | 
|  | .eff_cg_bitmap = eff_cg_bitmap, | 
|  | }; | 
|  | (void) list_for_each(part_list, | 
|  | _foreach_part_reduce_frag, | 
|  | &part_reduce_frag); | 
|  | if (part_reduce_frag.cg_part_str) { | 
|  | sched_debug("some job is still completing, skipping partitions '%s'", | 
|  | part_reduce_frag.cg_part_str); | 
|  | xfree(part_reduce_frag.cg_part_str); | 
|  | } | 
|  | } | 
|  | FREE_NULL_BITMAP(eff_cg_bitmap); | 
|  | } | 
|  |  | 
|  | sched_debug("Running job scheduler %s.", full_queue ? "for full queue":"for default depth"); | 
|  | job_queue = build_job_queue(false, false); | 
|  | slurmctld_diag_stats.schedule_queue_len = list_count(job_queue); | 
|  | sort_job_queue(job_queue); | 
|  |  | 
|  | job_ptr = NULL; | 
|  | wait_on_resv = false; | 
|  | while (1) { | 
|  | /* Run some final guaranteed logic after each job iteration */ | 
|  | if (job_ptr) { | 
|  | job_resv_clear_magnetic_flag(job_ptr); | 
|  | fill_array_reasons(job_ptr, reject_array_job); | 
|  | } | 
|  |  | 
|  | job_queue_rec = list_pop(job_queue); | 
|  | if (!job_queue_rec) { | 
|  | _set_schedule_exit(SCHEDULE_EXIT_END); | 
|  | break; | 
|  | } | 
|  | array_task_id = job_queue_rec->array_task_id; | 
|  | job_ptr = job_queue_rec->job_ptr; | 
|  | part_ptr = job_queue_rec->part_ptr; | 
|  |  | 
|  | if ((job_ptr->array_task_id != array_task_id) && | 
|  | (array_task_id == NO_VAL)) { | 
|  | /* Job array element started in other partition, | 
|  | * reset pointer to "master" job array record */ | 
|  | job_ptr = find_job_record(job_ptr->array_job_id); | 
|  | job_queue_rec->job_ptr = job_ptr; | 
|  | } | 
|  | if (!job_ptr || | 
|  | !IS_JOB_PENDING(job_ptr) || /* started in other part/qos */ | 
|  | !job_ptr->priority) { /* held from fail in other part/qos */ | 
|  | xfree(job_queue_rec); | 
|  | continue; | 
|  | } | 
|  |  | 
|  | use_prefer = job_queue_rec->use_prefer; | 
|  | _set_features(job_ptr, use_prefer); | 
|  |  | 
|  | if (job_ptr->resv_list) | 
|  | job_queue_rec_resv_list(job_queue_rec); | 
|  | else | 
|  | job_queue_rec_magnetic_resv(job_queue_rec); | 
|  |  | 
|  | if (!_job_runnable_test3(job_ptr, part_ptr)) { | 
|  | xfree(job_queue_rec); | 
|  | continue; | 
|  | } | 
|  |  | 
|  | job_ptr->qos_ptr = job_queue_rec->qos_ptr; | 
|  | job_ptr->part_ptr = part_ptr; | 
|  | job_ptr->priority = job_queue_rec->priority; | 
|  |  | 
|  | xfree(job_queue_rec); | 
|  |  | 
|  | job_ptr->last_sched_eval = time(NULL); | 
|  |  | 
|  | if (job_ptr->preempt_in_progress) | 
|  | continue;	/* scheduled in another partition */ | 
|  |  | 
|  | if (job_ptr->het_job_id) { | 
|  | fail_by_part = true; | 
|  | fail_by_part_non_reserve = false; | 
|  | goto fail_this_part; | 
|  | } | 
|  |  | 
|  | if (job_ptr->array_recs && (job_ptr->array_task_id == NO_VAL)) | 
|  | is_job_array_head = true; | 
|  | else | 
|  | is_job_array_head = false; | 
|  |  | 
|  | next_task: | 
|  | if ((time(NULL) - sched_start) >= sched_timeout) { | 
|  | sched_debug("loop taking too long, breaking out"); | 
|  | _set_schedule_exit(SCHEDULE_EXIT_TIMEOUT); | 
|  | break; | 
|  | } | 
|  | if (sched_max_job_start && (job_cnt >= sched_max_job_start)) { | 
|  | sched_debug("sched_max_job_start reached, breaking out"); | 
|  | _set_schedule_exit(SCHEDULE_EXIT_MAX_JOB_START); | 
|  | break; | 
|  | } | 
|  |  | 
|  | if ((job_ptr->array_task_id != NO_VAL) || job_ptr->array_recs) { | 
|  | if (reject_array_job && | 
|  | (reject_array_job->array_job_id == | 
|  | job_ptr->array_job_id) && | 
|  | (reject_array_part == part_ptr) && | 
|  | (reject_array_resv == job_ptr->resv_ptr) && | 
|  | (reject_array_use_prefer == use_prefer)) | 
|  | continue;  /* already rejected array element */ | 
|  |  | 
|  |  | 
|  | /* assume reject whole array for now, clear if OK */ | 
|  | reject_array_job = job_ptr; | 
|  | reject_array_part = part_ptr; | 
|  | reject_array_resv = job_ptr->resv_ptr; | 
|  | reject_array_use_prefer = use_prefer; | 
|  |  | 
|  | if (!job_array_start_test(job_ptr)) | 
|  | continue; | 
|  | } | 
|  | if (max_jobs_per_part && | 
|  | (max_jobs_per_part < ++job_ptr->part_ptr->num_sched_jobs)) { | 
|  | if (job_ptr->state_reason == WAIT_NO_REASON) { | 
|  | xfree(job_ptr->state_desc); | 
|  | job_ptr->state_reason = WAIT_PRIORITY; | 
|  | last_job_update = now; | 
|  | } | 
|  | if (job_ptr->part_ptr == skip_part_ptr) | 
|  | continue; | 
|  | sched_debug2("reached partition %s job limit", | 
|  | job_ptr->part_ptr->name); | 
|  | skip_part_ptr = job_ptr->part_ptr; | 
|  | continue; | 
|  | } | 
|  | if (!full_queue && (job_depth++ > def_job_limit)) { | 
|  | sched_debug("already tested %u jobs, breaking out", | 
|  | job_depth); | 
|  | _set_schedule_exit(SCHEDULE_EXIT_MAX_DEPTH); | 
|  | break; | 
|  | } | 
|  |  | 
|  | slurm_mutex_lock(&slurmctld_config.thread_count_lock); | 
|  | if ((defer_rpc_cnt > 0) && | 
|  | (slurmctld_config.server_thread_count >= defer_rpc_cnt)) { | 
|  | sched_debug("schedule() returning, too many RPCs"); | 
|  | slurm_mutex_unlock(&slurmctld_config.thread_count_lock); | 
|  | _set_schedule_exit(SCHEDULE_EXIT_RPC_CNT); | 
|  | break; | 
|  | } | 
|  | slurm_mutex_unlock(&slurmctld_config.thread_count_lock); | 
|  |  | 
|  | if (job_limits_check(&job_ptr, false) != WAIT_NO_REASON) { | 
|  | /* should never happen */ | 
|  | continue; | 
|  | } | 
|  |  | 
|  | slurmctld_diag_stats.schedule_cycle_depth++; | 
|  |  | 
|  | if (job_ptr->resv_name) { | 
|  | /* | 
|  | * If we have a MaxStartDelay we need to make sure we | 
|  | * don't schedule any jobs that could potentially run to | 
|  | * avoid starvation of this job. | 
|  | */ | 
|  | if (job_ptr->resv_ptr && | 
|  | job_ptr->resv_ptr->max_start_delay) | 
|  | wait_on_resv = true; | 
|  |  | 
|  | if (job_ptr->resv_ptr->flags & | 
|  | RESERVE_FLAG_SCHED_FAILED) { | 
|  | job_ptr->state_reason = WAIT_PRIORITY; | 
|  | xfree(job_ptr->state_desc); | 
|  | last_job_update = now; | 
|  | sched_debug3("%pJ. State=PENDING. Reason=Priority. Priority=%u. Resv=%s.", | 
|  | job_ptr, | 
|  | job_ptr->priority, | 
|  | job_ptr->resv_name); | 
|  | continue; | 
|  | } | 
|  | } else if (job_ptr->part_ptr->flags & PART_FLAG_SCHED_FAILED) { | 
|  | if (!(job_ptr->part_ptr->flags & | 
|  | PART_FLAG_SCHED_CLEARED)) { | 
|  | bit_and_not(avail_node_bitmap, | 
|  | part_ptr->node_bitmap); | 
|  | job_ptr->part_ptr->flags |= | 
|  | PART_FLAG_SCHED_CLEARED; | 
|  | } | 
|  |  | 
|  | if ((job_ptr->state_reason == WAIT_NO_REASON) || | 
|  | (job_ptr->state_reason == WAIT_RESOURCES)) { | 
|  | sched_debug("%pJ unable to schedule in Partition=%s (per PART_FLAG_SCHED_FAILED). State=PENDING. Previous-Reason=%s. Previous-Desc=%s. New-Reason=Priority. Priority=%u.", | 
|  | job_ptr, | 
|  | job_ptr->part_ptr->name, | 
|  | job_state_reason_string( | 
|  | job_ptr->state_reason), | 
|  | job_ptr->state_desc, | 
|  | job_ptr->priority); | 
|  | job_ptr->state_reason = WAIT_PRIORITY; | 
|  | xfree(job_ptr->state_desc); | 
|  | last_job_update = now; | 
|  | } else { | 
|  | /* | 
|  | * Log job can not run even though we are not | 
|  | * overriding the reason */ | 
|  | sched_debug2("%pJ. unable to schedule in Partition=%s (per PART_FLAG_SCHED_FAILED). Retaining previous scheduling Reason=%s. Desc=%s. Priority=%u.", | 
|  | job_ptr, | 
|  | job_ptr->part_ptr->name, | 
|  | job_state_reason_string( | 
|  | job_ptr->state_reason), | 
|  | job_ptr->state_desc, | 
|  | job_ptr->priority); | 
|  | } | 
|  | last_job_update = now; | 
|  |  | 
|  | continue; | 
|  | } else if (wait_on_resv && | 
|  | (job_ptr->warn_flags & KILL_JOB_RESV)) { | 
|  | sched_debug("%pJ. State=PENDING. Reason=Priority, Priority=%u. May be able to backfill on MaxStartDelay reservations.", | 
|  | job_ptr, job_ptr->priority); | 
|  | continue; | 
|  |  | 
|  | } | 
|  |  | 
|  | /* Test for valid QOS and required nodes on each pass */ | 
|  | if (job_ptr->qos_ptr) { | 
|  | assoc_mgr_lock_t locks = | 
|  | { .assoc = READ_LOCK, .qos = READ_LOCK }; | 
|  |  | 
|  | assoc_mgr_lock(&locks); | 
|  | if (job_ptr->assoc_ptr | 
|  | && (accounting_enforce & ACCOUNTING_ENFORCE_QOS) | 
|  | && ((job_ptr->qos_ptr->id >= g_qos_count) || | 
|  | !job_ptr->assoc_ptr->usage || | 
|  | !job_ptr->assoc_ptr->usage->valid_qos || | 
|  | !bit_test(job_ptr->assoc_ptr->usage->valid_qos, | 
|  | job_ptr->qos_ptr->id)) | 
|  | && !job_ptr->limit_set.qos) { | 
|  | assoc_mgr_unlock(&locks); | 
|  | sched_debug("%pJ has invalid QOS", job_ptr); | 
|  | job_fail_qos(job_ptr, __func__, false); | 
|  | last_job_update = now; | 
|  | continue; | 
|  | } else if (job_ptr->state_reason == FAIL_QOS) { | 
|  | xfree(job_ptr->state_desc); | 
|  | job_ptr->state_reason = WAIT_NO_REASON; | 
|  | last_job_update = now; | 
|  | } | 
|  | assoc_mgr_unlock(&locks); | 
|  | } | 
|  |  | 
|  | deadline_time_limit = 0; | 
|  | if ((job_ptr->deadline) && (job_ptr->deadline != NO_VAL)) { | 
|  | if (!deadline_ok(job_ptr, __func__)) | 
|  | continue; | 
|  |  | 
|  | deadline_time_limit = job_ptr->deadline - now; | 
|  | deadline_time_limit /= 60; | 
|  | if ((job_ptr->time_limit != NO_VAL) && | 
|  | (job_ptr->time_limit != INFINITE)) { | 
|  | deadline_time_limit = MIN(job_ptr->time_limit, | 
|  | deadline_time_limit); | 
|  | } else { | 
|  | if ((job_ptr->part_ptr->default_time != NO_VAL) && | 
|  | (job_ptr->part_ptr->default_time != INFINITE)){ | 
|  | deadline_time_limit = MIN( | 
|  | job_ptr->part_ptr->default_time, | 
|  | deadline_time_limit); | 
|  | } else if ((job_ptr->part_ptr->max_time != NO_VAL) && | 
|  | (job_ptr->part_ptr->max_time != INFINITE)){ | 
|  | deadline_time_limit = MIN( | 
|  | job_ptr->part_ptr->max_time, | 
|  | deadline_time_limit); | 
|  | } | 
|  | } | 
|  | } | 
|  |  | 
|  | if (job_state_reason_check(job_ptr->state_reason, | 
|  | JSR_QOS_ASSOC) && | 
|  | !acct_policy_job_runnable_pre_select(job_ptr, false)) | 
|  | continue; | 
|  |  | 
|  | if ((job_ptr->state_reason == WAIT_NODE_NOT_AVAIL) && | 
|  | job_ptr->details && job_ptr->details->req_node_bitmap && | 
|  | !bit_super_set(job_ptr->details->req_node_bitmap, | 
|  | avail_node_bitmap)) { | 
|  | continue; | 
|  | } | 
|  |  | 
|  | if (!job_ptr->part_ptr) | 
|  | continue; | 
|  | i = bit_overlap(avail_node_bitmap, | 
|  | job_ptr->part_ptr->node_bitmap); | 
|  | if ((job_ptr->details && | 
|  | (job_ptr->details->min_nodes != NO_VAL) && | 
|  | (job_ptr->details->min_nodes >  i)) || | 
|  | (!job_ptr->details && (i == 0))) { | 
|  | /* | 
|  | * Too many nodes DRAIN, DOWN, or | 
|  | * reserved for jobs in higher priority partition | 
|  | */ | 
|  | job_ptr->state_reason = WAIT_RESOURCES; | 
|  | xfree(job_ptr->state_desc); | 
|  | job_ptr->state_desc = | 
|  | xstrdup_printf("Nodes required for job are DOWN, DRAINED%s or reserved for jobs in higher priority partitions", | 
|  | bit_overlap(rs_node_bitmap, | 
|  | job_ptr->part_ptr-> | 
|  | node_bitmap) ? ", REBOOTING" : ""); | 
|  | last_job_update = now; | 
|  | sched_debug3("%pJ. State=%s. Reason=%s. Priority=%u. Partition=%s.", | 
|  | job_ptr, | 
|  | job_state_string(job_ptr->job_state), | 
|  | job_state_reason_string( | 
|  | job_ptr->state_reason), | 
|  | job_ptr->priority, job_ptr->partition); | 
|  | fail_by_part = true; | 
|  | fail_by_part_non_reserve = false; | 
|  | goto fail_this_part; | 
|  | } | 
|  |  | 
|  | if (assoc_mgr_validate_assoc_id(acct_db_conn, | 
|  | job_ptr->assoc_id, | 
|  | accounting_enforce)) { | 
|  | /* NOTE: This only happens if a user's account is | 
|  | * disabled between when the job was submitted and | 
|  | * the time we consider running it. It should be | 
|  | * very rare. */ | 
|  | sched_info("%pJ has invalid account", job_ptr); | 
|  | last_job_update = now; | 
|  | job_ptr->state_reason = FAIL_ACCOUNT; | 
|  | xfree(job_ptr->state_desc); | 
|  | continue; | 
|  | } | 
|  |  | 
|  | last_job_sched_start = MAX(last_job_sched_start, | 
|  | job_ptr->start_time); | 
|  | if (deadline_time_limit) { | 
|  | save_time_limit = job_ptr->time_limit; | 
|  | job_ptr->time_limit = deadline_time_limit; | 
|  | } | 
|  |  | 
|  | /* get fed job lock from origin cluster */ | 
|  | if (fed_mgr_job_lock(job_ptr)) { | 
|  | error_code = ESLURM_FED_JOB_LOCK; | 
|  | goto skip_start; | 
|  | } | 
|  |  | 
|  | job_node_select.job_ptr = job_ptr; | 
|  | error_code = select_nodes(&job_node_select, | 
|  | false, false, | 
|  | SLURMDB_JOB_FLAG_SCHED); | 
|  |  | 
|  | if (error_code == SLURM_SUCCESS) { | 
|  | /* | 
|  | * If the following fails because of network | 
|  | * connectivity, the origin cluster should ask | 
|  | * when it comes back up if the cluster_lock | 
|  | * cluster actually started the job | 
|  | */ | 
|  | fed_mgr_job_start(job_ptr, job_ptr->start_time); | 
|  | } else { | 
|  | /* | 
|  | * Node config unavailable plus state_reason | 
|  | * FAIL_BAD_CONSTRAINTS causes the job to be held | 
|  | * later. If job specs were unsatisfied due to | 
|  | * --prefer, give the opportunity to test the record | 
|  | * without it in a second attempt by resetting | 
|  | * state_reason to FAIL_CONSTRAINTS. | 
|  | */ | 
|  | if (ignore_prefer_val && job_ptr->details->prefer && | 
|  | job_ptr->details->prefer_list && | 
|  | (job_ptr->details->prefer_list == | 
|  | job_ptr->details->feature_list_use) && | 
|  | (error_code == | 
|  | ESLURM_REQUESTED_NODE_CONFIG_UNAVAILABLE) && | 
|  | (job_ptr->state_reason == FAIL_BAD_CONSTRAINTS)) { | 
|  | sched_debug2("StateReason='%s' set after evaluating %pJ in partition %s (maybe unsatisfied due to --prefer while ignore_prefer_validation configured). Re-testing without --prefer if needed.", | 
|  | job_state_reason_string(job_ptr->state_reason), job_ptr, job_ptr->part_ptr->name); | 
|  | job_ptr->state_reason = FAIL_CONSTRAINTS; | 
|  | } | 
|  |  | 
|  | fed_mgr_job_unlock(job_ptr); | 
|  | } | 
|  |  | 
|  | skip_start: | 
|  |  | 
|  | fail_by_part = false; | 
|  | fail_by_part_non_reserve = false; | 
|  | if ((error_code != SLURM_SUCCESS) && deadline_time_limit) | 
|  | job_ptr->time_limit = save_time_limit; | 
|  | if (error_code == ESLURM_NODES_BUSY) { | 
|  | sched_debug3("%pJ. State=%s. Reason=%s. Priority=%u. Partition=%s.", | 
|  | job_ptr, | 
|  | job_state_string(job_ptr->job_state), | 
|  | job_state_reason_string( | 
|  | job_ptr->state_reason), | 
|  | job_ptr->priority, job_ptr->partition); | 
|  | fail_by_part = true; | 
|  | } else if (error_code == ESLURM_LICENSES_UNAVAILABLE) { | 
|  | sched_debug3("%pJ. State=%s. Reason=%s. Priority=%u.", | 
|  | job_ptr, | 
|  | job_state_string(job_ptr->job_state), | 
|  | job_state_reason_string( | 
|  | job_ptr->state_reason), | 
|  | job_ptr->priority); | 
|  | if (bf_licenses) { | 
|  | sched_debug("%pJ is blocked on licenses. Stopping scheduling so license backfill can handle this", | 
|  | job_ptr); | 
|  | _set_schedule_exit(SCHEDULE_EXIT_LIC); | 
|  | break; | 
|  | } | 
|  | } else if (error_code == ESLURM_BURST_BUFFER_WAIT) { | 
|  | if (job_ptr->start_time == 0) { | 
|  | job_ptr->start_time = last_job_sched_start; | 
|  | bb_wait_cnt++; | 
|  | /* | 
|  | * Since start time wasn't set yet until this | 
|  | * point, this means that the job hasn't had a | 
|  | * chance to start stage-in yet. Clear | 
|  | * reject_array_job so that other jobs in this | 
|  | * array (if it was an array) may also have | 
|  | * a chance to have a start time set and | 
|  | * therefore have a chance to start stage-in. | 
|  | */ | 
|  | reject_array_job = NULL; | 
|  | reject_array_part = NULL; | 
|  | reject_array_resv = NULL; | 
|  | } | 
|  | sched_debug3("%pJ. State=%s. Reason=%s. Priority=%u.", | 
|  | job_ptr, | 
|  | job_state_string(job_ptr->job_state), | 
|  | job_state_reason_string( | 
|  | job_ptr->state_reason), | 
|  | job_ptr->priority); | 
|  | continue; | 
|  | } else if ((error_code == ESLURM_RESERVATION_BUSY) || | 
|  | (error_code == ESLURM_RESERVATION_NOT_USABLE)) { | 
|  | if (job_ptr->resv_ptr && | 
|  | job_ptr->resv_ptr->node_bitmap) { | 
|  | sched_debug3("%pJ. State=%s. Reason=%s. Priority=%u.", | 
|  | job_ptr, | 
|  | job_state_string(job_ptr->job_state), | 
|  | job_state_reason_string( | 
|  | job_ptr->state_reason), | 
|  | job_ptr->priority); | 
|  | bit_and_not(avail_node_bitmap, | 
|  | job_ptr->resv_ptr->node_bitmap); | 
|  | } else { | 
|  | /* | 
|  | * The job has no reservation but requires | 
|  | * nodes that are currently in some reservation | 
|  | * so just skip over this job and try running | 
|  | * the next lower priority job | 
|  | */ | 
|  | sched_debug3("%pJ. State=%s. Reason=Required nodes are reserved. Priority=%u", | 
|  | job_ptr, | 
|  | job_state_string(job_ptr->job_state), | 
|  | job_ptr->priority); | 
|  | } | 
|  | } else if (error_code == ESLURM_FED_JOB_LOCK) { | 
|  | job_ptr->state_reason = WAIT_FED_JOB_LOCK; | 
|  | xfree(job_ptr->state_desc); | 
|  | last_job_update = now; | 
|  | sched_debug3("%pJ. State=%s. Reason=%s. Priority=%u. Partition=%s. Couldn't get federation job lock.", | 
|  | job_ptr, | 
|  | job_state_string(job_ptr->job_state), | 
|  | job_state_reason_string( | 
|  | job_ptr->state_reason), | 
|  | job_ptr->priority, job_ptr->partition); | 
|  | fail_by_part = true; | 
|  | } else if (error_code == SLURM_SUCCESS) { | 
|  | /* job initiated */ | 
|  | sched_debug3("%pJ initiated", job_ptr); | 
|  | last_job_update = now; | 
|  |  | 
|  | /* Clear assumed rejected array status */ | 
|  | reject_array_job = NULL; | 
|  | reject_array_part = NULL; | 
|  | reject_array_resv = NULL; | 
|  |  | 
|  | sched_info("Allocate %pJ NodeList=%s #CPUs=%u Partition=%s", | 
|  | job_ptr, job_ptr->nodes, | 
|  | job_ptr->total_cpus, | 
|  | job_ptr->part_ptr->name); | 
|  | if (job_ptr->batch_flag == 0) | 
|  | srun_allocate(job_ptr); | 
|  | else if (!IS_JOB_CONFIGURING(job_ptr)) | 
|  | launch_job(job_ptr); | 
|  | rebuild_job_part_list(job_ptr); | 
|  | job_cnt++; | 
|  | if (is_job_array_head && | 
|  | (job_ptr->array_task_id != NO_VAL)) { | 
|  | /* Try starting another task of the job array */ | 
|  | job_record_t *tmp = job_ptr; | 
|  | job_ptr = find_job_record(job_ptr->array_job_id); | 
|  | if (job_ptr && (job_ptr != tmp) && | 
|  | IS_JOB_PENDING(job_ptr) && | 
|  | (bb_g_job_test_stage_in(job_ptr, false) == | 
|  | 1)) { | 
|  | _set_features(job_ptr, use_prefer); | 
|  | goto next_task; | 
|  | } | 
|  | } | 
|  | continue; | 
|  | } else if ((error_code == | 
|  | ESLURM_REQUESTED_NODE_CONFIG_UNAVAILABLE) && | 
|  | (job_ptr->resv_ptr)) { | 
|  | debug("%pJ non-runnable in reservation %s: %s", | 
|  | job_ptr, job_ptr->resv_ptr->name, | 
|  | slurm_strerror(error_code)); | 
|  | } else if ((error_code == | 
|  | ESLURM_REQUESTED_NODE_CONFIG_UNAVAILABLE) && | 
|  | job_ptr->part_ptr_list) { | 
|  | debug("%pJ non-runnable in partition %s: %s", | 
|  | job_ptr, job_ptr->part_ptr->name, | 
|  | slurm_strerror(error_code)); | 
|  | } else if ((error_code == | 
|  | ESLURM_REQUESTED_NODE_CONFIG_UNAVAILABLE) && | 
|  | (job_ptr->state_reason == FAIL_CONSTRAINTS)) { | 
|  | sched_info("%pJ current node constraints not satisfied", | 
|  | job_ptr); | 
|  | /* | 
|  | * Future node updates may satisfy the constraints, so | 
|  | * do not hold the job. | 
|  | */ | 
|  | } else if (error_code == ESLURM_ACCOUNTING_POLICY) { | 
|  | sched_debug3("%pJ delayed for accounting policy", | 
|  | job_ptr); | 
|  | /* potentially starve this job */ | 
|  | if (assoc_limit_stop) | 
|  | fail_by_part = true; | 
|  | } else if (error_code == ESLURM_MAX_POWERED_NODES) { | 
|  | sched_debug2("%pJ cannot start: %s", | 
|  | job_ptr, slurm_strerror(error_code)); | 
|  | job_ptr->state_reason = WAIT_MAX_POWERED_NODES; | 
|  | xfree(job_ptr->state_desc); | 
|  | } else if (error_code == ESLURM_PORTS_BUSY) { | 
|  | /* | 
|  | * This can only happen if using stepd step manager. | 
|  | * The nodes selected for the job ran out of ports. | 
|  | */ | 
|  | fail_by_part = true; | 
|  | job_ptr->state_reason = WAIT_MPI_PORTS_BUSY; | 
|  | xfree(job_ptr->state_desc); | 
|  | sched_debug3("%pJ. State=%s. Reason=%s. Priority=%u. Partition=%s.", | 
|  | job_ptr, | 
|  | job_state_string(job_ptr->job_state), | 
|  | job_state_reason_string( | 
|  | job_ptr->state_reason), | 
|  | job_ptr->priority, job_ptr->partition); | 
|  | } else if ((error_code != | 
|  | ESLURM_REQUESTED_PART_CONFIG_UNAVAILABLE) && | 
|  | (error_code != ESLURM_NODE_NOT_AVAIL)      && | 
|  | (error_code != ESLURM_INVALID_BURST_BUFFER_REQUEST)){ | 
|  | sched_info("schedule: %pJ non-runnable: %s", | 
|  | job_ptr, slurm_strerror(error_code)); | 
|  | last_job_update = now; | 
|  | job_state_set(job_ptr, JOB_PENDING); | 
|  | job_ptr->state_reason = FAIL_BAD_CONSTRAINTS; | 
|  | xfree(job_ptr->state_desc); | 
|  | job_ptr->start_time = job_ptr->end_time = now; | 
|  | job_ptr->priority = 0; | 
|  | debug2("%s: setting %pJ to \"%s\" (%s)", | 
|  | __func__, job_ptr, | 
|  | job_state_reason_string(job_ptr->state_reason), | 
|  | slurm_strerror(error_code)); | 
|  | } | 
|  |  | 
|  | if (job_ptr->details && job_ptr->details->req_node_bitmap && | 
|  | (bit_set_count(job_ptr->details->req_node_bitmap) >= | 
|  | job_ptr->details->min_nodes)) { | 
|  | fail_by_part = false; | 
|  | /* Do not schedule more jobs on nodes required by this | 
|  | * job, but don't block the entire queue/partition. */ | 
|  | bit_and_not(avail_node_bitmap, | 
|  | job_ptr->details->req_node_bitmap); | 
|  | } | 
|  | if (fail_by_part && job_ptr->resv_name) { | 
|  | /* | 
|  | * If the reservation is not FLEX or ANY_NODES, other | 
|  | * jobs in this partition can be scheduled. | 
|  | * | 
|  | * Jobs submitted to FLEX or ANY_NODES reservations can | 
|  | * use nodes outside of the reservation. If the | 
|  | * reservation is FLEX or ANY_NODES, other jobs in | 
|  | * this partition submitted to other reservations can | 
|  | * be scheduled. | 
|  | * | 
|  | * In both cases, do not schedule more jobs in this | 
|  | * reservation. | 
|  | */ | 
|  | if ((job_ptr->resv_ptr->flags & RESERVE_FLAG_FLEX) || | 
|  | (job_ptr->resv_ptr->flags & RESERVE_FLAG_ANY_NODES)) | 
|  | fail_by_part_non_reserve = true; | 
|  | else | 
|  | fail_by_part = false; | 
|  |  | 
|  | job_ptr->resv_ptr->flags |= RESERVE_FLAG_SCHED_FAILED; | 
|  | } | 
|  | if (fail_by_part && bf_min_age_reserve) { | 
|  | /* Consider other jobs in this partition if job has been | 
|  | * waiting for less than bf_min_age_reserve time */ | 
|  | if (job_ptr->details->begin_time == 0) { | 
|  | fail_by_part = false; | 
|  | } else { | 
|  | pend_time = difftime( | 
|  | now, job_ptr->details->begin_time); | 
|  | if (pend_time < bf_min_age_reserve) | 
|  | fail_by_part = false; | 
|  | } | 
|  | } | 
|  |  | 
|  | if (!(prio_reserve = acct_policy_get_prio_thresh( | 
|  | job_ptr, false))) | 
|  | prio_reserve = bf_min_prio_reserve; | 
|  |  | 
|  | if (fail_by_part && prio_reserve && | 
|  | (job_ptr->priority < prio_reserve)) | 
|  | fail_by_part = false; | 
|  |  | 
|  | fail_this_part:	if (fail_by_part) { | 
|  | /* Search for duplicates */ | 
|  | if (job_ptr->part_ptr->flags & PART_FLAG_SCHED_FAILED) { | 
|  | fail_by_part = false; | 
|  | break; | 
|  | } | 
|  | } | 
|  | if (fail_by_part) { | 
|  | /* | 
|  | * Do not schedule more jobs in this partition or on | 
|  | * nodes in this partition | 
|  | */ | 
|  | job_ptr->part_ptr->flags |= PART_FLAG_SCHED_FAILED; | 
|  |  | 
|  | if (fail_by_part_non_reserve) { | 
|  | /* | 
|  | * If a FLEX or ANY_NODES reservation job fails | 
|  | * by part, remove all nodes that are not in | 
|  | * reservations from avail_node_bitmap. | 
|  | * | 
|  | * Jobs submitted to FLEX or ANY_NODES | 
|  | * reservations can be scheduled on nodes | 
|  | * outside of the reservation. If we allowed | 
|  | * lower priority jobs to be scheduled on nodes | 
|  | * not in this reservation, they could delay | 
|  | * the higher priority job submitted to this | 
|  | * reservation. | 
|  | * | 
|  | * We only remove nodes not in reservations, | 
|  | * so lower priority jobs submitted to other | 
|  | * reservations can still be scheduled. | 
|  | * | 
|  | * We don't mark the partition as being | 
|  | * cleared. Once the first non-reservation job | 
|  | * in the partition gets evaluated, which cannot | 
|  | * be scheduled since non-reserved nodes have | 
|  | * been removed, the partition's reserved nodes | 
|  | * will be removed and it will be marked as | 
|  | * cleared. | 
|  | */ | 
|  | bitstr_t *remove_nodes = | 
|  | bit_alloc(node_record_count); | 
|  |  | 
|  | list_for_each(resv_list, | 
|  | _get_nodes_in_reservations, | 
|  | remove_nodes); | 
|  | bit_not(remove_nodes); | 
|  | bit_and(remove_nodes, | 
|  | job_ptr->part_ptr->node_bitmap); | 
|  | bit_and_not(avail_node_bitmap, remove_nodes); | 
|  | FREE_NULL_BITMAP(remove_nodes); | 
|  | } else { | 
|  | job_ptr->part_ptr->flags |= | 
|  | PART_FLAG_SCHED_CLEARED; | 
|  | bit_and_not(avail_node_bitmap, | 
|  | job_ptr->part_ptr->node_bitmap); | 
|  | } | 
|  | } | 
|  | } | 
|  |  | 
|  | if (bb_wait_cnt) | 
|  | (void) bb_g_job_try_stage_in(); | 
|  |  | 
|  | if (job_ptr) | 
|  | job_resv_clear_magnetic_flag(job_ptr); | 
|  | FREE_NULL_BITMAP(avail_node_bitmap); | 
|  | avail_node_bitmap = save_avail_node_bitmap; | 
|  | FREE_NULL_LIST(job_queue); | 
|  |  | 
|  | slurm_mutex_lock(&slurmctld_config.thread_count_lock); | 
|  | if ((slurmctld_config.server_thread_count >= 150) && | 
|  | (defer_rpc_cnt == 0)) { | 
|  | sched_info("%d pending RPCs at cycle end, consider configuring max_rpc_cnt", | 
|  | slurmctld_config.server_thread_count); | 
|  | } | 
|  | slurm_mutex_unlock(&slurmctld_config.thread_count_lock); | 
|  | unlock_slurmctld(job_write_lock); | 
|  | END_TIMER2(__func__); | 
|  |  | 
|  | _do_diag_stats(DELTA_TIMER); | 
|  |  | 
|  | out: | 
|  | return job_cnt; | 
|  | } | 
|  |  | 
|  | /* | 
|  | * sort_job_queue - sort job_queue in descending priority order | 
|  | * IN/OUT job_queue - sorted job queue | 
|  | */ | 
|  | extern void sort_job_queue(list_t *job_queue) | 
|  | { | 
|  | list_sort(job_queue, sort_job_queue2); | 
|  | } | 
|  |  | 
|  | /* Note this differs from the ListCmpF typedef since we want jobs sorted | 
|  | * in order of decreasing priority then submit time and the by increasing | 
|  | * job id */ | 
|  | extern int sort_job_queue2(void *x, void *y) | 
|  | { | 
|  | job_queue_rec_t *job_rec1 = *(job_queue_rec_t **) x; | 
|  | job_queue_rec_t *job_rec2 = *(job_queue_rec_t **) y; | 
|  | het_job_details_t *details = NULL; | 
|  | bool has_resv1, has_resv2; | 
|  | static time_t config_update = 0; | 
|  | static bool preemption_enabled = true; | 
|  | uint32_t job_id1, job_id2; | 
|  | uint32_t p1, p2; | 
|  |  | 
|  | /* The following block of code is designed to minimize run time in | 
|  | * typical configurations for this frequently executed function. */ | 
|  | if (config_update != slurm_conf.last_update) { | 
|  | preemption_enabled = slurm_preemption_enabled(); | 
|  | config_update = slurm_conf.last_update; | 
|  | } | 
|  | if (preemption_enabled) { | 
|  | if (preempt_g_job_preempt_check(job_rec1, job_rec2)) | 
|  | return -1; | 
|  | if (preempt_g_job_preempt_check(job_rec2, job_rec1)) | 
|  | return 1; | 
|  | } | 
|  |  | 
|  | if (bf_hetjob_prio && job_rec1->job_ptr->het_job_id && | 
|  | (job_rec1->job_ptr->het_job_id != | 
|  | job_rec2->job_ptr->het_job_id)) { | 
|  | if ((details = job_rec1->job_ptr->het_details)) | 
|  | has_resv1 = details->any_resv; | 
|  | else | 
|  | has_resv1 = (job_rec1->job_ptr->resv_id != 0) || | 
|  | job_rec1->resv_ptr; | 
|  | } else | 
|  | has_resv1 = (job_rec1->job_ptr->resv_id != 0) || | 
|  | job_rec1->resv_ptr; | 
|  |  | 
|  | if (bf_hetjob_prio && job_rec2->job_ptr->het_job_id && | 
|  | (job_rec2->job_ptr->het_job_id != | 
|  | job_rec1->job_ptr->het_job_id)) { | 
|  | if ((details = job_rec2->job_ptr->het_details)) | 
|  | has_resv2 = details->any_resv; | 
|  | else | 
|  | has_resv2 = (job_rec2->job_ptr->resv_id != 0) || | 
|  | job_rec2->resv_ptr; | 
|  | } else | 
|  | has_resv2 = (job_rec2->job_ptr->resv_id != 0) || | 
|  | job_rec2->resv_ptr; | 
|  |  | 
|  | if (has_resv1 && !has_resv2) | 
|  | return -1; | 
|  | if (!has_resv1 && has_resv2) | 
|  | return 1; | 
|  |  | 
|  | if (job_rec1->part_ptr && job_rec2->part_ptr) { | 
|  | if (bf_hetjob_prio && job_rec1->job_ptr->het_job_id && | 
|  | (job_rec1->job_ptr->het_job_id != | 
|  | job_rec2->job_ptr->het_job_id)) { | 
|  | if ((details = job_rec1->job_ptr->het_details)) | 
|  | p1 = details->priority_tier; | 
|  | else | 
|  | p1 = job_rec1->part_ptr->priority_tier; | 
|  | } else | 
|  | p1 = job_rec1->part_ptr->priority_tier; | 
|  |  | 
|  | if (bf_hetjob_prio && job_rec2->job_ptr->het_job_id && | 
|  | (job_rec2->job_ptr->het_job_id != | 
|  | job_rec1->job_ptr->het_job_id)) { | 
|  | if ((details = job_rec2->job_ptr->het_details)) | 
|  | p2 = details->priority_tier; | 
|  | else | 
|  | p2 = job_rec2->part_ptr->priority_tier; | 
|  | } else | 
|  | p2 = job_rec2->part_ptr->priority_tier; | 
|  |  | 
|  | if (p1 < p2) | 
|  | return 1; | 
|  | if (p1 > p2) | 
|  | return -1; | 
|  | } | 
|  |  | 
|  | if (bf_hetjob_prio && job_rec1->job_ptr->het_job_id && | 
|  | (job_rec1->job_ptr->het_job_id != | 
|  | job_rec2->job_ptr->het_job_id)) { | 
|  | if ((details = job_rec1->job_ptr->het_details)) | 
|  | p1 = details->priority; | 
|  | else { | 
|  | if (job_rec1->job_ptr->part_ptr_list && | 
|  | job_rec1->job_ptr->prio_mult && | 
|  | job_rec1->job_ptr->prio_mult->priority_array) | 
|  | p1 = job_rec1->priority; | 
|  | else | 
|  | p1 = job_rec1->job_ptr->priority; | 
|  | } | 
|  | } else { | 
|  | if (job_rec1->job_ptr->part_ptr_list && | 
|  | job_rec1->job_ptr->prio_mult && | 
|  | job_rec1->job_ptr->prio_mult->priority_array) | 
|  | p1 = job_rec1->priority; | 
|  | else | 
|  | p1 = job_rec1->job_ptr->priority; | 
|  | } | 
|  |  | 
|  | if (bf_hetjob_prio && job_rec2->job_ptr->het_job_id && | 
|  | (job_rec2->job_ptr->het_job_id != | 
|  | job_rec1->job_ptr->het_job_id)) { | 
|  | if ((details = job_rec2->job_ptr->het_details)) | 
|  | p2 = details->priority; | 
|  | else { | 
|  | if (job_rec2->job_ptr->part_ptr_list && | 
|  | job_rec2->job_ptr->prio_mult && | 
|  | job_rec2->job_ptr->prio_mult->priority_array) | 
|  | p2 = job_rec2->priority; | 
|  | else | 
|  | p2 = job_rec2->job_ptr->priority; | 
|  | } | 
|  | } else { | 
|  | if (job_rec2->job_ptr->part_ptr_list && | 
|  | job_rec2->job_ptr->prio_mult && | 
|  | job_rec2->job_ptr->prio_mult->priority_array) | 
|  | p2 = job_rec2->priority; | 
|  | else | 
|  | p2 = job_rec2->job_ptr->priority; | 
|  | } | 
|  |  | 
|  | if (p1 < p2) | 
|  | return 1; | 
|  | if (p1 > p2) | 
|  | return -1; | 
|  |  | 
|  | /* If the priorities are the same sort by submission time */ | 
|  | if (job_rec1->job_ptr->details && job_rec2->job_ptr->details) { | 
|  | if (job_rec1->job_ptr->details->submit_time > | 
|  | job_rec2->job_ptr->details->submit_time) | 
|  | return 1; | 
|  | if (job_rec2->job_ptr->details->submit_time > | 
|  | job_rec1->job_ptr->details->submit_time) | 
|  | return -1; | 
|  | } | 
|  |  | 
|  | /* If the submission times are the same sort by increasing job id's */ | 
|  | if (job_rec1->array_task_id == NO_VAL) | 
|  | job_id1 = job_rec1->job_id; | 
|  | else | 
|  | job_id1 = job_rec1->job_ptr->array_job_id; | 
|  | if (job_rec2->array_task_id == NO_VAL) | 
|  | job_id2 = job_rec2->job_id; | 
|  | else | 
|  | job_id2 = job_rec2->job_ptr->array_job_id; | 
|  | if (job_id1 > job_id2) | 
|  | return 1; | 
|  | else if (job_id1 < job_id2) | 
|  | return -1; | 
|  |  | 
|  | /* If job IDs match compare task IDs */ | 
|  | if (job_rec1->array_task_id > job_rec2->array_task_id) | 
|  | return 1; | 
|  |  | 
|  | /* Magnetic or multi-reservation. */ | 
|  | if (job_rec1->resv_ptr && job_rec2->resv_ptr && | 
|  | (job_rec1->resv_ptr->start_time > job_rec2->resv_ptr->start_time)) | 
|  | return 1; | 
|  |  | 
|  | if (job_rec1->use_prefer && !job_rec2->use_prefer) | 
|  | return -1; | 
|  | else if (!job_rec1->use_prefer && job_rec2->use_prefer) | 
|  | return 1; | 
|  |  | 
|  | return -1; | 
|  | } | 
|  |  | 
|  | /* The environment" variable is points to one big xmalloc. In order to | 
|  | * manipulate the array for a hetjob, we need to split it into an array | 
|  | * containing multiple xmalloc variables */ | 
|  | static void _split_env(batch_job_launch_msg_t *launch_msg_ptr) | 
|  | { | 
|  | int i; | 
|  |  | 
|  | for (i = 1; i < launch_msg_ptr->envc; i++) { | 
|  | launch_msg_ptr->environment[i] = | 
|  | xstrdup(launch_msg_ptr->environment[i]); | 
|  | } | 
|  | } | 
|  |  | 
|  | /* Given a scheduled job, return a pointer to it batch_job_launch_msg_t data */ | 
|  | static batch_job_launch_msg_t *_build_launch_job_msg(job_record_t *job_ptr, | 
|  | uint16_t protocol_version) | 
|  | { | 
|  | char *fail_why = NULL; | 
|  | batch_job_launch_msg_t *launch_msg_ptr; | 
|  |  | 
|  | /* Initialization of data structures */ | 
|  | launch_msg_ptr = (batch_job_launch_msg_t *) | 
|  | xmalloc(sizeof(batch_job_launch_msg_t)); | 
|  | launch_msg_ptr->job_id = job_ptr->job_id; | 
|  | launch_msg_ptr->het_job_id = job_ptr->het_job_id; | 
|  | launch_msg_ptr->array_job_id = job_ptr->array_job_id; | 
|  | launch_msg_ptr->array_task_id = job_ptr->array_task_id; | 
|  |  | 
|  | if (!(launch_msg_ptr->script_buf = get_job_script(job_ptr))) { | 
|  | fail_why = "Unable to load job batch script"; | 
|  | goto job_failed; | 
|  | } | 
|  |  | 
|  | /* | 
|  | * We only want send the number of tasks if we explicitly requested | 
|  | * them: num_tasks could be set (job_mgr.c | 
|  | * _figure_out_num_tasks()). Otherwise a step requesting less than the | 
|  | * allocation will be polluted with this calculated task count | 
|  | * erroneously. | 
|  | */ | 
|  | if (job_ptr->bit_flags & JOB_NTASKS_SET) | 
|  | launch_msg_ptr->ntasks = job_ptr->details->num_tasks; | 
|  | launch_msg_ptr->container = xstrdup(job_ptr->container); | 
|  | launch_msg_ptr->cpu_freq_min = job_ptr->details->cpu_freq_min; | 
|  | launch_msg_ptr->cpu_freq_max = job_ptr->details->cpu_freq_max; | 
|  | launch_msg_ptr->cpu_freq_gov = job_ptr->details->cpu_freq_gov; | 
|  | launch_msg_ptr->nodes = xstrdup(job_ptr->nodes); | 
|  | launch_msg_ptr->overcommit = job_ptr->details->overcommit; | 
|  | launch_msg_ptr->open_mode  = job_ptr->details->open_mode; | 
|  | launch_msg_ptr->cpus_per_task = job_ptr->details->cpus_per_task; | 
|  | launch_msg_ptr->pn_min_memory = job_ptr->details->pn_min_memory; | 
|  | launch_msg_ptr->restart_cnt   = job_ptr->restart_cnt; | 
|  | launch_msg_ptr->profile       = job_ptr->profile; | 
|  |  | 
|  | if (make_batch_job_cred(launch_msg_ptr, job_ptr, protocol_version)) { | 
|  | error("%s: slurm_cred_create failure for %pJ, holding job", | 
|  | __func__, job_ptr); | 
|  | slurm_free_job_launch_msg(launch_msg_ptr); | 
|  | job_mgr_handle_cred_failure(job_ptr); | 
|  | return NULL; | 
|  | } | 
|  |  | 
|  | launch_msg_ptr->acctg_freq = xstrdup(job_ptr->details->acctg_freq); | 
|  | if (job_ptr->part_ptr) | 
|  | launch_msg_ptr->partition = xstrdup(job_ptr->part_ptr->name); | 
|  | else | 
|  | launch_msg_ptr->partition = xstrdup(job_ptr->partition); | 
|  | launch_msg_ptr->std_err = xstrdup(job_ptr->details->std_err); | 
|  | launch_msg_ptr->std_in = xstrdup(job_ptr->details->std_in); | 
|  | launch_msg_ptr->std_out = xstrdup(job_ptr->details->std_out); | 
|  | launch_msg_ptr->work_dir = xstrdup(job_ptr->details->work_dir); | 
|  |  | 
|  | launch_msg_ptr->argc = job_ptr->details->argc; | 
|  | launch_msg_ptr->argv = xduparray(job_ptr->details->argc, | 
|  | job_ptr->details->argv); | 
|  | launch_msg_ptr->spank_job_env_size = job_ptr->spank_job_env_size; | 
|  | launch_msg_ptr->spank_job_env = xduparray(job_ptr->spank_job_env_size, | 
|  | job_ptr->spank_job_env); | 
|  | launch_msg_ptr->environment = get_job_env(job_ptr, | 
|  | &launch_msg_ptr->envc); | 
|  | if (!launch_msg_ptr->container && !launch_msg_ptr->environment) { | 
|  | fail_why = "Unable to load job environment"; | 
|  | goto job_failed; | 
|  | } | 
|  |  | 
|  | _split_env(launch_msg_ptr); | 
|  |  | 
|  | if (job_ptr->bit_flags & STEPMGR_ENABLED) { | 
|  | env_array_overwrite(&launch_msg_ptr->environment, | 
|  | "SLURM_STEPMGR", job_ptr->batch_host); | 
|  | /* Update envc if env was added to */ | 
|  | launch_msg_ptr->envc = | 
|  | PTR_ARRAY_SIZE(launch_msg_ptr->environment) - 1; | 
|  | } | 
|  |  | 
|  | launch_msg_ptr->job_mem = job_ptr->details->pn_min_memory; | 
|  | launch_msg_ptr->num_cpu_groups = job_ptr->job_resrcs->cpu_array_cnt; | 
|  | launch_msg_ptr->cpus_per_node  = xmalloc( | 
|  | sizeof(uint16_t) * job_ptr->job_resrcs->cpu_array_cnt); | 
|  | memcpy(launch_msg_ptr->cpus_per_node, | 
|  | job_ptr->job_resrcs->cpu_array_value, | 
|  | (sizeof(uint16_t) * job_ptr->job_resrcs->cpu_array_cnt)); | 
|  | launch_msg_ptr->cpu_count_reps  = xmalloc( | 
|  | sizeof(uint32_t) * job_ptr->job_resrcs->cpu_array_cnt); | 
|  | memcpy(launch_msg_ptr->cpu_count_reps, | 
|  | job_ptr->job_resrcs->cpu_array_reps, | 
|  | (sizeof(uint32_t) * job_ptr->job_resrcs->cpu_array_cnt)); | 
|  |  | 
|  | launch_msg_ptr->account = xstrdup(job_ptr->account); | 
|  | if (job_ptr->qos_ptr) | 
|  | launch_msg_ptr->qos = xstrdup(job_ptr->qos_ptr->name); | 
|  |  | 
|  | if (job_ptr->details->oom_kill_step != NO_VAL16) | 
|  | launch_msg_ptr->oom_kill_step = job_ptr->details->oom_kill_step; | 
|  | else | 
|  | launch_msg_ptr->oom_kill_step = | 
|  | slurm_conf.task_plugin_param & OOM_KILL_STEP; | 
|  | /* | 
|  | * Use resv_ptr->name instead of job_ptr->resv_name as the job | 
|  | * could contain multiple reservation names. | 
|  | */ | 
|  | if (job_ptr->resv_ptr) | 
|  | launch_msg_ptr->resv_name = xstrdup(job_ptr->resv_ptr->name); | 
|  |  | 
|  | launch_msg_ptr->tres_per_task = xstrdup(job_ptr->tres_per_task); | 
|  |  | 
|  | xassert(!fail_why); | 
|  | return launch_msg_ptr; | 
|  |  | 
|  | job_failed: | 
|  | /* fatal or kill the job as it can never be recovered */ | 
|  | if (!ignore_state_errors) | 
|  | fatal("%s: %s for %pJ. Check file system serving StateSaveLocation as that directory may be missing or corrupted. Start with '-i' to ignore this error and kill the afflicted jobs.", | 
|  | __func__, fail_why, job_ptr); | 
|  |  | 
|  | error("%s: %s for %pJ. %pJ will be killed due to system error.", | 
|  | __func__, fail_why, job_ptr, job_ptr); | 
|  | xfree(job_ptr->state_desc); | 
|  | job_ptr->state_desc = xstrdup(fail_why); | 
|  | job_ptr->state_reason = FAIL_SYSTEM; | 
|  | last_job_update = time(NULL); | 
|  | slurm_free_job_launch_msg(launch_msg_ptr); | 
|  | /* ignore the return as job is in an unknown state anyway */ | 
|  | job_complete(job_ptr->job_id, slurm_conf.slurm_user_id, false, false, | 
|  | 1); | 
|  | return NULL; | 
|  | } | 
|  |  | 
|  | static int _foreach_het_job_ready(void *x, void *arg) | 
|  | { | 
|  | job_record_t *het_job = x; | 
|  | het_job_ready_t *ready_struct = arg; | 
|  | bool prolog = false; | 
|  |  | 
|  | if (ready_struct->het_job_leader->het_job_id != het_job->het_job_id) { | 
|  | error("%s: Bad het_job_list for %pJ", | 
|  | __func__, ready_struct->het_job_leader); | 
|  | return 0; | 
|  | } | 
|  |  | 
|  | ready_struct->het_job = het_job; | 
|  |  | 
|  | if (het_job->details) | 
|  | prolog = het_job->details->prolog_running; | 
|  | if (prolog || IS_JOB_CONFIGURING(het_job) || | 
|  | !test_job_nodes_ready(het_job)) { | 
|  | ready_struct->het_job_leader = NULL; | 
|  | return -1; | 
|  | } | 
|  | if (!ready_struct->job_ptr->batch_flag || | 
|  | (!IS_JOB_RUNNING(ready_struct->job_ptr) && | 
|  | !IS_JOB_SUSPENDED(ready_struct->job_ptr))) { | 
|  | ready_struct->het_job_leader = NULL; | 
|  | return -1; | 
|  | } | 
|  |  | 
|  | ready_struct->het_job = NULL; | 
|  | return 0; | 
|  | } | 
|  |  | 
|  | /* Validate the job is ready for launch | 
|  | * RET pointer to batch job to launch or NULL if not ready yet */ | 
|  | static job_record_t *_het_job_ready(job_record_t *job_ptr) | 
|  | { | 
|  | het_job_ready_t ready_struct = { 0 }; | 
|  |  | 
|  | if (job_ptr->het_job_id == 0)	/* Not a hetjob */ | 
|  | return job_ptr; | 
|  | ready_struct.het_job_leader = find_job_record(job_ptr->het_job_id); | 
|  | if (!ready_struct.het_job_leader) { | 
|  | error("Hetjob leader %pJ not found", job_ptr); | 
|  | return NULL; | 
|  | } | 
|  | if (!ready_struct.het_job_leader->het_job_list) { | 
|  | error("Hetjob leader %pJ lacks het_job_list", job_ptr); | 
|  | return NULL; | 
|  | } | 
|  |  | 
|  | ready_struct.job_ptr = job_ptr; | 
|  | (void) list_for_each(ready_struct.het_job_leader->het_job_list, | 
|  | _foreach_het_job_ready, &ready_struct); | 
|  |  | 
|  | if (ready_struct.het_job_leader) | 
|  | log_flag(HETJOB, "Batch hetjob %pJ being launched", | 
|  | ready_struct.het_job_leader); | 
|  | else if (ready_struct.het_job) | 
|  | log_flag(HETJOB, "Batch hetjob %pJ waiting for job to be ready", | 
|  | ready_struct.het_job); | 
|  |  | 
|  | return ready_struct.het_job_leader; | 
|  | } | 
|  |  | 
|  | static void _set_job_env(job_record_t *job, batch_job_launch_msg_t *launch) | 
|  | { | 
|  | if (job->name) | 
|  | env_array_overwrite(&launch->environment, "SLURM_JOB_NAME", | 
|  | job->name); | 
|  |  | 
|  | if (job->details->open_mode) { | 
|  | /* Propagate mode to spawned job using environment variable */ | 
|  | if (job->details->open_mode == OPEN_MODE_APPEND) | 
|  | env_array_overwrite(&launch->environment, | 
|  | "SLURM_OPEN_MODE", "a"); | 
|  | else | 
|  | env_array_overwrite(&launch->environment, | 
|  | "SLURM_OPEN_MODE", "t"); | 
|  | } | 
|  |  | 
|  | if (job->details->dependency) | 
|  | env_array_overwrite(&launch->environment, | 
|  | "SLURM_JOB_DEPENDENCY", | 
|  | job->details->dependency); | 
|  |  | 
|  | /* intentionally skipping SLURM_EXPORT_ENV */ | 
|  |  | 
|  | if (job->profile) { | 
|  | char tmp[128] = {0}; | 
|  | acct_gather_profile_to_string_r(job->profile, tmp); | 
|  | env_array_overwrite(&launch->environment, "SLURM_PROFILE", tmp); | 
|  | } | 
|  |  | 
|  | if (job->details->acctg_freq) | 
|  | env_array_overwrite(&launch->environment, "SLURM_ACCTG_FREQ", | 
|  | job->details->acctg_freq); | 
|  |  | 
|  | if (job->network) | 
|  | env_array_overwrite(&launch->environment, "SLURM_NETWORK", | 
|  | job->network); | 
|  |  | 
|  | if (job->details->cpu_freq_min || job->details->cpu_freq_max || | 
|  | job->details->cpu_freq_gov) { | 
|  | char *tmp = cpu_freq_to_cmdline(job->details->cpu_freq_min, | 
|  | job->details->cpu_freq_max, | 
|  | job->details->cpu_freq_gov); | 
|  |  | 
|  | if (tmp) | 
|  | env_array_overwrite(&launch->environment, | 
|  | "SLURM_CPU_FREQ_REQ", tmp); | 
|  |  | 
|  | xfree(tmp); | 
|  | } | 
|  |  | 
|  | if (job->details->segment_size) | 
|  | env_array_overwrite_fmt(&launch->environment, | 
|  | "SLURM_JOB_SEGMENT_SIZE", "%u", | 
|  | job->details->segment_size); | 
|  |  | 
|  | /* update size of env in case it changed */ | 
|  | if (launch->environment) | 
|  | launch->envc = PTR_ARRAY_SIZE(launch->environment) - 1; | 
|  | } | 
|  |  | 
|  | static int _foreach_set_het_job_env(void *x, void *arg) | 
|  | { | 
|  | job_record_t *het_job = x; | 
|  | het_job_env_t *het_job_env = arg; | 
|  | job_record_t *het_job_leader = het_job_env->het_job_leader; | 
|  | int het_job_offset = het_job_env->het_job_offset; | 
|  | batch_job_launch_msg_t *launch_msg_ptr = het_job_env->launch_msg_ptr; | 
|  | uint32_t num_cpus = 0; | 
|  | uint64_t tmp_mem = 0; | 
|  | char *tmp_str = NULL; | 
|  |  | 
|  | if (het_job_leader->het_job_id != het_job->het_job_id) { | 
|  | error("%s: Bad het_job_list for %pJ", | 
|  | __func__, het_job_leader); | 
|  | return 0; | 
|  | } | 
|  | if (het_job->account) { | 
|  | (void) env_array_overwrite_het_fmt( | 
|  | &launch_msg_ptr->environment, | 
|  | "SLURM_JOB_ACCOUNT", | 
|  | het_job_offset, "%s", het_job->account); | 
|  | } | 
|  |  | 
|  | if (het_job->job_resrcs) { | 
|  | tmp_str = uint32_compressed_to_str( | 
|  | het_job->job_resrcs->cpu_array_cnt, | 
|  | het_job->job_resrcs->cpu_array_value, | 
|  | het_job->job_resrcs->cpu_array_reps); | 
|  | (void) env_array_overwrite_het_fmt( | 
|  | &launch_msg_ptr->environment, | 
|  | "SLURM_JOB_CPUS_PER_NODE", | 
|  | het_job_offset, "%s", tmp_str); | 
|  | xfree(tmp_str); | 
|  | } | 
|  | (void) env_array_overwrite_het_fmt( | 
|  | &launch_msg_ptr->environment, | 
|  | "SLURM_JOB_ID", | 
|  | het_job_offset, "%u", het_job->job_id); | 
|  | (void) env_array_overwrite_het_fmt( | 
|  | &launch_msg_ptr->environment, | 
|  | "SLURM_JOB_NAME", | 
|  | het_job_offset, "%s", het_job->name); | 
|  | (void) env_array_overwrite_het_fmt( | 
|  | &launch_msg_ptr->environment, | 
|  | "SLURM_JOB_NODELIST", | 
|  | het_job_offset, "%s", het_job->nodes); | 
|  | (void) env_array_overwrite_het_fmt( | 
|  | &launch_msg_ptr->environment, | 
|  | "SLURM_JOB_NUM_NODES", | 
|  | het_job_offset, "%u", het_job->node_cnt); | 
|  | if (het_job->partition) { | 
|  | (void) env_array_overwrite_het_fmt( | 
|  | &launch_msg_ptr->environment, | 
|  | "SLURM_JOB_PARTITION", | 
|  | het_job_offset, "%s", het_job->partition); | 
|  | } | 
|  | if (het_job->qos_ptr) { | 
|  | (void) env_array_overwrite_het_fmt( | 
|  | &launch_msg_ptr->environment, | 
|  | "SLURM_JOB_QOS", | 
|  | het_job_offset, "%s", het_job->qos_ptr->name); | 
|  | } | 
|  | if (het_job->resv_ptr) { | 
|  | (void) env_array_overwrite_het_fmt( | 
|  | &launch_msg_ptr->environment, | 
|  | "SLURM_JOB_RESERVATION", | 
|  | het_job_offset, "%s", het_job->resv_ptr->name); | 
|  | } | 
|  | if (het_job->details) | 
|  | tmp_mem = het_job->details->pn_min_memory; | 
|  | if (tmp_mem & MEM_PER_CPU) { | 
|  | tmp_mem &= (~MEM_PER_CPU); | 
|  | (void) env_array_overwrite_het_fmt( | 
|  | &launch_msg_ptr->environment, | 
|  | "SLURM_MEM_PER_CPU", | 
|  | het_job_offset, "%"PRIu64"", tmp_mem); | 
|  | } else if (tmp_mem) { | 
|  | (void) env_array_overwrite_het_fmt( | 
|  | &launch_msg_ptr->environment, | 
|  | "SLURM_MEM_PER_NODE", | 
|  | het_job_offset, "%"PRIu64"", tmp_mem); | 
|  | } | 
|  |  | 
|  | if (het_job->details && het_job->details->segment_size) | 
|  | (void) env_array_overwrite_het_fmt( | 
|  | &launch_msg_ptr->environment, "SLURM_JOB_SEGMENT_SIZE", | 
|  | het_job_offset, "%u", het_job->details->segment_size); | 
|  |  | 
|  | if (het_job->details && het_job->job_resrcs) { | 
|  | /* Both should always be set for active jobs */ | 
|  | struct job_resources *resrcs_ptr = het_job->job_resrcs; | 
|  | slurm_step_layout_t *step_layout = NULL; | 
|  | uint16_t cpus_per_task_array[1]; | 
|  | uint32_t cpus_task_reps[1], task_dist; | 
|  | uint16_t cpus_per_task = 1; | 
|  | slurm_step_layout_req_t step_layout_req = { | 
|  | .cpu_count_reps = resrcs_ptr->cpu_array_reps, | 
|  | .cpus_per_node = resrcs_ptr->cpu_array_value, | 
|  | .cpus_per_task = cpus_per_task_array, | 
|  | .cpus_task_reps = cpus_task_reps, | 
|  | .num_hosts = het_job->node_cnt, | 
|  | .plane_size = NO_VAL16, | 
|  | }; | 
|  |  | 
|  | cpus_task_reps[0] = het_job->node_cnt; | 
|  |  | 
|  | for (int i = 0; i < resrcs_ptr->cpu_array_cnt; i++) { | 
|  | num_cpus += resrcs_ptr->cpu_array_value[i] * | 
|  | resrcs_ptr->cpu_array_reps[i]; | 
|  | } | 
|  |  | 
|  | if ((het_job->details->cpus_per_task > 0) && | 
|  | (het_job->details->cpus_per_task != NO_VAL16)) | 
|  | cpus_per_task = het_job->details->cpus_per_task; | 
|  |  | 
|  | cpus_per_task_array[0] = cpus_per_task; | 
|  | if (het_job->details->num_tasks) { | 
|  | step_layout_req.num_tasks = | 
|  | het_job->details->num_tasks; | 
|  | } else { | 
|  | step_layout_req.num_tasks = num_cpus / | 
|  | cpus_per_task; | 
|  | } | 
|  |  | 
|  | if ((step_layout_req.node_list = | 
|  | getenvp(launch_msg_ptr->environment, | 
|  | "SLURM_ARBITRARY_NODELIST"))) { | 
|  | task_dist = SLURM_DIST_ARBITRARY; | 
|  | } else { | 
|  | step_layout_req.node_list = het_job->nodes; | 
|  | task_dist = SLURM_DIST_BLOCK; | 
|  | } | 
|  | step_layout_req.task_dist = task_dist; | 
|  | step_layout = slurm_step_layout_create(&step_layout_req); | 
|  | if (step_layout) { | 
|  | tmp_str = uint16_array_to_str( | 
|  | step_layout->node_cnt, | 
|  | step_layout->tasks); | 
|  | slurm_step_layout_destroy(step_layout); | 
|  | (void) env_array_overwrite_het_fmt( | 
|  | &launch_msg_ptr->environment, | 
|  | "SLURM_TASKS_PER_NODE", | 
|  | het_job_offset,"%s", tmp_str); | 
|  | xfree(tmp_str); | 
|  | } | 
|  | } else if (IS_JOB_RUNNING(het_job)) { | 
|  | if (!het_job->details) | 
|  | error("%s: %pJ has null details member", | 
|  | __func__, het_job); | 
|  | if (!het_job->job_resrcs) | 
|  | error("%s: %pJ has null job_resrcs member", | 
|  | __func__, het_job); | 
|  | } | 
|  | het_job_env->het_job_offset++; | 
|  |  | 
|  | return 0; | 
|  | } | 
|  |  | 
|  | /* | 
|  | * Set some hetjob environment variables. This will include information | 
|  | * about multiple job components (i.e. different slurmctld job records). | 
|  | */ | 
|  | static void _set_het_job_env(job_record_t *het_job_leader, | 
|  | batch_job_launch_msg_t *launch_msg_ptr) | 
|  | { | 
|  | int i; | 
|  | het_job_env_t het_job_env = { | 
|  | .het_job_leader = het_job_leader, | 
|  | .het_job_offset = 0, | 
|  | .launch_msg_ptr = launch_msg_ptr, | 
|  | }; | 
|  |  | 
|  | if (het_job_leader->het_job_id == 0) | 
|  | return; | 
|  | if (!launch_msg_ptr->environment) { | 
|  | error("%pJ lacks environment", het_job_leader); | 
|  | return; | 
|  | } | 
|  | if (!het_job_leader->het_job_list) { | 
|  | error("Hetjob leader %pJ lacks het_job_list", | 
|  | het_job_leader); | 
|  | return; | 
|  | } | 
|  |  | 
|  | (void) list_for_each(het_job_leader->het_job_list, | 
|  | _foreach_set_het_job_env, | 
|  | &het_job_env); | 
|  |  | 
|  | /* Continue support for old hetjob terminology. */ | 
|  | (void) env_array_overwrite_fmt(&launch_msg_ptr->environment, | 
|  | "SLURM_PACK_SIZE", "%d", | 
|  | het_job_env.het_job_offset); | 
|  | (void) env_array_overwrite_fmt(&launch_msg_ptr->environment, | 
|  | "SLURM_HET_SIZE", "%d", | 
|  | het_job_env.het_job_offset); | 
|  |  | 
|  | for (i = 0; launch_msg_ptr->environment[i]; i++) | 
|  | ; | 
|  | launch_msg_ptr->envc = i; | 
|  | } | 
|  |  | 
|  | /* | 
|  | * launch_job - send an RPC to a slurmd to initiate a batch job | 
|  | * IN job_ptr - pointer to job that will be initiated | 
|  | */ | 
|  | extern void launch_job(job_record_t *job_ptr) | 
|  | { | 
|  | batch_job_launch_msg_t *launch_msg_ptr; | 
|  | uint16_t protocol_version = NO_VAL16; | 
|  | agent_arg_t *agent_arg_ptr; | 
|  | job_record_t *launch_job_ptr; | 
|  | node_record_t *node_ptr; | 
|  |  | 
|  | xassert(job_ptr); | 
|  | xassert(job_ptr->batch_flag); | 
|  |  | 
|  | if (job_ptr->total_cpus == 0) | 
|  | return; | 
|  |  | 
|  | launch_job_ptr = _het_job_ready(job_ptr); | 
|  | if (!launch_job_ptr) | 
|  | return; | 
|  |  | 
|  | if (pick_batch_host(launch_job_ptr) != SLURM_SUCCESS) | 
|  | return; | 
|  |  | 
|  | node_ptr = find_node_record(job_ptr->batch_host); | 
|  | if (node_ptr) | 
|  | protocol_version = node_ptr->protocol_version; | 
|  |  | 
|  | (void)build_batch_step(job_ptr); | 
|  |  | 
|  | launch_msg_ptr = _build_launch_job_msg(launch_job_ptr,protocol_version); | 
|  | if (launch_msg_ptr == NULL) | 
|  | return; | 
|  | if (launch_job_ptr->het_job_id) | 
|  | _set_het_job_env(launch_job_ptr, launch_msg_ptr); | 
|  |  | 
|  | _set_job_env(launch_job_ptr, launch_msg_ptr); | 
|  |  | 
|  | agent_arg_ptr = xmalloc(sizeof(agent_arg_t)); | 
|  | agent_arg_ptr->protocol_version = protocol_version; | 
|  | agent_arg_ptr->node_count = 1; | 
|  | agent_arg_ptr->retry = 0; | 
|  | xassert(job_ptr->batch_host); | 
|  | agent_arg_ptr->hostlist = hostlist_create(launch_job_ptr->batch_host); | 
|  | agent_arg_ptr->msg_type = REQUEST_BATCH_JOB_LAUNCH; | 
|  | agent_arg_ptr->msg_args = (void *) launch_msg_ptr; | 
|  | set_agent_arg_r_uid(agent_arg_ptr, SLURM_AUTH_UID_ANY); | 
|  |  | 
|  | /* Launch the RPC via agent */ | 
|  | agent_queue_request(agent_arg_ptr); | 
|  | } | 
|  |  | 
|  | /* | 
|  | * make_batch_job_cred - add a job credential to the batch_job_launch_msg | 
|  | * IN/OUT launch_msg_ptr - batch_job_launch_msg in which job_id, step_id, | 
|  | *                         uid and nodes have already been set | 
|  | * IN job_ptr - pointer to job record | 
|  | * RET 0 or error code | 
|  | */ | 
|  | extern int make_batch_job_cred(batch_job_launch_msg_t *launch_msg_ptr, | 
|  | job_record_t *job_ptr, | 
|  | uint16_t protocol_version) | 
|  | { | 
|  | slurm_cred_arg_t cred_arg; | 
|  | job_resources_t *job_resrcs_ptr; | 
|  |  | 
|  | xassert(job_ptr->job_resrcs); | 
|  | job_resrcs_ptr = job_ptr->job_resrcs; | 
|  |  | 
|  | if (job_ptr->job_resrcs == NULL) { | 
|  | error("%s: %pJ is missing job_resrcs info", | 
|  | __func__, job_ptr); | 
|  | return SLURM_ERROR; | 
|  | } | 
|  |  | 
|  | setup_cred_arg(&cred_arg, job_ptr); | 
|  |  | 
|  | cred_arg.step_id.job_id = launch_msg_ptr->job_id; | 
|  | cred_arg.step_id.step_id = SLURM_BATCH_SCRIPT; | 
|  | cred_arg.step_id.step_het_comp = NO_VAL; | 
|  | if (job_resrcs_ptr->memory_allocated) { | 
|  | int batch_inx = job_get_node_inx( | 
|  | job_ptr->batch_host, job_ptr->node_bitmap); | 
|  |  | 
|  | if (batch_inx == -1) { | 
|  | error("%s: Invalid batch host %s for %pJ; this should never happen", | 
|  | __func__, job_ptr->batch_host, job_ptr); | 
|  | batch_inx = 0; | 
|  | } | 
|  | cred_arg.job_mem_alloc = xmalloc(sizeof(uint64_t)); | 
|  | cred_arg.job_mem_alloc[0] = | 
|  | job_resrcs_ptr->memory_allocated[batch_inx]; | 
|  | cred_arg.job_mem_alloc_rep_count = xmalloc(sizeof(uint64_t)); | 
|  | cred_arg.job_mem_alloc_rep_count[0] = 1; | 
|  | cred_arg.job_mem_alloc_size = 1; | 
|  | } | 
|  | /*	cred_arg.step_gres_list      = NULL; */ | 
|  |  | 
|  | xassert(job_ptr->batch_host); | 
|  | cred_arg.step_hostlist       = job_ptr->batch_host; | 
|  | cred_arg.step_core_bitmap    = job_resrcs_ptr->core_bitmap; | 
|  |  | 
|  | launch_msg_ptr->cred = slurm_cred_create(&cred_arg, false, | 
|  | protocol_version); | 
|  | xfree(cred_arg.job_mem_alloc); | 
|  | xfree(cred_arg.job_mem_alloc_rep_count); | 
|  |  | 
|  | if (launch_msg_ptr->cred) | 
|  | return SLURM_SUCCESS; | 
|  | error("slurm_cred_create failure for batch job %u", | 
|  | cred_arg.step_id.job_id); | 
|  | return SLURM_ERROR; | 
|  | } | 
|  |  | 
|  | static int _foreach_depend_list_copy(void *x, void *arg) | 
|  | { | 
|  | depend_spec_t *dep_src = x; | 
|  | list_t **depend_list_dest = arg; | 
|  | depend_spec_t *dep_dest = xmalloc(sizeof(depend_spec_t)); | 
|  |  | 
|  | memcpy(dep_dest, dep_src, sizeof(depend_spec_t)); | 
|  | list_append(*depend_list_dest, dep_dest); | 
|  |  | 
|  | return 0; | 
|  | } | 
|  |  | 
|  | /* | 
|  | * Copy a job's dependency list | 
|  | * IN depend_list_src - a job's depend_lst | 
|  | * RET copy of depend_list_src, must bee freed by caller | 
|  | */ | 
|  | extern list_t *depended_list_copy(list_t *depend_list_src) | 
|  | { | 
|  | list_t *depend_list_dest = NULL; | 
|  |  | 
|  | if (!depend_list_src) | 
|  | return depend_list_dest; | 
|  |  | 
|  | depend_list_dest = list_create(xfree_ptr); | 
|  | (void) list_for_each(depend_list_src, _foreach_depend_list_copy, | 
|  | &depend_list_dest); | 
|  | return depend_list_dest; | 
|  | } | 
|  |  | 
|  | static char *_depend_type2str(depend_spec_t *dep_ptr) | 
|  | { | 
|  | xassert(dep_ptr); | 
|  |  | 
|  | switch (dep_ptr->depend_type) { | 
|  | case SLURM_DEPEND_AFTER: | 
|  | return "after"; | 
|  | case SLURM_DEPEND_AFTER_ANY: | 
|  | return "afterany"; | 
|  | case SLURM_DEPEND_AFTER_NOT_OK: | 
|  | return "afternotok"; | 
|  | case SLURM_DEPEND_AFTER_OK: | 
|  | return "afterok"; | 
|  | case SLURM_DEPEND_AFTER_CORRESPOND: | 
|  | return "aftercorr"; | 
|  | case SLURM_DEPEND_EXPAND: | 
|  | return "expand"; | 
|  | case SLURM_DEPEND_BURST_BUFFER: | 
|  | return "afterburstbuffer"; | 
|  | case SLURM_DEPEND_SINGLETON: | 
|  | return "singleton"; | 
|  | default: | 
|  | return "unknown"; | 
|  | } | 
|  | } | 
|  |  | 
|  | static uint32_t _depend_state_str2state(char *state_str) | 
|  | { | 
|  | if (!xstrcasecmp(state_str, "fulfilled")) | 
|  | return DEPEND_FULFILLED; | 
|  | if (!xstrcasecmp(state_str, "failed")) | 
|  | return DEPEND_FAILED; | 
|  | /* Default to not fulfilled */ | 
|  | return DEPEND_NOT_FULFILLED; | 
|  | } | 
|  |  | 
|  | static char *_depend_state2str(depend_spec_t *dep_ptr) | 
|  | { | 
|  | xassert(dep_ptr); | 
|  |  | 
|  | switch(dep_ptr->depend_state) { | 
|  | case DEPEND_NOT_FULFILLED: | 
|  | return "unfulfilled"; | 
|  | case DEPEND_FULFILLED: | 
|  | return "fulfilled"; | 
|  | case DEPEND_FAILED: | 
|  | return "failed"; | 
|  | default: | 
|  | return "unknown"; | 
|  | } | 
|  | } | 
|  |  | 
|  | static int _foreach_depend_list2str(void *x, void *arg) | 
|  | { | 
|  | depend_spec_t *dep_ptr = x; | 
|  | depend_str_t *depend_str = arg; | 
|  | job_record_t *job_ptr = depend_str->job_ptr; | 
|  |  | 
|  | /* | 
|  | * Show non-fulfilled (including failed) dependencies, but don't | 
|  | * show fulfilled dependencies. | 
|  | */ | 
|  | if (dep_ptr->depend_state == DEPEND_FULFILLED) | 
|  | return 0; | 
|  | if (dep_ptr->depend_type == SLURM_DEPEND_SINGLETON) { | 
|  | xstrfmtcat(job_ptr->details->dependency, | 
|  | "%ssingleton(%s)", | 
|  | depend_str->sep, _depend_state2str(dep_ptr)); | 
|  | } else { | 
|  | char *dep_str = _depend_type2str(dep_ptr); | 
|  |  | 
|  | if (dep_ptr->array_task_id == INFINITE) | 
|  | xstrfmtcat(job_ptr->details->dependency, "%s%s:%u_*", | 
|  | depend_str->sep, dep_str, dep_ptr->job_id); | 
|  | else if (dep_ptr->array_task_id == NO_VAL) | 
|  | xstrfmtcat(job_ptr->details->dependency, "%s%s:%u", | 
|  | depend_str->sep, dep_str, dep_ptr->job_id); | 
|  | else | 
|  | xstrfmtcat(job_ptr->details->dependency, "%s%s:%u_%u", | 
|  | depend_str->sep, dep_str, dep_ptr->job_id, | 
|  | dep_ptr->array_task_id); | 
|  |  | 
|  | if (dep_ptr->depend_time) | 
|  | xstrfmtcat(job_ptr->details->dependency, | 
|  | "+%u", dep_ptr->depend_time / 60); | 
|  | xstrfmtcat(job_ptr->details->dependency, "(%s)", | 
|  | _depend_state2str(dep_ptr)); | 
|  | } | 
|  | if (depend_str->set_or_flag) | 
|  | dep_ptr->depend_flags |= SLURM_FLAGS_OR; | 
|  | if (dep_ptr->depend_flags & SLURM_FLAGS_OR) | 
|  | depend_str->sep = "?"; | 
|  | else | 
|  | depend_str->sep = ","; | 
|  |  | 
|  | return 0; | 
|  | } | 
|  |  | 
|  | static void _depend_list2str(job_record_t *job_ptr, bool set_or_flag) | 
|  | { | 
|  | depend_str_t depend_str = { | 
|  | .job_ptr = job_ptr, | 
|  | .sep = "", | 
|  | .set_or_flag = set_or_flag, | 
|  | }; | 
|  |  | 
|  | if (job_ptr->details == NULL) | 
|  | return; | 
|  |  | 
|  | xfree(job_ptr->details->dependency); | 
|  |  | 
|  | if (job_ptr->details->depend_list == NULL | 
|  | || list_count(job_ptr->details->depend_list) == 0) | 
|  | return; | 
|  |  | 
|  | (void) list_for_each(job_ptr->details->depend_list, | 
|  | _foreach_depend_list2str, | 
|  | &depend_str); | 
|  | } | 
|  |  | 
|  | /* Print a job's dependency information based upon job_ptr->depend_list */ | 
|  | extern void print_job_dependency(job_record_t *job_ptr, const char *func) | 
|  | { | 
|  | if ((job_ptr->details == NULL) || | 
|  | (job_ptr->details->depend_list == NULL)) { | 
|  | info("%s: %pJ has no dependency.", func, job_ptr); | 
|  | return; | 
|  | } | 
|  | _depend_list2str(job_ptr, false); | 
|  | info("%s: Dependency information for %pJ:\n  %s", | 
|  | func, job_ptr, job_ptr->details->dependency); | 
|  | } | 
|  |  | 
|  | static int _test_job_dependency_common( | 
|  | bool is_complete, bool is_completed, bool is_pending, | 
|  | bool *clear_dep, bool *failure, | 
|  | job_record_t *job_ptr, struct depend_spec *dep_ptr) | 
|  | { | 
|  | int rc = 0; | 
|  | job_record_t *djob_ptr = dep_ptr->job_ptr; | 
|  | time_t now = time(NULL); | 
|  |  | 
|  | xassert(clear_dep); | 
|  | xassert(failure); | 
|  |  | 
|  | if (dep_ptr->depend_type == SLURM_DEPEND_AFTER) { | 
|  | if (!is_pending) { | 
|  | if (!dep_ptr->depend_time || | 
|  | (djob_ptr->start_time && | 
|  | ((now - djob_ptr->start_time) >= | 
|  | dep_ptr->depend_time)) || | 
|  | fed_mgr_job_started_on_sib(djob_ptr)) { | 
|  | *clear_dep = true; | 
|  | } /* else still depends */ | 
|  | } /* else still depends */ | 
|  | rc = 1; | 
|  | } else if (dep_ptr->depend_type == SLURM_DEPEND_AFTER_ANY) { | 
|  | if (is_completed) | 
|  | *clear_dep = true; | 
|  | /* else still depends */ | 
|  | rc = 1; | 
|  | } else if (dep_ptr->depend_type == SLURM_DEPEND_AFTER_NOT_OK) { | 
|  | if (djob_ptr->job_state & JOB_SPECIAL_EXIT) | 
|  | *clear_dep = true; | 
|  | else if (!is_completed) { /* Still depends */ | 
|  | } else if (!is_complete) | 
|  | *clear_dep = true; | 
|  | else | 
|  | *failure = true; | 
|  | rc = 1; | 
|  | } else if (dep_ptr->depend_type == SLURM_DEPEND_AFTER_OK) { | 
|  | if (!is_completed) { /* Still depends */ | 
|  | } else if (is_complete) | 
|  | *clear_dep = true; | 
|  | else | 
|  | *failure = true; | 
|  | rc = 1; | 
|  | } else if (dep_ptr->depend_type == SLURM_DEPEND_AFTER_CORRESPOND) { | 
|  | job_record_t *dcjob_ptr = NULL; | 
|  | if ((job_ptr->array_task_id == NO_VAL) || | 
|  | (job_ptr->array_task_id == INFINITE)) | 
|  | dcjob_ptr = NULL; | 
|  | else | 
|  | dcjob_ptr = find_job_array_rec(dep_ptr->job_id, | 
|  | job_ptr->array_task_id); | 
|  |  | 
|  | if (dcjob_ptr) { | 
|  | if (!IS_JOB_COMPLETED(dcjob_ptr)) { /* Still depends */ | 
|  | } else if (IS_JOB_COMPLETE(dcjob_ptr)) | 
|  | *clear_dep = true; | 
|  | else | 
|  | *failure = true; | 
|  | } else { | 
|  | if (!is_completed) { /* Still depends */ | 
|  | } else if (is_complete) | 
|  | *clear_dep = true; | 
|  | else if (job_ptr->array_recs && | 
|  | (job_ptr->array_task_id == NO_VAL)) { | 
|  | /* Still depends */ | 
|  | } else | 
|  | *failure = true; | 
|  | } | 
|  | rc = 1; | 
|  | } else if (dep_ptr->depend_type == SLURM_DEPEND_BURST_BUFFER) { | 
|  | if (is_completed && | 
|  | (bb_g_job_test_stage_out(djob_ptr) == 1)) | 
|  | *clear_dep = true; | 
|  | /* else still depends */ | 
|  | rc = 1; | 
|  | } else if (dep_ptr->depend_type == SLURM_DEPEND_EXPAND) { | 
|  | time_t now = time(NULL); | 
|  | if (is_pending) { /* Still depends */ | 
|  | } else if (is_completed) | 
|  | *failure = true; | 
|  | else if ((djob_ptr->end_time != 0) && | 
|  | (djob_ptr->end_time > now)) { | 
|  | job_ptr->time_limit = djob_ptr->end_time - now; | 
|  | job_ptr->time_limit /= 60;  /* sec to min */ | 
|  | *clear_dep = true; | 
|  | } | 
|  | if (!*failure && job_ptr->details && djob_ptr->details) { | 
|  | job_ptr->details->share_res = | 
|  | djob_ptr->details->share_res; | 
|  | job_ptr->details->whole_node = | 
|  | djob_ptr->details->whole_node; | 
|  | } | 
|  | rc = 1; | 
|  | } | 
|  |  | 
|  | return rc; | 
|  | } | 
|  |  | 
|  | static void _test_dependency_state(depend_spec_t *dep_ptr, | 
|  | test_job_dep_t *test_job_dep) | 
|  | { | 
|  | xassert(test_job_dep); | 
|  |  | 
|  | test_job_dep->or_flag = | 
|  | (dep_ptr->depend_flags & SLURM_FLAGS_OR) ? true : false; | 
|  |  | 
|  | if (test_job_dep->or_flag) { | 
|  | if (dep_ptr->depend_state == DEPEND_FULFILLED) | 
|  | test_job_dep->or_satisfied = true; | 
|  | else if (dep_ptr->depend_state == DEPEND_NOT_FULFILLED) | 
|  | test_job_dep->has_unfulfilled = true; | 
|  | } else { /* AND'd dependencies */ | 
|  | if (dep_ptr->depend_state == DEPEND_FAILED) | 
|  | test_job_dep->and_failed = true; | 
|  | else if (dep_ptr->depend_state == DEPEND_NOT_FULFILLED) | 
|  | test_job_dep->has_unfulfilled = true; | 
|  | } | 
|  | } | 
|  |  | 
|  | static int _foreach_test_job_dependency(void *x, void *arg) | 
|  | { | 
|  | depend_spec_t *dep_ptr = x; | 
|  | test_job_dep_t *test_job_dep = arg; | 
|  | job_record_t *job_ptr = test_job_dep->job_ptr; | 
|  | job_record_t *djob_ptr; | 
|  | bool clear_dep = false, failure = false; | 
|  | bool remote = (dep_ptr->depend_flags & SLURM_FLAGS_REMOTE) ? | 
|  | true : false; | 
|  | /* | 
|  | * If the job id is for a cluster that's not in the federation | 
|  | * (it's likely the cluster left the federation), then set | 
|  | * this dependency's state to failed. | 
|  | */ | 
|  | if (remote) { | 
|  | if (fed_mgr_is_origin_job(job_ptr) && | 
|  | (dep_ptr->depend_state == DEPEND_NOT_FULFILLED) && | 
|  | (dep_ptr->depend_type != SLURM_DEPEND_SINGLETON) && | 
|  | (!fed_mgr_is_job_id_in_fed(dep_ptr->job_id))) { | 
|  | log_flag(DEPENDENCY, "%s: %pJ dependency %s:%u failed due to job_id not in federation.", | 
|  | __func__, job_ptr, | 
|  | _depend_type2str(dep_ptr), | 
|  | dep_ptr->job_id); | 
|  | test_job_dep->changed = true; | 
|  | dep_ptr->depend_state = DEPEND_FAILED; | 
|  | } | 
|  | } | 
|  | if ((dep_ptr->depend_state != DEPEND_NOT_FULFILLED) || remote) { | 
|  | _test_dependency_state(dep_ptr, test_job_dep); | 
|  | return 0; | 
|  | } | 
|  |  | 
|  | /* Test local, unfulfilled dependency: */ | 
|  | test_job_dep->has_local_depend = true; | 
|  | dep_ptr->job_ptr = find_job_array_rec(dep_ptr->job_id, | 
|  | dep_ptr->array_task_id); | 
|  | djob_ptr = dep_ptr->job_ptr; | 
|  | if ((dep_ptr->depend_type == SLURM_DEPEND_SINGLETON) && | 
|  | job_ptr->name) { | 
|  | if (list_find_first(job_list, _find_singleton_job, | 
|  | job_ptr) || | 
|  | !fed_mgr_is_singleton_satisfied(job_ptr, | 
|  | dep_ptr, true)) { | 
|  | /* Still depends */ | 
|  | } else | 
|  | clear_dep = true; | 
|  | } else if (!djob_ptr || (djob_ptr->magic != JOB_MAGIC) || | 
|  | ((djob_ptr->job_id != dep_ptr->job_id) && | 
|  | (djob_ptr->array_job_id != dep_ptr->job_id))) { | 
|  | /* job is gone, dependency lifted */ | 
|  | clear_dep = true; | 
|  | } else { | 
|  | bool is_complete, is_completed, is_pending; | 
|  |  | 
|  | /* Special case, apply test to job array as a whole */ | 
|  | if (dep_ptr->array_task_id == INFINITE) { | 
|  | is_complete = test_job_array_complete( | 
|  | dep_ptr->job_id); | 
|  | is_completed = test_job_array_completed( | 
|  | dep_ptr->job_id); | 
|  | is_pending = test_job_array_pending( | 
|  | dep_ptr->job_id); | 
|  | } else { | 
|  | /* Normal job */ | 
|  | is_complete = IS_JOB_COMPLETE(djob_ptr); | 
|  | is_completed = IS_JOB_COMPLETED(djob_ptr); | 
|  | is_pending = IS_JOB_PENDING(djob_ptr) || | 
|  | IS_JOB_CONFIGURING(djob_ptr); | 
|  | } | 
|  |  | 
|  | if (!_test_job_dependency_common( | 
|  | is_complete, is_completed, is_pending, | 
|  | &clear_dep, &failure, | 
|  | job_ptr, dep_ptr)) | 
|  | failure = true; | 
|  | } | 
|  |  | 
|  | if (failure) { | 
|  | dep_ptr->depend_state = DEPEND_FAILED; | 
|  | test_job_dep->changed = true; | 
|  | log_flag(DEPENDENCY, "%s: %pJ dependency %s:%u failed.", | 
|  | __func__, job_ptr, _depend_type2str(dep_ptr), | 
|  | dep_ptr->job_id); | 
|  | } else if (clear_dep) { | 
|  | dep_ptr->depend_state = DEPEND_FULFILLED; | 
|  | test_job_dep->changed = true; | 
|  | log_flag(DEPENDENCY, "%s: %pJ dependency %s:%u fulfilled.", | 
|  | __func__, job_ptr, _depend_type2str(dep_ptr), | 
|  | dep_ptr->job_id); | 
|  | } | 
|  |  | 
|  | _test_dependency_state(dep_ptr, test_job_dep); | 
|  |  | 
|  | return 0; | 
|  | } | 
|  |  | 
|  | /* | 
|  | * Determine if a job's dependencies are met | 
|  | * Inputs: job_ptr | 
|  | * Outputs: was_changed (optional) - | 
|  | *          If it exists, set it to true if at least 1 dependency changed | 
|  | *          state, otherwise false. | 
|  | * RET: NO_DEPEND = no dependencies | 
|  | *      LOCAL_DEPEND = local dependencies remain | 
|  | *      FAIL_DEPEND = failure (job completion code not per dependency), | 
|  | *                    delete the job | 
|  | *      REMOTE_DEPEND = only remote dependencies remain | 
|  | */ | 
|  | extern int test_job_dependency(job_record_t *job_ptr, bool *was_changed) | 
|  | { | 
|  | test_job_dep_t test_job_dep = { | 
|  | .job_ptr = job_ptr, | 
|  | }; | 
|  | int results = NO_DEPEND; | 
|  |  | 
|  | if ((job_ptr->details == NULL) || | 
|  | (job_ptr->details->depend_list == NULL) || | 
|  | (list_count(job_ptr->details->depend_list) == 0)) { | 
|  | job_ptr->bit_flags &= ~JOB_DEPENDENT; | 
|  | if (was_changed) | 
|  | *was_changed = false; | 
|  | return NO_DEPEND; | 
|  | } | 
|  |  | 
|  | (void) list_for_each(job_ptr->details->depend_list, | 
|  | _foreach_test_job_dependency, | 
|  | &test_job_dep); | 
|  |  | 
|  | if (test_job_dep.or_satisfied && | 
|  | (job_ptr->state_reason == WAIT_DEP_INVALID)) { | 
|  | job_ptr->state_reason = WAIT_NO_REASON; | 
|  | xfree(job_ptr->state_desc); | 
|  | last_job_update = time(NULL); | 
|  | } | 
|  |  | 
|  | if (test_job_dep.or_satisfied || | 
|  | (!test_job_dep.or_flag && | 
|  | !test_job_dep.and_failed && | 
|  | !test_job_dep.has_unfulfilled)) { | 
|  | /* Dependency fulfilled */ | 
|  | fed_mgr_remove_remote_dependencies(job_ptr); | 
|  | job_ptr->bit_flags &= ~JOB_DEPENDENT; | 
|  | /* | 
|  | * Don't flush the list if this job isn't on the origin - that | 
|  | * means that we were called from | 
|  | * fed_mgr_test_remote_dependencies() and need to send back the | 
|  | * dependency list to the origin. | 
|  | */ | 
|  | if (fed_mgr_is_origin_job(job_ptr)) | 
|  | list_flush(job_ptr->details->depend_list); | 
|  | _depend_list2str(job_ptr, false); | 
|  | results = NO_DEPEND; | 
|  | log_flag(DEPENDENCY, "%s: %pJ dependency fulfilled", | 
|  | __func__, job_ptr); | 
|  | } else { | 
|  | if (test_job_dep.changed) { | 
|  | _depend_list2str(job_ptr, false); | 
|  | if (slurm_conf.debug_flags & DEBUG_FLAG_DEPENDENCY) | 
|  | print_job_dependency(job_ptr, __func__); | 
|  | } | 
|  | job_ptr->bit_flags |= JOB_DEPENDENT; | 
|  | acct_policy_remove_accrue_time(job_ptr, false); | 
|  | if (test_job_dep.and_failed || | 
|  | (test_job_dep.or_flag && !test_job_dep.has_unfulfilled)) | 
|  | /* Dependency failed */ | 
|  | results = FAIL_DEPEND; | 
|  | else | 
|  | /* Still dependent */ | 
|  | results = test_job_dep.has_local_depend ? LOCAL_DEPEND : | 
|  | REMOTE_DEPEND; | 
|  | } | 
|  |  | 
|  | if (was_changed) | 
|  | *was_changed = test_job_dep.changed; | 
|  | return results; | 
|  | } | 
|  |  | 
|  | /* Given a new job dependency specification, expand job array specifications | 
|  | * into a collection of task IDs that update_job_dependency can parse. | 
|  | * (e.g. "after:123_[4-5]" to "after:123_4:123_5") | 
|  | * Returns NULL if not valid job array specification. | 
|  | * Returned value must be xfreed. */ | 
|  | static char *_xlate_array_dep(char *new_depend) | 
|  | { | 
|  | char *new_array_dep = NULL, *array_tmp, *jobid_ptr = NULL, *sep; | 
|  | bitstr_t *array_bitmap; | 
|  | int i; | 
|  | uint32_t job_id; | 
|  | int32_t t, t_first, t_last; | 
|  |  | 
|  | if (strstr(new_depend, "_[") == NULL) | 
|  | return NULL;	/* No job array expressions */ | 
|  |  | 
|  | if (max_array_size == NO_VAL) { | 
|  | max_array_size = slurm_conf.max_array_sz; | 
|  | } | 
|  |  | 
|  | for (i = 0; new_depend[i]; i++) { | 
|  | xstrfmtcat(new_array_dep, "%c", new_depend[i]); | 
|  | if ((new_depend[i] >= '0') && (new_depend[i] <= '9')) { | 
|  | if (jobid_ptr == NULL) | 
|  | jobid_ptr = new_depend + i; | 
|  | } else if ((new_depend[i] == '_') && (new_depend[i+1] == '[') && | 
|  | (jobid_ptr != NULL)) { | 
|  | job_id = (uint32_t) atol(jobid_ptr); | 
|  | i += 2;	/* Skip over "_[" */ | 
|  | array_tmp = xstrdup(new_depend + i); | 
|  | sep = strchr(array_tmp, ']'); | 
|  | if (sep) | 
|  | sep[0] = '\0'; | 
|  | array_bitmap = bit_alloc(max_array_size); | 
|  | if ((sep == NULL) || | 
|  | (bit_unfmt(array_bitmap, array_tmp) != 0) || | 
|  | ((t_first = bit_ffs(array_bitmap)) == -1)) { | 
|  | /* Invalid format */ | 
|  | xfree(array_tmp); | 
|  | FREE_NULL_BITMAP(array_bitmap); | 
|  | xfree(new_array_dep); | 
|  | return NULL; | 
|  | } | 
|  | i += (sep - array_tmp);	/* Move to location of ']' */ | 
|  | xfree(array_tmp); | 
|  | t_last = bit_fls(array_bitmap); | 
|  | for (t = t_first; t <= t_last; t++) { | 
|  | if (!bit_test(array_bitmap, t)) | 
|  | continue; | 
|  | if (t == t_first) { | 
|  | xstrfmtcat(new_array_dep, "%d", t); | 
|  | } else { | 
|  | xstrfmtcat(new_array_dep, ":%u_%d", | 
|  | job_id, t); | 
|  | } | 
|  | } | 
|  | FREE_NULL_BITMAP(array_bitmap); | 
|  | jobid_ptr = NULL; | 
|  | } else { | 
|  | jobid_ptr = NULL; | 
|  | } | 
|  | } | 
|  |  | 
|  | return new_array_dep; | 
|  | } | 
|  |  | 
|  | /* Copy dependent job's TRES options into another job's options  */ | 
|  | static void _copy_tres_opts(job_record_t *job_ptr, job_record_t *dep_job_ptr) | 
|  | { | 
|  | xfree(job_ptr->cpus_per_tres); | 
|  | job_ptr->cpus_per_tres = xstrdup(dep_job_ptr->cpus_per_tres); | 
|  | xfree(job_ptr->tres_per_job); | 
|  | job_ptr->tres_per_job = xstrdup(dep_job_ptr->tres_per_job); | 
|  | xfree(job_ptr->tres_per_node); | 
|  | job_ptr->tres_per_node = xstrdup(dep_job_ptr->tres_per_node); | 
|  | xfree(job_ptr->tres_per_socket); | 
|  | job_ptr->tres_per_socket = xstrdup(dep_job_ptr->tres_per_socket); | 
|  | xfree(job_ptr->tres_per_task); | 
|  | job_ptr->tres_per_task = xstrdup(dep_job_ptr->tres_per_task); | 
|  | xfree(job_ptr->mem_per_tres); | 
|  | job_ptr->mem_per_tres = xstrdup(dep_job_ptr->mem_per_tres); | 
|  | } | 
|  |  | 
|  | static int _find_dependency(void *arg, void *key) | 
|  | { | 
|  | /* Does arg (dependency in the list) match key (new dependency)? */ | 
|  | depend_spec_t *dep_ptr = (depend_spec_t *)arg; | 
|  | depend_spec_t *new_dep = (depend_spec_t *)key; | 
|  | return (dep_ptr->job_id == new_dep->job_id) && | 
|  | (dep_ptr->array_task_id == new_dep->array_task_id) && | 
|  | (dep_ptr->depend_type == new_dep->depend_type); | 
|  | } | 
|  |  | 
|  | extern depend_spec_t *find_dependency(job_record_t *job_ptr, | 
|  | depend_spec_t *dep_ptr) | 
|  | { | 
|  | if (!job_ptr->details || !job_ptr->details->depend_list) | 
|  | return NULL; | 
|  | return list_find_first(job_ptr->details->depend_list, | 
|  | _find_dependency, dep_ptr); | 
|  | } | 
|  |  | 
|  | /* | 
|  | * Add a new dependency to the list, ensuring that the list is unique. | 
|  | * Dependencies are uniquely identified by a combination of job_id and | 
|  | * depend_type. | 
|  | */ | 
|  | static void _add_dependency_to_list(list_t *depend_list, | 
|  | depend_spec_t *dep_ptr) | 
|  | { | 
|  | if (!list_find_first(depend_list, _find_dependency, dep_ptr)) | 
|  | list_append(depend_list, dep_ptr); | 
|  | } | 
|  |  | 
|  | static int _parse_depend_state(char **str_ptr, uint32_t *depend_state) | 
|  | { | 
|  | char *sep_ptr; | 
|  |  | 
|  | if ((sep_ptr = strchr(*str_ptr, '('))) { | 
|  | /* Get the whole string before ")", convert to state */ | 
|  | char *paren = strchr(*str_ptr, ')'); | 
|  | if (!paren) | 
|  | return SLURM_ERROR; | 
|  | else | 
|  | *paren = '\0'; | 
|  | sep_ptr++; /* skip over "(" */ | 
|  | *depend_state = _depend_state_str2state(sep_ptr); | 
|  | /* Don't allow depend_fulfilled as a string. */ | 
|  | if (*depend_state != DEPEND_FAILED) | 
|  | *depend_state = DEPEND_NOT_FULFILLED; | 
|  | *str_ptr = paren + 1; /* skip over ")" */ | 
|  | } else | 
|  | *depend_state = DEPEND_NOT_FULFILLED; | 
|  |  | 
|  | return SLURM_SUCCESS; | 
|  | } | 
|  |  | 
|  | static job_record_t *_find_dependent_job_ptr(uint32_t job_id, | 
|  | uint32_t *array_task_id) | 
|  | { | 
|  | job_record_t *dep_job_ptr; | 
|  |  | 
|  | if (*array_task_id == NO_VAL) { | 
|  | dep_job_ptr = find_job_record(job_id); | 
|  | if (!dep_job_ptr) | 
|  | dep_job_ptr = find_job_array_rec(job_id, INFINITE); | 
|  | if (dep_job_ptr && | 
|  | (dep_job_ptr->array_job_id == job_id) && | 
|  | ((dep_job_ptr->array_task_id != NO_VAL) || | 
|  | (dep_job_ptr->array_recs != NULL))) | 
|  | *array_task_id = INFINITE; | 
|  | } else | 
|  | dep_job_ptr = find_job_array_rec(job_id, *array_task_id); | 
|  |  | 
|  | return dep_job_ptr; | 
|  | } | 
|  |  | 
|  | /* | 
|  | * job_ptr - job that is getting a new dependency | 
|  | * dep_job_ptr - pointer to the job that job_ptr wants to depend on | 
|  | *   - This can be NULL, for example if it's a remote dependency. That's okay. | 
|  | * job_id - job_id of the dependency string | 
|  | * array_task_id - array_task_id of the dependency string | 
|  | *   - Equals NO_VAL if the dependency isn't a job array. | 
|  | *   - Equals INFINITE if the dependency is the whole job array. | 
|  | *   - Otherwise this equals a specific task of the job array (0, 1, 2, etc.) | 
|  | * | 
|  | * RET true if job_ptr is the same job as the new dependency, false otherwise. | 
|  | * | 
|  | * Example: | 
|  | *   scontrol update jobid=123 dependency=afterok:456_5 | 
|  | * | 
|  | * job_ptr points to the job record for jobid=123. | 
|  | * dep_job_ptr points to the job record for 456_5. | 
|  | * job_id == 456. (This is probably different from dep_job_ptr->job_id.) | 
|  | * array_task_id == 5. | 
|  | */ | 
|  | static bool _depends_on_same_job(job_record_t *job_ptr, | 
|  | job_record_t *dep_job_ptr, | 
|  | uint32_t job_id, uint32_t array_task_id) | 
|  | { | 
|  | if (array_task_id == INFINITE) { | 
|  | /* job_ptr wants to set a dependency on a whole job array */ | 
|  | if ((job_ptr->array_task_id != NO_VAL) || | 
|  | (job_ptr->array_recs)) { | 
|  | /* | 
|  | * job_ptr is a specific task in a job array, or is | 
|  | * the meta job of a job array. | 
|  | * Test if job_ptr belongs to the array indicated by | 
|  | * the dependency string's "job_id" | 
|  | */ | 
|  | return (job_ptr->array_job_id == job_id); | 
|  | } else { | 
|  | /* job_ptr is a normal job */ | 
|  | return (job_ptr == dep_job_ptr); | 
|  | } | 
|  | } else { | 
|  | /* Doesn't depend on a whole job array; test normally */ | 
|  | return (job_ptr == dep_job_ptr); | 
|  | } | 
|  | } | 
|  |  | 
|  | /* | 
|  | * The new dependency format is: | 
|  | * | 
|  | * <type:job_id[:job_id][,type:job_id[:job_id]]> or | 
|  | * <type:job_id[:job_id][?type:job_id[:job_id]]> | 
|  | * | 
|  | * This function parses the all job id's within a single dependency type. | 
|  | * One char past the end of valid job id's is returned in (*sep_ptr2). | 
|  | * Set (*rc) to ESLURM_DEPENDENCY for invalid job id's. | 
|  | */ | 
|  | static void _parse_dependency_jobid_new(job_record_t *job_ptr, | 
|  | list_t *new_depend_list, char *sep_ptr, | 
|  | char **sep_ptr2, char *tok, | 
|  | uint16_t depend_type, int select_hetero, | 
|  | int *rc) | 
|  | { | 
|  | depend_spec_t *dep_ptr; | 
|  | job_record_t *dep_job_ptr = NULL; | 
|  | int expand_cnt = 0; | 
|  | uint32_t job_id, array_task_id, depend_state; | 
|  | char *tmp = NULL; | 
|  | int depend_time = 0; | 
|  |  | 
|  | while (!(*rc)) { | 
|  | job_id = strtol(sep_ptr, &tmp, 10); | 
|  | if ((tmp != NULL) && (tmp[0] == '_')) { | 
|  | if (tmp[1] == '*') { | 
|  | array_task_id = INFINITE; | 
|  | tmp += 2;	/* Past "_*" */ | 
|  | } else { | 
|  | array_task_id = strtol(tmp+1, | 
|  | &tmp, 10); | 
|  | } | 
|  | } else | 
|  | array_task_id = NO_VAL; | 
|  | if ((tmp == NULL) || (job_id == 0) || | 
|  | ((tmp[0] != '\0') && (tmp[0] != ',') && | 
|  | (tmp[0] != '?')  && (tmp[0] != ':') && | 
|  | (tmp[0] != '+') && (tmp[0] != '('))) { | 
|  | *rc = ESLURM_DEPENDENCY; | 
|  | break; | 
|  | } | 
|  |  | 
|  | dep_job_ptr = _find_dependent_job_ptr(job_id, &array_task_id); | 
|  |  | 
|  | if (!dep_job_ptr && fed_mgr_is_origin_job_id(job_id) && | 
|  | ((depend_type == SLURM_DEPEND_AFTER_OK) || | 
|  | (depend_type == SLURM_DEPEND_AFTER_NOT_OK))) { | 
|  | /* | 
|  | * Reject the job since we won't be able to check if | 
|  | * job dependency was fulfilled or not. | 
|  | */ | 
|  | *rc = ESLURM_DEPENDENCY; | 
|  | break; | 
|  | } | 
|  |  | 
|  | /* | 
|  | * _find_dependent_job_ptr() may modify array_task_id, so check | 
|  | * if the job is the same after that. | 
|  | */ | 
|  | if (_depends_on_same_job(job_ptr, dep_job_ptr, job_id, | 
|  | array_task_id)) { | 
|  | *rc = ESLURM_DEPENDENCY; | 
|  | break; | 
|  | } | 
|  | if ((depend_type == SLURM_DEPEND_EXPAND) && | 
|  | ((expand_cnt++ > 0) || (dep_job_ptr == NULL) || | 
|  | (!IS_JOB_RUNNING(dep_job_ptr))		|| | 
|  | (dep_job_ptr->qos_id != job_ptr->qos_id)	|| | 
|  | (dep_job_ptr->part_ptr == NULL)		|| | 
|  | (job_ptr->part_ptr     == NULL)		|| | 
|  | (dep_job_ptr->part_ptr != job_ptr->part_ptr))) { | 
|  | /* | 
|  | * Expand only jobs in the same QOS and partition | 
|  | */ | 
|  | *rc = ESLURM_DEPENDENCY; | 
|  | break; | 
|  | } | 
|  |  | 
|  | if (tmp[0] == '+') { | 
|  | sep_ptr = &tmp[1]; /* skip over "+" */ | 
|  | depend_time = strtol(sep_ptr, &tmp, 10); | 
|  |  | 
|  | if (depend_time <= 0) { | 
|  | *rc = ESLURM_DEPENDENCY; | 
|  | break; | 
|  | } | 
|  | depend_time *= 60; | 
|  | } | 
|  |  | 
|  | if (_parse_depend_state(&tmp, &depend_state)) { | 
|  | *rc = ESLURM_DEPENDENCY; | 
|  | break; | 
|  | } | 
|  |  | 
|  | if (depend_type == SLURM_DEPEND_EXPAND) { | 
|  | assoc_mgr_lock_t locks = { .tres = READ_LOCK }; | 
|  | job_details_t *detail_ptr = job_ptr->details; | 
|  | multi_core_data_t *mc_ptr = detail_ptr->mc_ptr; | 
|  | gres_job_state_validate_t gres_js_val = { | 
|  | .cpus_per_task = | 
|  | &detail_ptr->orig_cpus_per_task, | 
|  | .max_nodes = &detail_ptr->max_nodes, | 
|  | .min_cpus = &detail_ptr->min_cpus, | 
|  | .min_nodes = &detail_ptr->min_nodes, | 
|  | .ntasks_per_node = &detail_ptr->ntasks_per_node, | 
|  | .ntasks_per_socket = &mc_ptr->ntasks_per_socket, | 
|  | .ntasks_per_tres = &detail_ptr->ntasks_per_tres, | 
|  | .num_tasks = &detail_ptr->num_tasks, | 
|  | .sockets_per_node = &mc_ptr->sockets_per_node, | 
|  |  | 
|  | .gres_list = &job_ptr->gres_list_req, | 
|  | }; | 
|  |  | 
|  | job_ptr->details->expanding_jobid = job_id; | 
|  | if (select_hetero == 0) { | 
|  | /* | 
|  | * GRES per node of this job must match | 
|  | * the job being expanded. Other options | 
|  | * are ignored. | 
|  | */ | 
|  | _copy_tres_opts(job_ptr, dep_job_ptr); | 
|  | } | 
|  |  | 
|  | gres_js_val.cpus_per_tres = job_ptr->cpus_per_tres; | 
|  | gres_js_val.mem_per_tres = job_ptr->mem_per_tres; | 
|  | gres_js_val.tres_freq = job_ptr->tres_freq; | 
|  | gres_js_val.tres_per_job = job_ptr->tres_per_job; | 
|  | gres_js_val.tres_per_node = job_ptr->tres_per_node; | 
|  | gres_js_val.tres_per_socket = job_ptr->tres_per_socket; | 
|  | gres_js_val.tres_per_task = job_ptr->tres_per_task; | 
|  |  | 
|  | FREE_NULL_LIST(job_ptr->gres_list_req); | 
|  | (void) gres_job_state_validate(&gres_js_val); | 
|  | assoc_mgr_lock(&locks); | 
|  | gres_stepmgr_set_job_tres_cnt( | 
|  | job_ptr->gres_list_req, | 
|  | job_ptr->details->min_nodes, | 
|  | job_ptr->tres_req_cnt, | 
|  | true); | 
|  | xfree(job_ptr->tres_req_str); | 
|  | job_ptr->tres_req_str = | 
|  | assoc_mgr_make_tres_str_from_array( | 
|  | job_ptr->tres_req_cnt, | 
|  | TRES_STR_FLAG_SIMPLE, true); | 
|  | assoc_mgr_unlock(&locks); | 
|  | } | 
|  |  | 
|  | dep_ptr = xmalloc(sizeof(depend_spec_t)); | 
|  | dep_ptr->array_task_id = array_task_id; | 
|  | dep_ptr->depend_type = depend_type; | 
|  | if (job_ptr->fed_details && !fed_mgr_is_origin_job_id(job_id)) { | 
|  | if (depend_type == SLURM_DEPEND_EXPAND) { | 
|  | error("%s: Job expansion not permitted for remote jobs", | 
|  | __func__); | 
|  | *rc = ESLURM_DEPENDENCY; | 
|  | xfree(dep_ptr); | 
|  | break; | 
|  | } | 
|  | /* The dependency is on a remote cluster */ | 
|  | dep_ptr->depend_flags |= SLURM_FLAGS_REMOTE; | 
|  | dep_job_ptr = NULL; | 
|  | } | 
|  | if (dep_job_ptr) {	/* job still active */ | 
|  | if (array_task_id == NO_VAL) | 
|  | dep_ptr->job_id = dep_job_ptr->job_id; | 
|  | else | 
|  | dep_ptr->job_id = dep_job_ptr->array_job_id; | 
|  | } else | 
|  | dep_ptr->job_id = job_id; | 
|  | dep_ptr->job_ptr = dep_job_ptr; | 
|  | dep_ptr->depend_time = depend_time; | 
|  | dep_ptr->depend_state = depend_state; | 
|  | _add_dependency_to_list(new_depend_list, dep_ptr); | 
|  | if (tmp[0] != ':') | 
|  | break; | 
|  | sep_ptr = tmp + 1;	/* skip over ":" */ | 
|  |  | 
|  | } | 
|  | *sep_ptr2 = tmp; | 
|  | } | 
|  |  | 
|  | /* | 
|  | * The old dependency format is a comma-separated list of job id's. | 
|  | * Parse a single jobid. | 
|  | * One char past the end of a valid job id will be returned in (*sep_ptr). | 
|  | * For an invalid job id, (*rc) will be set to ESLURM_DEPENDENCY. | 
|  | */ | 
|  | static void _parse_dependency_jobid_old(job_record_t *job_ptr, | 
|  | list_t *new_depend_list, char **sep_ptr, | 
|  | char *tok, int *rc) | 
|  | { | 
|  | depend_spec_t *dep_ptr; | 
|  | job_record_t *dep_job_ptr = NULL; | 
|  | uint32_t job_id, array_task_id; | 
|  | char *tmp = NULL; | 
|  |  | 
|  | job_id = strtol(tok, &tmp, 10); | 
|  | if ((tmp != NULL) && (tmp[0] == '_')) { | 
|  | if (tmp[1] == '*') { | 
|  | array_task_id = INFINITE; | 
|  | tmp += 2;	/* Past "_*" */ | 
|  | } else { | 
|  | array_task_id = strtol(tmp+1, &tmp, 10); | 
|  | } | 
|  | } else { | 
|  | array_task_id = NO_VAL; | 
|  | } | 
|  | *sep_ptr = tmp; | 
|  | if ((tmp == NULL) || (job_id == 0) || | 
|  | ((tmp[0] != '\0') && (tmp[0] != ','))) { | 
|  | *rc = ESLURM_DEPENDENCY; | 
|  | return; | 
|  | } | 
|  | /* | 
|  | * _find_dependent_job_ptr() may modify array_task_id, so check | 
|  | * if the job is the same after that. | 
|  | */ | 
|  | dep_job_ptr = _find_dependent_job_ptr(job_id, &array_task_id); | 
|  | if (_depends_on_same_job(job_ptr, dep_job_ptr, job_id, array_task_id)) { | 
|  | *rc = ESLURM_DEPENDENCY; | 
|  | return; | 
|  | } | 
|  |  | 
|  | dep_ptr = xmalloc(sizeof(depend_spec_t)); | 
|  | dep_ptr->array_task_id = array_task_id; | 
|  | dep_ptr->depend_type = SLURM_DEPEND_AFTER_ANY; | 
|  | if (job_ptr->fed_details && | 
|  | !fed_mgr_is_origin_job_id(job_id)) { | 
|  | /* The dependency is on a remote cluster */ | 
|  | dep_ptr->depend_flags |= SLURM_FLAGS_REMOTE; | 
|  | dep_job_ptr = NULL; | 
|  | } | 
|  | if (dep_job_ptr) { | 
|  | if (array_task_id == NO_VAL) { | 
|  | dep_ptr->job_id = dep_job_ptr->job_id; | 
|  | } else { | 
|  | dep_ptr->job_id = dep_job_ptr->array_job_id; | 
|  | } | 
|  | } else | 
|  | dep_ptr->job_id = job_id; | 
|  | dep_ptr->job_ptr = dep_job_ptr; /* Can be NULL */ | 
|  | _add_dependency_to_list(new_depend_list, dep_ptr); | 
|  | } | 
|  |  | 
|  | static int _foreach_update_job_depenency_list(void *x, void *arg) | 
|  | { | 
|  | depend_spec_t *dep_ptr = x, *job_depend; | 
|  | test_job_dep_t *test_job_dep = arg; | 
|  | job_record_t *job_ptr = test_job_dep->job_ptr; | 
|  |  | 
|  | /* | 
|  | * If the dependency is marked as remote, then it wasn't updated | 
|  | * by the sibling cluster. Skip it. | 
|  | */ | 
|  | if (dep_ptr->depend_flags & SLURM_FLAGS_REMOTE) | 
|  | return 0; | 
|  |  | 
|  | /* | 
|  | * Find the dependency in job_ptr that matches this one. | 
|  | * Then update job_ptr's dependency state (not fulfilled, | 
|  | * fulfilled, or failed) to match this one. | 
|  | */ | 
|  | job_depend = list_find_first(job_ptr->details->depend_list, | 
|  | _find_dependency, | 
|  | dep_ptr); | 
|  | if (!job_depend) { | 
|  | /* | 
|  | * This can happen if the job's dependency is updated | 
|  | * and the update doesn't get to the sibling before | 
|  | * the sibling sends back an update to the origin (us). | 
|  | */ | 
|  | log_flag(DEPENDENCY, "%s: Cannot find dependency %s:%u for %pJ, it may have been cleared before we got here.", | 
|  | __func__, _depend_type2str(dep_ptr), | 
|  | dep_ptr->job_id, job_ptr); | 
|  | return 0; | 
|  | } | 
|  |  | 
|  | /* | 
|  | * If the dependency is already fulfilled, don't update it. | 
|  | * Otherwise update the dependency state. | 
|  | */ | 
|  | if ((job_depend->depend_state == DEPEND_FULFILLED) || | 
|  | (job_depend->depend_state == dep_ptr->depend_state)) | 
|  | return 0; | 
|  | if (job_depend->depend_type == SLURM_DEPEND_SINGLETON) { | 
|  | /* | 
|  | * We need to update the singleton dependency with | 
|  | * the cluster bit, but test_job_dependency() will test | 
|  | * if it is fulfilled, so don't change the depend_state | 
|  | * here. | 
|  | */ | 
|  | job_depend->singleton_bits |= dep_ptr->singleton_bits; | 
|  | if (!fed_mgr_is_singleton_satisfied(job_ptr, job_depend, | 
|  | false)) | 
|  | return 0; | 
|  | } | 
|  | job_depend->depend_state = dep_ptr->depend_state; | 
|  | test_job_dep->changed = true; | 
|  |  | 
|  | return 0; | 
|  | } | 
|  |  | 
|  | extern bool update_job_dependency_list(job_record_t *job_ptr, | 
|  | list_t *new_depend_list) | 
|  | { | 
|  | test_job_dep_t test_job_dep = { | 
|  | .job_ptr = job_ptr, | 
|  | }; | 
|  |  | 
|  | xassert(job_ptr); | 
|  | xassert(job_ptr->details); | 
|  | xassert(job_ptr->details->depend_list); | 
|  |  | 
|  | (void) list_for_each(new_depend_list, | 
|  | _foreach_update_job_depenency_list, | 
|  | &test_job_dep); | 
|  |  | 
|  | return test_job_dep.changed; | 
|  | } | 
|  |  | 
|  | static int _foreach_handle_job_dependency_updates(void *x, void *arg) | 
|  | { | 
|  | depend_spec_t *dep_ptr = x; | 
|  | test_job_dep_t *test_job_dep = arg; | 
|  |  | 
|  | _test_dependency_state(dep_ptr, test_job_dep); | 
|  |  | 
|  | return 0; | 
|  | } | 
|  |  | 
|  | extern int handle_job_dependency_updates(void *object, void *arg) | 
|  | { | 
|  | job_record_t *job_ptr = (job_record_t *) object; | 
|  | time_t now = time(NULL); | 
|  | test_job_dep_t test_job_dep = { | 
|  | .job_ptr = job_ptr, | 
|  | }; | 
|  | xassert(job_ptr->details); | 
|  | xassert(job_ptr->details->depend_list); | 
|  |  | 
|  | /* | 
|  | * Check the depend_state of each dependency. | 
|  | * All dependencies are OR'd or AND'd - we don't allow a mix. | 
|  | * OR'd dependencies: | 
|  | *   - If one dependency succeeded, the whole thing passes. | 
|  | *   - If there is at least one unfulfilled dependency, | 
|  | *     the job is still dependent. | 
|  | *   - All dependencies failed == dependency never satisfied. | 
|  | * AND'd dependencies: | 
|  | *   - One failure == dependency never satisfied | 
|  | *   - One+ not fulfilled == still dependent | 
|  | *   - All succeeded == dependency fulfilled | 
|  | */ | 
|  | (void) list_for_each(job_ptr->details->depend_list, | 
|  | _foreach_handle_job_dependency_updates, | 
|  | &test_job_dep); | 
|  |  | 
|  | if (test_job_dep.or_satisfied || | 
|  | (!test_job_dep.or_flag && | 
|  | !test_job_dep.and_failed && | 
|  | !test_job_dep.has_unfulfilled)) { | 
|  | /* Dependency fulfilled */ | 
|  | fed_mgr_remove_remote_dependencies(job_ptr); | 
|  | job_ptr->bit_flags &= ~JOB_DEPENDENT; | 
|  | list_flush(job_ptr->details->depend_list); | 
|  | if ((job_ptr->state_reason == WAIT_DEP_INVALID) || | 
|  | (job_ptr->state_reason == WAIT_DEPENDENCY)) { | 
|  | job_ptr->state_reason = WAIT_NO_REASON; | 
|  | xfree(job_ptr->state_desc); | 
|  | last_job_update = now; | 
|  | } | 
|  | _depend_list2str(job_ptr, false); | 
|  | fed_mgr_job_requeue(job_ptr); | 
|  | } else { | 
|  | _depend_list2str(job_ptr, false); | 
|  | job_ptr->bit_flags |= JOB_DEPENDENT; | 
|  | acct_policy_remove_accrue_time(job_ptr, false); | 
|  | if (test_job_dep.and_failed || | 
|  | (test_job_dep.or_flag && !test_job_dep.has_unfulfilled)) { | 
|  | /* Dependency failed */ | 
|  | handle_invalid_dependency(job_ptr); | 
|  | } else { | 
|  | /* Still dependent */ | 
|  | job_ptr->state_reason = WAIT_DEPENDENCY; | 
|  | xfree(job_ptr->state_desc); | 
|  | last_job_update = now; | 
|  | } | 
|  | } | 
|  | if (slurm_conf.debug_flags & DEBUG_FLAG_DEPENDENCY) | 
|  | print_job_dependency(job_ptr, __func__); | 
|  |  | 
|  | return SLURM_SUCCESS; | 
|  | } | 
|  |  | 
|  | /* | 
|  | * Parse a job dependency string and use it to establish a "depend_spec" | 
|  | * list of dependencies. We accept both old format (a single job ID) and | 
|  | * new format (e.g. "afterok:123:124,after:128"). | 
|  | * IN job_ptr - job record to have dependency and depend_list updated | 
|  | * IN new_depend - new dependency description | 
|  | * RET returns an error code from slurm_errno.h | 
|  | */ | 
|  | extern int update_job_dependency(job_record_t *job_ptr, char *new_depend) | 
|  | { | 
|  | static int select_hetero = -1; | 
|  | int rc = SLURM_SUCCESS; | 
|  | uint16_t depend_type = 0; | 
|  | char *tok, *new_array_dep, *sep_ptr, *sep_ptr2 = NULL; | 
|  | list_t *new_depend_list = NULL; | 
|  | depend_spec_t *dep_ptr; | 
|  | bool or_flag = false; | 
|  |  | 
|  | if (job_ptr->details == NULL) | 
|  | return EINVAL; | 
|  |  | 
|  | if (select_hetero == -1) { | 
|  | /* | 
|  | * Determine if the select plugin supports heterogeneous | 
|  | * GRES allocations (count differ by node): 1=yes, 0=no | 
|  | */ | 
|  | if (xstrstr(slurm_conf.select_type, "cons_tres")) | 
|  | select_hetero = 1; | 
|  | else | 
|  | select_hetero = 0; | 
|  | } | 
|  |  | 
|  | /* Clear dependencies on NULL, "0", or empty dependency input */ | 
|  | job_ptr->details->expanding_jobid = 0; | 
|  | if ((new_depend == NULL) || (new_depend[0] == '\0') || | 
|  | ((new_depend[0] == '0') && (new_depend[1] == '\0'))) { | 
|  | xfree(job_ptr->details->dependency); | 
|  | FREE_NULL_LIST(job_ptr->details->depend_list); | 
|  | return rc; | 
|  |  | 
|  | } | 
|  |  | 
|  | new_depend_list = list_create(xfree_ptr); | 
|  | if ((new_array_dep = _xlate_array_dep(new_depend))) | 
|  | tok = new_array_dep; | 
|  | else | 
|  | tok = new_depend; | 
|  | /* validate new dependency string */ | 
|  | while (rc == SLURM_SUCCESS) { | 
|  | /* test singleton dependency flag */ | 
|  | if (xstrncasecmp(tok, "singleton", 9) == 0) { | 
|  | uint32_t state; | 
|  | tok += 9; /* skip past "singleton" */ | 
|  | depend_type = SLURM_DEPEND_SINGLETON; | 
|  | if (_parse_depend_state(&tok, &state)) { | 
|  | rc = ESLURM_DEPENDENCY; | 
|  | break; | 
|  | } | 
|  | if (disable_remote_singleton && | 
|  | !fed_mgr_is_origin_job(job_ptr)) { | 
|  | /* Singleton disabled for non-origin cluster */ | 
|  | } else { | 
|  | dep_ptr = xmalloc(sizeof(depend_spec_t)); | 
|  | dep_ptr->depend_state = state; | 
|  | dep_ptr->depend_type = depend_type; | 
|  | /* dep_ptr->job_id = 0;	set by xmalloc */ | 
|  | /* dep_ptr->job_ptr = NULL; set by xmalloc */ | 
|  | /* dep_ptr->singleton_bits = 0;set by xmalloc */ | 
|  | _add_dependency_to_list(new_depend_list, | 
|  | dep_ptr); | 
|  | } | 
|  | if (tok[0] == ',') { | 
|  | tok++; | 
|  | continue; | 
|  | } else if (tok[0] == '?') { | 
|  | tok++; | 
|  | or_flag = true; | 
|  | continue; | 
|  | } | 
|  | if (tok[0] != '\0') | 
|  | rc = ESLURM_DEPENDENCY; | 
|  | break; | 
|  | } | 
|  |  | 
|  | /* Test for old format, just a job ID */ | 
|  | sep_ptr = strchr(tok, ':'); | 
|  | if ((sep_ptr == NULL) && (tok[0] >= '0') && (tok[0] <= '9')) { | 
|  | _parse_dependency_jobid_old(job_ptr, new_depend_list, | 
|  | &sep_ptr, tok, &rc); | 
|  | if (rc) | 
|  | break; | 
|  | if (sep_ptr && (sep_ptr[0] == ',')) { | 
|  | tok = sep_ptr + 1; | 
|  | continue; | 
|  | } else { | 
|  | break; | 
|  | } | 
|  | } else if (sep_ptr == NULL) { | 
|  | rc = ESLURM_DEPENDENCY; | 
|  | break; | 
|  | } | 
|  |  | 
|  | /* New format, <test>:job_ID */ | 
|  | if (!xstrncasecmp(tok, "afternotok:", 11)) | 
|  | depend_type = SLURM_DEPEND_AFTER_NOT_OK; | 
|  | else if (!xstrncasecmp(tok, "aftercorr:", 10)) | 
|  | depend_type = SLURM_DEPEND_AFTER_CORRESPOND; | 
|  | else if (!xstrncasecmp(tok, "afterany:", 9)) | 
|  | depend_type = SLURM_DEPEND_AFTER_ANY; | 
|  | else if (!xstrncasecmp(tok, "afterok:", 8)) | 
|  | depend_type = SLURM_DEPEND_AFTER_OK; | 
|  | else if (!xstrncasecmp(tok, "afterburstbuffer:", 11)) | 
|  | depend_type = SLURM_DEPEND_BURST_BUFFER; | 
|  | else if (!xstrncasecmp(tok, "after:", 6)) | 
|  | depend_type = SLURM_DEPEND_AFTER; | 
|  | else if (!xstrncasecmp(tok, "expand:", 7)) { | 
|  | if (!permit_job_expansion()) { | 
|  | rc = ESLURM_NOT_SUPPORTED; | 
|  | break; | 
|  | } | 
|  | depend_type = SLURM_DEPEND_EXPAND; | 
|  | } else { | 
|  | rc = ESLURM_DEPENDENCY; | 
|  | break; | 
|  | } | 
|  | sep_ptr++;	/* skip over ":" */ | 
|  | _parse_dependency_jobid_new(job_ptr, new_depend_list, sep_ptr, | 
|  | &sep_ptr2, tok, depend_type, | 
|  | select_hetero, &rc); | 
|  |  | 
|  | if (sep_ptr2 && (sep_ptr2[0] == ',')) { | 
|  | tok = sep_ptr2 + 1; | 
|  | } else if (sep_ptr2 && (sep_ptr2[0] == '?')) { | 
|  | tok = sep_ptr2 + 1; | 
|  | or_flag = true; | 
|  | } else { | 
|  | break; | 
|  | } | 
|  | } | 
|  |  | 
|  | if (rc == SLURM_SUCCESS) { | 
|  | /* test for circular dependencies (e.g. A -> B -> A) */ | 
|  | (void) _scan_depend(NULL, job_ptr); | 
|  | if (_scan_depend(new_depend_list, job_ptr)) | 
|  | rc = ESLURM_CIRCULAR_DEPENDENCY; | 
|  | } | 
|  |  | 
|  | if (rc == SLURM_SUCCESS) { | 
|  | FREE_NULL_LIST(job_ptr->details->depend_list); | 
|  | job_ptr->details->depend_list = new_depend_list; | 
|  | _depend_list2str(job_ptr, or_flag); | 
|  | if (slurm_conf.debug_flags & DEBUG_FLAG_DEPENDENCY) | 
|  | print_job_dependency(job_ptr, __func__); | 
|  | } else { | 
|  | FREE_NULL_LIST(new_depend_list); | 
|  | } | 
|  | xfree(new_array_dep); | 
|  | return rc; | 
|  | } | 
|  |  | 
|  | static int _foreach_scan_depend(void *x, void *arg) | 
|  | { | 
|  | depend_spec_t *dep_ptr = x; | 
|  | test_job_dep_t *test_job_dep = arg; | 
|  | job_record_t *job_ptr = test_job_dep->job_ptr; | 
|  |  | 
|  | if (dep_ptr->job_id == 0)	/* Singleton */ | 
|  | return 0; | 
|  | /* | 
|  | * We can't test for circular dependencies if the job_ptr | 
|  | * wasn't found - the job may not be on this cluster, or the | 
|  | * job was already purged when the dependency submitted, | 
|  | * or the job just didn't exist. | 
|  | */ | 
|  | if (!dep_ptr->job_ptr) | 
|  | return 0; | 
|  | if ((test_job_dep->changed = _depends_on_same_job( | 
|  | job_ptr, dep_ptr->job_ptr, | 
|  | dep_ptr->job_id, | 
|  | dep_ptr->array_task_id))) | 
|  | return -1; | 
|  | else if (dep_ptr->job_ptr->magic != JOB_MAGIC) | 
|  | return 0;	/* purged job, ptr not yet cleared */ | 
|  | else if (!IS_JOB_FINISHED(dep_ptr->job_ptr) && | 
|  | dep_ptr->job_ptr->details && | 
|  | dep_ptr->job_ptr->details->depend_list) { | 
|  | test_job_dep->changed = _scan_depend( | 
|  | dep_ptr->job_ptr->details->depend_list, | 
|  | job_ptr); | 
|  | if (test_job_dep->changed) { | 
|  | info("circular dependency: %pJ is dependent upon %pJ", | 
|  | dep_ptr->job_ptr, job_ptr); | 
|  | return -1; | 
|  | } | 
|  | } | 
|  | return 0; | 
|  | } | 
|  |  | 
|  | /* Return true if the job job_ptr is found in dependency_list. | 
|  | * Pass NULL dependency list to clear the counter. | 
|  | * Execute recursively for each dependent job */ | 
|  | static bool _scan_depend(list_t *dependency_list, job_record_t *job_ptr) | 
|  | { | 
|  | static int job_counter = 0; | 
|  | test_job_dep_t test_job_dep = { | 
|  | .job_ptr = job_ptr, | 
|  | }; | 
|  |  | 
|  | if (dependency_list == NULL) { | 
|  | job_counter = 0; | 
|  | return false; | 
|  | } else if (job_counter++ >= max_depend_depth) { | 
|  | return false; | 
|  | } | 
|  |  | 
|  | xassert(job_ptr); | 
|  |  | 
|  | (void) list_for_each(dependency_list, | 
|  | _foreach_scan_depend, | 
|  | &test_job_dep); | 
|  |  | 
|  | return test_job_dep.changed; | 
|  | } | 
|  |  | 
|  | static int _foreach_delayed_job_start_time(void *x, void *arg) | 
|  | { | 
|  | job_record_t *job_q_ptr = x; | 
|  | delay_start_t *delay_start = arg; | 
|  | job_record_t *job_ptr = delay_start->job_ptr; | 
|  | uint32_t job_size_cpus, job_size_nodes, job_time; | 
|  |  | 
|  | if (!IS_JOB_PENDING(job_q_ptr) || !job_q_ptr->details || | 
|  | (job_q_ptr->part_ptr != job_ptr->part_ptr) || | 
|  | (job_q_ptr->priority < job_ptr->priority) || | 
|  | (job_q_ptr->job_id == job_ptr->job_id) || | 
|  | (IS_JOB_REVOKED(job_q_ptr))) | 
|  | return 0; | 
|  |  | 
|  | if (job_q_ptr->details->min_nodes == NO_VAL) | 
|  | job_size_nodes = 1; | 
|  | else | 
|  | job_size_nodes = job_q_ptr->details->min_nodes; | 
|  | if (job_q_ptr->details->min_cpus == NO_VAL) | 
|  | job_size_cpus = 1; | 
|  | else | 
|  | job_size_cpus = job_q_ptr->details->min_cpus; | 
|  | job_size_cpus = MAX(job_size_cpus, | 
|  | (job_size_nodes * delay_start->part_cpus_per_node)); | 
|  | if (job_q_ptr->time_limit == NO_VAL) | 
|  | job_time = job_q_ptr->part_ptr->max_time; | 
|  | else | 
|  | job_time = job_q_ptr->time_limit; | 
|  | delay_start->cume_space_time += job_size_cpus * job_time; | 
|  |  | 
|  | return 0; | 
|  | } | 
|  |  | 
|  | /* If there are higher priority queued jobs in this job's partition, then | 
|  | * delay the job's expected initiation time as needed to run those jobs. | 
|  | * NOTE: This is only a rough estimate of the job's start time as it ignores | 
|  | * job dependencies, feature requirements, specific node requirements, etc. */ | 
|  | static void _delayed_job_start_time(job_record_t *job_ptr) | 
|  | { | 
|  | uint32_t part_node_cnt, part_cpu_cnt; | 
|  | delay_start_t delay_start = { | 
|  | .job_ptr = job_ptr, | 
|  | .part_cpus_per_node = 1, | 
|  | }; | 
|  |  | 
|  | if (job_ptr->part_ptr == NULL) | 
|  | return; | 
|  | part_node_cnt = job_ptr->part_ptr->total_nodes; | 
|  | part_cpu_cnt  = job_ptr->part_ptr->total_cpus; | 
|  | if (part_cpu_cnt > part_node_cnt) | 
|  | delay_start.part_cpus_per_node = part_cpu_cnt / part_node_cnt; | 
|  |  | 
|  | (void) list_for_each(job_list, | 
|  | _foreach_delayed_job_start_time, | 
|  | &delay_start); | 
|  | delay_start.cume_space_time /= part_cpu_cnt;/* Factor out size */ | 
|  | delay_start.cume_space_time *= 60;		/* Minutes to seconds */ | 
|  | debug2("Increasing estimated start of %pJ by %"PRIu64" secs", | 
|  | job_ptr, delay_start.cume_space_time); | 
|  | job_ptr->start_time += delay_start.cume_space_time; | 
|  | } | 
|  |  | 
|  | static int _foreach_add_to_preemptee_job_id(void *x, void *arg) | 
|  | { | 
|  | job_record_t *job_ptr = x; | 
|  | will_run_response_msg_t *resp_data = arg; | 
|  | uint32_t *preemptee_jid = xmalloc(sizeof(uint32_t)); | 
|  |  | 
|  | (*preemptee_jid) = job_ptr->job_id; | 
|  |  | 
|  | if (!resp_data->preemptee_job_id) | 
|  | resp_data->preemptee_job_id = list_create(xfree_ptr); | 
|  |  | 
|  | list_append(resp_data->preemptee_job_id, preemptee_jid); | 
|  |  | 
|  | return 0; | 
|  | } | 
|  |  | 
|  | static int _foreach_job_start_data_part(void *x, void *arg) | 
|  | { | 
|  | part_record_t *part_ptr = x; | 
|  | job_start_data_t *job_start_data = arg; | 
|  | job_record_t *job_ptr = job_start_data->job_ptr; | 
|  |  | 
|  | bitstr_t *active_bitmap = NULL, *avail_bitmap = NULL; | 
|  | bitstr_t *resv_bitmap = NULL; | 
|  | uint32_t min_nodes, max_nodes, req_nodes; | 
|  | int rc2 = SLURM_SUCCESS; | 
|  | time_t start_res, orig_start_time = (time_t) 0; | 
|  | list_t *preemptee_candidates = NULL, *preemptee_job_list = NULL; | 
|  | bool resv_overlap = false; | 
|  | resv_exc_t resv_exc = { 0 }; | 
|  |  | 
|  | job_start_data->rc = SLURM_SUCCESS; | 
|  | if (!part_ptr) { | 
|  | job_start_data->rc = ESLURM_INVALID_PARTITION_NAME; | 
|  | return -1; | 
|  | } | 
|  |  | 
|  | if (job_ptr->details->req_nodes && job_ptr->details->req_nodes[0]) { | 
|  | if (node_name2bitmap(job_ptr->details->req_nodes, false, | 
|  | &avail_bitmap, NULL)) { | 
|  | job_start_data->rc = ESLURM_INVALID_NODE_NAME; | 
|  | return -1; | 
|  | } | 
|  | } else { | 
|  | /* assume all nodes available to job for testing */ | 
|  | avail_bitmap = node_conf_get_active_bitmap(); | 
|  | } | 
|  |  | 
|  | /* Consider only nodes in this job's partition */ | 
|  | if (part_ptr->node_bitmap) | 
|  | bit_and(avail_bitmap, part_ptr->node_bitmap); | 
|  | else | 
|  | job_start_data->rc = ESLURM_REQUESTED_PART_CONFIG_UNAVAILABLE; | 
|  | if (job_req_node_filter(job_ptr, avail_bitmap, true)) | 
|  | job_start_data->rc = ESLURM_REQUESTED_PART_CONFIG_UNAVAILABLE; | 
|  | if (job_ptr->details->exc_node_bitmap) { | 
|  | bit_and_not(avail_bitmap, job_ptr->details->exc_node_bitmap); | 
|  | } | 
|  | if (job_ptr->details->req_node_bitmap) { | 
|  | if (!bit_super_set(job_ptr->details->req_node_bitmap, | 
|  | avail_bitmap)) { | 
|  | job_start_data->rc = | 
|  | ESLURM_REQUESTED_PART_CONFIG_UNAVAILABLE; | 
|  | } | 
|  | } | 
|  |  | 
|  | /* Enforce reservation: access control, time and nodes */ | 
|  | if (job_ptr->details->begin_time && | 
|  | (job_ptr->details->begin_time > job_start_data->now)) | 
|  | start_res = job_ptr->details->begin_time; | 
|  | else | 
|  | start_res = job_start_data->now; | 
|  |  | 
|  | rc2 = job_test_resv(job_ptr, &start_res, true, &resv_bitmap, | 
|  | &resv_exc, &resv_overlap, false); | 
|  | if (rc2 != SLURM_SUCCESS) { | 
|  | FREE_NULL_BITMAP(avail_bitmap); | 
|  | reservation_delete_resv_exc_parts(&resv_exc); | 
|  | job_start_data->rc = rc2; | 
|  | return -1; | 
|  | } | 
|  |  | 
|  | bit_and(avail_bitmap, resv_bitmap); | 
|  | FREE_NULL_BITMAP(resv_bitmap); | 
|  |  | 
|  | /* Only consider nodes that are not DOWN or DRAINED */ | 
|  | bit_and(avail_bitmap, avail_node_bitmap); | 
|  |  | 
|  | if (job_start_data->rc == SLURM_SUCCESS) { | 
|  | int test_fini = -1; | 
|  | uint8_t save_share_res, save_whole_node; | 
|  | /* On BlueGene systems don't adjust the min/max node limits | 
|  | here.  We are working on midplane values. */ | 
|  | min_nodes = MAX(job_ptr->details->min_nodes, | 
|  | part_ptr->min_nodes); | 
|  | if (job_ptr->details->max_nodes == 0) | 
|  | max_nodes = part_ptr->max_nodes; | 
|  | else | 
|  | max_nodes = MIN(job_ptr->details->max_nodes, | 
|  | part_ptr->max_nodes); | 
|  | max_nodes = MIN(max_nodes, 500000);	/* prevent overflows */ | 
|  | if (!job_ptr->limit_set.tres[TRES_ARRAY_NODE] && | 
|  | job_ptr->details->max_nodes) | 
|  | req_nodes = max_nodes; | 
|  | else | 
|  | req_nodes = min_nodes; | 
|  | preemptee_candidates = slurm_find_preemptable_jobs(job_ptr); | 
|  |  | 
|  | /* The orig_start is based upon the backfill scheduler data | 
|  | * and considers all higher priority jobs. The logic below | 
|  | * only considers currently running jobs, so the expected | 
|  | * start time will almost certainly be earlier and not as | 
|  | * accurate, but this algorithm is much faster. */ | 
|  | orig_start_time = job_ptr->start_time; | 
|  | build_active_feature_bitmap(job_ptr, avail_bitmap, | 
|  | &active_bitmap); | 
|  | if (active_bitmap) { | 
|  | job_start_data->rc = select_g_job_test( | 
|  | job_ptr, active_bitmap, | 
|  | min_nodes, max_nodes, req_nodes, | 
|  | SELECT_MODE_WILL_RUN, | 
|  | preemptee_candidates, | 
|  | &preemptee_job_list, | 
|  | &resv_exc, | 
|  | NULL); | 
|  | if (job_start_data->rc == SLURM_SUCCESS) { | 
|  | FREE_NULL_BITMAP(avail_bitmap); | 
|  | avail_bitmap = active_bitmap; | 
|  | active_bitmap = NULL; | 
|  | test_fini = 1; | 
|  | } else { | 
|  | FREE_NULL_BITMAP(active_bitmap); | 
|  | save_share_res  = job_ptr->details->share_res; | 
|  | save_whole_node = job_ptr->details->whole_node; | 
|  | job_ptr->details->share_res = 0; | 
|  | job_ptr->details->whole_node |= | 
|  | WHOLE_NODE_REQUIRED; | 
|  | test_fini = 0; | 
|  | } | 
|  | } | 
|  | if (test_fini != 1) { | 
|  | job_start_data->rc = select_g_job_test( | 
|  | job_ptr, avail_bitmap, | 
|  | min_nodes, max_nodes, req_nodes, | 
|  | SELECT_MODE_WILL_RUN, | 
|  | preemptee_candidates, | 
|  | &preemptee_job_list, | 
|  | &resv_exc, | 
|  | NULL); | 
|  | if (test_fini == 0) { | 
|  | job_ptr->details->share_res = save_share_res; | 
|  | job_ptr->details->whole_node = save_whole_node; | 
|  | } | 
|  | } | 
|  | } | 
|  |  | 
|  | if (job_start_data->rc == SLURM_SUCCESS) { | 
|  | will_run_response_msg_t *resp_data; | 
|  | resp_data = xmalloc(sizeof(will_run_response_msg_t)); | 
|  | resp_data->job_id     = job_ptr->job_id; | 
|  | resp_data->proc_cnt = job_ptr->total_cpus; | 
|  | _delayed_job_start_time(job_ptr); | 
|  | resp_data->start_time = MAX(job_ptr->start_time, | 
|  | orig_start_time); | 
|  | resp_data->start_time = MAX(resp_data->start_time, start_res); | 
|  | job_ptr->start_time   = 0;  /* restore pending job start time */ | 
|  | resp_data->node_list  = bitmap2node_name(avail_bitmap); | 
|  | resp_data->part_name  = xstrdup(part_ptr->name); | 
|  |  | 
|  | if (preemptee_job_list) | 
|  | (void) list_for_each(preemptee_job_list, | 
|  | _foreach_add_to_preemptee_job_id, | 
|  | resp_data); | 
|  |  | 
|  | *job_start_data->resp = resp_data; | 
|  | } else { | 
|  | job_start_data->rc = ESLURM_REQUESTED_NODE_CONFIG_UNAVAILABLE; | 
|  | } | 
|  |  | 
|  | FREE_NULL_LIST(preemptee_candidates); | 
|  | FREE_NULL_LIST(preemptee_job_list); | 
|  | FREE_NULL_BITMAP(avail_bitmap); | 
|  | reservation_delete_resv_exc_parts(&resv_exc); | 
|  |  | 
|  | if (job_start_data->rc) | 
|  | return 0; | 
|  |  | 
|  | return -1; | 
|  | } | 
|  |  | 
|  | /* | 
|  | * Determine if a pending job will run using only the specified nodes, build | 
|  | * response message and return SLURM_SUCCESS on success. Otherwise return an | 
|  | * error code. Caller must free response message. | 
|  | */ | 
|  | extern int job_start_data(job_record_t *job_ptr, | 
|  | will_run_response_msg_t **resp) | 
|  | { | 
|  | job_start_data_t job_start_data = { | 
|  | .job_ptr = job_ptr, | 
|  | .now = time(NULL), | 
|  | .resp = resp, | 
|  | }; | 
|  |  | 
|  | if (job_ptr == NULL) | 
|  | return ESLURM_INVALID_JOB_ID; | 
|  |  | 
|  | /* | 
|  | * NOTE: Do not use IS_JOB_PENDING since that doesn't take | 
|  | * into account the COMPLETING FLAG which we need to since we don't want | 
|  | * to schedule a requeued job until it is actually done completing | 
|  | * the first time. | 
|  | */ | 
|  | if ((job_ptr->details == NULL) || (job_ptr->job_state != JOB_PENDING)) | 
|  | return ESLURM_DISABLED; | 
|  |  | 
|  | if (job_ptr->part_ptr_list) | 
|  | (void) list_for_each(job_ptr->part_ptr_list, | 
|  | _foreach_job_start_data_part, | 
|  | &job_start_data); | 
|  | else | 
|  | (void) _foreach_job_start_data_part(job_ptr->part_ptr, | 
|  | &job_start_data); | 
|  |  | 
|  | return job_start_data.rc; | 
|  | } | 
|  |  | 
|  | /* | 
|  | * epilog_slurmctld - execute the epilog_slurmctld for a job that has just | 
|  | *	terminated. | 
|  | * IN job_ptr - pointer to job that has been terminated | 
|  | */ | 
|  | extern void epilog_slurmctld(job_record_t *job_ptr) | 
|  | { | 
|  | xassert(verify_lock(JOB_LOCK, WRITE_LOCK)); | 
|  |  | 
|  | prep_g_epilog_slurmctld(job_ptr); | 
|  | } | 
|  |  | 
|  | /* | 
|  | * Determine which nodes must be rebooted for a job | 
|  | * IN job_ptr - pointer to job that will be initiated | 
|  | * IN/OUT reboot_features - features that should be applied to the node on | 
|  | *                          reboot. Caller must xfree(). | 
|  | * RET bitmap of nodes requiring a reboot for NodeFeaturesPlugin or NULL if none | 
|  | */ | 
|  | extern bitstr_t *node_features_reboot(job_record_t *job_ptr, | 
|  | char **reboot_features) | 
|  | { | 
|  | bitstr_t *active_bitmap = NULL, *boot_node_bitmap = NULL; | 
|  | bitstr_t *feature_node_bitmap, *tmp_bitmap; | 
|  |  | 
|  | xassert(reboot_features); | 
|  | xassert(!(*reboot_features)); /* It needs to start out NULL */ | 
|  |  | 
|  | if ((node_features_g_count() == 0) || | 
|  | !node_features_g_user_update(job_ptr->user_id)) | 
|  | return NULL; | 
|  |  | 
|  | /* | 
|  | * Check if all features supported with AND/OR combinations | 
|  | */ | 
|  | build_active_feature_bitmap(job_ptr, job_ptr->node_bitmap, | 
|  | &active_bitmap); | 
|  | if (active_bitmap == NULL)	/* All nodes have desired features */ | 
|  | return NULL; | 
|  | FREE_NULL_BITMAP(active_bitmap); | 
|  |  | 
|  | /* | 
|  | * If some MOR/XAND option, filter out only first set of features | 
|  | * for NodeFeaturesPlugin | 
|  | */ | 
|  | feature_node_bitmap = node_features_g_get_node_bitmap(); | 
|  | if (feature_node_bitmap == NULL) /* No nodes under NodeFeaturesPlugin */ | 
|  | return NULL; | 
|  |  | 
|  | *reboot_features = node_features_g_job_xlate( | 
|  | job_ptr->details->features_use, | 
|  | job_ptr->details->feature_list_use, | 
|  | job_ptr->node_bitmap); | 
|  | tmp_bitmap = build_active_feature_bitmap2(*reboot_features); | 
|  | boot_node_bitmap = bit_copy(job_ptr->node_bitmap); | 
|  | bit_and(boot_node_bitmap, feature_node_bitmap); | 
|  | FREE_NULL_BITMAP(feature_node_bitmap); | 
|  | if (tmp_bitmap) { | 
|  | bit_and_not(boot_node_bitmap, tmp_bitmap); | 
|  | FREE_NULL_BITMAP(tmp_bitmap); | 
|  | } | 
|  | if (bit_ffs(boot_node_bitmap) == -1) | 
|  | FREE_NULL_BITMAP(boot_node_bitmap); | 
|  |  | 
|  | return boot_node_bitmap; | 
|  | } | 
|  |  | 
|  | /* | 
|  | * reboot_job_nodes - Reboot the compute nodes allocated to a job. | 
|  | * Also change the modes of KNL nodes for node_features/knl_generic plugin. | 
|  | * IN job_ptr - pointer to job that will be initiated | 
|  | * RET SLURM_SUCCESS(0) or error code | 
|  | */ | 
|  | static void _send_reboot_msg(bitstr_t *node_bitmap, char *features, | 
|  | uint16_t protocol_version) | 
|  | { | 
|  | agent_arg_t *reboot_agent_args = NULL; | 
|  | reboot_msg_t *reboot_msg; | 
|  | hostlist_t *hostlist; | 
|  |  | 
|  | reboot_agent_args = xmalloc(sizeof(agent_arg_t)); | 
|  | reboot_agent_args->msg_type = REQUEST_REBOOT_NODES; | 
|  | reboot_agent_args->retry = 0; | 
|  | reboot_agent_args->node_count = 0; | 
|  | reboot_agent_args->protocol_version = protocol_version; | 
|  |  | 
|  | if ((hostlist = bitmap2hostlist(node_bitmap))) { | 
|  | reboot_agent_args->hostlist = hostlist; | 
|  | reboot_agent_args->node_count = hostlist_count(hostlist); | 
|  | } | 
|  |  | 
|  | reboot_msg = xmalloc(sizeof(reboot_msg_t)); | 
|  | slurm_init_reboot_msg(reboot_msg, false); | 
|  | reboot_agent_args->msg_args = reboot_msg; | 
|  | reboot_msg->features = xstrdup(features); | 
|  |  | 
|  | set_agent_arg_r_uid(reboot_agent_args, SLURM_AUTH_UID_ANY); | 
|  | agent_queue_request(reboot_agent_args); | 
|  | } | 
|  |  | 
|  | static void _do_reboot(bool power_save_on, bitstr_t *node_bitmap, | 
|  | job_record_t *job_ptr, char *reboot_features, | 
|  | uint16_t protocol_version) | 
|  | { | 
|  | xassert(node_bitmap); | 
|  |  | 
|  | if (bit_ffs(node_bitmap) == -1) | 
|  | return; | 
|  |  | 
|  | if (power_save_on) | 
|  | power_job_reboot(node_bitmap, job_ptr, reboot_features); | 
|  | else | 
|  | _send_reboot_msg(node_bitmap, reboot_features, | 
|  | protocol_version); | 
|  | if (get_log_level() >= LOG_LEVEL_DEBUG) { | 
|  | char *nodes = bitmap2node_name(node_bitmap); | 
|  | if (nodes) { | 
|  | debug("%s: reboot nodes %s features %s", | 
|  | __func__, nodes, | 
|  | reboot_features ? "reboot_features" : "N/A"); | 
|  | } else { | 
|  | error("%s: bitmap2nodename", __func__); | 
|  | } | 
|  | xfree(nodes); | 
|  | } | 
|  | } | 
|  |  | 
|  | static void _set_reboot_features_active(bitstr_t *node_bitmap, | 
|  | char *reboot_features) | 
|  | { | 
|  | node_record_t *node_ptr; | 
|  |  | 
|  | for (int i = 0; (node_ptr = next_node_bitmap(node_bitmap, &i)); i++) { | 
|  | char *tmp_feature; | 
|  |  | 
|  | tmp_feature = node_features_g_node_xlate(reboot_features, | 
|  | node_ptr->features_act, | 
|  | node_ptr->features, i); | 
|  | xfree(node_ptr->features_act); | 
|  | node_ptr->features_act = tmp_feature; | 
|  | (void) update_node_active_features(node_ptr->name, | 
|  | node_ptr->features_act, | 
|  | FEATURE_MODE_IND); | 
|  | } | 
|  | } | 
|  |  | 
|  | extern void reboot_job_nodes(job_record_t *job_ptr) | 
|  | { | 
|  | node_record_t *node_ptr; | 
|  | time_t now = time(NULL); | 
|  | bitstr_t *boot_node_bitmap = NULL, *feature_node_bitmap = NULL; | 
|  | bitstr_t *non_feature_node_bitmap = NULL; | 
|  | char *reboot_features = NULL; | 
|  | uint16_t protocol_version = SLURM_PROTOCOL_VERSION; | 
|  | static bool power_save_on = false; | 
|  | static time_t sched_update = 0; | 
|  | static bool logged = false; | 
|  |  | 
|  | if (sched_update != slurm_conf.last_update) { | 
|  | power_save_on = power_save_test(); | 
|  | sched_update = slurm_conf.last_update; | 
|  | } | 
|  |  | 
|  | if ((job_ptr->details == NULL) || (job_ptr->node_bitmap == NULL)) | 
|  | return; | 
|  |  | 
|  | if (job_ptr->reboot) | 
|  | boot_node_bitmap = bit_copy(job_ptr->node_bitmap); | 
|  | else | 
|  | boot_node_bitmap = node_features_reboot(job_ptr, | 
|  | &reboot_features); | 
|  |  | 
|  | if (!logged && boot_node_bitmap && | 
|  | (!power_save_on && | 
|  | ((slurm_conf.reboot_program == NULL) || | 
|  | (slurm_conf.reboot_program[0] == '\0')))) { | 
|  | info("%s: Preparing node reboot without power saving and RebootProgram", | 
|  | __func__); | 
|  | logged = true; | 
|  | } | 
|  |  | 
|  | if (boot_node_bitmap && | 
|  | job_ptr->details->features_use && | 
|  | node_features_g_user_update(job_ptr->user_id)) { | 
|  | non_feature_node_bitmap = bit_copy(boot_node_bitmap); | 
|  | /* | 
|  | * node_features_g_job_xlate is called from | 
|  | * node_features_reboot, which we may have already called. | 
|  | * Avoid calling node_features_g_job_xlate twice. | 
|  | */ | 
|  | if (!reboot_features) { | 
|  | reboot_features = node_features_g_job_xlate( | 
|  | job_ptr->details->features_use, | 
|  | job_ptr->details->feature_list_use, | 
|  | job_ptr->node_bitmap); | 
|  | } | 
|  | if (reboot_features) | 
|  | feature_node_bitmap = node_features_g_get_node_bitmap(); | 
|  | if (feature_node_bitmap) | 
|  | bit_and(feature_node_bitmap, non_feature_node_bitmap); | 
|  | if (!feature_node_bitmap || | 
|  | (bit_ffs(feature_node_bitmap) == -1)) { | 
|  | /* No KNL nodes to reboot */ | 
|  | FREE_NULL_BITMAP(feature_node_bitmap); | 
|  | } else { | 
|  | bit_and_not(non_feature_node_bitmap, | 
|  | feature_node_bitmap); | 
|  | if (bit_ffs(non_feature_node_bitmap) == -1) { | 
|  | /* No non-KNL nodes to reboot */ | 
|  | FREE_NULL_BITMAP(non_feature_node_bitmap); | 
|  | } | 
|  | } | 
|  | } | 
|  |  | 
|  | if (feature_node_bitmap) { | 
|  | /* | 
|  | * Update node features now to avoid a race where a | 
|  | * second job may request that this node gets rebooted | 
|  | * (in order to get a new active feature) *after* the | 
|  | * first reboot request but *before* slurmd actually | 
|  | * starts up. If that would happen then the second job | 
|  | * would stay configuring forever, waiting for the node | 
|  | * to reboot even though the node already rebooted. | 
|  | * | 
|  | * By setting the node's active features right now, any | 
|  | * other job that wants that active feature can be | 
|  | * scheduled onto this node, which will also already be | 
|  | * rebooting, so those other jobs won't send additional | 
|  | * reboot requests to change the feature. | 
|  | */ | 
|  | _set_reboot_features_active(feature_node_bitmap, | 
|  | reboot_features); | 
|  | } | 
|  |  | 
|  | /* | 
|  | * Assume the power save thread will handle the boot if any of the nodes | 
|  | * are cloud nodes. In KNL/features, the node is being rebooted and not | 
|  | * brought up from being powered down. | 
|  | */ | 
|  | if ((boot_node_bitmap == NULL) || | 
|  | bit_overlap_any(cloud_node_bitmap, job_ptr->node_bitmap)) { | 
|  | /* launch_job() when all nodes have booted */ | 
|  | if (bit_overlap_any(power_down_node_bitmap, | 
|  | job_ptr->node_bitmap) || | 
|  | bit_overlap_any(booting_node_bitmap, | 
|  | job_ptr->node_bitmap)) { | 
|  | /* Reset job start time when nodes are booted */ | 
|  | job_state_set_flag(job_ptr, (JOB_CONFIGURING | | 
|  | JOB_POWER_UP_NODE)); | 
|  | job_ptr->wait_all_nodes = 1; | 
|  | } | 
|  |  | 
|  | goto cleanup; | 
|  | } | 
|  |  | 
|  | /* Reset job start time when nodes are booted */ | 
|  | job_state_set_flag(job_ptr, (JOB_CONFIGURING | JOB_POWER_UP_NODE)); | 
|  | /* launch_job() when all nodes have booted */ | 
|  | job_ptr->wait_all_nodes = 1; | 
|  |  | 
|  | /* Modify state information for all nodes, KNL and others */ | 
|  | for (int i = 0; (node_ptr = next_node_bitmap(boot_node_bitmap, &i)); | 
|  | i++) { | 
|  | if (protocol_version > node_ptr->protocol_version) | 
|  | protocol_version = node_ptr->protocol_version; | 
|  |  | 
|  | if (IS_NODE_POWERED_DOWN(node_ptr)) { | 
|  | node_ptr->node_state &= (~NODE_STATE_POWERED_DOWN); | 
|  | clusteracct_storage_g_node_up(acct_db_conn, node_ptr, | 
|  | now); | 
|  | } | 
|  | node_ptr->node_state |= NODE_STATE_NO_RESPOND; | 
|  | node_ptr->node_state |= NODE_STATE_POWERING_UP; | 
|  | bit_clear(avail_node_bitmap, i); | 
|  | bit_clear(power_down_node_bitmap, i); | 
|  | bit_set(booting_node_bitmap, i); | 
|  | node_ptr->boot_req_time = now; | 
|  | } | 
|  |  | 
|  | if (feature_node_bitmap) { | 
|  | /* Reboot nodes to change KNL NUMA and/or MCDRAM mode */ | 
|  | _do_reboot(power_save_on, feature_node_bitmap, job_ptr, | 
|  | reboot_features, protocol_version); | 
|  | bit_and_not(boot_node_bitmap, feature_node_bitmap); | 
|  | } | 
|  |  | 
|  | if (non_feature_node_bitmap) { | 
|  | /* Reboot nodes with no feature changes */ | 
|  | _do_reboot(power_save_on, non_feature_node_bitmap, job_ptr, | 
|  | NULL, protocol_version); | 
|  | bit_and_not(boot_node_bitmap, non_feature_node_bitmap); | 
|  | } | 
|  |  | 
|  | if (job_ptr->reboot) { | 
|  | /* Reboot the remaining nodes blindly as per direct request */ | 
|  | _do_reboot(power_save_on, boot_node_bitmap, job_ptr, NULL, | 
|  | protocol_version); | 
|  | } | 
|  |  | 
|  | cleanup: | 
|  | xfree(reboot_features); | 
|  | FREE_NULL_BITMAP(boot_node_bitmap); | 
|  | FREE_NULL_BITMAP(non_feature_node_bitmap); | 
|  | FREE_NULL_BITMAP(feature_node_bitmap); | 
|  | } | 
|  |  | 
|  | /* | 
|  | * Deferring this setup ensures that all calling paths into select_nodes() | 
|  | * have had a chance to update all appropriate job records. | 
|  | * This works since select_nodes() will always be holding the job_write lock, | 
|  | * and thus this new thread will be blocked waiting to acquire job_write | 
|  | * until that has completed. | 
|  | * For HetJobs in particular, this is critical to ensure that all components | 
|  | * have been setup properly before prolog_slurmctld actually runs. | 
|  | */ | 
|  | static void *_start_prolog_slurmctld_thread(void *x) | 
|  | { | 
|  | slurmctld_lock_t node_write_lock = { | 
|  | .conf = READ_LOCK, .job = WRITE_LOCK, | 
|  | .node = WRITE_LOCK, .fed = READ_LOCK }; | 
|  | uint32_t *job_id = (uint32_t *) x; | 
|  | job_record_t *job_ptr; | 
|  |  | 
|  | lock_slurmctld(node_write_lock); | 
|  | if (!(job_ptr = find_job_record(*job_id))) { | 
|  | error("%s: missing JobId=%u", __func__, *job_id); | 
|  | unlock_slurmctld(node_write_lock); | 
|  | return NULL; | 
|  | } | 
|  | prep_g_prolog_slurmctld(job_ptr); | 
|  |  | 
|  | /* | 
|  | * No async prolog_slurmctld threads running, so decrement now to move | 
|  | * on with the job launch. | 
|  | */ | 
|  | if (!job_ptr->prep_prolog_cnt) { | 
|  | debug2("%s: no async prolog_slurmctld running", __func__); | 
|  | prolog_running_decr(job_ptr); | 
|  | } | 
|  |  | 
|  | unlock_slurmctld(node_write_lock); | 
|  | xfree(job_id); | 
|  |  | 
|  | return NULL; | 
|  | } | 
|  |  | 
|  | /* | 
|  | * prolog_slurmctld - execute the prolog_slurmctld for a job that has just | 
|  | *	been allocated resources. | 
|  | * IN job_ptr - pointer to job that will be initiated | 
|  | */ | 
|  | extern void prolog_slurmctld(job_record_t *job_ptr) | 
|  | { | 
|  | uint32_t *job_id; | 
|  | xassert(verify_lock(JOB_LOCK, WRITE_LOCK)); | 
|  |  | 
|  | if (!prep_g_required(PREP_PROLOG_SLURMCTLD)) | 
|  | return; | 
|  | job_ptr->details->prolog_running++; | 
|  | job_state_set_flag(job_ptr, JOB_CONFIGURING); | 
|  |  | 
|  | job_id = xmalloc(sizeof(*job_id)); | 
|  | *job_id = job_ptr->job_id; | 
|  | slurm_thread_create_detached(_start_prolog_slurmctld_thread, job_id); | 
|  | } | 
|  |  | 
|  | /* Decrement a job's prolog_running counter and launch the job if zero */ | 
|  | extern void prolog_running_decr(job_record_t *job_ptr) | 
|  | { | 
|  | xassert(verify_lock(JOB_LOCK, WRITE_LOCK)); | 
|  | xassert(verify_lock(FED_LOCK, READ_LOCK)); | 
|  |  | 
|  | if (!job_ptr) | 
|  | return; | 
|  |  | 
|  | if (job_ptr->details && job_ptr->details->prolog_running && | 
|  | (--job_ptr->details->prolog_running > 0)) | 
|  | return; | 
|  |  | 
|  | /* Federated job notified the origin that the job is to be requeued, | 
|  | * need to wait for this job to be cancelled. */ | 
|  | if (job_ptr->job_state & JOB_REQUEUE_FED) | 
|  | return; | 
|  |  | 
|  | if (IS_JOB_CONFIGURING(job_ptr) && test_job_nodes_ready(job_ptr)) { | 
|  | info("%s: Configuration for %pJ is complete", | 
|  | __func__, job_ptr); | 
|  | job_config_fini(job_ptr); | 
|  | if (job_ptr->batch_flag && | 
|  | (IS_JOB_RUNNING(job_ptr) || IS_JOB_SUSPENDED(job_ptr))) { | 
|  | launch_job(job_ptr); | 
|  | } | 
|  | } | 
|  | } | 
|  |  | 
|  | static int _foreach_feature_list_copy(void *x, void *arg) | 
|  | { | 
|  | job_feature_t *feat_src = x, *feat_dest; | 
|  | list_t **feature_list_dest = arg; | 
|  |  | 
|  | feat_dest = xmalloc(sizeof(job_feature_t)); | 
|  | memcpy(feat_dest, feat_src, sizeof(job_feature_t)); | 
|  | if (feat_src->node_bitmap_active) | 
|  | feat_dest->node_bitmap_active = | 
|  | bit_copy(feat_src->node_bitmap_active); | 
|  | if (feat_src->node_bitmap_avail) | 
|  | feat_dest->node_bitmap_avail = | 
|  | bit_copy(feat_src->node_bitmap_avail); | 
|  | feat_dest->name = xstrdup(feat_src->name); | 
|  | list_append(*feature_list_dest, feat_dest); | 
|  |  | 
|  | return 0; | 
|  | } | 
|  |  | 
|  | /* | 
|  | * Copy a job's feature list | 
|  | * IN feature_list_src - a job's depend_lst | 
|  | * RET copy of feature_list_src, must be freed by caller | 
|  | */ | 
|  | extern list_t *feature_list_copy(list_t *feature_list_src) | 
|  | { | 
|  | list_t *feature_list_dest = NULL; | 
|  |  | 
|  | if (!feature_list_src) | 
|  | return feature_list_dest; | 
|  |  | 
|  | feature_list_dest = list_create(feature_list_delete); | 
|  | (void) list_for_each(feature_list_src, | 
|  | _foreach_feature_list_copy, | 
|  | &feature_list_dest); | 
|  |  | 
|  | return feature_list_dest; | 
|  | } | 
|  |  | 
|  | /* | 
|  | * IN/OUT convert_to_matching_or - | 
|  | * If at least one changeable feature is requested, then all the nodes | 
|  | * in the job allocation need to match the same feature set. | 
|  | * | 
|  | * As an input: if true, then mark all '|' operators as matching OR, and also | 
|  | * imply that it is surrounded by brackets by setting bracket=1 for all the | 
|  | * features except the last one. The AND operators are still treated as normal | 
|  | * AND (not XAND), as if they were surrounded by parentheses within the | 
|  | * brackets. | 
|  | * | 
|  | * As an output: if multiple changeable features are requested, | 
|  | * and bar (OR) was requested, then set this to true. | 
|  | * | 
|  | * This is needed for the scheduling logic with parentheses and matching OR. | 
|  | */ | 
|  | static int _feature_string2list(char *features, char *debug_str, | 
|  | list_t **feature_list, | 
|  | bool *convert_to_matching_or) | 
|  | { | 
|  | int rc = SLURM_SUCCESS; | 
|  | int bracket = 0, count = 0, i, paren = 0; | 
|  | int brack_set_count = 0; | 
|  | char *tmp_requested; | 
|  | char *str_ptr, *feature = NULL; | 
|  | bool has_changeable = false; | 
|  | bool has_or = false; | 
|  | bool has_asterisk = false; | 
|  |  | 
|  | xassert(feature_list); | 
|  |  | 
|  | /* Use of commas separator is a common error. Replace them with '&' */ | 
|  | while ((str_ptr = strstr(features, ","))) | 
|  | str_ptr[0] = '&'; | 
|  |  | 
|  | tmp_requested = xstrdup(features); | 
|  | *feature_list = list_create(feature_list_delete); | 
|  |  | 
|  | for (i = 0; ; i++) { | 
|  | job_feature_t *feat; | 
|  |  | 
|  | if (tmp_requested[i] == '*') { | 
|  | tmp_requested[i] = '\0'; | 
|  | count = strtol(&tmp_requested[i+1], &str_ptr, 10); | 
|  | if (!bracket) | 
|  | has_asterisk = true; | 
|  | if ((feature == NULL) || (count <= 0) || (paren != 0)) { | 
|  | verbose("%s constraint invalid, '*' must be requested with a positive integer, and after a feature or parentheses: %s", | 
|  | debug_str, features); | 
|  | rc = ESLURM_INVALID_FEATURE; | 
|  | goto fini; | 
|  | } | 
|  | i = str_ptr - tmp_requested - 1; | 
|  | } else if (tmp_requested[i] == '&') { | 
|  | tmp_requested[i] = '\0'; | 
|  | if (feature == NULL) { | 
|  | verbose("%s constraint requested '&' without a feature: %s", | 
|  | debug_str, features); | 
|  | rc = ESLURM_INVALID_FEATURE; | 
|  | goto fini; | 
|  | } | 
|  | feat = xmalloc(sizeof(job_feature_t)); | 
|  | feat->bracket = *convert_to_matching_or ? 1 : bracket; | 
|  | feat->name = xstrdup(feature); | 
|  | feat->changeable = node_features_g_changeable_feature( | 
|  | feature); | 
|  | feat->count = count; | 
|  | feat->paren = paren; | 
|  |  | 
|  | has_changeable |= feat->changeable; | 
|  |  | 
|  | if (paren || *convert_to_matching_or) | 
|  | feat->op_code = FEATURE_OP_AND; | 
|  | else if (bracket) | 
|  | feat->op_code = FEATURE_OP_XAND; | 
|  | else | 
|  | feat->op_code = FEATURE_OP_AND; | 
|  | list_append(*feature_list, feat); | 
|  | feature = NULL; | 
|  | count = 0; | 
|  | } else if (tmp_requested[i] == '|') { | 
|  | bool changeable; | 
|  |  | 
|  | tmp_requested[i] = '\0'; | 
|  | if (feature == NULL) { | 
|  | verbose("%s constraint requested '|' without a feature: %s", | 
|  | debug_str, features); | 
|  | rc = ESLURM_INVALID_FEATURE; | 
|  | goto fini; | 
|  | } | 
|  | changeable = node_features_g_changeable_feature( | 
|  | feature); | 
|  | feat = xmalloc(sizeof(job_feature_t)); | 
|  | feat->bracket = *convert_to_matching_or ? 1 : bracket; | 
|  | feat->name = xstrdup(feature); | 
|  | feat->changeable = changeable; | 
|  | feat->count = count; | 
|  | feat->paren = paren; | 
|  |  | 
|  | has_changeable |= changeable; | 
|  | has_or = true; | 
|  |  | 
|  | /* | 
|  | * The if-else-if is like this for priority: | 
|  | * - paren is highest priority | 
|  | * - then bracket | 
|  | * - then outside of paren/bracket | 
|  | */ | 
|  | if (paren && !(*convert_to_matching_or)) | 
|  | feat->op_code = FEATURE_OP_OR; | 
|  | else if (bracket || changeable || | 
|  | (*convert_to_matching_or)) | 
|  | feat->op_code = FEATURE_OP_MOR; | 
|  | else | 
|  | feat->op_code = FEATURE_OP_OR; | 
|  | list_append(*feature_list, feat); | 
|  | feature = NULL; | 
|  | count = 0; | 
|  | } else if (tmp_requested[i] == '[') { | 
|  | tmp_requested[i] = '\0'; | 
|  | if ((feature != NULL) || bracket || paren) { | 
|  | verbose("%s constraint has imbalanced brackets: %s", | 
|  | debug_str, features); | 
|  | rc = ESLURM_INVALID_FEATURE; | 
|  | goto fini; | 
|  | } | 
|  | bracket++; | 
|  | brack_set_count++; | 
|  | if (brack_set_count > 1) { | 
|  | verbose("%s constraint has more than one set of brackets: %s", | 
|  | debug_str, features); | 
|  | rc = ESLURM_INVALID_FEATURE; | 
|  | goto fini; | 
|  | } | 
|  | } else if (tmp_requested[i] == ']') { | 
|  | tmp_requested[i] = '\0'; | 
|  | if ((feature == NULL) || (bracket == 0) || paren) { | 
|  | verbose("%s constraint has imbalanced brackets: %s", | 
|  | debug_str, features); | 
|  | rc = ESLURM_INVALID_FEATURE; | 
|  | goto fini; | 
|  | } | 
|  | bracket--; | 
|  | } else if (tmp_requested[i] == '(') { | 
|  | tmp_requested[i] = '\0'; | 
|  | if ((feature != NULL) || paren) { | 
|  | verbose("%s constraint has imbalanced parentheses: %s", | 
|  | debug_str, features); | 
|  | rc = ESLURM_INVALID_FEATURE; | 
|  | goto fini; | 
|  | } | 
|  | paren++; | 
|  | } else if (tmp_requested[i] == ')') { | 
|  | tmp_requested[i] = '\0'; | 
|  | if ((feature == NULL) || (paren == 0)) { | 
|  | verbose("%s constraint has imbalanced parentheses: %s", | 
|  | debug_str, features); | 
|  | rc = ESLURM_INVALID_FEATURE; | 
|  | goto fini; | 
|  | } | 
|  | paren--; | 
|  | } else if (tmp_requested[i] == '\0') { | 
|  | if (feature) { | 
|  | feat = xmalloc(sizeof(job_feature_t)); | 
|  | feat->bracket = bracket; | 
|  | feat->name = xstrdup(feature); | 
|  | feat->changeable = node_features_g_changeable_feature( | 
|  | feature); | 
|  | feat->count = count; | 
|  | feat->paren = paren; | 
|  | feat->op_code = FEATURE_OP_END; | 
|  | list_append(*feature_list, feat); | 
|  |  | 
|  | has_changeable |= feat->changeable; | 
|  | } | 
|  | break; | 
|  | } else if (feature == NULL) { | 
|  | feature = &tmp_requested[i]; | 
|  | } else if (i && (tmp_requested[i - 1] == '\0')) { | 
|  | /* ')' and ']' should be followed by a token. */ | 
|  | verbose("%s constraint has an unexpected character: %s", | 
|  | debug_str, features); | 
|  | rc = ESLURM_INVALID_FEATURE; | 
|  | goto fini; | 
|  | } | 
|  | } | 
|  |  | 
|  | if (bracket != 0) { | 
|  | verbose("%s constraint has unbalanced brackets: %s", | 
|  | debug_str, features); | 
|  | rc = ESLURM_INVALID_FEATURE; | 
|  | goto fini; | 
|  | } | 
|  | if (paren != 0) { | 
|  | verbose("%s constraint has unbalanced parenthesis: %s", | 
|  | debug_str, features); | 
|  | rc = ESLURM_INVALID_FEATURE; | 
|  | goto fini; | 
|  | } | 
|  | if (has_asterisk && (list_count(*feature_list) > 1)) { | 
|  | verbose("%s constraint has '*' outside of brackets with more than one feature: %s", | 
|  | debug_str, features); | 
|  | rc = ESLURM_INVALID_FEATURE; | 
|  | goto fini; | 
|  | } | 
|  |  | 
|  | *convert_to_matching_or = (has_changeable && has_or); | 
|  |  | 
|  | fini: | 
|  | if (rc != SLURM_SUCCESS) { | 
|  | FREE_NULL_LIST(*feature_list); | 
|  | info("%s invalid constraint: %s", | 
|  | debug_str, features); | 
|  | } | 
|  | xfree(tmp_requested); | 
|  |  | 
|  | return rc; | 
|  | } | 
|  |  | 
|  | /* | 
|  | * build_feature_list - Translate a job's feature string into a feature_list | 
|  | * NOTE: This function is also used for reservations if is_reservation is true | 
|  | * and for job_desc_msg_t if job_id == 0 | 
|  | * IN  details->features | 
|  | * OUT details->feature_list | 
|  | * RET error code | 
|  | */ | 
|  | extern int build_feature_list(job_record_t *job_ptr, bool prefer, | 
|  | bool is_reservation) | 
|  | { | 
|  | job_details_t *detail_ptr = job_ptr->details; | 
|  | list_t **feature_list; | 
|  | int rc; | 
|  | int feature_err; | 
|  | bool convert_to_matching_or = false; | 
|  | valid_feature_t valid_feature = { | 
|  | .rc = SLURM_SUCCESS, | 
|  | }; | 
|  |  | 
|  | /* no hard constraints */ | 
|  | if (!detail_ptr || (!detail_ptr->features && !detail_ptr->prefer)) { | 
|  | if (job_ptr->batch_features) | 
|  | return ESLURM_BATCH_CONSTRAINT; | 
|  | return SLURM_SUCCESS; | 
|  | } | 
|  |  | 
|  | if (prefer) { | 
|  | valid_feature.features = detail_ptr->prefer; | 
|  | feature_list = &detail_ptr->prefer_list; | 
|  | feature_err = ESLURM_INVALID_PREFER; | 
|  | } else { | 
|  | valid_feature.features = detail_ptr->features; | 
|  | feature_list = &detail_ptr->feature_list; | 
|  | feature_err = ESLURM_INVALID_FEATURE; | 
|  | } | 
|  |  | 
|  | if (!valid_feature.features) /* The other constraint is non NULL. */ | 
|  | return SLURM_SUCCESS; | 
|  |  | 
|  | if (*feature_list)		/* already processed */ | 
|  | return SLURM_SUCCESS; | 
|  |  | 
|  | if (is_reservation) | 
|  | valid_feature.debug_str = xstrdup("Reservation"); | 
|  | else if (!job_ptr->job_id) | 
|  | valid_feature.debug_str = xstrdup("Job specs"); | 
|  | else | 
|  | valid_feature.debug_str = | 
|  | xstrdup_printf("JobId=%u", job_ptr->job_id); | 
|  |  | 
|  | valid_feature.can_reboot = | 
|  | node_features_g_user_update(job_ptr->user_id); | 
|  | rc = _feature_string2list(valid_feature.features, | 
|  | valid_feature.debug_str, | 
|  | feature_list, &convert_to_matching_or); | 
|  | if (rc != SLURM_SUCCESS) { | 
|  | rc = feature_err; | 
|  | goto fini; | 
|  | } | 
|  |  | 
|  | if (convert_to_matching_or) { | 
|  | char *str = NULL; | 
|  | list_t *feature_sets; | 
|  |  | 
|  | /* | 
|  | * Restructure the list into a format of AND'ing features in | 
|  | * parentheses and matching OR each parentheses together. The | 
|  | * current scheduling logic does not know how to handle matching | 
|  | * OR inside of parentheses; however, it does know how to handle | 
|  | * matching OR outside of parentheses, so we restructure the | 
|  | * feature list to a format the scheduling logic understands. | 
|  | * This is needed for changeable features which need all nodes | 
|  | * in the job allocation to match the same feature set, so they | 
|  | * cannot have any boolean OR in the feature list. | 
|  | * | 
|  | * For example, "(a|b)&c" becomes "(a&c)|(b&c)" | 
|  | * | 
|  | * Restructure only the feature list; leave the original | 
|  | * constraint expression intact. | 
|  | */ | 
|  | feature_sets = job_features_list2feature_sets( | 
|  | valid_feature.features, | 
|  | *feature_list, | 
|  | false); | 
|  | list_for_each(feature_sets, job_features_set2str, &str); | 
|  | FREE_NULL_LIST(feature_sets); | 
|  | FREE_NULL_LIST(*feature_list); | 
|  | rc = _feature_string2list(str, valid_feature.debug_str, | 
|  | feature_list, | 
|  | &convert_to_matching_or); | 
|  | if (rc != SLURM_SUCCESS) { | 
|  | /* | 
|  | * Something went wrong - we should have caught this | 
|  | * error the first time we called _feature_string2list. | 
|  | */ | 
|  | error("%s: Problem converting feature string %s to matching OR list", | 
|  | __func__, str); | 
|  | rc = feature_err; | 
|  | xfree(str); | 
|  | goto fini; | 
|  | } | 
|  | log_flag(NODE_FEATURES, "%s: Converted %sfeature list:'%s' to matching OR:'%s'", | 
|  | __func__, prefer ? "prefer " : "", | 
|  | valid_feature.features, str); | 
|  | xfree(str); | 
|  | } | 
|  |  | 
|  | if (job_ptr->batch_features) { | 
|  | detail_ptr->feature_list_use = *feature_list; | 
|  | detail_ptr->features_use = valid_feature.features; | 
|  | rc = _valid_batch_features(job_ptr, valid_feature.can_reboot); | 
|  | detail_ptr->feature_list_use = NULL; | 
|  | detail_ptr->features_use = NULL; | 
|  | if (rc != SLURM_SUCCESS) | 
|  | goto fini; | 
|  | } | 
|  |  | 
|  | valid_feature.feature_list = *feature_list; | 
|  | rc = _valid_feature_list(job_ptr, &valid_feature, is_reservation); | 
|  | if (rc != SLURM_SUCCESS) { | 
|  | rc = feature_err; | 
|  | goto fini; | 
|  | } | 
|  |  | 
|  | fini: | 
|  | xfree(valid_feature.debug_str); | 
|  | return rc; | 
|  | } | 
|  |  | 
|  | /* | 
|  | * Delete a record from a job's feature_list | 
|  | */ | 
|  | extern void feature_list_delete(void *x) | 
|  | { | 
|  | job_feature_t *feature_ptr = (job_feature_t *)x; | 
|  | xfree(feature_ptr->name); | 
|  | FREE_NULL_BITMAP(feature_ptr->node_bitmap_active); | 
|  | FREE_NULL_BITMAP(feature_ptr->node_bitmap_avail); | 
|  | xfree(feature_ptr); | 
|  | } | 
|  |  | 
|  | static int _match_job_feature(void *x, void *key) | 
|  | { | 
|  | job_feature_t *feat = (job_feature_t *) x; | 
|  | char *tok = (char *) key; | 
|  |  | 
|  | if (!xstrcmp(feat->name, tok))	/* Found matching feature name */ | 
|  | return 1; | 
|  | return 0; | 
|  | } | 
|  |  | 
|  | static int _valid_batch_features(job_record_t *job_ptr, bool can_reboot) | 
|  | { | 
|  | char *tmp, *tok, *save_ptr = NULL; | 
|  | int rc = SLURM_SUCCESS; | 
|  | bool have_or = false, success_or = false; | 
|  |  | 
|  | if (!job_ptr->batch_features) | 
|  | return SLURM_SUCCESS; | 
|  | if (!job_ptr->details || !job_ptr->details->feature_list_use) | 
|  | return ESLURM_BATCH_CONSTRAINT; | 
|  |  | 
|  | if (strchr(job_ptr->batch_features, '|')) | 
|  | have_or = true; | 
|  | tmp = xstrdup(job_ptr->batch_features); | 
|  | tok = strtok_r(tmp, "&|", &save_ptr); | 
|  | while (tok) { | 
|  | if (!list_find_first(job_ptr->details->feature_list_use, | 
|  | _match_job_feature, tok)) { | 
|  | rc = ESLURM_BATCH_CONSTRAINT; | 
|  | break; | 
|  | } | 
|  | rc = _valid_node_feature(tok, can_reboot); | 
|  | if (have_or) { | 
|  | if (rc == SLURM_SUCCESS) | 
|  | success_or = true; | 
|  | /* Ignore failure on some OR components */ | 
|  | } else if (rc != SLURM_SUCCESS) { | 
|  | rc = ESLURM_BATCH_CONSTRAINT; | 
|  | break; | 
|  | } | 
|  | tok = strtok_r(NULL, "&|", &save_ptr); | 
|  | } | 
|  | xfree(tmp); | 
|  |  | 
|  | if (have_or && success_or) | 
|  | return SLURM_SUCCESS; | 
|  | return rc; | 
|  | } | 
|  |  | 
|  | static int _foreach_valid_feature_list(void *x, void *arg) | 
|  | { | 
|  | job_feature_t *feat_ptr = x; | 
|  | valid_feature_t *valid_feature = arg; | 
|  |  | 
|  | if ((feat_ptr->op_code == FEATURE_OP_MOR) || | 
|  | (feat_ptr->op_code == FEATURE_OP_XAND)) { | 
|  | valid_feature->bracket = feat_ptr->paren + 1; | 
|  | } | 
|  | if (feat_ptr->paren > valid_feature->paren) { | 
|  | valid_feature->paren = feat_ptr->paren; | 
|  | } | 
|  | if (feat_ptr->paren < valid_feature->paren) { | 
|  | valid_feature->paren = feat_ptr->paren; | 
|  | } | 
|  | if ((valid_feature->rc == SLURM_SUCCESS) && | 
|  | !valid_feature->skip_validation) { | 
|  | valid_feature->rc = | 
|  | _valid_node_feature(feat_ptr->name, | 
|  | valid_feature->can_reboot); | 
|  | if (valid_feature->rc != SLURM_SUCCESS) | 
|  | verbose("%s feature %s is not usable on any node: %s", | 
|  | valid_feature->debug_str, feat_ptr->name, | 
|  | valid_feature->features); | 
|  | } | 
|  | if ((feat_ptr->op_code == FEATURE_OP_XAND) && !feat_ptr->count) { | 
|  | verbose("%s feature %s invalid, count must be used with XAND: %s", | 
|  | valid_feature->debug_str, feat_ptr->name, | 
|  | valid_feature->features); | 
|  | valid_feature->rc = ESLURM_INVALID_FEATURE; | 
|  | } | 
|  | if ((feat_ptr->op_code == FEATURE_OP_MOR) && feat_ptr->count) { | 
|  | verbose("%s feature %s invalid, count must not be used with MOR: %s", | 
|  | valid_feature->debug_str, feat_ptr->name, | 
|  | valid_feature->features); | 
|  | valid_feature->rc = ESLURM_INVALID_FEATURE; | 
|  | } | 
|  |  | 
|  | /* In brackets, outside of paren */ | 
|  | if ((valid_feature->bracket > valid_feature->paren) && | 
|  | ((feat_ptr->op_code != FEATURE_OP_MOR) && | 
|  | (feat_ptr->op_code != FEATURE_OP_XAND))) { | 
|  | if (valid_feature->has_xand && !feat_ptr->count) { | 
|  | valid_feature->rc = ESLURM_INVALID_FEATURE; | 
|  | verbose("%s feature %s invalid, count must be used with XAND: %s", | 
|  | valid_feature->debug_str, feat_ptr->name, | 
|  | valid_feature->features); | 
|  | } | 
|  | if (valid_feature->has_mor && feat_ptr->count) { | 
|  | valid_feature->rc = ESLURM_INVALID_FEATURE; | 
|  | verbose("%s feature %s invalid, count must not be used with MOR: %s", | 
|  | valid_feature->debug_str, feat_ptr->name, | 
|  | valid_feature->features); | 
|  | } | 
|  | valid_feature->bracket = 0; | 
|  | valid_feature->has_xand = false; | 
|  | valid_feature->has_mor = false; | 
|  | } | 
|  | if (feat_ptr->op_code == FEATURE_OP_XAND) | 
|  | valid_feature->has_xand = true; | 
|  | if (feat_ptr->op_code == FEATURE_OP_MOR) | 
|  | valid_feature->has_mor = true; | 
|  |  | 
|  | return 0; | 
|  | } | 
|  |  | 
|  | static int _valid_feature_list(job_record_t *job_ptr, | 
|  | valid_feature_t *valid_feature, | 
|  | bool is_reservation) | 
|  | { | 
|  | static time_t sched_update = 0; | 
|  | static bool ignore_prefer_val = false, ignore_constraint_val = false; | 
|  | bool is_prefer_list, skip_validation; | 
|  |  | 
|  | if (!valid_feature->feature_list) { | 
|  | debug2("%s feature list is empty", | 
|  | valid_feature->debug_str); | 
|  | return valid_feature->rc; | 
|  | } | 
|  |  | 
|  | if (sched_update != slurm_conf.last_update) { | 
|  | sched_update = slurm_conf.last_update; | 
|  | if (xstrcasestr(slurm_conf.sched_params, | 
|  | "ignore_prefer_validation")) | 
|  | ignore_prefer_val = true; | 
|  | else | 
|  | ignore_prefer_val = false; | 
|  | if (xstrcasestr(slurm_conf.sched_params, | 
|  | "ignore_constraint_validation")) | 
|  | ignore_constraint_val = true; | 
|  | else | 
|  | ignore_constraint_val = false; | 
|  |  | 
|  | } | 
|  |  | 
|  | is_prefer_list = (valid_feature->feature_list == | 
|  | job_ptr->details->prefer_list); | 
|  | skip_validation = (is_prefer_list && ignore_prefer_val) || | 
|  | (!is_prefer_list && ignore_constraint_val); | 
|  |  | 
|  | valid_feature->skip_validation = skip_validation; | 
|  |  | 
|  | (void) list_for_each(valid_feature->feature_list, | 
|  | _foreach_valid_feature_list, | 
|  | valid_feature); | 
|  |  | 
|  | if (valid_feature->rc == SLURM_SUCCESS) { | 
|  | debug("%s feature list: %s", | 
|  | valid_feature->debug_str, valid_feature->features); | 
|  | } else { | 
|  | if (is_reservation) { | 
|  | info("Reservation has invalid feature list: %s", | 
|  | valid_feature->features); | 
|  | } else { | 
|  | if (valid_feature->can_reboot) | 
|  | info("%s has invalid feature list: %s", | 
|  | valid_feature->debug_str, | 
|  | valid_feature->features); | 
|  | else | 
|  | info("%s has invalid feature list (%s) or the features are not active and this user cannot reboot to update node features", | 
|  | valid_feature->debug_str, | 
|  | valid_feature->features); | 
|  | } | 
|  | } | 
|  |  | 
|  | return valid_feature->rc; | 
|  | } | 
|  |  | 
|  | static int _find_feature_in_list(void *x, void *arg) | 
|  | { | 
|  | node_feature_t *feature_ptr = x; | 
|  | char *feature = arg; | 
|  |  | 
|  | if (!xstrcmp(feature_ptr->name, feature)) | 
|  | return 1; | 
|  |  | 
|  | return 0; | 
|  | } | 
|  |  | 
|  | /* Validate that job's feature is available on some node(s) */ | 
|  | static int _valid_node_feature(char *feature, bool can_reboot) | 
|  | { | 
|  | int rc = ESLURM_INVALID_FEATURE; | 
|  | list_t *use_list = | 
|  | can_reboot ? avail_feature_list : active_feature_list; | 
|  |  | 
|  | if (list_find_first(use_list, _find_feature_in_list, feature)) | 
|  | rc = SLURM_SUCCESS; | 
|  |  | 
|  | return rc; | 
|  | } | 
|  |  | 
|  | #define REBUILD_PENDING SLURM_BIT(0) | 
|  | #define REBUILD_ACTIVE SLURM_BIT(1) | 
|  |  | 
|  | typedef struct { | 
|  | uint16_t flags; | 
|  | job_record_t *job_ptr; | 
|  | } rebuild_args_t; | 
|  |  | 
|  | static int _build_partition_string(void *object, void *arg) { | 
|  | part_record_t *part_ptr = object; | 
|  | rebuild_args_t *args = arg; | 
|  | uint16_t flags = args->flags; | 
|  | job_record_t *job_ptr = args->job_ptr; | 
|  |  | 
|  | if (flags & REBUILD_PENDING) { | 
|  | job_ptr->part_ptr = part_ptr; | 
|  | flags &= ~(REBUILD_PENDING); | 
|  | } | 
|  | if ((flags & REBUILD_ACTIVE) && (part_ptr == job_ptr->part_ptr)) | 
|  | return SLURM_SUCCESS;       /* already added */ | 
|  | if (job_ptr->partition) | 
|  | xstrcat(job_ptr->partition, ","); | 
|  | xstrcat(job_ptr->partition, part_ptr->name); | 
|  | return SLURM_SUCCESS; | 
|  | } | 
|  |  | 
|  | /* If a job can run in multiple partitions, when it is started we want to | 
|  | * put the name of the partition used _first_ in that list. When slurmctld | 
|  | * restarts, that will be used to set the job's part_ptr and that will be | 
|  | * reported to squeue. We leave all of the partitions in the list though, | 
|  | * so the job can be requeued and have access to them all. */ | 
|  | extern void rebuild_job_part_list(job_record_t *job_ptr) | 
|  | { | 
|  | rebuild_args_t arg = { | 
|  | .job_ptr = job_ptr, | 
|  | }; | 
|  |  | 
|  | xfree(job_ptr->partition); | 
|  |  | 
|  | if (!job_ptr->part_ptr_list) { | 
|  | job_ptr->partition = xstrdup(job_ptr->part_ptr->name); | 
|  | last_job_update = time(NULL); | 
|  | return; | 
|  | } | 
|  |  | 
|  | if (IS_JOB_RUNNING(job_ptr) || IS_JOB_SUSPENDED(job_ptr)) { | 
|  | arg.flags |= REBUILD_ACTIVE; | 
|  | job_ptr->partition = xstrdup(job_ptr->part_ptr->name); | 
|  | } else if (IS_JOB_PENDING(job_ptr)) | 
|  | arg.flags |= REBUILD_PENDING; | 
|  | list_for_each(job_ptr->part_ptr_list, _build_partition_string, &arg); | 
|  | last_job_update = time(NULL); | 
|  | } | 
|  |  | 
|  | /* cleanup_completing() | 
|  | * | 
|  | * Clean up the JOB_COMPLETING flag and eventually | 
|  | * requeue the job if there is a pending request | 
|  | * for it. This function assumes the caller has the | 
|  | * appropriate locks on the job_record. | 
|  | */ | 
|  | void cleanup_completing(job_record_t *job_ptr, bool requeue) | 
|  | { | 
|  | time_t delay; | 
|  | if (job_ptr->epilog_running || job_ptr->node_cnt) | 
|  | return; | 
|  | log_flag(TRACE_JOBS, "%s: %pJ", __func__, job_ptr); | 
|  |  | 
|  | delay = last_job_update - job_ptr->end_time; | 
|  | if (delay > 60) { | 
|  | info("%s: %pJ completion process took %ld seconds", | 
|  | __func__, job_ptr, (long) delay); | 
|  | } | 
|  |  | 
|  | license_job_return(job_ptr); | 
|  | gs_job_fini(job_ptr); | 
|  |  | 
|  | delete_step_records(job_ptr); | 
|  | job_state_unset_flag(job_ptr, JOB_COMPLETING); | 
|  | job_hold_requeue(job_ptr); | 
|  |  | 
|  | /* | 
|  | * Clear alloc tres fields after a requeue. job_set_alloc_tres will | 
|  | * clear the fields when the job is pending and not completing. | 
|  | */ | 
|  | if (IS_JOB_PENDING(job_ptr)) | 
|  | job_set_alloc_tres(job_ptr, false); | 
|  |  | 
|  | /* Job could be pending if the job was requeued due to a node failure */ | 
|  | if (IS_JOB_COMPLETED(job_ptr)) | 
|  | fed_mgr_job_complete(job_ptr, job_ptr->exit_code, | 
|  | job_ptr->start_time); | 
|  | if (requeue) | 
|  | batch_requeue_fini(job_ptr); | 
|  | } | 
|  |  | 
|  | void main_sched_init(void) | 
|  | { | 
|  | if (thread_id_sched) | 
|  | return; | 
|  | slurm_thread_create(&thread_id_sched, _sched_agent, NULL); | 
|  | } | 
|  |  | 
|  | void main_sched_fini(void) | 
|  | { | 
|  | if (!thread_id_sched) | 
|  | return; | 
|  | slurm_mutex_lock(&sched_mutex); | 
|  | slurm_cond_broadcast(&sched_cond); | 
|  | slurm_mutex_unlock(&sched_mutex); | 
|  | slurm_thread_join(thread_id_sched); | 
|  | } |