| /*****************************************************************************\ |
| * job_scheduler.c - manage the scheduling of pending jobs in priority order |
| * Note there is a global job list (job_list) |
| ***************************************************************************** |
| * Copyright (C) 2002-2007 The Regents of the University of California. |
| * Copyright (C) 2008-2010 Lawrence Livermore National Security. |
| * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). |
| * Written by Morris Jette <jette1@llnl.gov> |
| * CODE-OCEC-09-009. All rights reserved. |
| * |
| * This file is part of SLURM, a resource management program. |
| * For details, see <http://slurm.schedmd.com/>. |
| * Please also read the included file: DISCLAIMER. |
| * |
| * SLURM is free software; you can redistribute it and/or modify it under |
| * the terms of the GNU General Public License as published by the Free |
| * Software Foundation; either version 2 of the License, or (at your option) |
| * any later version. |
| * |
| * In addition, as a special exception, the copyright holders give permission |
| * to link the code of portions of this program with the OpenSSL library under |
| * certain conditions as described in each individual source file, and |
| * distribute linked combinations including the two. You must obey the GNU |
| * General Public License in all respects for all of the code used other than |
| * OpenSSL. If you modify file(s) with this exception, you may extend this |
| * exception to your version of the file(s), but you are not obligated to do |
| * so. If you do not wish to do so, delete this exception statement from your |
| * version. If you delete this exception statement from all source files in |
| * the program, then also delete it here. |
| * |
| * SLURM is distributed in the hope that it will be useful, but WITHOUT ANY |
| * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS |
| * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more |
| * details. |
| * |
| * You should have received a copy of the GNU General Public License along |
| * with SLURM; if not, write to the Free Software Foundation, Inc., |
| * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. |
| \*****************************************************************************/ |
| |
| #ifdef HAVE_CONFIG_H |
| # include "config.h" |
| #endif |
| |
| #if defined(__NetBSD__) |
| #include <sys/types.h> /* for pid_t */ |
| #include <sys/signal.h> /* for SIGKILL */ |
| #endif |
| #if defined(__FreeBSD__) |
| #include <signal.h> /* for SIGKILL */ |
| #endif |
| #include <ctype.h> |
| #include <errno.h> |
| #include <stdio.h> |
| #include <stdlib.h> |
| #include <string.h> |
| #include <unistd.h> |
| |
| #include "src/common/assoc_mgr.h" |
| #include "src/common/env.h" |
| #include "src/common/gres.h" |
| #include "src/common/list.h" |
| #include "src/common/macros.h" |
| #include "src/common/node_select.h" |
| #include "src/common/slurm_accounting_storage.h" |
| #include "src/common/slurm_acct_gather.h" |
| #include "src/common/timers.h" |
| #include "src/common/uid.h" |
| #include "src/common/xassert.h" |
| #include "src/common/xstring.h" |
| |
| #include "src/slurmctld/acct_policy.h" |
| #include "src/slurmctld/agent.h" |
| #include "src/slurmctld/front_end.h" |
| #include "src/slurmctld/job_scheduler.h" |
| #include "src/slurmctld/licenses.h" |
| #include "src/slurmctld/locks.h" |
| #include "src/slurmctld/node_scheduler.h" |
| #include "src/slurmctld/preempt.h" |
| #include "src/slurmctld/proc_req.h" |
| #include "src/slurmctld/reservation.h" |
| #include "src/slurmctld/slurmctld.h" |
| #include "src/slurmctld/srun_comm.h" |
| #include "src/slurmctld/sched_plugin.h" |
| |
| #define _DEBUG 0 |
| #define MAX_FAILED_RESV 10 |
| #define MAX_RETRIES 10 |
| |
| typedef struct epilog_arg { |
| char *epilog_slurmctld; |
| uint32_t job_id; |
| char **my_env; |
| } epilog_arg_t; |
| |
| static char ** _build_env(struct job_record *job_ptr); |
| static void _depend_list_del(void *dep_ptr); |
| static void _feature_list_delete(void *x); |
| static void _job_queue_append(List job_queue, struct job_record *job_ptr, |
| struct part_record *part_ptr, uint32_t priority); |
| static void _job_queue_rec_del(void *x); |
| static bool _job_runnable_test1(struct job_record *job_ptr, |
| bool clear_start); |
| static bool _job_runnable_test2(struct job_record *job_ptr, |
| bool check_min_time); |
| static void * _run_epilog(void *arg); |
| static void * _run_prolog(void *arg); |
| static bool _scan_depend(List dependency_list, uint32_t job_id); |
| static int _valid_feature_list(uint32_t job_id, List feature_list); |
| static int _valid_node_feature(char *feature); |
| |
| static int save_last_part_update = 0; |
| |
| extern diag_stats_t slurmctld_diag_stats; |
| |
| /* |
| * _build_user_job_list - build list of jobs for a given user |
| * and an optional job name |
| * IN user_id - user id |
| * IN job_name - job name constraint |
| * RET the job queue |
| * NOTE: the caller must call list_destroy() on RET value to free memory |
| */ |
| static List _build_user_job_list(uint32_t user_id, char* job_name) |
| { |
| List job_queue; |
| ListIterator job_iterator; |
| struct job_record *job_ptr = NULL; |
| |
| job_queue = list_create(NULL); |
| job_iterator = list_iterator_create(job_list); |
| while ((job_ptr = (struct job_record *) list_next(job_iterator))) { |
| xassert (job_ptr->magic == JOB_MAGIC); |
| if (job_ptr->user_id != user_id) |
| continue; |
| if (job_name && job_ptr->name && |
| strcmp(job_name, job_ptr->name)) |
| continue; |
| list_append(job_queue, job_ptr); |
| } |
| list_iterator_destroy(job_iterator); |
| |
| return job_queue; |
| } |
| |
| static void _job_queue_append(List job_queue, struct job_record *job_ptr, |
| struct part_record *part_ptr, uint32_t prio) |
| { |
| job_queue_rec_t *job_queue_rec; |
| |
| job_queue_rec = xmalloc(sizeof(job_queue_rec_t)); |
| job_queue_rec->job_id = job_ptr->job_id; |
| job_queue_rec->job_ptr = job_ptr; |
| job_queue_rec->part_ptr = part_ptr; |
| job_queue_rec->priority = prio; |
| list_append(job_queue, job_queue_rec); |
| } |
| |
| static void _job_queue_rec_del(void *x) |
| { |
| xfree(x); |
| } |
| |
| /* Job test for ability to run now, excludes partition specific tests */ |
| static bool _job_runnable_test1(struct job_record *job_ptr, bool clear_start) |
| { |
| bool job_indepen = false; |
| uint16_t cleaning = 0; |
| |
| xassert(job_ptr->magic == JOB_MAGIC); |
| if (!IS_JOB_PENDING(job_ptr) || IS_JOB_COMPLETING(job_ptr)) |
| return false; |
| |
| select_g_select_jobinfo_get(job_ptr->select_jobinfo, |
| SELECT_JOBDATA_CLEANING, |
| &cleaning); |
| if (cleaning) { |
| /* Job's been requeued and the |
| * previous run hasn't finished yet */ |
| job_ptr->state_reason = WAIT_CLEANING; |
| xfree(job_ptr->state_desc); |
| debug3("sched: JobId=%u. State=PENDING. " |
| "Reason=Cleaning.", |
| job_ptr->job_id); |
| return false; |
| } |
| |
| #ifdef HAVE_FRONT_END |
| /* At least one front-end node up at this point */ |
| if (job_ptr->state_reason == WAIT_FRONT_END) { |
| job_ptr->state_reason = WAIT_NO_REASON; |
| xfree(job_ptr->state_desc); |
| last_job_update = time(NULL); |
| } |
| #endif |
| |
| job_indepen = job_independent(job_ptr, 0); |
| if (clear_start) |
| job_ptr->start_time = (time_t) 0; |
| if (job_ptr->priority == 0) { /* held */ |
| if (job_ptr->state_reason != FAIL_BAD_CONSTRAINTS |
| && (job_ptr->state_reason != WAIT_HELD) |
| && (job_ptr->state_reason != WAIT_HELD_USER)) { |
| job_ptr->state_reason = WAIT_HELD; |
| xfree(job_ptr->state_desc); |
| last_job_update = time(NULL); |
| } |
| debug3("sched: JobId=%u. State=%s. Reason=%s. Priority=%u.", |
| job_ptr->job_id, |
| job_state_string(job_ptr->job_state), |
| job_reason_string(job_ptr->state_reason), |
| job_ptr->priority); |
| return false; |
| } |
| |
| if (!job_indepen && |
| ((job_ptr->state_reason == WAIT_HELD) || |
| (job_ptr->state_reason == WAIT_HELD_USER))) { |
| /* released behind active dependency? */ |
| job_ptr->state_reason = WAIT_DEPENDENCY; |
| xfree(job_ptr->state_desc); |
| } |
| |
| if (!job_indepen) /* can not run now */ |
| return false; |
| return true; |
| } |
| |
| /* |
| * Job and partition tests for ability to run now |
| * IN job_ptr - job to test |
| * IN check_min_time - If set, test job's minimum time limit |
| * otherwise test maximum time limit |
| */ |
| static bool _job_runnable_test2(struct job_record *job_ptr, bool check_min_time) |
| { |
| int reason; |
| |
| reason = job_limits_check(&job_ptr, check_min_time); |
| if ((reason != job_ptr->state_reason) && |
| ((reason != WAIT_NO_REASON) || |
| (!part_policy_job_runnable_state(job_ptr)))) { |
| job_ptr->state_reason = reason; |
| xfree(job_ptr->state_desc); |
| } |
| if (reason != WAIT_NO_REASON) |
| return false; |
| return true; |
| } |
| |
| /* |
| * build_job_queue - build (non-priority ordered) list of pending jobs |
| * IN clear_start - if set then clear the start_time for pending jobs |
| * IN backfill - true if running backfill scheduler, enforce min time limit |
| * RET the job queue |
| * NOTE: the caller must call list_destroy() on RET value to free memory |
| */ |
| extern List build_job_queue(bool clear_start, bool backfill) |
| { |
| List job_queue; |
| ListIterator job_iterator, part_iterator; |
| struct job_record *job_ptr = NULL; |
| struct part_record *part_ptr; |
| int reason; |
| |
| job_queue = list_create(_job_queue_rec_del); |
| job_iterator = list_iterator_create(job_list); |
| while ((job_ptr = (struct job_record *) list_next(job_iterator))) { |
| job_ptr->preempt_in_progress = false; /* initialize */ |
| if (!_job_runnable_test1(job_ptr, clear_start)) |
| continue; |
| |
| if (job_ptr->part_ptr_list) { |
| int inx = -1; |
| part_iterator = list_iterator_create( |
| job_ptr->part_ptr_list); |
| while ((part_ptr = (struct part_record *) |
| list_next(part_iterator))) { |
| job_ptr->part_ptr = part_ptr; |
| reason = job_limits_check(&job_ptr, backfill); |
| if ((reason != WAIT_NO_REASON) && |
| (reason != job_ptr->state_reason) && |
| (!part_policy_job_runnable_state(job_ptr))){ |
| job_ptr->state_reason = reason; |
| xfree(job_ptr->state_desc); |
| } |
| /* priority_array index matches part_ptr_list |
| * position: increment inx */ |
| inx++; |
| if (reason != WAIT_NO_REASON) |
| continue; |
| if (job_ptr->priority_array) { |
| _job_queue_append(job_queue, job_ptr, |
| part_ptr, |
| job_ptr-> |
| priority_array[inx]); |
| } else { |
| _job_queue_append(job_queue, job_ptr, |
| part_ptr, |
| job_ptr->priority); |
| } |
| } |
| list_iterator_destroy(part_iterator); |
| } else { |
| if (job_ptr->part_ptr == NULL) { |
| part_ptr = find_part_record(job_ptr->partition); |
| if (part_ptr == NULL) { |
| error("Could not find partition %s " |
| "for job %u", job_ptr->partition, |
| job_ptr->job_id); |
| continue; |
| } |
| job_ptr->part_ptr = part_ptr; |
| error("partition pointer reset for job %u, " |
| "part %s", job_ptr->job_id, |
| job_ptr->partition); |
| } |
| if (!_job_runnable_test2(job_ptr, backfill)) |
| continue; |
| _job_queue_append(job_queue, job_ptr, |
| job_ptr->part_ptr, job_ptr->priority); |
| } |
| } |
| list_iterator_destroy(job_iterator); |
| |
| return job_queue; |
| } |
| |
| /* |
| * job_is_completing - Determine if jobs are in the process of completing. |
| * RET - True of any job is in the process of completing AND |
| * CompleteWait is configured non-zero |
| * NOTE: This function can reduce resource fragmentation, which is a |
| * critical issue on Elan interconnect based systems. |
| */ |
| extern bool job_is_completing(void) |
| { |
| bool completing = false; |
| ListIterator job_iterator; |
| struct job_record *job_ptr = NULL; |
| uint16_t complete_wait = slurm_get_complete_wait(); |
| time_t recent; |
| |
| if ((job_list == NULL) || (complete_wait == 0)) |
| return completing; |
| |
| recent = time(NULL) - complete_wait; |
| job_iterator = list_iterator_create(job_list); |
| while ((job_ptr = (struct job_record *) list_next(job_iterator))) { |
| if (IS_JOB_COMPLETING(job_ptr) && |
| (job_ptr->end_time >= recent)) { |
| completing = true; |
| break; |
| } |
| } |
| list_iterator_destroy(job_iterator); |
| |
| return completing; |
| } |
| |
| /* |
| * set_job_elig_time - set the eligible time for pending jobs once their |
| * dependencies are lifted (in job->details->begin_time) |
| */ |
| extern void set_job_elig_time(void) |
| { |
| struct job_record *job_ptr = NULL; |
| struct part_record *part_ptr = NULL; |
| ListIterator job_iterator; |
| slurmctld_lock_t job_write_lock = |
| { READ_LOCK, WRITE_LOCK, WRITE_LOCK, READ_LOCK }; |
| #ifdef HAVE_BG |
| static uint16_t cpus_per_node = 0; |
| if (!cpus_per_node) |
| select_g_alter_node_cnt(SELECT_GET_NODE_CPU_CNT, |
| &cpus_per_node); |
| #endif |
| |
| lock_slurmctld(job_write_lock); |
| job_iterator = list_iterator_create(job_list); |
| while ((job_ptr = (struct job_record *) list_next(job_iterator))) { |
| uint32_t job_min_nodes, job_max_nodes; |
| uint32_t part_min_nodes, part_max_nodes; |
| part_ptr = job_ptr->part_ptr; |
| if (!IS_JOB_PENDING(job_ptr)) |
| continue; |
| if (part_ptr == NULL) |
| continue; |
| if ((job_ptr->details == NULL) || job_ptr->details->begin_time) |
| continue; |
| if ((part_ptr->state_up & PARTITION_SCHED) == 0) |
| continue; |
| if ((job_ptr->time_limit != NO_VAL) && |
| (job_ptr->time_limit > part_ptr->max_time)) |
| continue; |
| #ifdef HAVE_BG |
| job_min_nodes = job_ptr->details->min_cpus / cpus_per_node; |
| job_max_nodes = job_ptr->details->max_cpus / cpus_per_node; |
| part_min_nodes = part_ptr->min_nodes_orig; |
| part_max_nodes = part_ptr->max_nodes_orig; |
| #else |
| job_min_nodes = job_ptr->details->min_nodes; |
| job_max_nodes = job_ptr->details->max_nodes; |
| part_min_nodes = part_ptr->min_nodes; |
| part_max_nodes = part_ptr->max_nodes; |
| #endif |
| if ((job_max_nodes != 0) && |
| ((job_max_nodes < part_min_nodes) || |
| (job_min_nodes > part_max_nodes))) |
| continue; |
| /* Job's eligible time is set in job_independent() */ |
| if (!job_independent(job_ptr, 0)) |
| continue; |
| } |
| list_iterator_destroy(job_iterator); |
| unlock_slurmctld(job_write_lock); |
| } |
| |
| /* Test of part_ptr can still run jobs or if its nodes have |
| * already been reserved by higher priority jobs (those in |
| * the failed_parts array) */ |
| static bool _failed_partition(struct part_record *part_ptr, |
| struct part_record **failed_parts, |
| int failed_part_cnt) |
| { |
| int i; |
| |
| for (i = 0; i < failed_part_cnt; i++) { |
| if (failed_parts[i] == part_ptr) |
| return true; |
| } |
| return false; |
| } |
| |
| static void _do_diag_stats(long delta_t) |
| { |
| if (delta_t > slurmctld_diag_stats.schedule_cycle_max) |
| slurmctld_diag_stats.schedule_cycle_max = delta_t; |
| |
| slurmctld_diag_stats.schedule_cycle_sum += delta_t; |
| slurmctld_diag_stats.schedule_cycle_last = delta_t; |
| slurmctld_diag_stats.schedule_cycle_counter++; |
| } |
| |
| |
| /* |
| * Given that one batch job just completed, attempt to launch a suitable |
| * replacement batch job in a response messge as a REQUEST_BATCH_JOB_LAUNCH |
| * message type, alternately send a return code fo SLURM_SUCCESS |
| * msg IN - The original message from slurmd |
| * fini_job_ptr IN - Pointer to job that just completed and needs replacement |
| * RET true if there are pending jobs that might use the resources |
| */ |
| extern bool replace_batch_job(slurm_msg_t * msg, void *fini_job) |
| { |
| static int select_serial = -1; |
| /* Locks: Read config, write job, write node, read partition */ |
| slurmctld_lock_t job_write_lock = |
| { READ_LOCK, WRITE_LOCK, WRITE_LOCK, READ_LOCK }; |
| struct job_record *job_ptr = NULL; |
| struct job_record *fini_job_ptr = (struct job_record *) fini_job; |
| struct part_record *part_ptr; |
| ListIterator job_iterator = NULL, part_iterator = NULL; |
| batch_job_launch_msg_t *launch_msg = NULL; |
| bitstr_t *orig_exc_bitmap = NULL; |
| bool have_node_bitmaps, pending_jobs = false; |
| time_t now, min_age; |
| int error_code; |
| |
| if (select_serial == -1) { |
| if (strcmp(slurmctld_conf.select_type, "select/serial")) |
| select_serial = 0; |
| else |
| select_serial = 1; |
| } |
| if ((select_serial != 1) || (fini_job_ptr == NULL) || |
| (msg->msg_type != REQUEST_COMPLETE_BATCH_JOB)) |
| goto send_reply; |
| |
| now = time(NULL); |
| min_age = now - slurmctld_conf.min_job_age; |
| lock_slurmctld(job_write_lock); |
| if (!fini_job_ptr->job_resrcs || |
| !fini_job_ptr->job_resrcs->node_bitmap) { |
| /* This should never happen, but if it does, avoid using |
| * a bad pointer below. */ |
| error("job_resrcs empty for job %u", fini_job_ptr->job_id); |
| unlock_slurmctld(job_write_lock); |
| goto send_reply; |
| } |
| job_iterator = list_iterator_create(job_list); |
| while (1) { |
| if (job_ptr && part_iterator) |
| goto next_part; |
| |
| job_ptr = (struct job_record *) list_next(job_iterator); |
| if (!job_ptr) |
| break; |
| |
| if ((job_ptr == fini_job_ptr) || |
| (job_ptr->priority == 0) || |
| (job_ptr->details == NULL) || |
| !avail_front_end(job_ptr)) |
| continue; |
| |
| if (!IS_JOB_PENDING(job_ptr)) { |
| if (IS_JOB_FINISHED(job_ptr) && |
| (job_ptr != fini_job_ptr) && |
| (job_ptr->end_time <= min_age)) { |
| /* If we don't have a db_index by now and we |
| * are running with the slurmdbd lets put it on |
| * the list to be handled later when it comes |
| * back up since we won't get another chance */ |
| if (with_slurmdbd && !job_ptr->db_index) { |
| jobacct_storage_g_job_start(acct_db_conn, |
| job_ptr); |
| } |
| list_delete_item(job_iterator); |
| } |
| continue; |
| } |
| |
| /* Tests dependencies, begin time and reservations */ |
| if (!job_independent(job_ptr, 0)) |
| continue; |
| |
| if (job_ptr->part_ptr_list) { |
| part_iterator = list_iterator_create(job_ptr-> |
| part_ptr_list); |
| next_part: part_ptr = (struct part_record *) |
| list_next(part_iterator); |
| if (part_ptr) { |
| job_ptr->part_ptr = part_ptr; |
| } else { |
| list_iterator_destroy(part_iterator); |
| part_iterator = NULL; |
| continue; |
| } |
| } |
| if (job_limits_check(&job_ptr, false) != WAIT_NO_REASON) |
| continue; |
| |
| /* Test for valid account, QOS and required nodes on each pass */ |
| if (job_ptr->state_reason == FAIL_ACCOUNT) { |
| slurmdb_association_rec_t assoc_rec; |
| memset(&assoc_rec, 0, sizeof(slurmdb_association_rec_t)); |
| assoc_rec.acct = job_ptr->account; |
| if (job_ptr->part_ptr) |
| assoc_rec.partition = job_ptr->part_ptr->name; |
| assoc_rec.uid = job_ptr->user_id; |
| |
| if (!assoc_mgr_fill_in_assoc(acct_db_conn, &assoc_rec, |
| accounting_enforce, |
| (slurmdb_association_rec_t **) |
| &job_ptr->assoc_ptr, |
| false)) { |
| job_ptr->state_reason = WAIT_NO_REASON; |
| xfree(job_ptr->state_desc); |
| job_ptr->assoc_id = assoc_rec.id; |
| last_job_update = now; |
| } else { |
| continue; |
| } |
| } |
| if (job_ptr->qos_id) { |
| slurmdb_association_rec_t *assoc_ptr = |
| (slurmdb_association_rec_t *)job_ptr->assoc_ptr; |
| if (assoc_ptr && |
| !bit_test(assoc_ptr->usage->valid_qos, |
| job_ptr->qos_id) && |
| !job_ptr->limit_set_qos) { |
| info("sched: JobId=%u has invalid QOS", |
| job_ptr->job_id); |
| xfree(job_ptr->state_desc); |
| job_ptr->state_reason = FAIL_QOS; |
| last_job_update = now; |
| continue; |
| } else if (job_ptr->state_reason == FAIL_QOS) { |
| xfree(job_ptr->state_desc); |
| job_ptr->state_reason = WAIT_NO_REASON; |
| last_job_update = now; |
| } |
| } |
| |
| if ((job_ptr->state_reason == WAIT_QOS_JOB_LIMIT) || |
| (job_ptr->state_reason == WAIT_QOS_RESOURCE_LIMIT) || |
| (job_ptr->state_reason == WAIT_QOS_TIME_LIMIT)) { |
| job_ptr->state_reason = WAIT_NO_REASON; |
| xfree(job_ptr->state_desc); |
| last_job_update = now; |
| } |
| |
| if ((job_ptr->state_reason == WAIT_NODE_NOT_AVAIL) && |
| job_ptr->details->req_node_bitmap && |
| !bit_super_set(job_ptr->details->req_node_bitmap, |
| avail_node_bitmap)) { |
| continue; |
| } |
| |
| if (bit_overlap(avail_node_bitmap, |
| job_ptr->part_ptr->node_bitmap) == 0) { |
| /* This node DRAIN or DOWN */ |
| continue; |
| } |
| |
| if (license_job_test(job_ptr, now) != SLURM_SUCCESS) { |
| job_ptr->state_reason = WAIT_LICENSES; |
| xfree(job_ptr->state_desc); |
| last_job_update = now; |
| continue; |
| } |
| |
| if (assoc_mgr_validate_assoc_id(acct_db_conn, |
| job_ptr->assoc_id, |
| accounting_enforce)) { |
| /* NOTE: This only happens if a user's account is |
| * disabled between when the job was submitted and |
| * the time we consider running it. It should be |
| * very rare. */ |
| info("sched: JobId=%u has invalid account", |
| job_ptr->job_id); |
| last_job_update = now; |
| job_ptr->state_reason = FAIL_ACCOUNT; |
| xfree(job_ptr->state_desc); |
| continue; |
| } |
| |
| if (job_ptr->details->exc_node_bitmap) |
| have_node_bitmaps = true; |
| else |
| have_node_bitmaps = false; |
| if (have_node_bitmaps && |
| (bit_overlap(job_ptr->details->exc_node_bitmap, |
| fini_job_ptr->job_resrcs->node_bitmap) != 0)) |
| continue; |
| |
| if (!job_ptr->batch_flag) { /* Can't pull interactive jobs */ |
| pending_jobs = true; |
| break; |
| } |
| |
| if (have_node_bitmaps) |
| orig_exc_bitmap = job_ptr->details->exc_node_bitmap; |
| else |
| orig_exc_bitmap = NULL; |
| job_ptr->details->exc_node_bitmap = |
| bit_copy(fini_job_ptr->job_resrcs->node_bitmap); |
| bit_not(job_ptr->details->exc_node_bitmap); |
| error_code = select_nodes(job_ptr, false, NULL); |
| bit_free(job_ptr->details->exc_node_bitmap); |
| job_ptr->details->exc_node_bitmap = orig_exc_bitmap; |
| if (error_code == SLURM_SUCCESS) { |
| last_job_update = now; |
| info("sched: Allocate JobId=%u NodeList=%s #CPUs=%u", |
| job_ptr->job_id, job_ptr->nodes, |
| job_ptr->total_cpus); |
| if (job_ptr->details->prolog_running == 0) { |
| launch_msg = build_launch_job_msg(job_ptr, |
| msg->protocol_version); |
| } |
| } |
| break; |
| } |
| unlock_slurmctld(job_write_lock); |
| if (job_iterator) |
| list_iterator_destroy(job_iterator); |
| if (part_iterator) |
| list_iterator_destroy(part_iterator); |
| |
| send_reply: |
| if (launch_msg) { |
| slurm_msg_t response_msg; |
| slurm_msg_t_init(&response_msg); |
| response_msg.flags = msg->flags; |
| response_msg.protocol_version = msg->protocol_version; |
| response_msg.address = msg->address; |
| response_msg.msg_type = REQUEST_BATCH_JOB_LAUNCH; |
| response_msg.data = launch_msg; |
| slurm_send_node_msg(msg->conn_fd, &response_msg); |
| slurmctld_free_batch_job_launch_msg(launch_msg); |
| return false; |
| } |
| slurm_send_rc_msg(msg, SLURM_SUCCESS); |
| return pending_jobs; |
| } |
| |
| /* Return true of all partitions have the same priority, otherwise false. */ |
| static bool _all_partition_priorities_same(void) |
| { |
| struct part_record *part_ptr; |
| ListIterator iter; |
| bool part_priority_set = false; |
| uint32_t part_priority = 0; |
| bool result = true; |
| |
| iter = list_iterator_create(part_list); |
| while ((part_ptr = (struct part_record *) list_next(iter))) { |
| if (!part_priority_set) { |
| part_priority = part_ptr->priority; |
| part_priority_set = true; |
| } else if (part_priority != part_ptr->priority) { |
| result = false; |
| break; |
| } |
| } |
| list_iterator_destroy(iter); |
| |
| return result; |
| } |
| |
| /* |
| * schedule - attempt to schedule all pending jobs |
| * pending jobs for each partition will be scheduled in priority |
| * order until a request fails |
| * IN job_limit - maximum number of jobs to test now, avoid testing the full |
| * queue on every job submit (0 means to use the system default, |
| * SchedulerParameters for default_queue_depth) |
| * RET count of jobs scheduled |
| * Note: We re-build the queue every time. Jobs can not only be added |
| * or removed from the queue, but have their priority or partition |
| * changed with the update_job RPC. In general nodes will be in priority |
| * order (by submit time), so the sorting should be pretty fast. |
| */ |
| extern int schedule(uint32_t job_limit) |
| { |
| ListIterator job_iterator = NULL, part_iterator = NULL; |
| List job_queue = NULL; |
| int failed_part_cnt = 0, failed_resv_cnt = 0, job_cnt = 0; |
| int error_code, i, j, part_cnt, time_limit; |
| uint32_t job_depth = 0; |
| job_queue_rec_t *job_queue_rec; |
| struct job_record *job_ptr = NULL; |
| struct part_record *part_ptr, **failed_parts = NULL; |
| struct part_record *skip_part_ptr = NULL; |
| struct slurmctld_resv **failed_resv = NULL; |
| bitstr_t *save_avail_node_bitmap; |
| struct part_record **sched_part_ptr = NULL; |
| int *sched_part_jobs = NULL; |
| /* Locks: Read config, write job, write node, read partition */ |
| slurmctld_lock_t job_write_lock = |
| { READ_LOCK, WRITE_LOCK, WRITE_LOCK, READ_LOCK }; |
| #ifdef HAVE_BG |
| char *ionodes = NULL; |
| char tmp_char[256]; |
| static bool backfill_sched = false; |
| #endif |
| static time_t sched_update = 0; |
| static bool wiki_sched = false; |
| static bool fifo_sched = false; |
| static int sched_timeout = 0; |
| static int def_job_limit = 100; |
| static int max_jobs_per_part = 0; |
| static int defer_rpc_cnt = 0; |
| time_t now, sched_start; |
| uint32_t reject_array_job_id = 0; |
| struct part_record *reject_array_part = NULL; |
| uint16_t reject_state_reason = WAIT_NO_REASON; |
| DEF_TIMERS; |
| |
| if (slurmctld_config.shutdown_time) |
| return 0; |
| |
| if (sched_update != slurmctld_conf.last_update) { |
| char *sched_params, *tmp_ptr; |
| char *sched_type = slurm_get_sched_type(); |
| char *prio_type = slurm_get_priority_type(); |
| #ifdef HAVE_BG |
| /* On BlueGene, do FIFO only with sched/backfill */ |
| if (strcmp(sched_type, "sched/backfill") == 0) |
| backfill_sched = true; |
| #endif |
| if ((strcmp(sched_type, "sched/builtin") == 0) && |
| (strcmp(prio_type, "priority/basic") == 0) && |
| _all_partition_priorities_same()) |
| fifo_sched = true; |
| else |
| fifo_sched = false; |
| /* Disable avoiding of fragmentation with sched/wiki */ |
| if ((strcmp(sched_type, "sched/wiki") == 0) || |
| (strcmp(sched_type, "sched/wiki2") == 0)) |
| wiki_sched = true; |
| xfree(sched_type); |
| xfree(prio_type); |
| |
| sched_params = slurm_get_sched_params(); |
| |
| |
| if (sched_params && |
| (tmp_ptr=strstr(sched_params, "batch_sched_delay="))) |
| /* 012345678901234567 */ |
| batch_sched_delay = atoi(tmp_ptr + 18); |
| if (batch_sched_delay < 0) { |
| error("Invalid batch_sched_delay: %d", |
| batch_sched_delay); |
| batch_sched_delay = 3; |
| } |
| |
| if (sched_params && |
| (tmp_ptr = strstr(sched_params, "default_queue_depth="))) { |
| /* 01234567890123456789 */ |
| i = atoi(tmp_ptr + 20); |
| if (i < 0) { |
| error("ignoring SchedulerParameters: " |
| "default_queue_depth value of %d", i); |
| } else { |
| def_job_limit = i; |
| } |
| } |
| |
| if (sched_params && |
| (tmp_ptr=strstr(sched_params, "partition_job_depth="))) { |
| /* 01234567890123456789 */ |
| i = atoi(tmp_ptr + 20); |
| if (i < 0) { |
| error("ignoring SchedulerParameters: " |
| "partition_job_depth value of %d", i); |
| } else { |
| max_jobs_per_part = i; |
| } |
| } |
| if (sched_params && |
| (tmp_ptr=strstr(sched_params, "max_rpc_cnt="))) |
| defer_rpc_cnt = atoi(tmp_ptr + 12); |
| if (defer_rpc_cnt < 0) { |
| error("Invalid max_rpc_cnt: %d", defer_rpc_cnt); |
| defer_rpc_cnt = 0; |
| } |
| |
| time_limit = slurm_get_msg_timeout() / 2; |
| if (sched_params && |
| (tmp_ptr=strstr(sched_params, "max_sched_time="))) { |
| sched_timeout = atoi(tmp_ptr + 15); |
| if ((sched_timeout <= 0) || |
| (sched_timeout > time_limit)) { |
| error("Invalid max_sched_time: %d", |
| sched_timeout); |
| sched_timeout = 0; |
| } |
| } |
| if (sched_timeout == 0) { |
| sched_timeout = MAX(time_limit, 1); |
| sched_timeout = MIN(sched_timeout, 4); |
| } |
| |
| if (sched_params && |
| (tmp_ptr=strstr(sched_params, "sched_interval="))) |
| sched_interval = atoi(tmp_ptr + 15); |
| if (sched_interval < 0) { |
| error("Invalid sched_interval: %d", sched_interval); |
| sched_interval = 60; |
| } |
| |
| xfree(sched_params); |
| sched_update = slurmctld_conf.last_update; |
| info("SchedulerParameters=default_queue_depth=%d," |
| "max_rpc_cnt=%d,max_sched_time=%d,partition_job_depth=%d", |
| def_job_limit, defer_rpc_cnt, sched_timeout, |
| max_jobs_per_part); |
| } |
| |
| if ((defer_rpc_cnt > 0) && |
| (slurmctld_config.server_thread_count >= defer_rpc_cnt)) { |
| debug("sched: schedule() returning, too many RPCs"); |
| return 0; |
| } |
| |
| if (job_limit == 0) |
| job_limit = def_job_limit; |
| |
| lock_slurmctld(job_write_lock); |
| now = time(NULL); |
| sched_start = now; |
| START_TIMER; |
| if (!avail_front_end(NULL)) { |
| ListIterator job_iterator = list_iterator_create(job_list); |
| while ((job_ptr = (struct job_record *) |
| list_next(job_iterator))) { |
| if (!IS_JOB_PENDING(job_ptr)) |
| continue; |
| if ((job_ptr->state_reason != WAIT_NO_REASON) && |
| (job_ptr->state_reason != WAIT_RESOURCES) && |
| (job_ptr->state_reason != WAIT_NODE_NOT_AVAIL)) |
| continue; |
| job_ptr->state_reason = WAIT_FRONT_END; |
| } |
| list_iterator_destroy(job_iterator); |
| |
| unlock_slurmctld(job_write_lock); |
| debug("sched: schedule() returning, no front end nodes are " |
| "available"); |
| return 0; |
| } |
| /* Avoid resource fragmentation if important */ |
| if ((!wiki_sched) && job_is_completing()) { |
| unlock_slurmctld(job_write_lock); |
| debug("sched: schedule() returning, some job is still " |
| "completing"); |
| return 0; |
| } |
| |
| #ifdef HAVE_ALPS_CRAY |
| /* |
| * Run a Basil Inventory immediately before scheduling, to avoid |
| * race conditions caused by ALPS node state change (caused e.g. |
| * by the node health checker). |
| * This relies on the above write lock for the node state. |
| */ |
| if (select_g_reconfigure()) { |
| unlock_slurmctld(job_write_lock); |
| debug4("sched: not scheduling due to ALPS"); |
| return SLURM_SUCCESS; |
| } |
| #endif |
| |
| part_cnt = list_count(part_list); |
| failed_parts = xmalloc(sizeof(struct part_record *) * part_cnt); |
| failed_resv = xmalloc(sizeof(struct slurmctld_resv*) * MAX_FAILED_RESV); |
| save_avail_node_bitmap = bit_copy(avail_node_bitmap); |
| |
| if (max_jobs_per_part) { |
| ListIterator part_iterator; |
| sched_part_ptr = xmalloc(sizeof(struct part_record *) * |
| part_cnt); |
| sched_part_jobs = xmalloc(sizeof(int) * part_cnt); |
| part_iterator = list_iterator_create(part_list); |
| i = 0; |
| while ((part_ptr = (struct part_record *) |
| list_next(part_iterator))) { |
| sched_part_ptr[i++] = part_ptr; |
| } |
| list_iterator_destroy(part_iterator); |
| } |
| |
| debug("sched: Running job scheduler"); |
| /* |
| * If we are doing FIFO scheduling, use the job records right off the |
| * job list. |
| * |
| * If a job is submitted to multiple partitions then build_job_queue() |
| * will return a separate record for each job:partition pair. |
| * |
| * In both cases, we test each partition associated with the job. |
| */ |
| if (fifo_sched) { |
| slurmctld_diag_stats.schedule_queue_len = list_count(job_list); |
| job_iterator = list_iterator_create(job_list); |
| } else { |
| job_queue = build_job_queue(false, false); |
| slurmctld_diag_stats.schedule_queue_len = list_count(job_queue); |
| sort_job_queue(job_queue); |
| } |
| while (1) { |
| if (fifo_sched) { |
| if (job_ptr && part_iterator && |
| IS_JOB_PENDING(job_ptr)) /* test job in next part */ |
| goto next_part; |
| job_ptr = (struct job_record *) list_next(job_iterator); |
| if (!job_ptr) |
| break; |
| if (!avail_front_end(job_ptr)) { |
| job_ptr->state_reason = WAIT_FRONT_END; |
| xfree(job_ptr->state_desc); |
| last_job_update = now; |
| continue; |
| } |
| if (!_job_runnable_test1(job_ptr, false)) |
| continue; |
| if (job_ptr->part_ptr_list) { |
| part_iterator = list_iterator_create( |
| job_ptr->part_ptr_list); |
| next_part: part_ptr = (struct part_record *) |
| list_next(part_iterator); |
| if (part_ptr) { |
| job_ptr->part_ptr = part_ptr; |
| if (job_limits_check(&job_ptr, false) != |
| WAIT_NO_REASON) |
| continue; |
| } else { |
| list_iterator_destroy(part_iterator); |
| part_iterator = NULL; |
| continue; |
| } |
| } else { |
| if (!_job_runnable_test2(job_ptr, false)) |
| continue; |
| } |
| } else { |
| job_queue_rec = list_pop(job_queue); |
| if (!job_queue_rec) |
| break; |
| job_ptr = job_queue_rec->job_ptr; |
| part_ptr = job_queue_rec->part_ptr; |
| xfree(job_queue_rec); |
| if (!avail_front_end(job_ptr)) { |
| job_ptr->state_reason = WAIT_FRONT_END; |
| xfree(job_ptr->state_desc); |
| last_job_update = now; |
| continue; |
| } |
| if (!IS_JOB_PENDING(job_ptr)) |
| continue; /* started in another partition */ |
| job_ptr->part_ptr = part_ptr; |
| } |
| if (job_ptr->preempt_in_progress) |
| continue; /* scheduled in another partition */ |
| if ((time(NULL) - sched_start) >= sched_timeout) { |
| debug("sched: loop taking too long, breaking out"); |
| break; |
| } |
| |
| if (job_ptr->array_task_id != NO_VAL) { |
| if ((reject_array_job_id == job_ptr->array_job_id) && |
| (reject_array_part == job_ptr->part_ptr)) { |
| xfree(job_ptr->state_desc); |
| job_ptr->state_reason = reject_state_reason; |
| continue; /* already rejected array element */ |
| } |
| |
| /* assume reject whole array for now, clear if OK */ |
| reject_array_job_id = job_ptr->array_job_id; |
| reject_array_part = job_ptr->part_ptr; |
| } |
| if (max_jobs_per_part) { |
| bool skip_job = false; |
| for (j = 0; j < part_cnt; j++) { |
| if (sched_part_ptr[j] != job_ptr->part_ptr) |
| continue; |
| if (sched_part_jobs[j]++ >= |
| max_jobs_per_part) |
| skip_job = true; |
| break; |
| } |
| if (skip_job) { |
| if (job_ptr->part_ptr == skip_part_ptr) |
| continue; |
| debug2("sched: reached partition %s job limit", |
| job_ptr->part_ptr->name); |
| if (job_ptr->state_reason == WAIT_NO_REASON) { |
| xfree(job_ptr->state_desc); |
| job_ptr->state_reason = WAIT_PRIORITY; |
| } |
| skip_part_ptr = job_ptr->part_ptr; |
| continue; |
| } |
| } |
| if (job_depth++ > job_limit) { |
| debug("sched: already tested %u jobs, breaking out", |
| job_depth); |
| break; |
| } |
| if ((defer_rpc_cnt > 0) && |
| (slurmctld_config.server_thread_count >= defer_rpc_cnt)) { |
| debug("sched: schedule() returning, too many RPCs"); |
| break; |
| } |
| |
| slurmctld_diag_stats.schedule_cycle_depth++; |
| |
| if (job_ptr->resv_name) { |
| bool found_resv = false; |
| for (i = 0; i < failed_resv_cnt; i++) { |
| if (failed_resv[i] == job_ptr->resv_ptr) { |
| found_resv = true; |
| break; |
| } |
| } |
| if (found_resv) { |
| if (job_ptr->state_reason == WAIT_NO_REASON) { |
| job_ptr->state_reason = WAIT_PRIORITY; |
| xfree(job_ptr->state_desc); |
| } |
| debug3("sched: JobId=%u. State=PENDING. " |
| "Reason=%s(Priority). Priority=%u, " |
| "Resv=%s.", |
| job_ptr->job_id, |
| job_reason_string(job_ptr->state_reason), |
| job_ptr->priority, job_ptr->resv_name); |
| continue; |
| } |
| } else if (_failed_partition(job_ptr->part_ptr, failed_parts, |
| failed_part_cnt)) { |
| if ((job_ptr->state_reason == WAIT_NODE_NOT_AVAIL) || |
| (job_ptr->state_reason == WAIT_NO_REASON)) { |
| job_ptr->state_reason = WAIT_PRIORITY; |
| xfree(job_ptr->state_desc); |
| last_job_update = now; |
| } |
| debug("sched: JobId=%u. State=PENDING. " |
| "Reason=%s(Priority), Priority=%u, " |
| "Partition=%s.", |
| job_ptr->job_id, |
| job_reason_string(job_ptr->state_reason), |
| job_ptr->priority, job_ptr->partition); |
| continue; |
| } |
| |
| /* Test for valid account, QOS and required nodes on each pass */ |
| if (job_ptr->state_reason == FAIL_ACCOUNT) { |
| slurmdb_association_rec_t assoc_rec; |
| memset(&assoc_rec, 0, sizeof(slurmdb_association_rec_t)); |
| assoc_rec.acct = job_ptr->account; |
| if (job_ptr->part_ptr) |
| assoc_rec.partition = job_ptr->part_ptr->name; |
| assoc_rec.uid = job_ptr->user_id; |
| |
| if (!assoc_mgr_fill_in_assoc(acct_db_conn, &assoc_rec, |
| accounting_enforce, |
| (slurmdb_association_rec_t **) |
| &job_ptr->assoc_ptr, |
| false)) { |
| job_ptr->state_reason = WAIT_NO_REASON; |
| xfree(job_ptr->state_desc); |
| job_ptr->assoc_id = assoc_rec.id; |
| last_job_update = now; |
| } else { |
| debug("sched: JobId=%u has invalid association", |
| job_ptr->job_id); |
| xfree(job_ptr->state_desc); |
| job_ptr->state_reason = |
| WAIT_ASSOC_RESOURCE_LIMIT; |
| continue; |
| } |
| } |
| if (job_ptr->qos_id) { |
| slurmdb_association_rec_t *assoc_ptr; |
| assoc_ptr = (slurmdb_association_rec_t *)job_ptr->assoc_ptr; |
| if (assoc_ptr && |
| !bit_test(assoc_ptr->usage->valid_qos, |
| job_ptr->qos_id) && |
| !job_ptr->limit_set_qos) { |
| debug("sched: JobId=%u has invalid QOS", |
| job_ptr->job_id); |
| xfree(job_ptr->state_desc); |
| job_ptr->state_reason = FAIL_QOS; |
| last_job_update = now; |
| continue; |
| } else if (job_ptr->state_reason == FAIL_QOS) { |
| xfree(job_ptr->state_desc); |
| job_ptr->state_reason = WAIT_NO_REASON; |
| last_job_update = now; |
| } |
| } |
| |
| if (!acct_policy_job_runnable_state(job_ptr) && |
| !acct_policy_job_runnable_pre_select(job_ptr)) |
| continue; |
| |
| if ((job_ptr->state_reason == WAIT_NODE_NOT_AVAIL) && |
| job_ptr->details && job_ptr->details->req_node_bitmap && |
| !bit_super_set(job_ptr->details->req_node_bitmap, |
| avail_node_bitmap)) { |
| continue; |
| } |
| |
| i = bit_overlap(avail_node_bitmap, |
| job_ptr->part_ptr->node_bitmap); |
| if ((job_ptr->details && |
| (job_ptr->details->min_nodes != NO_VAL) && |
| (job_ptr->details->min_nodes > i)) || |
| (!job_ptr->details && (i == 0))) { |
| /* Too many nodes DRAIN, DOWN, or |
| * reserved for jobs in higher priority partition */ |
| job_ptr->state_reason = WAIT_RESOURCES; |
| xfree(job_ptr->state_desc); |
| last_job_update = now; |
| debug3("sched: JobId=%u. State=%s. Reason=%s. " |
| "Priority=%u. Partition=%s.", |
| job_ptr->job_id, |
| job_state_string(job_ptr->job_state), |
| job_reason_string(job_ptr->state_reason), |
| job_ptr->priority, |
| job_ptr->partition); |
| continue; |
| } |
| if (license_job_test(job_ptr, time(NULL)) != SLURM_SUCCESS) { |
| job_ptr->state_reason = WAIT_LICENSES; |
| xfree(job_ptr->state_desc); |
| last_job_update = now; |
| debug3("sched: JobId=%u. State=%s. Reason=%s. " |
| "Priority=%u.", |
| job_ptr->job_id, |
| job_state_string(job_ptr->job_state), |
| job_reason_string(job_ptr->state_reason), |
| job_ptr->priority); |
| continue; |
| } |
| |
| if (assoc_mgr_validate_assoc_id(acct_db_conn, |
| job_ptr->assoc_id, |
| accounting_enforce)) { |
| /* NOTE: This only happens if a user's account is |
| * disabled between when the job was submitted and |
| * the time we consider running it. It should be |
| * very rare. */ |
| info("sched: JobId=%u has invalid account", |
| job_ptr->job_id); |
| last_job_update = now; |
| job_ptr->state_reason = FAIL_ACCOUNT; |
| xfree(job_ptr->state_desc); |
| continue; |
| } |
| |
| error_code = select_nodes(job_ptr, false, NULL); |
| if (error_code == ESLURM_NODES_BUSY) { |
| debug3("sched: JobId=%u. State=%s. Reason=%s. " |
| "Priority=%u. Partition=%s.", |
| job_ptr->job_id, |
| job_state_string(job_ptr->job_state), |
| job_reason_string(job_ptr->state_reason), |
| job_ptr->priority, job_ptr->partition); |
| bool fail_by_part = true; |
| #ifdef HAVE_BG |
| /* When we use static or overlap partitioning on |
| * BlueGene, each job can possibly be scheduled |
| * independently, without impacting other jobs of |
| * different sizes. Therefore we sort and try to |
| * schedule every pending job unless the backfill |
| * scheduler is configured. */ |
| if (!backfill_sched) |
| fail_by_part = false; |
| #else |
| if (job_ptr->details && |
| job_ptr->details->req_node_bitmap && |
| (bit_set_count(job_ptr->details-> |
| req_node_bitmap)>= |
| job_ptr->details->min_nodes)) { |
| fail_by_part = false; |
| /* Do not schedule more jobs on nodes required |
| * by this job, but don't block the entire |
| * queue/partition. */ |
| bit_not(job_ptr->details->req_node_bitmap); |
| bit_and(avail_node_bitmap, |
| job_ptr->details->req_node_bitmap); |
| bit_not(job_ptr->details->req_node_bitmap); |
| } |
| #endif |
| |
| if (fail_by_part && job_ptr->resv_name) { |
| /* do not schedule more jobs in this |
| * reservation, but other jobs in this partition |
| * can be scheduled. */ |
| fail_by_part = false; |
| if (failed_resv_cnt < MAX_FAILED_RESV) { |
| failed_resv[failed_resv_cnt++] = |
| job_ptr->resv_ptr; |
| } |
| } |
| |
| if (fail_by_part) { |
| /* do not schedule more jobs in this partition |
| * or on nodes in this partition */ |
| failed_parts[failed_part_cnt++] = |
| job_ptr->part_ptr; |
| bit_not(job_ptr->part_ptr->node_bitmap); |
| bit_and(avail_node_bitmap, |
| job_ptr->part_ptr->node_bitmap); |
| bit_not(job_ptr->part_ptr->node_bitmap); |
| } |
| } else if (error_code == ESLURM_RESERVATION_NOT_USABLE) { |
| if (job_ptr->resv_ptr && |
| job_ptr->resv_ptr->node_bitmap) { |
| debug3("sched: JobId=%u. State=%s. " |
| "Reason=%s. Priority=%u.", |
| job_ptr->job_id, |
| job_state_string(job_ptr->job_state), |
| job_reason_string(job_ptr-> |
| state_reason), |
| job_ptr->priority); |
| bit_not(job_ptr->resv_ptr->node_bitmap); |
| bit_and(avail_node_bitmap, |
| job_ptr->resv_ptr->node_bitmap); |
| bit_not(job_ptr->resv_ptr->node_bitmap); |
| } else { |
| /* The job has no reservation but requires |
| * nodes that are currently in some reservation |
| * so just skip over this job and try running |
| * the next lower priority job */ |
| debug3("sched: JobId=%u State=%s. " |
| "Reason=Required nodes are reserved." |
| "Priority=%u",job_ptr->job_id, |
| job_state_string(job_ptr->job_state), |
| job_ptr->priority); |
| } |
| } else if (error_code == SLURM_SUCCESS) { |
| /* job initiated */ |
| debug3("sched: JobId=%u initiated", job_ptr->job_id); |
| last_job_update = now; |
| #ifdef HAVE_BG |
| select_g_select_jobinfo_get(job_ptr->select_jobinfo, |
| SELECT_JOBDATA_IONODES, |
| &ionodes); |
| if (ionodes) { |
| sprintf(tmp_char,"%s[%s]", |
| job_ptr->nodes, ionodes); |
| } else { |
| sprintf(tmp_char,"%s",job_ptr->nodes); |
| } |
| info("sched: Allocate JobId=%u MidplaneList=%s", |
| job_ptr->job_id, tmp_char); |
| xfree(ionodes); |
| #else |
| info("sched: Allocate JobId=%u NodeList=%s #CPUs=%u", |
| job_ptr->job_id, job_ptr->nodes, |
| job_ptr->total_cpus); |
| #endif |
| if (job_ptr->batch_flag == 0) |
| srun_allocate(job_ptr->job_id); |
| else if (job_ptr->details->prolog_running == 0) |
| launch_job(job_ptr); |
| rebuild_job_part_list(job_ptr); |
| job_cnt++; |
| reject_array_job_id = 0; |
| reject_array_part = NULL; |
| } else if ((error_code == |
| ESLURM_REQUESTED_NODE_CONFIG_UNAVAILABLE) && |
| job_ptr->part_ptr_list) { |
| debug("JobId=%u non-runnable in partition %s: %s", |
| job_ptr->job_id, job_ptr->part_ptr->name, |
| slurm_strerror(error_code)); |
| } else if ((error_code != |
| ESLURM_REQUESTED_PART_CONFIG_UNAVAILABLE) && |
| (error_code != ESLURM_NODE_NOT_AVAIL) && |
| (error_code != ESLURM_ACCOUNTING_POLICY)) { |
| info("sched: schedule: JobId=%u non-runnable: %s", |
| job_ptr->job_id, slurm_strerror(error_code)); |
| if (!wiki_sched) { |
| last_job_update = now; |
| job_ptr->job_state = JOB_PENDING; |
| job_ptr->state_reason = FAIL_BAD_CONSTRAINTS; |
| xfree(job_ptr->state_desc); |
| job_ptr->start_time = job_ptr->end_time = now; |
| job_ptr->priority = 0; |
| } |
| } |
| |
| if ((reject_array_job_id == job_ptr->array_job_id) && |
| (reject_array_part == job_ptr->part_ptr)) { |
| /* All other elements of this job array get the |
| * same reason */ |
| reject_state_reason = job_ptr->state_reason; |
| } |
| } |
| |
| save_last_part_update = last_part_update; |
| FREE_NULL_BITMAP(avail_node_bitmap); |
| avail_node_bitmap = save_avail_node_bitmap; |
| xfree(failed_parts); |
| xfree(failed_resv); |
| if (fifo_sched) { |
| if (job_iterator) |
| list_iterator_destroy(job_iterator); |
| if (part_iterator) |
| list_iterator_destroy(part_iterator); |
| } else if (job_queue) { |
| FREE_NULL_LIST(job_queue); |
| } |
| xfree(sched_part_ptr); |
| xfree(sched_part_jobs); |
| unlock_slurmctld(job_write_lock); |
| END_TIMER2("schedule"); |
| |
| _do_diag_stats(DELTA_TIMER); |
| |
| return job_cnt; |
| } |
| |
| /* |
| * sort_job_queue - sort job_queue in descending priority order |
| * IN/OUT job_queue - sorted job queue |
| */ |
| extern void sort_job_queue(List job_queue) |
| { |
| list_sort(job_queue, sort_job_queue2); |
| } |
| |
| /* Note this differs from the ListCmpF typedef since we want jobs sorted |
| * in order of decreasing priority then by increasing job id */ |
| extern int sort_job_queue2(void *x, void *y) |
| { |
| job_queue_rec_t *job_rec1 = *(job_queue_rec_t **) x; |
| job_queue_rec_t *job_rec2 = *(job_queue_rec_t **) y; |
| bool has_resv1, has_resv2; |
| static time_t config_update = 0; |
| static bool preemption_enabled = true; |
| uint32_t p1, p2; |
| |
| /* The following block of code is designed to minimize run time in |
| * typical configurations for this frequently executed function. */ |
| if (config_update != slurmctld_conf.last_update) { |
| preemption_enabled = slurm_preemption_enabled(); |
| config_update = slurmctld_conf.last_update; |
| } |
| if (preemption_enabled) { |
| if (slurm_job_preempt_check(job_rec1, job_rec2)) |
| return -1; |
| if (slurm_job_preempt_check(job_rec2, job_rec1)) |
| return 1; |
| } |
| |
| has_resv1 = (job_rec1->job_ptr->resv_id != 0); |
| has_resv2 = (job_rec2->job_ptr->resv_id != 0); |
| if (has_resv1 && !has_resv2) |
| return -1; |
| if (!has_resv1 && has_resv2) |
| return 1; |
| |
| if (job_rec1->part_ptr && job_rec2->part_ptr) { |
| p1 = job_rec1->part_ptr->priority; |
| p2 = job_rec2->part_ptr->priority; |
| if (p1 < p2) |
| return 1; |
| if (p1 > p2) |
| return -1; |
| } |
| |
| if (job_rec1->job_ptr->part_ptr_list && |
| job_rec1->job_ptr->priority_array) |
| p1 = job_rec1->priority; |
| else |
| p1 = job_rec1->job_ptr->priority; |
| |
| |
| if (job_rec2->job_ptr->part_ptr_list && |
| job_rec2->job_ptr->priority_array) |
| p2 = job_rec2->priority; |
| else |
| p2 = job_rec2->job_ptr->priority; |
| |
| |
| if (p1 < p2) |
| return 1; |
| if (p1 > p2) |
| return -1; |
| |
| /* If the priorities are the same sort by increasing job id's */ |
| if (job_rec1->job_id > job_rec2->job_id) |
| return 1; |
| |
| return -1; |
| } |
| |
| /* Given a scheduled job, return a pointer to it batch_job_launch_msg_t data */ |
| extern batch_job_launch_msg_t *build_launch_job_msg(struct job_record *job_ptr, |
| uint16_t protocol_version) |
| { |
| batch_job_launch_msg_t *launch_msg_ptr; |
| struct passwd pwd, *result; |
| char buffer[PW_BUF_SIZE]; |
| |
| /* Initialization of data structures */ |
| launch_msg_ptr = (batch_job_launch_msg_t *) |
| xmalloc(sizeof(batch_job_launch_msg_t)); |
| launch_msg_ptr->job_id = job_ptr->job_id; |
| launch_msg_ptr->step_id = NO_VAL; |
| launch_msg_ptr->array_job_id = job_ptr->array_job_id; |
| launch_msg_ptr->array_task_id = job_ptr->array_task_id; |
| launch_msg_ptr->uid = job_ptr->user_id; |
| |
| if (slurm_getpwuid_r(launch_msg_ptr->uid, |
| &pwd, |
| buffer, |
| PW_BUF_SIZE, |
| &result) |
| || !result) { |
| #ifdef HAVE_NATIVE_CRAY |
| /* On a Cray this needs to happen before the launch of |
| * the tasks. So fail if it doesn't work. On a |
| * normal system this isn't a big deal just go on your way. |
| */ |
| error("uid %ld not found on system, aborting job %u", |
| (long)launch_msg_ptr->uid, job_ptr->job_id); |
| slurm_free_job_launch_msg(launch_msg_ptr); |
| (void) job_complete(job_ptr->job_id, getuid(), false, true, 0); |
| return NULL; |
| #endif |
| } else |
| launch_msg_ptr->user_name = xstrdup(result->pw_name); |
| |
| launch_msg_ptr->gid = job_ptr->group_id; |
| launch_msg_ptr->ntasks = job_ptr->details->num_tasks; |
| launch_msg_ptr->alias_list = xstrdup(job_ptr->alias_list); |
| launch_msg_ptr->nodes = xstrdup(job_ptr->nodes); |
| launch_msg_ptr->partition = xstrdup(job_ptr->partition); |
| launch_msg_ptr->overcommit = job_ptr->details->overcommit; |
| launch_msg_ptr->open_mode = job_ptr->details->open_mode; |
| launch_msg_ptr->acctg_freq = xstrdup(job_ptr->details->acctg_freq); |
| launch_msg_ptr->cpus_per_task = job_ptr->details->cpus_per_task; |
| launch_msg_ptr->pn_min_memory = job_ptr->details->pn_min_memory; |
| launch_msg_ptr->restart_cnt = job_ptr->restart_cnt; |
| |
| if (make_batch_job_cred(launch_msg_ptr, job_ptr, protocol_version)) { |
| /* FIXME: This is a kludge, but this event indicates a serious |
| * problem with Munge or OpenSSH and should never happen. We |
| * are too deep into the job launch to gracefully clean up from |
| * from the launch, so requeue if possible. */ |
| error("Can not create job credential, attempting to requeue " |
| "batch job %u", job_ptr->job_id); |
| slurm_free_job_launch_msg(launch_msg_ptr); |
| job_ptr->batch_flag = 1; /* Allow repeated requeue */ |
| job_ptr->details->begin_time = time(NULL) + 120; |
| (void) job_complete(job_ptr->job_id, getuid(), true, false, 0); |
| return NULL; |
| } |
| |
| launch_msg_ptr->std_err = xstrdup(job_ptr->details->std_err); |
| launch_msg_ptr->std_in = xstrdup(job_ptr->details->std_in); |
| launch_msg_ptr->std_out = xstrdup(job_ptr->details->std_out); |
| launch_msg_ptr->work_dir = xstrdup(job_ptr->details->work_dir); |
| launch_msg_ptr->ckpt_dir = xstrdup(job_ptr->details->ckpt_dir); |
| launch_msg_ptr->restart_dir = xstrdup(job_ptr->details->restart_dir); |
| launch_msg_ptr->argc = job_ptr->details->argc; |
| launch_msg_ptr->argv = xduparray(job_ptr->details->argc, |
| job_ptr->details->argv); |
| launch_msg_ptr->spank_job_env_size = job_ptr->spank_job_env_size; |
| launch_msg_ptr->spank_job_env = xduparray(job_ptr->spank_job_env_size, |
| job_ptr->spank_job_env); |
| if ((launch_msg_ptr->script = get_job_script(job_ptr)) == NULL) { |
| error("Batch script is missing, aborting job %u", |
| job_ptr->job_id); |
| slurm_free_job_launch_msg(launch_msg_ptr); |
| (void) job_complete(job_ptr->job_id, getuid(), false, true, 0); |
| return NULL; |
| } |
| launch_msg_ptr->environment = get_job_env(job_ptr, |
| &launch_msg_ptr->envc); |
| if (launch_msg_ptr->environment == NULL) { |
| error("%s: environment missing or corrupted aborting job %u", |
| __func__, job_ptr->job_id); |
| slurm_free_job_launch_msg(launch_msg_ptr); |
| job_complete(job_ptr->job_id, getuid(), false, true, 0); |
| return NULL; |
| } |
| launch_msg_ptr->job_mem = job_ptr->details->pn_min_memory; |
| launch_msg_ptr->num_cpu_groups = job_ptr->job_resrcs->cpu_array_cnt; |
| launch_msg_ptr->cpus_per_node = xmalloc(sizeof(uint16_t) * |
| job_ptr->job_resrcs->cpu_array_cnt); |
| memcpy(launch_msg_ptr->cpus_per_node, |
| job_ptr->job_resrcs->cpu_array_value, |
| (sizeof(uint16_t) * job_ptr->job_resrcs->cpu_array_cnt)); |
| launch_msg_ptr->cpu_count_reps = xmalloc(sizeof(uint32_t) * |
| job_ptr->job_resrcs->cpu_array_cnt); |
| memcpy(launch_msg_ptr->cpu_count_reps, |
| job_ptr->job_resrcs->cpu_array_reps, |
| (sizeof(uint32_t) * job_ptr->job_resrcs->cpu_array_cnt)); |
| |
| launch_msg_ptr->select_jobinfo = select_g_select_jobinfo_copy( |
| job_ptr->select_jobinfo); |
| |
| return launch_msg_ptr; |
| } |
| |
| /* |
| * launch_job - send an RPC to a slurmd to initiate a batch job |
| * IN job_ptr - pointer to job that will be initiated |
| */ |
| extern void launch_job(struct job_record *job_ptr) |
| { |
| batch_job_launch_msg_t *launch_msg_ptr; |
| uint16_t protocol_version = (uint16_t) NO_VAL; |
| agent_arg_t *agent_arg_ptr; |
| |
| #ifdef HAVE_FRONT_END |
| front_end_record_t *front_end_ptr; |
| front_end_ptr = find_front_end_record(job_ptr->batch_host); |
| if (front_end_ptr) |
| protocol_version = front_end_ptr->protocol_version; |
| #else |
| struct node_record *node_ptr; |
| node_ptr = find_node_record(job_ptr->batch_host); |
| if (node_ptr) |
| protocol_version = node_ptr->protocol_version; |
| #endif |
| |
| launch_msg_ptr = build_launch_job_msg(job_ptr, protocol_version); |
| if (launch_msg_ptr == NULL) |
| return; |
| |
| agent_arg_ptr = (agent_arg_t *) xmalloc(sizeof(agent_arg_t)); |
| agent_arg_ptr->protocol_version = protocol_version; |
| agent_arg_ptr->node_count = 1; |
| agent_arg_ptr->retry = 0; |
| xassert(job_ptr->batch_host); |
| agent_arg_ptr->hostlist = hostlist_create(job_ptr->batch_host); |
| agent_arg_ptr->msg_type = REQUEST_BATCH_JOB_LAUNCH; |
| agent_arg_ptr->msg_args = (void *) launch_msg_ptr; |
| |
| /* Launch the RPC via agent */ |
| agent_queue_request(agent_arg_ptr); |
| } |
| |
| /* |
| * make_batch_job_cred - add a job credential to the batch_job_launch_msg |
| * IN/OUT launch_msg_ptr - batch_job_launch_msg in which job_id, step_id, |
| * uid and nodes have already been set |
| * IN job_ptr - pointer to job record |
| * RET 0 or error code |
| */ |
| extern int make_batch_job_cred(batch_job_launch_msg_t *launch_msg_ptr, |
| struct job_record *job_ptr, |
| uint16_t protocol_version) |
| { |
| slurm_cred_arg_t cred_arg; |
| job_resources_t *job_resrcs_ptr; |
| |
| xassert(job_ptr->job_resrcs); |
| job_resrcs_ptr = job_ptr->job_resrcs; |
| |
| memset(&cred_arg, 0, sizeof(slurm_cred_arg_t)); |
| |
| cred_arg.jobid = launch_msg_ptr->job_id; |
| cred_arg.stepid = launch_msg_ptr->step_id; |
| cred_arg.uid = launch_msg_ptr->uid; |
| |
| cred_arg.job_hostlist = job_resrcs_ptr->nodes; |
| cred_arg.job_core_bitmap = job_resrcs_ptr->core_bitmap; |
| cred_arg.job_core_spec = job_ptr->details->core_spec; |
| cred_arg.job_mem_limit = job_ptr->details->pn_min_memory; |
| cred_arg.job_nhosts = job_resrcs_ptr->nhosts; |
| cred_arg.job_gres_list = job_ptr->gres_list; |
| /* cred_arg.step_gres_list = NULL; */ |
| |
| #ifdef HAVE_FRONT_END |
| xassert(job_ptr->batch_host); |
| cred_arg.step_hostlist = job_ptr->batch_host; |
| #else |
| cred_arg.step_hostlist = launch_msg_ptr->nodes; |
| #endif |
| cred_arg.step_core_bitmap = job_resrcs_ptr->core_bitmap; |
| cred_arg.step_mem_limit = job_ptr->details->pn_min_memory; |
| |
| cred_arg.cores_per_socket = job_resrcs_ptr->cores_per_socket; |
| cred_arg.sockets_per_node = job_resrcs_ptr->sockets_per_node; |
| cred_arg.sock_core_rep_count = job_resrcs_ptr->sock_core_rep_count; |
| |
| launch_msg_ptr->cred = slurm_cred_create(slurmctld_config.cred_ctx, |
| &cred_arg, protocol_version); |
| |
| if (launch_msg_ptr->cred) |
| return SLURM_SUCCESS; |
| error("slurm_cred_create failure for batch job %u", cred_arg.jobid); |
| return SLURM_ERROR; |
| } |
| |
| static void _depend_list_del(void *dep_ptr) |
| { |
| xfree(dep_ptr); |
| } |
| |
| /* |
| * Copy a job's dependency list |
| * IN depend_list_src - a job's depend_lst |
| * RET copy of depend_list_src, must bee freed by caller |
| */ |
| extern List depended_list_copy(List depend_list_src) |
| { |
| struct depend_spec *dep_src, *dep_dest; |
| ListIterator iter; |
| List depend_list_dest = NULL; |
| |
| if (!depend_list_src) |
| return depend_list_dest; |
| |
| depend_list_dest = list_create(_depend_list_del); |
| iter = list_iterator_create(depend_list_src); |
| while ((dep_src = (struct depend_spec *) list_next(iter))) { |
| dep_dest = xmalloc(sizeof(struct depend_spec)); |
| memcpy(dep_dest, dep_src, sizeof(struct depend_spec)); |
| list_append(depend_list_dest, dep_dest); |
| } |
| list_iterator_destroy(iter); |
| return depend_list_dest; |
| } |
| |
| /* Print a job's dependency information based upon job_ptr->depend_list */ |
| extern void print_job_dependency(struct job_record *job_ptr) |
| { |
| ListIterator depend_iter; |
| struct depend_spec *dep_ptr; |
| char *array_task_id, *dep_str; |
| |
| info("Dependency information for job %u", job_ptr->job_id); |
| if ((job_ptr->details == NULL) || |
| (job_ptr->details->depend_list == NULL)) |
| return; |
| |
| depend_iter = list_iterator_create(job_ptr->details->depend_list); |
| while ((dep_ptr = list_next(depend_iter))) { |
| if (dep_ptr->depend_type == SLURM_DEPEND_SINGLETON) { |
| info(" singleton"); |
| continue; |
| } |
| |
| if (dep_ptr->depend_type == SLURM_DEPEND_AFTER) |
| dep_str = "after"; |
| else if (dep_ptr->depend_type == SLURM_DEPEND_AFTER_ANY) |
| dep_str = "afterany"; |
| else if (dep_ptr->depend_type == SLURM_DEPEND_AFTER_NOT_OK) |
| dep_str = "afternotok"; |
| else if (dep_ptr->depend_type == SLURM_DEPEND_AFTER_OK) |
| dep_str = "afterok"; |
| else if (dep_ptr->depend_type == SLURM_DEPEND_EXPAND) |
| dep_str = "expand"; |
| else |
| dep_str = "unknown"; |
| if (dep_ptr->array_task_id == INFINITE) |
| array_task_id = "_*"; |
| else |
| array_task_id = ""; |
| info(" %s:%u%s", dep_str, dep_ptr->job_id, array_task_id); |
| } |
| list_iterator_destroy(depend_iter); |
| } |
| |
| static void _depend_list2str(struct job_record *job_ptr) |
| { |
| ListIterator depend_iter; |
| struct depend_spec *dep_ptr; |
| char *array_task_id, *dep_str, *sep = ""; |
| |
| if (job_ptr->details == NULL) |
| return; |
| xfree(job_ptr->details->dependency); |
| if (job_ptr->details->depend_list == NULL) |
| return; |
| |
| depend_iter = list_iterator_create(job_ptr->details->depend_list); |
| while ((dep_ptr = list_next(depend_iter))) { |
| if (dep_ptr->depend_type == SLURM_DEPEND_SINGLETON) { |
| xstrfmtcat(job_ptr->details->dependency, |
| "%ssingleton", sep); |
| sep = ","; |
| continue; |
| } |
| |
| if (dep_ptr->depend_type == SLURM_DEPEND_AFTER) |
| dep_str = "after"; |
| else if (dep_ptr->depend_type == SLURM_DEPEND_AFTER_ANY) |
| dep_str = "afterany"; |
| else if (dep_ptr->depend_type == SLURM_DEPEND_AFTER_NOT_OK) |
| dep_str = "afternotok"; |
| else if (dep_ptr->depend_type == SLURM_DEPEND_AFTER_OK) |
| dep_str = "afterok"; |
| else if (dep_ptr->depend_type == SLURM_DEPEND_EXPAND) |
| dep_str = "expand"; |
| else |
| dep_str = "unknown"; |
| if (dep_ptr->array_task_id == INFINITE) |
| array_task_id = "_*"; |
| else |
| array_task_id = ""; |
| xstrfmtcat(job_ptr->details->dependency, "%s%s:%u%s", |
| sep, dep_str, dep_ptr->job_id, array_task_id); |
| sep = ","; |
| } |
| list_iterator_destroy(depend_iter); |
| } |
| |
| /* |
| * Determine if a job's dependencies are met |
| * RET: 0 = no dependencies |
| * 1 = dependencies remain |
| * 2 = failure (job completion code not per dependency), delete the job |
| */ |
| extern int test_job_dependency(struct job_record *job_ptr) |
| { |
| ListIterator depend_iter, job_iterator; |
| struct depend_spec *dep_ptr; |
| bool failure = false, depends = false, rebuild_str = false; |
| List job_queue = NULL; |
| bool run_now; |
| int results = 0; |
| struct job_record *qjob_ptr, *djob_ptr; |
| time_t now = time(NULL); |
| /* For performance reasons with job arrays, we cache dependency |
| * results and re-use them whenever possible */ |
| static uint32_t cache_job_id = 0; |
| static struct job_record *cache_job_ptr = NULL; |
| static int cache_results; |
| static time_t cache_time = 0; |
| |
| if ((job_ptr->details == NULL) || |
| (job_ptr->details->depend_list == NULL) || |
| (list_count(job_ptr->details->depend_list) == 0)) |
| return 0; |
| |
| if ((job_ptr->array_task_id != NO_VAL) && |
| (cache_time == now) && |
| (cache_job_ptr->magic == JOB_MAGIC) && |
| (cache_job_ptr->job_id == cache_job_id) && |
| (cache_job_ptr->array_job_id == job_ptr->array_job_id) && |
| (cache_job_ptr->details) && |
| (cache_job_ptr->details->orig_dependency) && |
| (job_ptr->details->orig_dependency) && |
| (!strcmp(cache_job_ptr->details->orig_dependency, |
| job_ptr->details->orig_dependency))) { |
| return cache_results; |
| } |
| |
| depend_iter = list_iterator_create(job_ptr->details->depend_list); |
| while ((dep_ptr = list_next(depend_iter))) { |
| bool clear_dep = false; |
| if (dep_ptr->array_task_id == INFINITE) { |
| /* Advance to latest element of this job array */ |
| dep_ptr->job_ptr = find_job_array_rec(dep_ptr->job_id, |
| dep_ptr->array_task_id); |
| } |
| djob_ptr = dep_ptr->job_ptr; |
| if ((dep_ptr->depend_type == SLURM_DEPEND_SINGLETON) && |
| job_ptr->name) { |
| /* get user jobs with the same user and name */ |
| job_queue = _build_user_job_list(job_ptr->user_id, |
| job_ptr->name); |
| run_now = true; |
| job_iterator = list_iterator_create(job_queue); |
| while ((qjob_ptr = (struct job_record *) |
| list_next(job_iterator))) { |
| /* already running/suspended job or previously |
| * submitted pending job */ |
| if (IS_JOB_RUNNING(qjob_ptr) || |
| IS_JOB_SUSPENDED(qjob_ptr) || |
| (IS_JOB_PENDING(qjob_ptr) && |
| (qjob_ptr->job_id < job_ptr->job_id))) { |
| run_now = false; |
| break; |
| } |
| } |
| list_iterator_destroy(job_iterator); |
| list_destroy(job_queue); |
| /* job can run now, delete dependency */ |
| if (run_now) |
| list_delete_item(depend_iter); |
| else |
| depends = true; |
| } else if ((djob_ptr == NULL) || |
| (djob_ptr->magic != JOB_MAGIC) || |
| ((djob_ptr->job_id != dep_ptr->job_id) && |
| (djob_ptr->array_job_id != dep_ptr->job_id))) { |
| /* job is gone, dependency lifted */ |
| clear_dep = true; |
| } else if (dep_ptr->depend_type == SLURM_DEPEND_AFTER) { |
| if (!IS_JOB_PENDING(dep_ptr->job_ptr)) { |
| clear_dep = true; |
| } else |
| depends = true; |
| } else if (dep_ptr->depend_type == SLURM_DEPEND_AFTER_ANY) { |
| if (IS_JOB_COMPLETED(dep_ptr->job_ptr)) { |
| clear_dep = true; |
| } else |
| depends = true; |
| } else if (dep_ptr->depend_type == SLURM_DEPEND_AFTER_NOT_OK) { |
| if (dep_ptr->job_ptr->job_state & JOB_SPECIAL_EXIT) { |
| clear_dep = true; |
| } else if (!IS_JOB_COMPLETED(dep_ptr->job_ptr)) { |
| depends = true; |
| } else if (!IS_JOB_COMPLETE(dep_ptr->job_ptr)) { |
| clear_dep = true; |
| } else { |
| failure = true; |
| break; |
| } |
| } else if (dep_ptr->depend_type == SLURM_DEPEND_AFTER_OK) { |
| if (!IS_JOB_COMPLETED(dep_ptr->job_ptr)) |
| depends = true; |
| else if (IS_JOB_COMPLETE(dep_ptr->job_ptr)) { |
| clear_dep = true; |
| } else { |
| failure = true; |
| break; |
| } |
| } else if (dep_ptr->depend_type == SLURM_DEPEND_EXPAND) { |
| time_t now = time(NULL); |
| if (IS_JOB_PENDING(dep_ptr->job_ptr)) { |
| depends = true; |
| } else if (IS_JOB_COMPLETED(dep_ptr->job_ptr)) { |
| failure = true; |
| break; |
| } else if ((dep_ptr->job_ptr->end_time != 0) && |
| (dep_ptr->job_ptr->end_time > now)) { |
| job_ptr->time_limit = dep_ptr->job_ptr-> |
| end_time - now; |
| job_ptr->time_limit /= 60; /* sec to min */ |
| } |
| if (job_ptr->details && dep_ptr->job_ptr->details) { |
| job_ptr->details->share_res = |
| dep_ptr->job_ptr->details->share_res; |
| job_ptr->details->whole_node = |
| dep_ptr->job_ptr->details->whole_node; |
| } |
| } else |
| failure = true; |
| if (clear_dep) { |
| list_delete_item(depend_iter); |
| rebuild_str = true; |
| } |
| } |
| list_iterator_destroy(depend_iter); |
| if (rebuild_str) |
| _depend_list2str(job_ptr); |
| if (list_count(job_ptr->details->depend_list) == 0) |
| xfree(job_ptr->details->dependency); |
| |
| if (failure) |
| results = 2; |
| else if (depends) |
| results = 1; |
| |
| if (job_ptr->array_task_id != NO_VAL) { |
| cache_job_id = job_ptr->job_id; |
| cache_job_ptr = job_ptr; |
| cache_results = results; |
| cache_time = now; |
| } |
| |
| return results; |
| } |
| |
| /* |
| * Parse a job dependency string and use it to establish a "depend_spec" |
| * list of dependencies. We accept both old format (a single job ID) and |
| * new format (e.g. "afterok:123:124,after:128"). |
| * IN job_ptr - job record to have dependency and depend_list updated |
| * IN new_depend - new dependency description |
| * RET returns an error code from slurm_errno.h |
| */ |
| extern int update_job_dependency(struct job_record *job_ptr, char *new_depend) |
| { |
| int rc = SLURM_SUCCESS; |
| uint16_t depend_type = 0; |
| uint32_t job_id = 0; |
| uint32_t array_task_id; |
| char *tok = new_depend, *sep_ptr, *sep_ptr2 = NULL; |
| List new_depend_list = NULL; |
| struct depend_spec *dep_ptr; |
| struct job_record *dep_job_ptr; |
| bool expand_cnt = 0; |
| |
| if (job_ptr->details == NULL) |
| return EINVAL; |
| |
| /* Clear dependencies on NULL, "0", or empty dependency input */ |
| job_ptr->details->expanding_jobid = 0; |
| if ((new_depend == NULL) || (new_depend[0] == '\0') || |
| ((new_depend[0] == '0') && (new_depend[1] == '\0'))) { |
| xfree(job_ptr->details->dependency); |
| if (job_ptr->details->depend_list) { |
| list_destroy(job_ptr->details->depend_list); |
| job_ptr->details->depend_list = NULL; |
| } |
| return rc; |
| |
| } |
| |
| new_depend_list = list_create(_depend_list_del); |
| |
| /* validate new dependency string */ |
| while (rc == SLURM_SUCCESS) { |
| |
| /* test singleton dependency flag */ |
| if ( strncasecmp(tok, "singleton", 9) == 0 ) { |
| depend_type = SLURM_DEPEND_SINGLETON; |
| dep_ptr = xmalloc(sizeof(struct depend_spec)); |
| dep_ptr->depend_type = depend_type; |
| /* dep_ptr->job_id = 0; set by xmalloc */ |
| /* dep_ptr->job_ptr = NULL; set by xmalloc */ |
| (void) list_append(new_depend_list, dep_ptr); |
| if (tok[9] == ',') { |
| tok += 10; |
| continue; |
| } |
| if (tok[9] != '\0') |
| rc = ESLURM_DEPENDENCY; |
| break; |
| } |
| |
| /* Test for old format, just a job ID */ |
| sep_ptr = strchr(tok, ':'); |
| if ((sep_ptr == NULL) && (tok[0] >= '0') && (tok[0] <= '9')) { |
| job_id = strtol(tok, &sep_ptr, 10); |
| if ((sep_ptr != NULL) && (sep_ptr[0] == '_')) { |
| if (sep_ptr[1] == '*') { |
| array_task_id = INFINITE; |
| sep_ptr += 2; /* Past "_*" */ |
| } else { |
| array_task_id = strtol(sep_ptr+1, |
| &sep_ptr, 10); |
| } |
| } else { |
| array_task_id = NO_VAL; |
| } |
| if ((sep_ptr == NULL) || |
| (job_id == 0) || (job_id == job_ptr->job_id) || |
| ((sep_ptr[0] != '\0') && (sep_ptr[0] != ','))) { |
| rc = ESLURM_DEPENDENCY; |
| break; |
| } |
| if (array_task_id == NO_VAL) { |
| dep_job_ptr = find_job_record(job_id); |
| if (!dep_job_ptr) { |
| dep_job_ptr = find_job_array_rec(job_id, |
| INFINITE); |
| } |
| if (dep_job_ptr && |
| (dep_job_ptr->array_job_id == job_id) && |
| (dep_job_ptr->array_task_id != NO_VAL)) { |
| array_task_id = INFINITE; |
| } |
| } else { |
| dep_job_ptr = find_job_array_rec(job_id, |
| array_task_id); |
| } |
| if (dep_job_ptr) { |
| dep_ptr = xmalloc(sizeof(struct depend_spec)); |
| dep_ptr->array_task_id = array_task_id; |
| dep_ptr->depend_type = SLURM_DEPEND_AFTER_ANY; |
| if (array_task_id == NO_VAL) { |
| dep_ptr->job_id = dep_job_ptr->job_id; |
| } else { |
| dep_ptr->job_id = |
| dep_job_ptr->array_job_id; |
| } |
| dep_ptr->job_ptr = dep_job_ptr; |
| (void) list_append(new_depend_list, dep_ptr); |
| } |
| if (sep_ptr && (sep_ptr[0] == ',')) { |
| tok = sep_ptr + 1; |
| continue; |
| } else { |
| break; |
| } |
| } else if (sep_ptr == NULL) { |
| rc = ESLURM_DEPENDENCY; |
| break; |
| } |
| |
| /* New format, <test>:job_ID */ |
| if (strncasecmp(tok, "afternotok", 10) == 0) |
| depend_type = SLURM_DEPEND_AFTER_NOT_OK; |
| else if (strncasecmp(tok, "afterany", 8) == 0) |
| depend_type = SLURM_DEPEND_AFTER_ANY; |
| else if (strncasecmp(tok, "afterok", 7) == 0) |
| depend_type = SLURM_DEPEND_AFTER_OK; |
| else if (strncasecmp(tok, "after", 5) == 0) |
| depend_type = SLURM_DEPEND_AFTER; |
| else if (strncasecmp(tok, "expand", 6) == 0) { |
| if (!select_g_job_expand_allow()) { |
| rc = ESLURM_DEPENDENCY; |
| break; |
| } |
| depend_type = SLURM_DEPEND_EXPAND; |
| } else { |
| rc = ESLURM_DEPENDENCY; |
| break; |
| } |
| sep_ptr++; /* skip over ":" */ |
| while (rc == SLURM_SUCCESS) { |
| job_id = strtol(sep_ptr, &sep_ptr2, 10); |
| if ((sep_ptr2 != NULL) && (sep_ptr2[0] == '_')) { |
| if (sep_ptr2[1] == '*') { |
| array_task_id = INFINITE; |
| sep_ptr2 += 2; /* Past "_*" */ |
| } else { |
| array_task_id = strtol(sep_ptr2+1, |
| &sep_ptr2, 10); |
| } |
| } else |
| array_task_id = NO_VAL; |
| if ((sep_ptr2 == NULL) || |
| (job_id == 0) || (job_id == job_ptr->job_id) || |
| ((sep_ptr2[0] != '\0') && (sep_ptr2[0] != ',') && |
| (sep_ptr2[0] != ':'))) { |
| rc = ESLURM_DEPENDENCY; |
| break; |
| } |
| if (array_task_id == NO_VAL) { |
| dep_job_ptr = find_job_record(job_id); |
| if (!dep_job_ptr) { |
| dep_job_ptr = find_job_array_rec(job_id, |
| INFINITE); |
| } |
| if (dep_job_ptr && |
| (dep_job_ptr->array_job_id == job_id) && |
| (dep_job_ptr->array_task_id != NO_VAL)) { |
| array_task_id = INFINITE; |
| } |
| } else { |
| dep_job_ptr = find_job_array_rec(job_id, |
| array_task_id); |
| } |
| if ((depend_type == SLURM_DEPEND_EXPAND) && |
| ((expand_cnt++ > 0) || (dep_job_ptr == NULL) || |
| (!IS_JOB_RUNNING(dep_job_ptr)) || |
| (dep_job_ptr->qos_id != job_ptr->qos_id) || |
| (dep_job_ptr->part_ptr == NULL) || |
| (job_ptr->part_ptr == NULL) || |
| (dep_job_ptr->part_ptr != job_ptr->part_ptr))) { |
| /* Expand only jobs in the same QOS and |
| * and partition */ |
| rc = ESLURM_DEPENDENCY; |
| break; |
| } |
| if (depend_type == SLURM_DEPEND_EXPAND) { |
| job_ptr->details->expanding_jobid = job_id; |
| /* GRES configuration of this job must match |
| * the job being expanded */ |
| xfree(job_ptr->gres); |
| job_ptr->gres = xstrdup(dep_job_ptr->gres); |
| if (job_ptr->gres_list) |
| list_destroy(job_ptr->gres_list); |
| gres_plugin_job_state_validate(job_ptr->gres, |
| &job_ptr->gres_list); |
| } |
| if (dep_job_ptr) { /* job still active */ |
| dep_ptr = xmalloc(sizeof(struct depend_spec)); |
| dep_ptr->array_task_id = array_task_id; |
| dep_ptr->depend_type = depend_type; |
| if (array_task_id == NO_VAL) |
| dep_ptr->job_id = dep_job_ptr->job_id; |
| else { |
| dep_ptr->job_id = |
| dep_job_ptr->array_job_id; |
| } |
| dep_ptr->job_ptr = dep_job_ptr; |
| (void) list_append(new_depend_list, dep_ptr); |
| } |
| if (sep_ptr2[0] != ':') |
| break; |
| sep_ptr = sep_ptr2 + 1; /* skip over ":" */ |
| } |
| if (sep_ptr2 && (sep_ptr2[0] == ',')) |
| tok = sep_ptr2 + 1; |
| else |
| break; |
| } |
| |
| if (rc == SLURM_SUCCESS) { |
| /* test for circular dependencies (e.g. A -> B -> A) */ |
| (void) _scan_depend(NULL, job_ptr->job_id); |
| if (_scan_depend(new_depend_list, job_ptr->job_id)) |
| rc = ESLURM_CIRCULAR_DEPENDENCY; |
| } |
| |
| if (rc == SLURM_SUCCESS) { |
| if (job_ptr->details->depend_list) |
| list_destroy(job_ptr->details->depend_list); |
| job_ptr->details->depend_list = new_depend_list; |
| _depend_list2str(job_ptr); |
| #if _DEBUG |
| print_job_dependency(job_ptr); |
| #endif |
| } else { |
| list_destroy(new_depend_list); |
| } |
| return rc; |
| } |
| |
| /* Return TRUE if job_id is found in dependency_list. |
| * Pass NULL dependency list to clear the counter. |
| * Execute recursively for each dependent job */ |
| static bool _scan_depend(List dependency_list, uint32_t job_id) |
| { |
| static time_t sched_update = 0; |
| static int max_depend_depth = 10; |
| static int job_counter = 0; |
| bool rc = false; |
| ListIterator iter; |
| struct depend_spec *dep_ptr; |
| |
| if (sched_update != slurmctld_conf.last_update) { |
| char *sched_params, *tmp_ptr; |
| |
| sched_params = slurm_get_sched_params(); |
| if (sched_params && |
| (tmp_ptr = strstr(sched_params, "max_depend_depth="))) { |
| /* 01234567890123456 */ |
| int i = atoi(tmp_ptr + 17); |
| if (i < 0) { |
| error("ignoring SchedulerParameters: " |
| "max_depend_depth value of %d", i); |
| } else { |
| max_depend_depth = i; |
| } |
| } |
| xfree(sched_params); |
| sched_update = slurmctld_conf.last_update; |
| } |
| |
| if (dependency_list == NULL) { |
| job_counter = 0; |
| return FALSE; |
| } else if (job_counter++ >= max_depend_depth) { |
| return FALSE; |
| } |
| |
| xassert(job_id); |
| iter = list_iterator_create(dependency_list); |
| while (!rc && (dep_ptr = (struct depend_spec *) list_next(iter))) { |
| if (dep_ptr->job_id == 0) /* Singleton */ |
| continue; |
| if (dep_ptr->job_id == job_id) |
| rc = true; |
| else if ((dep_ptr->job_id != dep_ptr->job_ptr->job_id) || |
| (dep_ptr->job_ptr->magic != JOB_MAGIC)) |
| continue; /* purged job, ptr not yet cleared */ |
| else if (!IS_JOB_FINISHED(dep_ptr->job_ptr) && |
| dep_ptr->job_ptr->details && |
| dep_ptr->job_ptr->details->depend_list) { |
| rc = _scan_depend(dep_ptr->job_ptr->details-> |
| depend_list, job_id); |
| if (rc) { |
| info("circular dependency: job %u is dependent " |
| "upon job %u", dep_ptr->job_id, job_id); |
| } |
| } |
| } |
| list_iterator_destroy(iter); |
| return rc; |
| } |
| |
| static void _pre_list_del(void *x) |
| { |
| xfree(x); |
| } |
| |
| /* If there are higher priority queued jobs in this job's partition, then |
| * delay the job's expected initiation time as needed to run those jobs. |
| * NOTE: This is only a rough estimate of the job's start time as it ignores |
| * job dependencies, feature requirements, specific node requirements, etc. */ |
| static void _delayed_job_start_time(struct job_record *job_ptr) |
| { |
| uint32_t part_node_cnt, part_cpu_cnt, part_cpus_per_node; |
| uint32_t job_size_cpus, job_size_nodes, job_time; |
| uint64_t cume_space_time = 0; |
| struct job_record *job_q_ptr; |
| ListIterator job_iterator; |
| |
| if (job_ptr->part_ptr == NULL) |
| return; |
| part_node_cnt = job_ptr->part_ptr->total_nodes; |
| part_cpu_cnt = job_ptr->part_ptr->total_cpus; |
| if (part_cpu_cnt > part_node_cnt) |
| part_cpus_per_node = part_cpu_cnt / part_node_cnt; |
| else |
| part_cpus_per_node = 1; |
| |
| job_iterator = list_iterator_create(job_list); |
| while ((job_q_ptr = (struct job_record *) list_next(job_iterator))) { |
| if (!IS_JOB_PENDING(job_q_ptr) || !job_q_ptr->details || |
| (job_q_ptr->part_ptr != job_ptr->part_ptr) || |
| (job_q_ptr->priority < job_ptr->priority) || |
| (job_q_ptr->job_id == job_ptr->job_id)) |
| continue; |
| if (job_q_ptr->details->min_nodes == NO_VAL) |
| job_size_nodes = 1; |
| else |
| job_size_nodes = job_q_ptr->details->min_nodes; |
| if (job_q_ptr->details->min_cpus == NO_VAL) |
| job_size_cpus = 1; |
| else |
| job_size_cpus = job_q_ptr->details->min_nodes; |
| job_size_cpus = MAX(job_size_cpus, |
| (job_size_nodes * part_cpus_per_node)); |
| if (job_q_ptr->time_limit == NO_VAL) |
| job_time = job_q_ptr->part_ptr->max_time; |
| else |
| job_time = job_q_ptr->time_limit; |
| cume_space_time += job_size_cpus * job_time; |
| } |
| list_iterator_destroy(job_iterator); |
| cume_space_time /= part_cpu_cnt;/* Factor out size */ |
| cume_space_time *= 60; /* Minutes to seconds */ |
| debug2("Increasing estimated start of job %u by %"PRIu64" secs", |
| job_ptr->job_id, cume_space_time); |
| job_ptr->start_time += cume_space_time; |
| } |
| |
| /* Determine if a pending job will run using only the specified nodes |
| * (in job_desc_msg->req_nodes), build response message and return |
| * SLURM_SUCCESS on success. Otherwise return an error code. Caller |
| * must free response message */ |
| extern int job_start_data(job_desc_msg_t *job_desc_msg, |
| will_run_response_msg_t **resp) |
| { |
| struct job_record *job_ptr; |
| struct part_record *part_ptr; |
| bitstr_t *avail_bitmap = NULL, *resv_bitmap = NULL; |
| bitstr_t *exc_core_bitmap = NULL; |
| uint32_t min_nodes, max_nodes, req_nodes; |
| int i, rc = SLURM_SUCCESS; |
| time_t now = time(NULL), start_res, orig_start_time = (time_t) 0; |
| List preemptee_candidates = NULL, preemptee_job_list = NULL; |
| |
| job_ptr = find_job_record(job_desc_msg->job_id); |
| if (job_ptr == NULL) |
| return ESLURM_INVALID_JOB_ID; |
| |
| part_ptr = job_ptr->part_ptr; |
| if (part_ptr == NULL) |
| return ESLURM_INVALID_PARTITION_NAME; |
| |
| if ((job_ptr->details == NULL) || (!IS_JOB_PENDING(job_ptr))) |
| return ESLURM_DISABLED; |
| |
| if ((job_desc_msg->req_nodes == NULL) || |
| (job_desc_msg->req_nodes == '\0')) { |
| /* assume all nodes available to job for testing */ |
| avail_bitmap = bit_alloc(node_record_count); |
| bit_nset(avail_bitmap, 0, (node_record_count - 1)); |
| } else if (node_name2bitmap(job_desc_msg->req_nodes, false, |
| &avail_bitmap) != 0) { |
| return ESLURM_INVALID_NODE_NAME; |
| } |
| |
| /* Consider only nodes in this job's partition */ |
| if (part_ptr->node_bitmap) |
| bit_and(avail_bitmap, part_ptr->node_bitmap); |
| else |
| rc = ESLURM_REQUESTED_PART_CONFIG_UNAVAILABLE; |
| if (job_req_node_filter(job_ptr, avail_bitmap)) |
| rc = ESLURM_REQUESTED_PART_CONFIG_UNAVAILABLE; |
| if (job_ptr->details->exc_node_bitmap) { |
| bitstr_t *exc_node_mask = NULL; |
| exc_node_mask = bit_copy(job_ptr->details->exc_node_bitmap); |
| bit_not(exc_node_mask); |
| bit_and(avail_bitmap, exc_node_mask); |
| FREE_NULL_BITMAP(exc_node_mask); |
| } |
| if (job_ptr->details->req_node_bitmap) { |
| if (!bit_super_set(job_ptr->details->req_node_bitmap, |
| avail_bitmap)) { |
| rc = ESLURM_REQUESTED_PART_CONFIG_UNAVAILABLE; |
| } |
| } |
| |
| /* Enforce reservation: access control, time and nodes */ |
| if (job_ptr->details->begin_time) |
| start_res = job_ptr->details->begin_time; |
| else |
| start_res = now; |
| i = job_test_resv(job_ptr, &start_res, false, &resv_bitmap, |
| &exc_core_bitmap); |
| if (i != SLURM_SUCCESS) |
| return i; |
| bit_and(avail_bitmap, resv_bitmap); |
| FREE_NULL_BITMAP(resv_bitmap); |
| |
| /* Only consider nodes that are not DOWN or DRAINED */ |
| bit_and(avail_bitmap, avail_node_bitmap); |
| |
| if (rc == SLURM_SUCCESS) { |
| /* On BlueGene systems don't adjust the min/max node limits |
| here. We are working on midplane values. */ |
| min_nodes = MAX(job_ptr->details->min_nodes, |
| part_ptr->min_nodes); |
| if (job_ptr->details->max_nodes == 0) |
| max_nodes = part_ptr->max_nodes; |
| else |
| max_nodes = MIN(job_ptr->details->max_nodes, |
| part_ptr->max_nodes); |
| max_nodes = MIN(max_nodes, 500000); /* prevent overflows */ |
| if (!job_ptr->limit_set_max_nodes && |
| job_ptr->details->max_nodes) |
| req_nodes = max_nodes; |
| else |
| req_nodes = min_nodes; |
| preemptee_candidates = slurm_find_preemptable_jobs(job_ptr); |
| |
| /* The orig_start is based upon the backfill scheduler data |
| * and considers all higher priority jobs. The logic below |
| * only considers currently running jobs, so the expected |
| * start time will almost certainly be earlier and not as |
| * accurate, but this algorithm is much faster. */ |
| orig_start_time = job_ptr->start_time; |
| rc = select_g_job_test(job_ptr, avail_bitmap, |
| min_nodes, max_nodes, req_nodes, |
| SELECT_MODE_WILL_RUN, |
| preemptee_candidates, |
| &preemptee_job_list, exc_core_bitmap); |
| } |
| |
| if (rc == SLURM_SUCCESS) { |
| will_run_response_msg_t *resp_data; |
| resp_data = xmalloc(sizeof(will_run_response_msg_t)); |
| resp_data->job_id = job_ptr->job_id; |
| #ifdef HAVE_BG |
| select_g_select_jobinfo_get(job_ptr->select_jobinfo, |
| SELECT_JOBDATA_NODE_CNT, |
| &resp_data->proc_cnt); |
| |
| #else |
| resp_data->proc_cnt = job_ptr->total_cpus; |
| #endif |
| _delayed_job_start_time(job_ptr); |
| resp_data->start_time = MAX(job_ptr->start_time, |
| orig_start_time); |
| resp_data->start_time = MAX(resp_data->start_time, start_res); |
| job_ptr->start_time = 0; /* restore pending job start time */ |
| resp_data->node_list = bitmap2node_name(avail_bitmap); |
| |
| if (preemptee_job_list) { |
| ListIterator preemptee_iterator; |
| uint32_t *preemptee_jid; |
| struct job_record *tmp_job_ptr; |
| resp_data->preemptee_job_id=list_create(_pre_list_del); |
| preemptee_iterator = list_iterator_create( |
| preemptee_job_list); |
| while ((tmp_job_ptr = (struct job_record *) |
| list_next(preemptee_iterator))) { |
| preemptee_jid = xmalloc(sizeof(uint32_t)); |
| (*preemptee_jid) = tmp_job_ptr->job_id; |
| list_append(resp_data->preemptee_job_id, |
| preemptee_jid); |
| } |
| list_iterator_destroy(preemptee_iterator); |
| } |
| *resp = resp_data; |
| } else { |
| rc = ESLURM_REQUESTED_NODE_CONFIG_UNAVAILABLE; |
| } |
| |
| if (preemptee_candidates) |
| list_destroy(preemptee_candidates); |
| if (preemptee_job_list) |
| list_destroy(preemptee_job_list); |
| FREE_NULL_BITMAP(avail_bitmap); |
| return rc; |
| } |
| |
| /* |
| * epilog_slurmctld - execute the epilog_slurmctld for a job that has just |
| * terminated. |
| * IN job_ptr - pointer to job that has been terminated |
| * RET SLURM_SUCCESS(0) or error code |
| */ |
| extern int epilog_slurmctld(struct job_record *job_ptr) |
| { |
| int rc; |
| pthread_t thread_id_epilog; |
| pthread_attr_t thread_attr_epilog; |
| epilog_arg_t *epilog_arg; |
| |
| if ((slurmctld_conf.epilog_slurmctld == NULL) || |
| (slurmctld_conf.epilog_slurmctld[0] == '\0')) |
| return SLURM_SUCCESS; |
| |
| if (access(slurmctld_conf.epilog_slurmctld, X_OK) < 0) { |
| error("Invalid EpilogSlurmctld: %m"); |
| return errno; |
| } |
| |
| epilog_arg = xmalloc(sizeof(epilog_arg_t)); |
| epilog_arg->job_id = job_ptr->job_id; |
| epilog_arg->epilog_slurmctld = xstrdup(slurmctld_conf.epilog_slurmctld); |
| epilog_arg->my_env = _build_env(job_ptr); |
| |
| slurm_attr_init(&thread_attr_epilog); |
| pthread_attr_setdetachstate(&thread_attr_epilog, |
| PTHREAD_CREATE_DETACHED); |
| job_ptr->epilog_running = true; |
| while (1) { |
| rc = pthread_create(&thread_id_epilog, |
| &thread_attr_epilog, |
| _run_epilog, (void *) epilog_arg); |
| if (rc == 0) { |
| slurm_attr_destroy(&thread_attr_epilog); |
| return SLURM_SUCCESS; |
| } |
| if (errno == EAGAIN) |
| continue; |
| error("pthread_create: %m"); |
| slurm_attr_destroy(&thread_attr_epilog); |
| job_ptr->epilog_running = false; |
| return errno; |
| } |
| } |
| |
| static char **_build_env(struct job_record *job_ptr) |
| { |
| char **my_env, *name; |
| char buf[32]; |
| int exit_code; |
| int signal; |
| |
| my_env = xmalloc(sizeof(char *)); |
| my_env[0] = NULL; |
| |
| /* Set SPANK env vars first so that we can overrite as needed |
| * below. Prevent user hacking from setting SLURM_JOB_ID etc. */ |
| if (job_ptr->spank_job_env_size) { |
| env_array_merge(&my_env, |
| (const char **) job_ptr->spank_job_env); |
| } |
| |
| #ifdef HAVE_BG |
| select_g_select_jobinfo_get(job_ptr->select_jobinfo, |
| SELECT_JOBDATA_BLOCK_ID, &name); |
| setenvf(&my_env, "MPIRUN_PARTITION", "%s", name); |
| # ifdef HAVE_BGP |
| { |
| uint16_t conn_type = (uint16_t)NO_VAL; |
| select_g_select_jobinfo_get(job_ptr->select_jobinfo, |
| SELECT_JOBDATA_CONN_TYPE, |
| &conn_type); |
| if (conn_type > SELECT_SMALL) { |
| /* SUBMIT_POOL over rides |
| HTC_SUBMIT_POOL */ |
| setenvf(&my_env, "SUBMIT_POOL", "%s", name); |
| } |
| } |
| # endif |
| xfree(name); |
| #elif defined HAVE_ALPS_CRAY |
| name = select_g_select_jobinfo_xstrdup(job_ptr->select_jobinfo, |
| SELECT_PRINT_RESV_ID); |
| setenvf(&my_env, "BASIL_RESERVATION_ID", "%s", name); |
| xfree(name); |
| #endif |
| setenvf(&my_env, "SLURM_JOB_ACCOUNT", "%s", job_ptr->account); |
| if (job_ptr->details) { |
| setenvf(&my_env, "SLURM_JOB_CONSTRAINTS", |
| "%s", job_ptr->details->features); |
| } |
| setenvf(&my_env, "SLURM_JOB_DERIVED_EC", "%u", |
| job_ptr->derived_ec); |
| |
| exit_code = signal = 0; |
| if (WIFEXITED(job_ptr->exit_code)) { |
| exit_code = WEXITSTATUS(job_ptr->exit_code); |
| } |
| if (WIFSIGNALED(job_ptr->exit_code)) { |
| signal = WTERMSIG(job_ptr->exit_code); |
| } |
| sprintf(buf, "%d:%d", exit_code, signal); |
| setenvf(&my_env, "SLURM_JOB_EXIT_CODE2", "%s", buf); |
| |
| if (job_ptr->array_task_id != NO_VAL) { |
| setenvf(&my_env, "SLURM_ARRAY_JOB_ID", "%u", |
| job_ptr->array_job_id); |
| setenvf(&my_env, "SLURM_ARRAY_TASK_ID", "%u", |
| job_ptr->array_task_id); |
| } |
| |
| if (slurmctld_cluster_name) { |
| setenvf(&my_env, "SLURM_CLUSTER_NAME", "%s", |
| slurmctld_cluster_name); |
| } |
| |
| setenvf(&my_env, "SLURM_JOB_EXIT_CODE", "%u", job_ptr->exit_code); |
| setenvf(&my_env, "SLURM_JOB_GID", "%u", job_ptr->group_id); |
| name = gid_to_string((uid_t) job_ptr->group_id); |
| setenvf(&my_env, "SLURM_JOB_GROUP", "%s", name); |
| xfree(name); |
| setenvf(&my_env, "SLURM_JOBID", "%u", job_ptr->job_id); |
| setenvf(&my_env, "SLURM_JOB_ID", "%u", job_ptr->job_id); |
| setenvf(&my_env, "SLURM_JOB_NAME", "%s", job_ptr->name); |
| setenvf(&my_env, "SLURM_JOB_NODELIST", "%s", job_ptr->nodes); |
| setenvf(&my_env, "SLURM_JOB_PARTITION", "%s", job_ptr->partition); |
| setenvf(&my_env, "SLURM_JOB_UID", "%u", job_ptr->user_id); |
| name = uid_to_string((uid_t) job_ptr->user_id); |
| setenvf(&my_env, "SLURM_JOB_USER", "%s", name); |
| xfree(name); |
| |
| return my_env; |
| } |
| |
| static void *_run_epilog(void *arg) |
| { |
| /* Locks: Write job */ |
| slurmctld_lock_t job_write_lock = { |
| NO_LOCK, WRITE_LOCK, NO_LOCK, NO_LOCK }; |
| struct job_record *job_ptr; |
| epilog_arg_t *epilog_arg = (epilog_arg_t *) arg; |
| pid_t cpid; |
| int i, status, wait_rc; |
| char *argv[2]; |
| |
| argv[0] = epilog_arg->epilog_slurmctld; |
| argv[1] = NULL; |
| |
| if ((cpid = fork()) < 0) { |
| error("epilog_slurmctld fork error: %m"); |
| goto fini; |
| } |
| if (cpid == 0) { |
| for (i = 0; i < 1024; i++) |
| (void) close(i); |
| #ifdef SETPGRP_TWO_ARGS |
| setpgrp(0, 0); |
| #else |
| setpgrp(); |
| #endif |
| execve(argv[0], argv, epilog_arg->my_env); |
| exit(127); |
| } |
| |
| while (1) { |
| wait_rc = waitpid(cpid, &status, 0); |
| if (wait_rc < 0) { |
| if (errno == EINTR) |
| continue; |
| error("epilog_slurmctld waitpid error: %m"); |
| break; |
| } else if (wait_rc > 0) { |
| killpg(cpid, SIGKILL); /* kill children too */ |
| break; |
| } |
| } |
| if (status != 0) { |
| error("epilog_slurmctld job %u epilog exit status %u:%u", |
| epilog_arg->job_id, WEXITSTATUS(status), |
| WTERMSIG(status)); |
| } else { |
| debug2("epilog_slurmctld job %u epilog completed", |
| epilog_arg->job_id); |
| } |
| |
| fini: lock_slurmctld(job_write_lock); |
| job_ptr = find_job_record(epilog_arg->job_id); |
| if (job_ptr) { |
| job_ptr->epilog_running = false; |
| /* Clean up the JOB_COMPLETING flag |
| * only if the node count is 0 meaning |
| * the slurmd epilog already completed. |
| */ |
| if (job_ptr->node_cnt == 0 |
| && IS_JOB_COMPLETING(job_ptr)) |
| cleanup_completing(job_ptr); |
| } |
| unlock_slurmctld(job_write_lock); |
| xfree(epilog_arg->epilog_slurmctld); |
| for (i=0; epilog_arg->my_env[i]; i++) |
| xfree(epilog_arg->my_env[i]); |
| xfree(epilog_arg->my_env); |
| xfree(epilog_arg); |
| return NULL; |
| } |
| |
| /* |
| * prolog_slurmctld - execute the prolog_slurmctld for a job that has just |
| * been allocated resources. |
| * IN job_ptr - pointer to job that will be initiated |
| * RET SLURM_SUCCESS(0) or error code |
| */ |
| extern int prolog_slurmctld(struct job_record *job_ptr) |
| { |
| int rc; |
| pthread_t thread_id_prolog; |
| pthread_attr_t thread_attr_prolog; |
| |
| if ((slurmctld_conf.prolog_slurmctld == NULL) || |
| (slurmctld_conf.prolog_slurmctld[0] == '\0')) |
| return SLURM_SUCCESS; |
| |
| if (access(slurmctld_conf.prolog_slurmctld, X_OK) < 0) { |
| error("Invalid PrologSlurmctld: %m"); |
| return errno; |
| } |
| |
| if (job_ptr->details) |
| job_ptr->details->prolog_running = 1; |
| |
| slurm_attr_init(&thread_attr_prolog); |
| pthread_attr_setdetachstate(&thread_attr_prolog, |
| PTHREAD_CREATE_DETACHED); |
| while (1) { |
| rc = pthread_create(&thread_id_prolog, |
| &thread_attr_prolog, |
| _run_prolog, (void *) job_ptr); |
| if (rc == 0) { |
| slurm_attr_destroy(&thread_attr_prolog); |
| return SLURM_SUCCESS; |
| } |
| if (errno == EAGAIN) |
| continue; |
| error("pthread_create: %m"); |
| slurm_attr_destroy(&thread_attr_prolog); |
| return errno; |
| } |
| } |
| |
| static void *_run_prolog(void *arg) |
| { |
| struct job_record *job_ptr = (struct job_record *) arg; |
| struct node_record *node_ptr; |
| uint32_t job_id; |
| pid_t cpid; |
| int i, rc, status, wait_rc; |
| char *argv[2], **my_env; |
| /* Locks: Read config, job; Write nodes */ |
| slurmctld_lock_t config_read_lock = { |
| READ_LOCK, READ_LOCK, WRITE_LOCK, NO_LOCK }; |
| bitstr_t *node_bitmap = NULL; |
| time_t now = time(NULL); |
| uint16_t resume_timeout = slurm_get_resume_timeout(); |
| |
| lock_slurmctld(config_read_lock); |
| argv[0] = xstrdup(slurmctld_conf.prolog_slurmctld); |
| argv[1] = NULL; |
| my_env = _build_env(job_ptr); |
| job_id = job_ptr->job_id; |
| if (job_ptr->node_bitmap) { |
| node_bitmap = bit_copy(job_ptr->node_bitmap); |
| for (i = 0, node_ptr = node_record_table_ptr; |
| i < node_record_count; i++, node_ptr++) { |
| if (!bit_test(node_bitmap, i)) |
| continue; |
| /* Allow time for possible reboot */ |
| node_ptr->last_response = now + resume_timeout; |
| } |
| } |
| unlock_slurmctld(config_read_lock); |
| |
| if ((cpid = fork()) < 0) { |
| error("prolog_slurmctld fork error: %m"); |
| goto fini; |
| } |
| if (cpid == 0) { |
| for (i = 0; i < 1024; i++) |
| (void) close(i); |
| #ifdef SETPGRP_TWO_ARGS |
| setpgrp(0, 0); |
| #else |
| setpgrp(); |
| #endif |
| execve(argv[0], argv, my_env); |
| exit(127); |
| } |
| |
| while (1) { |
| wait_rc = waitpid(cpid, &status, 0); |
| if (wait_rc < 0) { |
| if (errno == EINTR) |
| continue; |
| error("prolog_slurmctld waitpid error: %m"); |
| break; |
| } else if (wait_rc > 0) { |
| killpg(cpid, SIGKILL); /* kill children too */ |
| break; |
| } |
| } |
| if (status != 0) { |
| bool kill_job = false; |
| slurmctld_lock_t job_write_lock = { |
| NO_LOCK, WRITE_LOCK, WRITE_LOCK, NO_LOCK }; |
| error("prolog_slurmctld job %u prolog exit status %u:%u", |
| job_id, WEXITSTATUS(status), WTERMSIG(status)); |
| lock_slurmctld(job_write_lock); |
| if ((rc = job_requeue(0, job_id, -1, (uint16_t) NO_VAL, |
| false))) { |
| info("unable to requeue job %u: %m", job_id); |
| kill_job = true; |
| } |
| if (kill_job) { |
| srun_user_message(job_ptr, |
| "PrologSlurmctld failed, job killed"); |
| (void) job_signal(job_id, SIGKILL, 0, 0, false); |
| } |
| |
| unlock_slurmctld(job_write_lock); |
| } else |
| debug2("prolog_slurmctld job %u prolog completed", job_id); |
| |
| fini: xfree(argv[0]); |
| for (i=0; my_env[i]; i++) |
| xfree(my_env[i]); |
| xfree(my_env); |
| lock_slurmctld(config_read_lock); |
| if (job_ptr->job_id != job_id) { |
| error("prolog_slurmctld job %u pointer invalid", job_id); |
| job_ptr = find_job_record(job_id); |
| if (job_ptr == NULL) |
| error("prolog_slurmctld job %u now defunct", job_id); |
| } |
| if (job_ptr) { |
| if (job_ptr->details) |
| job_ptr->details->prolog_running = 0; |
| if (job_ptr->batch_flag && |
| (IS_JOB_RUNNING(job_ptr) || IS_JOB_SUSPENDED(job_ptr))) |
| launch_job(job_ptr); |
| } |
| if (job_ptr && job_ptr->node_bitmap) { |
| for (i=0; i<node_record_count; i++) { |
| if (bit_test(job_ptr->node_bitmap, i) == 0) |
| continue; |
| node_record_table_ptr[i].node_state &= |
| (~NODE_STATE_POWER_UP); |
| } |
| } else if (node_bitmap) { |
| for (i=0; i<node_record_count; i++) { |
| if (bit_test(node_bitmap, i) == 0) |
| continue; |
| node_record_table_ptr[i].node_state &= |
| (~NODE_STATE_POWER_UP); |
| } |
| } |
| unlock_slurmctld(config_read_lock); |
| FREE_NULL_BITMAP(node_bitmap); |
| |
| return NULL; |
| } |
| |
| /* |
| * Copy a job's feature list |
| * IN feature_list_src - a job's depend_lst |
| * RET copy of feature_list_src, must be freed by caller |
| */ |
| extern List feature_list_copy(List feature_list_src) |
| { |
| struct feature_record *feat_src, *feat_dest; |
| ListIterator iter; |
| List feature_list_dest = NULL; |
| |
| if (!feature_list_src) |
| return feature_list_dest; |
| |
| feature_list_dest = list_create(_feature_list_delete); |
| iter = list_iterator_create(feature_list_src); |
| while ((feat_src = (struct feature_record *) list_next(iter))) { |
| feat_dest = xmalloc(sizeof(struct feature_record)); |
| memcpy(feat_dest, feat_src, sizeof(struct feature_record)); |
| feat_dest->name = xstrdup(feat_src->name); |
| list_append(feature_list_dest, feat_dest); |
| } |
| list_iterator_destroy(iter); |
| return feature_list_dest; |
| } |
| |
| /* |
| * build_feature_list - Translate a job's feature string into a feature_list |
| * IN details->features |
| * OUT details->feature_list |
| * RET error code |
| */ |
| extern int build_feature_list(struct job_record *job_ptr) |
| { |
| struct job_details *detail_ptr = job_ptr->details; |
| char *tmp_requested, *str_ptr, *feature = NULL; |
| int bracket = 0, count = 0, i; |
| bool have_count = false, have_or = false; |
| struct feature_record *feat; |
| |
| if (!detail_ptr || !detail_ptr->features) /* no constraints */ |
| return SLURM_SUCCESS; |
| if (detail_ptr->feature_list) /* already processed */ |
| return SLURM_SUCCESS; |
| |
| tmp_requested = xstrdup(detail_ptr->features); |
| detail_ptr->feature_list = list_create(_feature_list_delete); |
| for (i=0; ; i++) { |
| if (tmp_requested[i] == '*') { |
| tmp_requested[i] = '\0'; |
| have_count = true; |
| count = strtol(&tmp_requested[i+1], &str_ptr, 10); |
| if ((feature == NULL) || (count <= 0)) { |
| info("Job %u invalid constraint %s", |
| job_ptr->job_id, detail_ptr->features); |
| xfree(tmp_requested); |
| return ESLURM_INVALID_FEATURE; |
| } |
| i = str_ptr - tmp_requested - 1; |
| } else if (tmp_requested[i] == '&') { |
| tmp_requested[i] = '\0'; |
| if (feature == NULL) { |
| info("Job %u invalid constraint %s", |
| job_ptr->job_id, detail_ptr->features); |
| xfree(tmp_requested); |
| return ESLURM_INVALID_FEATURE; |
| } |
| feat = xmalloc(sizeof(struct feature_record)); |
| feat->name = xstrdup(feature); |
| feat->count = count; |
| if (bracket) |
| feat->op_code = FEATURE_OP_XAND; |
| else |
| feat->op_code = FEATURE_OP_AND; |
| list_append(detail_ptr->feature_list, feat); |
| feature = NULL; |
| count = 0; |
| } else if (tmp_requested[i] == '|') { |
| tmp_requested[i] = '\0'; |
| have_or = true; |
| if (feature == NULL) { |
| info("Job %u invalid constraint %s", |
| job_ptr->job_id, detail_ptr->features); |
| xfree(tmp_requested); |
| return ESLURM_INVALID_FEATURE; |
| } |
| feat = xmalloc(sizeof(struct feature_record)); |
| feat->name = xstrdup(feature); |
| feat->count = count; |
| if (bracket) |
| feat->op_code = FEATURE_OP_XOR; |
| else |
| feat->op_code = FEATURE_OP_OR; |
| list_append(detail_ptr->feature_list, feat); |
| feature = NULL; |
| count = 0; |
| } else if (tmp_requested[i] == '[') { |
| tmp_requested[i] = '\0'; |
| if ((feature != NULL) || bracket) { |
| info("Job %u invalid constraint %s", |
| job_ptr->job_id, detail_ptr->features); |
| xfree(tmp_requested); |
| return ESLURM_INVALID_FEATURE; |
| } |
| bracket++; |
| } else if (tmp_requested[i] == ']') { |
| tmp_requested[i] = '\0'; |
| if ((feature == NULL) || (bracket == 0)) { |
| info("Job %u invalid constraint %s", |
| job_ptr->job_id, detail_ptr->features); |
| xfree(tmp_requested); |
| return ESLURM_INVALID_FEATURE; |
| } |
| bracket = 0; |
| } else if (tmp_requested[i] == '\0') { |
| if (feature) { |
| feat = xmalloc(sizeof(struct feature_record)); |
| feat->name = xstrdup(feature); |
| feat->count = count; |
| feat->op_code = FEATURE_OP_END; |
| list_append(detail_ptr->feature_list, feat); |
| } |
| break; |
| } else if (tmp_requested[i] == ',') { |
| info("Job %u invalid constraint %s", |
| job_ptr->job_id, detail_ptr->features); |
| xfree(tmp_requested); |
| return ESLURM_INVALID_FEATURE; |
| } else if (feature == NULL) { |
| feature = &tmp_requested[i]; |
| } |
| } |
| xfree(tmp_requested); |
| if (have_count && have_or) { |
| info("Job %u invalid constraint (OR with feature count): %s", |
| job_ptr->job_id, detail_ptr->features); |
| return ESLURM_INVALID_FEATURE; |
| } |
| |
| return _valid_feature_list(job_ptr->job_id, detail_ptr->feature_list); |
| } |
| |
| static void _feature_list_delete(void *x) |
| { |
| struct feature_record *feature = (struct feature_record *)x; |
| xfree(feature->name); |
| xfree(feature); |
| } |
| |
| static int _valid_feature_list(uint32_t job_id, List feature_list) |
| { |
| ListIterator feat_iter; |
| struct feature_record *feat_ptr; |
| char *buf = NULL, tmp[16]; |
| int bracket = 0; |
| int rc = SLURM_SUCCESS; |
| |
| if (feature_list == NULL) { |
| debug2("Job %u feature list is empty", job_id); |
| return rc; |
| } |
| |
| feat_iter = list_iterator_create(feature_list); |
| while ((feat_ptr = (struct feature_record *)list_next(feat_iter))) { |
| if ((feat_ptr->op_code == FEATURE_OP_XOR) || |
| (feat_ptr->op_code == FEATURE_OP_XAND)) { |
| if (bracket == 0) |
| xstrcat(buf, "["); |
| bracket = 1; |
| } |
| xstrcat(buf, feat_ptr->name); |
| if (rc == SLURM_SUCCESS) |
| rc = _valid_node_feature(feat_ptr->name); |
| if (feat_ptr->count) { |
| snprintf(tmp, sizeof(tmp), "*%u", feat_ptr->count); |
| xstrcat(buf, tmp); |
| } |
| if (bracket && |
| ((feat_ptr->op_code != FEATURE_OP_XOR) && |
| (feat_ptr->op_code != FEATURE_OP_XAND))) { |
| xstrcat(buf, "]"); |
| bracket = 0; |
| } |
| if ((feat_ptr->op_code == FEATURE_OP_AND) || |
| (feat_ptr->op_code == FEATURE_OP_XAND)) |
| xstrcat(buf, "&"); |
| else if ((feat_ptr->op_code == FEATURE_OP_OR) || |
| (feat_ptr->op_code == FEATURE_OP_XOR)) |
| xstrcat(buf, "|"); |
| } |
| list_iterator_destroy(feat_iter); |
| if (rc == SLURM_SUCCESS) |
| debug("Job %u feature list: %s", job_id, buf); |
| else |
| info("Job %u has invalid feature list: %s", job_id, buf); |
| xfree(buf); |
| return rc; |
| } |
| |
| static int _valid_node_feature(char *feature) |
| { |
| int rc = ESLURM_INVALID_FEATURE; |
| struct features_record *feature_ptr; |
| ListIterator feature_iter; |
| |
| /* Clear these nodes from the feature_list record, |
| * then restore as needed */ |
| feature_iter = list_iterator_create(feature_list); |
| while ((feature_ptr = (struct features_record *) |
| list_next(feature_iter))) { |
| if (strcmp(feature_ptr->name, feature)) |
| continue; |
| rc = SLURM_SUCCESS; |
| break; |
| } |
| list_iterator_destroy(feature_iter); |
| |
| return rc; |
| } |
| |
| /* If a job can run in multiple partitions, when it is started we want to |
| * put the name of the partition used _first_ in that list. When slurmctld |
| * restarts, that will be used to set the job's part_ptr and that will be |
| * reported to squeue. We leave all of the partitions in the list though, |
| * so the job can be requeued and have access to them all. */ |
| extern void rebuild_job_part_list(struct job_record *job_ptr) |
| { |
| ListIterator part_iterator; |
| struct part_record *part_ptr; |
| |
| if (!job_ptr->part_ptr_list) |
| return; |
| if (!job_ptr->part_ptr || !job_ptr->part_ptr->name) { |
| error("Job %u has NULL part_ptr or the partition name is NULL", |
| job_ptr->job_id); |
| return; |
| } |
| |
| xfree(job_ptr->partition); |
| job_ptr->partition = xstrdup(job_ptr->part_ptr->name); |
| |
| part_iterator = list_iterator_create(job_ptr->part_ptr_list); |
| while ((part_ptr = (struct part_record *) list_next(part_iterator))) { |
| if (part_ptr == job_ptr->part_ptr) |
| continue; |
| xstrcat(job_ptr->partition, ","); |
| xstrcat(job_ptr->partition, part_ptr->name); |
| } |
| list_iterator_destroy(part_iterator); |
| } |
| |
| /* cleanup_completing() |
| * |
| * Clean up the JOB_COMPLETING flag and eventually |
| * requeue the job if there is a pending request |
| * for it. This function assumes the caller has the |
| * appropriate locks on the job_record. |
| */ |
| void |
| cleanup_completing(struct job_record *job_ptr) |
| { |
| time_t delay; |
| |
| delay = last_job_update - job_ptr->end_time; |
| if (delay > 60) { |
| info("%s: job %u completion process took %ld seconds", |
| __func__, job_ptr->job_id,(long) delay); |
| } |
| |
| job_ptr->job_state &= (~JOB_COMPLETING); |
| job_hold_requeue(job_ptr); |
| |
| delete_step_records(job_ptr); |
| slurm_sched_g_schedule(); |
| } |