| /*****************************************************************************\ |
| * job_mgr.c - manage the job information of slurm |
| * Note: there is a global job list (job_list), time stamp |
| * (last_job_update), and hash table (job_hash) |
| ***************************************************************************** |
| * Copyright (C) 2002-2007 The Regents of the University of California. |
| * Copyright (C) 2008-2010 Lawrence Livermore National Security. |
| * Portions Copyright (C) 2010-2014 SchedMD <http://www.schedmd.com>. |
| * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). |
| * Written by Morris Jette <jette1@llnl.gov> |
| * CODE-OCEC-09-009. All rights reserved. |
| * |
| * This file is part of SLURM, a resource management program. |
| * For details, see <http://slurm.schedmd.com/>. |
| * Please also read the included file: DISCLAIMER. |
| * |
| * SLURM is free software; you can redistribute it and/or modify it under |
| * the terms of the GNU General Public License as published by the Free |
| * Software Foundation; either version 2 of the License, or (at your option) |
| * any later version. |
| * |
| * In addition, as a special exception, the copyright holders give permission |
| * to link the code of portions of this program with the OpenSSL library under |
| * certain conditions as described in each individual source file, and |
| * distribute linked combinations including the two. You must obey the GNU |
| * General Public License in all respects for all of the code used other than |
| * OpenSSL. If you modify file(s) with this exception, you may extend this |
| * exception to your version of the file(s), but you are not obligated to do |
| * so. If you do not wish to do so, delete this exception statement from your |
| * version. If you delete this exception statement from all source files in |
| * the program, then also delete it here. |
| * |
| * SLURM is distributed in the hope that it will be useful, but WITHOUT ANY |
| * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS |
| * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more |
| * details. |
| * |
| * You should have received a copy of the GNU General Public License along |
| * with SLURM; if not, write to the Free Software Foundation, Inc., |
| * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. |
| \*****************************************************************************/ |
| |
| #ifdef HAVE_CONFIG_H |
| # include "config.h" |
| #endif |
| |
| #include <ctype.h> |
| #include <dirent.h> |
| #include <errno.h> |
| #include <fcntl.h> |
| #include <libgen.h> |
| #include <signal.h> |
| #include <stdio.h> |
| #include <stdlib.h> |
| #include <string.h> |
| #include <strings.h> |
| #include <sys/stat.h> |
| #include <sys/types.h> |
| #include <sys/param.h> |
| #include <unistd.h> |
| |
| #include "slurm/slurm_errno.h" |
| |
| #include "src/common/slurm_acct_gather.h" |
| #include "src/common/assoc_mgr.h" |
| #include "src/common/bitstring.h" |
| #include "src/common/fd.h" |
| #include "src/common/forward.h" |
| #include "src/common/gres.h" |
| #include "src/common/hostlist.h" |
| #include "src/common/node_select.h" |
| #include "src/common/parse_time.h" |
| #include "src/common/slurm_accounting_storage.h" |
| #include "src/common/slurm_jobcomp.h" |
| #include "src/common/slurm_priority.h" |
| #include "src/common/slurm_protocol_pack.h" |
| #include "src/common/switch.h" |
| #include "src/common/timers.h" |
| #include "src/common/xassert.h" |
| #include "src/common/xstring.h" |
| |
| #include "src/slurmctld/acct_policy.h" |
| #include "src/slurmctld/agent.h" |
| #include "src/slurmctld/front_end.h" |
| #include "src/slurmctld/job_scheduler.h" |
| #include "src/slurmctld/job_submit.h" |
| #include "src/slurmctld/licenses.h" |
| #include "src/slurmctld/locks.h" |
| #include "src/slurmctld/node_scheduler.h" |
| #include "src/slurmctld/preempt.h" |
| #include "src/slurmctld/proc_req.h" |
| #include "src/slurmctld/reservation.h" |
| #include "src/slurmctld/sched_plugin.h" |
| #include "src/slurmctld/slurmctld.h" |
| #include "src/slurmctld/slurmctld_plugstack.h" |
| #include "src/slurmctld/srun_comm.h" |
| #include "src/slurmctld/state_save.h" |
| #include "src/slurmctld/trigger_mgr.h" |
| |
| #define DETAILS_FLAG 0xdddd |
| #define SLURM_CREATE_JOB_FLAG_NO_ALLOCATE_0 0 |
| #define STEP_FLAG 0xbbbb |
| #define TOP_PRIORITY 0xffff0000 /* large, but leave headroom for higher */ |
| |
| #define JOB_HASH_INX(_job_id) (_job_id % hash_table_size) |
| #define JOB_ARRAY_HASH_INX(_job_id, _task_id) \ |
| ((_job_id + _task_id) % hash_table_size) |
| |
| /* Change JOB_STATE_VERSION value when changing the state save format */ |
| #define JOB_STATE_VERSION "PROTOCOL_VERSION" |
| #define JOB_14_03_STATE_VERSION "VER015" /* SLURM version 14.03 */ |
| #define JOB_2_6_STATE_VERSION "VER014" /* SLURM version 2.6 */ |
| #define JOB_2_5_STATE_VERSION "VER013" /* SLURM version 2.5 */ |
| |
| #define JOB_CKPT_VERSION "PROTOCOL_VERSION" |
| #define JOB_2_2_CKPT_VERSION "JOB_CKPT_002" /* SLURM version 2.2 */ |
| #define JOB_2_1_CKPT_VERSION "JOB_CKPT_001" /* SLURM version 2.1 */ |
| |
| /* Global variables */ |
| List job_list = NULL; /* job_record list */ |
| time_t last_job_update; /* time of last update to job records */ |
| |
| /* Local variables */ |
| static uint32_t highest_prio = 0; |
| static uint32_t lowest_prio = TOP_PRIORITY; |
| static int hash_table_size = 0; |
| static int job_count = 0; /* job's in the system */ |
| static uint32_t job_id_sequence = 0; /* first job_id to assign new job */ |
| static struct job_record **job_hash = NULL; |
| static struct job_record **job_array_hash_j = NULL; |
| static struct job_record **job_array_hash_t = NULL; |
| static time_t last_file_write_time = (time_t) 0; |
| static int select_serial = -1; |
| static bool wiki_sched = false; |
| static bool wiki2_sched = false; |
| static bool wiki_sched_test = false; |
| |
| /* Local functions */ |
| static void _add_job_hash(struct job_record *job_ptr); |
| static void _add_job_array_hash(struct job_record *job_ptr); |
| static int _checkpoint_job_record (struct job_record *job_ptr, |
| char *image_dir); |
| static int _copy_job_desc_files(uint32_t job_id_src, uint32_t job_id_dest); |
| static int _copy_job_desc_to_file(job_desc_msg_t * job_desc, |
| uint32_t job_id); |
| static int _copy_job_desc_to_job_record(job_desc_msg_t * job_desc, |
| struct job_record **job_ptr, |
| bitstr_t ** exc_bitmap, |
| bitstr_t ** req_bitmap); |
| static job_desc_msg_t * _copy_job_record_to_job_desc( |
| struct job_record *job_ptr); |
| static char *_copy_nodelist_no_dup(char *node_list); |
| static void _del_batch_list_rec(void *x); |
| static void _delete_job_desc_files(uint32_t job_id); |
| static slurmdb_qos_rec_t *_determine_and_validate_qos( |
| char *resv_name, slurmdb_association_rec_t *assoc_ptr, |
| bool admin, slurmdb_qos_rec_t *qos_rec, int *error_code); |
| static void _dump_job_details(struct job_details *detail_ptr, |
| Buf buffer); |
| static void _dump_job_state(struct job_record *dump_job_ptr, Buf buffer); |
| static int _find_batch_dir(void *x, void *key); |
| static void _get_batch_job_dir_ids(List batch_dirs); |
| static time_t _get_last_state_write_time(void); |
| static struct job_record *_job_rec_copy(struct job_record *job_ptr, |
| uint32_t array_task_id); |
| static void _job_timed_out(struct job_record *job_ptr); |
| static int _job_create(job_desc_msg_t * job_specs, int allocate, int will_run, |
| struct job_record **job_rec_ptr, uid_t submit_uid, |
| char **err_msg); |
| static void _list_delete_job(void *job_entry); |
| static int _list_find_job_id(void *job_entry, void *key); |
| static int _list_find_job_old(void *job_entry, void *key); |
| static int _load_job_details(struct job_record *job_ptr, Buf buffer, |
| uint16_t protocol_version); |
| static int _load_job_state(Buf buffer, uint16_t protocol_version); |
| static uint32_t _max_switch_wait(uint32_t input_wait); |
| static void _notify_srun_missing_step(struct job_record *job_ptr, int node_inx, |
| time_t now, time_t node_boot_time); |
| static int _open_job_state_file(char **state_file); |
| static void _pack_job_for_ckpt (struct job_record *job_ptr, Buf buffer); |
| static void _pack_default_job_details(struct job_record *job_ptr, |
| Buf buffer, |
| uint16_t protocol_version); |
| static void _pack_pending_job_details(struct job_details *detail_ptr, |
| Buf buffer, |
| uint16_t protocol_version); |
| static int _purge_job_record(uint32_t job_id); |
| static void _purge_missing_jobs(int node_inx, time_t now); |
| static int _read_data_array_from_file(char *file_name, char ***data, |
| uint32_t * size, |
| struct job_record *job_ptr); |
| static void _read_data_from_file(char *file_name, char **data); |
| static char *_read_job_ckpt_file(char *ckpt_file, int *size_ptr); |
| static void _remove_defunct_batch_dirs(List batch_dirs); |
| static int _reset_detail_bitmaps(struct job_record *job_ptr); |
| static void _reset_step_bitmaps(struct job_record *job_ptr); |
| static int _resume_job_nodes(struct job_record *job_ptr, bool indf_susp); |
| static void _send_job_kill(struct job_record *job_ptr); |
| static int _set_job_id(struct job_record *job_ptr); |
| static void _signal_batch_job(struct job_record *job_ptr, uint16_t signal); |
| static void _signal_job(struct job_record *job_ptr, int signal); |
| static void _suspend_job(struct job_record *job_ptr, uint16_t op, |
| bool indf_susp); |
| static int _suspend_job_nodes(struct job_record *job_ptr, bool indf_susp); |
| static bool _top_priority(struct job_record *job_ptr); |
| static int _valid_job_part(job_desc_msg_t * job_desc, |
| uid_t submit_uid, bitstr_t *req_bitmap, |
| struct part_record **part_pptr, |
| List part_ptr_list, |
| slurmdb_association_rec_t *assoc_ptr, |
| slurmdb_qos_rec_t *qos_ptr); |
| static int _validate_job_desc(job_desc_msg_t * job_desc_msg, int allocate, |
| uid_t submit_uid, struct part_record *part_ptr, |
| List part_list); |
| static void _validate_job_files(List batch_dirs); |
| static int _write_data_to_file(char *file_name, char *data); |
| static int _write_data_array_to_file(char *file_name, char **data, |
| uint32_t size); |
| static void _xmit_new_end_time(struct job_record *job_ptr); |
| static bool _validate_min_mem_partition(job_desc_msg_t *job_desc_msg, |
| struct part_record *, |
| List part_list); |
| static int _copy_job_file(const char *src, const char *dst); |
| |
| /* |
| * create_job_record - create an empty job_record including job_details. |
| * load its values with defaults (zeros, nulls, and magic cookie) |
| * IN/OUT error_code - set to zero if no error, errno otherwise |
| * RET pointer to the record or NULL if error |
| * global: job_list - global job list |
| * job_count - number of jobs in the system |
| * last_job_update - time of last job table update |
| * NOTE: allocates memory that should be xfreed with _list_delete_job |
| */ |
| struct job_record *create_job_record(int *error_code) |
| { |
| struct job_record *job_ptr; |
| struct job_details *detail_ptr; |
| |
| if (job_count >= slurmctld_conf.max_job_cnt) { |
| error("create_job_record: MaxJobCount reached (%u)", |
| slurmctld_conf.max_job_cnt); |
| } |
| |
| job_count++; |
| *error_code = 0; |
| last_job_update = time(NULL); |
| |
| job_ptr = (struct job_record *) xmalloc(sizeof(struct job_record)); |
| detail_ptr = (struct job_details *)xmalloc(sizeof(struct job_details)); |
| |
| job_ptr->magic = JOB_MAGIC; |
| job_ptr->array_task_id = NO_VAL; |
| job_ptr->details = detail_ptr; |
| job_ptr->prio_factors = xmalloc(sizeof(priority_factors_object_t)); |
| job_ptr->step_list = list_create(NULL); |
| |
| xassert (detail_ptr->magic = DETAILS_MAGIC); /* set value */ |
| detail_ptr->submit_time = time(NULL); |
| job_ptr->requid = -1; /* force to -1 for sacct to know this |
| * hasn't been set yet */ |
| (void) list_append(job_list, job_ptr); |
| |
| return job_ptr; |
| } |
| |
| |
| /* |
| * delete_job_details - delete a job's detail record and clear it's pointer |
| * this information can be deleted as soon as the job is allocated |
| * resources and running (could need to restart batch job) |
| * IN job_entry - pointer to job_record to clear the record of |
| */ |
| void delete_job_details(struct job_record *job_entry) |
| { |
| int i; |
| |
| if (job_entry->details == NULL) |
| return; |
| |
| xassert (job_entry->details->magic == DETAILS_MAGIC); |
| if (IS_JOB_FINISHED(job_entry)) |
| _delete_job_desc_files(job_entry->job_id); |
| |
| xfree(job_entry->details->acctg_freq); |
| for (i=0; i<job_entry->details->argc; i++) |
| xfree(job_entry->details->argv[i]); |
| xfree(job_entry->details->argv); |
| xfree(job_entry->details->ckpt_dir); |
| xfree(job_entry->details->cpu_bind); |
| if (job_entry->details->depend_list) |
| list_destroy(job_entry->details->depend_list); |
| xfree(job_entry->details->dependency); |
| xfree(job_entry->details->orig_dependency); |
| for (i=0; i<job_entry->details->env_cnt; i++) |
| xfree(job_entry->details->env_sup[i]); |
| xfree(job_entry->details->env_sup); |
| xfree(job_entry->details->std_err); |
| FREE_NULL_BITMAP(job_entry->details->exc_node_bitmap); |
| xfree(job_entry->details->exc_nodes); |
| if (job_entry->details->feature_list) |
| list_destroy(job_entry->details->feature_list); |
| xfree(job_entry->details->features); |
| xfree(job_entry->details->std_in); |
| xfree(job_entry->details->mc_ptr); |
| xfree(job_entry->details->mem_bind); |
| xfree(job_entry->details->std_out); |
| FREE_NULL_BITMAP(job_entry->details->req_node_bitmap); |
| xfree(job_entry->details->req_node_layout); |
| xfree(job_entry->details->req_nodes); |
| xfree(job_entry->details->restart_dir); |
| xfree(job_entry->details->work_dir); |
| xfree(job_entry->details); /* Must be last */ |
| } |
| |
| /* _delete_job_desc_files - delete job descriptor related files */ |
| static void _delete_job_desc_files(uint32_t job_id) |
| { |
| char *dir_name, job_dir[20], *file_name; |
| struct stat sbuf; |
| |
| dir_name = slurm_get_state_save_location(); |
| |
| sprintf(job_dir, "/job.%u", job_id); |
| xstrcat(dir_name, job_dir); |
| |
| file_name = xstrdup(dir_name); |
| xstrcat(file_name, "/environment"); |
| (void) unlink(file_name); |
| xfree(file_name); |
| |
| file_name = xstrdup(dir_name); |
| xstrcat(file_name, "/script"); |
| (void) unlink(file_name); |
| xfree(file_name); |
| |
| if (stat(dir_name, &sbuf) == 0) /* remove job directory as needed */ |
| (void) rmdir(dir_name); |
| xfree(dir_name); |
| } |
| |
| static uint32_t _max_switch_wait(uint32_t input_wait) |
| { |
| static time_t sched_update = 0; |
| static uint32_t max_wait = 300; /* default max_switch_wait, seconds */ |
| char *sched_params, *tmp_ptr; |
| int i; |
| |
| if (sched_update != slurmctld_conf.last_update) { |
| sched_params = slurm_get_sched_params(); |
| if (sched_params && |
| (tmp_ptr = strstr(sched_params, "max_switch_wait="))) { |
| /* 0123456789012345 */ |
| i = atoi(tmp_ptr + 16); |
| if (i < 0) { |
| error("ignoring SchedulerParameters: " |
| "max_switch_wait of %d", i); |
| } else { |
| max_wait = i; |
| } |
| } |
| xfree(sched_params); |
| } |
| |
| if (max_wait > input_wait) |
| return input_wait; |
| return max_wait; |
| } |
| |
| static slurmdb_qos_rec_t *_determine_and_validate_qos( |
| char *resv_name, slurmdb_association_rec_t *assoc_ptr, |
| bool admin, slurmdb_qos_rec_t *qos_rec, int *error_code) |
| { |
| slurmdb_qos_rec_t *qos_ptr = NULL; |
| |
| /* If enforcing associations make sure this is a valid qos |
| with the association. If not just fill in the qos and |
| continue. */ |
| |
| xassert(qos_rec); |
| |
| if (!qos_rec->name && !qos_rec->id) { |
| if (assoc_ptr && assoc_ptr->usage->valid_qos) { |
| if (assoc_ptr->def_qos_id) |
| qos_rec->id = assoc_ptr->def_qos_id; |
| else if (bit_set_count(assoc_ptr->usage->valid_qos) |
| == 1) |
| qos_rec->id = |
| bit_ffs(assoc_ptr->usage->valid_qos); |
| else if (assoc_mgr_root_assoc |
| && assoc_mgr_root_assoc->def_qos_id) |
| qos_rec->id = assoc_mgr_root_assoc->def_qos_id; |
| else |
| qos_rec->name = "normal"; |
| } else if (assoc_mgr_root_assoc |
| && assoc_mgr_root_assoc->def_qos_id) |
| qos_rec->id = assoc_mgr_root_assoc->def_qos_id; |
| else |
| qos_rec->name = "normal"; |
| } |
| |
| if (assoc_mgr_fill_in_qos(acct_db_conn, qos_rec, accounting_enforce, |
| &qos_ptr, 0) != SLURM_SUCCESS) { |
| error("Invalid qos (%s)", qos_rec->name); |
| *error_code = ESLURM_INVALID_QOS; |
| return NULL; |
| } |
| |
| if ((accounting_enforce & ACCOUNTING_ENFORCE_QOS) |
| && assoc_ptr |
| && !admin |
| && (!assoc_ptr->usage->valid_qos |
| || !bit_test(assoc_ptr->usage->valid_qos, qos_rec->id))) { |
| error("This association %d(account='%s', " |
| "user='%s', partition='%s') does not have " |
| "access to qos %s", |
| assoc_ptr->id, assoc_ptr->acct, assoc_ptr->user, |
| assoc_ptr->partition, qos_rec->name); |
| *error_code = ESLURM_INVALID_QOS; |
| return NULL; |
| } |
| |
| if (qos_ptr && (qos_ptr->flags & QOS_FLAG_REQ_RESV) |
| && (!resv_name || resv_name[0] == '\0')) { |
| error("qos %s can only be used in a reservation", |
| qos_rec->name); |
| *error_code = ESLURM_INVALID_QOS; |
| return NULL; |
| } |
| |
| *error_code = SLURM_SUCCESS; |
| return qos_ptr; |
| } |
| |
| /* |
| * dump_all_job_state - save the state of all jobs to file for checkpoint |
| * Changes here should be reflected in load_last_job_id() and |
| * load_all_job_state(). |
| * RET 0 or error code */ |
| int dump_all_job_state(void) |
| { |
| /* Save high-water mark to avoid buffer growth with copies */ |
| static int high_buffer_size = (1024 * 1024); |
| int error_code = SLURM_SUCCESS, log_fd; |
| char *old_file, *new_file, *reg_file; |
| struct stat stat_buf; |
| /* Locks: Read config and job */ |
| slurmctld_lock_t job_read_lock = |
| { READ_LOCK, READ_LOCK, NO_LOCK, NO_LOCK }; |
| ListIterator job_iterator; |
| struct job_record *job_ptr; |
| Buf buffer = init_buf(high_buffer_size); |
| time_t min_age = 0, now = time(NULL); |
| time_t last_state_file_time; |
| DEF_TIMERS; |
| |
| START_TIMER; |
| /* Check that last state file was written at expected time. |
| * This is a check for two slurmctld daemons running at the same |
| * time in primary mode (a split-brain problem). */ |
| last_state_file_time = _get_last_state_write_time(); |
| if (last_file_write_time && last_state_file_time && |
| (last_file_write_time != last_state_file_time)) { |
| error("Bad job state save file time. We wrote it at time %u, " |
| "but the file contains a time stamp of %u.", |
| (uint32_t) last_file_write_time, |
| (uint32_t) last_state_file_time); |
| if (slurmctld_primary == 0) { |
| fatal("Two slurmctld daemons are running as primary. " |
| "Shutting down this daemon to avoid inconsistent " |
| "state due to split brain."); |
| } |
| } |
| |
| /* write header: version, time */ |
| packstr(JOB_STATE_VERSION, buffer); |
| pack16(SLURM_PROTOCOL_VERSION, buffer); |
| pack_time(now, buffer); |
| |
| if (slurmctld_conf.min_job_age > 0) |
| min_age = now - slurmctld_conf.min_job_age; |
| |
| /* |
| * write header: job id |
| * This is needed so that the job id remains persistent even after |
| * slurmctld is restarted. |
| */ |
| pack32( job_id_sequence, buffer); |
| |
| debug3("Writing job id %u to header record of job_state file", |
| job_id_sequence); |
| |
| /* write individual job records */ |
| lock_slurmctld(job_read_lock); |
| job_iterator = list_iterator_create(job_list); |
| while ((job_ptr = (struct job_record *) list_next(job_iterator))) { |
| xassert (job_ptr->magic == JOB_MAGIC); |
| if ((min_age > 0) && (job_ptr->end_time < min_age) && |
| (! IS_JOB_COMPLETING(job_ptr)) && IS_JOB_FINISHED(job_ptr)) |
| continue; /* job ready for purging, don't dump */ |
| |
| _dump_job_state(job_ptr, buffer); |
| } |
| list_iterator_destroy(job_iterator); |
| |
| /* write the buffer to file */ |
| old_file = xstrdup(slurmctld_conf.state_save_location); |
| xstrcat(old_file, "/job_state.old"); |
| reg_file = xstrdup(slurmctld_conf.state_save_location); |
| xstrcat(reg_file, "/job_state"); |
| new_file = xstrdup(slurmctld_conf.state_save_location); |
| xstrcat(new_file, "/job_state.new"); |
| unlock_slurmctld(job_read_lock); |
| |
| if (stat(reg_file, &stat_buf) == 0) { |
| static time_t last_mtime = (time_t) 0; |
| int delta_t = difftime(stat_buf.st_mtime, last_mtime); |
| if (delta_t < -10) { |
| error("The modification time of %s moved backwards " |
| "by %d seconds", |
| reg_file, (0-delta_t)); |
| error("The clock of the file system and this computer " |
| "appear to not be synchronized"); |
| /* It could be safest to exit here. We likely mounted |
| * a different file system with the state save files */ |
| } |
| last_mtime = time(NULL); |
| } |
| |
| lock_state_files(); |
| log_fd = creat(new_file, 0600); |
| if (log_fd < 0) { |
| error("Can't save state, create file %s error %m", |
| new_file); |
| error_code = errno; |
| } else { |
| int pos = 0, nwrite, amount, rc; |
| char *data; |
| |
| fd_set_close_on_exec(log_fd); |
| nwrite = get_buf_offset(buffer); |
| data = (char *)get_buf_data(buffer); |
| high_buffer_size = MAX(nwrite, high_buffer_size); |
| while (nwrite > 0) { |
| amount = write(log_fd, &data[pos], nwrite); |
| if ((amount < 0) && (errno != EINTR)) { |
| error("Error writing file %s, %m", new_file); |
| error_code = errno; |
| break; |
| } |
| nwrite -= amount; |
| pos += amount; |
| } |
| |
| rc = fsync_and_close(log_fd, "job"); |
| if (rc && !error_code) |
| error_code = rc; |
| } |
| if (error_code) |
| (void) unlink(new_file); |
| else { /* file shuffle */ |
| (void) unlink(old_file); |
| if (link(reg_file, old_file)) |
| debug4("unable to create link for %s -> %s: %m", |
| reg_file, old_file); |
| (void) unlink(reg_file); |
| if (link(new_file, reg_file)) |
| debug4("unable to create link for %s -> %s: %m", |
| new_file, reg_file); |
| (void) unlink(new_file); |
| last_file_write_time = now; |
| } |
| xfree(old_file); |
| xfree(reg_file); |
| xfree(new_file); |
| unlock_state_files(); |
| |
| free_buf(buffer); |
| END_TIMER2("dump_all_job_state"); |
| return error_code; |
| } |
| |
| /* Open the job state save file, or backup if necessary. |
| * state_file IN - the name of the state save file used |
| * RET the file description to read from or error code |
| */ |
| static int _open_job_state_file(char **state_file) |
| { |
| int state_fd; |
| struct stat stat_buf; |
| |
| *state_file = slurm_get_state_save_location(); |
| xstrcat(*state_file, "/job_state"); |
| state_fd = open(*state_file, O_RDONLY); |
| if (state_fd < 0) { |
| error("Could not open job state file %s: %m", *state_file); |
| } else if (fstat(state_fd, &stat_buf) < 0) { |
| error("Could not stat job state file %s: %m", *state_file); |
| (void) close(state_fd); |
| } else if (stat_buf.st_size < 10) { |
| error("Job state file %s too small", *state_file); |
| (void) close(state_fd); |
| } else /* Success */ |
| return state_fd; |
| |
| error("NOTE: Trying backup state save file. Jobs may be lost!"); |
| xstrcat(*state_file, ".old"); |
| state_fd = open(*state_file, O_RDONLY); |
| return state_fd; |
| } |
| |
| /* Note that the backup slurmctld has assumed primary control. |
| * This function can be called multiple times. */ |
| extern void backup_slurmctld_restart(void) |
| { |
| last_file_write_time = (time_t) 0; |
| } |
| |
| /* Return the time stamp in the current job state save file */ |
| static time_t _get_last_state_write_time(void) |
| { |
| int data_allocated, data_read = 0, error_code = SLURM_SUCCESS; |
| uint32_t data_size = 0; |
| int state_fd; |
| char *data, *state_file; |
| Buf buffer; |
| time_t buf_time = (time_t) 0; |
| char *ver_str = NULL; |
| uint32_t ver_str_len; |
| uint16_t protocol_version = (uint16_t)NO_VAL; |
| |
| /* read the file */ |
| state_fd = _open_job_state_file(&state_file); |
| if (state_fd < 0) { |
| info("No job state file (%s) found", state_file); |
| error_code = ENOENT; |
| } else { |
| data_allocated = 128; |
| data = xmalloc(data_allocated); |
| while (1) { |
| data_read = read(state_fd, &data[data_size], |
| (data_allocated - data_size)); |
| if (data_read < 0) { |
| if (errno == EINTR) |
| continue; |
| else { |
| error("Read error on %s: %m", |
| state_file); |
| break; |
| } |
| } else if (data_read == 0) /* eof */ |
| break; |
| data_size += data_read; |
| if (data_size >= 128) |
| break; |
| } |
| close(state_fd); |
| } |
| xfree(state_file); |
| if (error_code) |
| return error_code; |
| |
| buffer = create_buf(data, data_size); |
| safe_unpackstr_xmalloc(&ver_str, &ver_str_len, buffer); |
| if (ver_str) { |
| if (!strcmp(ver_str, JOB_STATE_VERSION)) |
| safe_unpack16(&protocol_version, buffer); |
| else if (!strcmp(ver_str, JOB_2_6_STATE_VERSION)) |
| protocol_version = SLURM_2_6_PROTOCOL_VERSION; |
| else if (!strcmp(ver_str, JOB_2_5_STATE_VERSION)) |
| protocol_version = SLURM_2_5_PROTOCOL_VERSION; |
| } |
| safe_unpack_time(&buf_time, buffer); |
| |
| unpack_error: |
| xfree(ver_str); |
| free_buf(buffer); |
| return buf_time; |
| } |
| |
| /* |
| * load_all_job_state - load the job state from file, recover from last |
| * checkpoint. Execute this after loading the configuration file data. |
| * Changes here should be reflected in load_last_job_id(). |
| * RET 0 or error code |
| */ |
| extern int load_all_job_state(void) |
| { |
| int data_allocated, data_read = 0, error_code = SLURM_SUCCESS; |
| uint32_t data_size = 0; |
| int state_fd, job_cnt = 0; |
| char *data = NULL, *state_file; |
| Buf buffer; |
| time_t buf_time; |
| uint32_t saved_job_id; |
| char *ver_str = NULL; |
| uint32_t ver_str_len; |
| uint16_t protocol_version = (uint16_t)NO_VAL; |
| |
| /* read the file */ |
| lock_state_files(); |
| state_fd = _open_job_state_file(&state_file); |
| if (state_fd < 0) { |
| info("No job state file (%s) to recover", state_file); |
| error_code = ENOENT; |
| } else { |
| data_allocated = BUF_SIZE; |
| data = xmalloc(data_allocated); |
| while (1) { |
| data_read = read(state_fd, &data[data_size], |
| BUF_SIZE); |
| if (data_read < 0) { |
| if (errno == EINTR) |
| continue; |
| else { |
| error("Read error on %s: %m", |
| state_file); |
| break; |
| } |
| } else if (data_read == 0) /* eof */ |
| break; |
| data_size += data_read; |
| data_allocated += data_read; |
| xrealloc(data, data_allocated); |
| } |
| close(state_fd); |
| } |
| xfree(state_file); |
| unlock_state_files(); |
| |
| job_id_sequence = MAX(job_id_sequence, slurmctld_conf.first_job_id); |
| if (error_code) |
| return error_code; |
| |
| buffer = create_buf(data, data_size); |
| safe_unpackstr_xmalloc(&ver_str, &ver_str_len, buffer); |
| debug3("Version string in job_state header is %s", ver_str); |
| if (ver_str) { |
| if (!strcmp(ver_str, JOB_STATE_VERSION)) |
| safe_unpack16(&protocol_version, buffer); |
| else if (!strcmp(ver_str, JOB_2_6_STATE_VERSION)) |
| protocol_version = SLURM_2_6_PROTOCOL_VERSION; |
| else if (!strcmp(ver_str, JOB_2_5_STATE_VERSION)) |
| protocol_version = SLURM_2_5_PROTOCOL_VERSION; |
| } |
| |
| if (protocol_version == (uint16_t)NO_VAL) { |
| error("***********************************************"); |
| error("Can not recover job state, incompatible version"); |
| error("***********************************************"); |
| xfree(ver_str); |
| free_buf(buffer); |
| return EFAULT; |
| } |
| xfree(ver_str); |
| |
| safe_unpack_time(&buf_time, buffer); |
| safe_unpack32( &saved_job_id, buffer); |
| job_id_sequence = MAX(saved_job_id, job_id_sequence); |
| debug3("Job id in job_state header is %u", saved_job_id); |
| |
| while (remaining_buf(buffer) > 0) { |
| error_code = _load_job_state(buffer, protocol_version); |
| if (error_code != SLURM_SUCCESS) |
| goto unpack_error; |
| job_cnt++; |
| } |
| debug3("Set job_id_sequence to %u", job_id_sequence); |
| |
| free_buf(buffer); |
| info("Recovered information about %d jobs", job_cnt); |
| return error_code; |
| |
| unpack_error: |
| error("Incomplete job data checkpoint file"); |
| info("Recovered information about %d jobs", job_cnt); |
| free_buf(buffer); |
| return SLURM_FAILURE; |
| } |
| |
| /* |
| * load_last_job_id - load only the last job ID from state save file. |
| * Changes here should be reflected in load_all_job_state(). |
| * RET 0 or error code |
| */ |
| extern int load_last_job_id( void ) |
| { |
| int data_allocated, data_read = 0, error_code = SLURM_SUCCESS; |
| uint32_t data_size = 0; |
| int state_fd; |
| char *data = NULL, *state_file; |
| Buf buffer; |
| time_t buf_time; |
| char *ver_str = NULL; |
| uint32_t ver_str_len; |
| uint16_t protocol_version = (uint16_t)NO_VAL; |
| |
| /* read the file */ |
| state_file = slurm_get_state_save_location(); |
| xstrcat(state_file, "/job_state"); |
| lock_state_files(); |
| state_fd = open(state_file, O_RDONLY); |
| if (state_fd < 0) { |
| debug("No job state file (%s) to recover", state_file); |
| error_code = ENOENT; |
| } else { |
| data_allocated = BUF_SIZE; |
| data = xmalloc(data_allocated); |
| while (1) { |
| data_read = read(state_fd, &data[data_size], |
| BUF_SIZE); |
| if (data_read < 0) { |
| if (errno == EINTR) |
| continue; |
| else { |
| error("Read error on %s: %m", |
| state_file); |
| break; |
| } |
| } else if (data_read == 0) /* eof */ |
| break; |
| data_size += data_read; |
| data_allocated += data_read; |
| xrealloc(data, data_allocated); |
| } |
| close(state_fd); |
| } |
| xfree(state_file); |
| unlock_state_files(); |
| |
| if (error_code) |
| return error_code; |
| |
| buffer = create_buf(data, data_size); |
| safe_unpackstr_xmalloc(&ver_str, &ver_str_len, buffer); |
| debug3("Version string in job_state header is %s", ver_str); |
| if (ver_str) { |
| if (!strcmp(ver_str, JOB_STATE_VERSION)) |
| safe_unpack16(&protocol_version, buffer); |
| else if (!strcmp(ver_str, JOB_2_6_STATE_VERSION)) |
| protocol_version = SLURM_2_6_PROTOCOL_VERSION; |
| else if (!strcmp(ver_str, JOB_2_5_STATE_VERSION)) |
| protocol_version = SLURM_2_5_PROTOCOL_VERSION; |
| } |
| xfree(ver_str); |
| |
| if (protocol_version == (uint16_t)NO_VAL) { |
| debug("*************************************************"); |
| debug("Can not recover last job ID, incompatible version"); |
| debug("*************************************************"); |
| free_buf(buffer); |
| return EFAULT; |
| } |
| |
| safe_unpack_time(&buf_time, buffer); |
| safe_unpack32( &job_id_sequence, buffer); |
| debug3("Job ID in job_state header is %u", job_id_sequence); |
| |
| /* Ignore the state for individual jobs stored here */ |
| |
| free_buf(buffer); |
| return error_code; |
| |
| unpack_error: |
| debug("Invalid job data checkpoint file"); |
| free_buf(buffer); |
| return SLURM_FAILURE; |
| } |
| |
| /* |
| * _dump_job_state - dump the state of a specific job, its details, and |
| * steps to a buffer |
| * IN dump_job_ptr - pointer to job for which information is requested |
| * IN/OUT buffer - location to store data, pointers automatically advanced |
| */ |
| static void _dump_job_state(struct job_record *dump_job_ptr, Buf buffer) |
| { |
| struct job_details *detail_ptr; |
| ListIterator step_iterator; |
| struct step_record *step_ptr; |
| |
| /* Dump basic job info */ |
| pack32(dump_job_ptr->array_job_id, buffer); |
| pack32(dump_job_ptr->array_task_id, buffer); |
| pack32(dump_job_ptr->assoc_id, buffer); |
| pack32(dump_job_ptr->job_id, buffer); |
| pack32(dump_job_ptr->user_id, buffer); |
| pack32(dump_job_ptr->group_id, buffer); |
| pack32(dump_job_ptr->time_limit, buffer); |
| pack32(dump_job_ptr->time_min, buffer); |
| pack32(dump_job_ptr->priority, buffer); |
| pack32(dump_job_ptr->alloc_sid, buffer); |
| pack32(dump_job_ptr->total_cpus, buffer); |
| if (dump_job_ptr->total_nodes) |
| pack32(dump_job_ptr->total_nodes, buffer); |
| else |
| pack32(dump_job_ptr->node_cnt_wag, buffer); |
| pack32(dump_job_ptr->cpu_cnt, buffer); |
| pack32(dump_job_ptr->exit_code, buffer); |
| pack32(dump_job_ptr->derived_ec, buffer); |
| pack32(dump_job_ptr->db_index, buffer); |
| pack32(dump_job_ptr->resv_id, buffer); |
| pack32(dump_job_ptr->next_step_id, buffer); |
| pack32(dump_job_ptr->qos_id, buffer); |
| pack32(dump_job_ptr->req_switch, buffer); |
| pack32(dump_job_ptr->wait4switch, buffer); |
| pack32(dump_job_ptr->profile, buffer); |
| |
| pack_time(dump_job_ptr->preempt_time, buffer); |
| pack_time(dump_job_ptr->start_time, buffer); |
| pack_time(dump_job_ptr->end_time, buffer); |
| pack_time(dump_job_ptr->suspend_time, buffer); |
| pack_time(dump_job_ptr->pre_sus_time, buffer); |
| pack_time(dump_job_ptr->resize_time, buffer); |
| pack_time(dump_job_ptr->tot_sus_time, buffer); |
| |
| pack16(dump_job_ptr->direct_set_prio, buffer); |
| pack16(dump_job_ptr->job_state, buffer); |
| pack16(dump_job_ptr->kill_on_node_fail, buffer); |
| pack16(dump_job_ptr->batch_flag, buffer); |
| pack16(dump_job_ptr->mail_type, buffer); |
| pack16(dump_job_ptr->state_reason, buffer); |
| pack16(dump_job_ptr->restart_cnt, buffer); |
| pack16(dump_job_ptr->wait_all_nodes, buffer); |
| pack16(dump_job_ptr->warn_flags, buffer); |
| pack16(dump_job_ptr->warn_signal, buffer); |
| pack16(dump_job_ptr->warn_time, buffer); |
| pack16(dump_job_ptr->limit_set_max_cpus, buffer); |
| pack16(dump_job_ptr->limit_set_max_nodes, buffer); |
| pack16(dump_job_ptr->limit_set_min_cpus, buffer); |
| pack16(dump_job_ptr->limit_set_min_nodes, buffer); |
| pack16(dump_job_ptr->limit_set_pn_min_memory, buffer); |
| pack16(dump_job_ptr->limit_set_time, buffer); |
| pack16(dump_job_ptr->limit_set_qos, buffer); |
| |
| packstr(dump_job_ptr->state_desc, buffer); |
| packstr(dump_job_ptr->resp_host, buffer); |
| |
| pack16(dump_job_ptr->alloc_resp_port, buffer); |
| pack16(dump_job_ptr->other_port, buffer); |
| |
| if (IS_JOB_COMPLETING(dump_job_ptr)) { |
| if (dump_job_ptr->nodes_completing == NULL) { |
| dump_job_ptr->nodes_completing = |
| bitmap2node_name(dump_job_ptr->node_bitmap); |
| } |
| packstr(dump_job_ptr->nodes_completing, buffer); |
| } |
| packstr(dump_job_ptr->nodes, buffer); |
| packstr(dump_job_ptr->partition, buffer); |
| packstr(dump_job_ptr->name, buffer); |
| packstr(dump_job_ptr->wckey, buffer); |
| packstr(dump_job_ptr->alloc_node, buffer); |
| packstr(dump_job_ptr->account, buffer); |
| packstr(dump_job_ptr->comment, buffer); |
| packstr(dump_job_ptr->gres, buffer); |
| packstr(dump_job_ptr->gres_alloc, buffer); |
| packstr(dump_job_ptr->gres_req, buffer); |
| packstr(dump_job_ptr->gres_used, buffer); |
| packstr(dump_job_ptr->network, buffer); |
| packstr(dump_job_ptr->licenses, buffer); |
| packstr(dump_job_ptr->mail_user, buffer); |
| packstr(dump_job_ptr->resv_name, buffer); |
| packstr(dump_job_ptr->batch_host, buffer); |
| |
| select_g_select_jobinfo_pack(dump_job_ptr->select_jobinfo, |
| buffer, SLURM_PROTOCOL_VERSION); |
| pack_job_resources(dump_job_ptr->job_resrcs, buffer, |
| SLURM_PROTOCOL_VERSION); |
| |
| pack16(dump_job_ptr->ckpt_interval, buffer); |
| checkpoint_pack_jobinfo(dump_job_ptr->check_job, buffer, |
| SLURM_PROTOCOL_VERSION); |
| packstr_array(dump_job_ptr->spank_job_env, |
| dump_job_ptr->spank_job_env_size, buffer); |
| |
| (void) gres_plugin_job_state_pack(dump_job_ptr->gres_list, buffer, |
| dump_job_ptr->job_id, true, |
| SLURM_PROTOCOL_VERSION); |
| |
| /* Dump job details, if available */ |
| detail_ptr = dump_job_ptr->details; |
| if (detail_ptr) { |
| xassert (detail_ptr->magic == DETAILS_MAGIC); |
| pack16((uint16_t) DETAILS_FLAG, buffer); |
| _dump_job_details(detail_ptr, buffer); |
| } else |
| pack16((uint16_t) 0, buffer); /* no details flag */ |
| |
| /* Dump job steps */ |
| step_iterator = list_iterator_create(dump_job_ptr->step_list); |
| while ((step_ptr = (struct step_record *) |
| list_next(step_iterator))) { |
| if (step_ptr->state < JOB_RUNNING) |
| continue; |
| pack16((uint16_t) STEP_FLAG, buffer); |
| dump_job_step_state(dump_job_ptr, step_ptr, buffer); |
| } |
| list_iterator_destroy(step_iterator); |
| pack16((uint16_t) 0, buffer); /* no step flag */ |
| } |
| |
| /* Unpack a job's state information from a buffer */ |
| static int _load_job_state(Buf buffer, uint16_t protocol_version) |
| { |
| uint32_t job_id, user_id, group_id, time_limit, priority, alloc_sid; |
| uint32_t exit_code, assoc_id, db_index, name_len, time_min; |
| uint32_t next_step_id, total_cpus, total_nodes = 0, cpu_cnt; |
| uint32_t resv_id, spank_job_env_size = 0, qos_id, derived_ec = 0; |
| uint32_t array_job_id = 0, req_switch = 0, wait4switch = 0; |
| uint32_t profile = ACCT_GATHER_PROFILE_NOT_SET; |
| time_t start_time, end_time, suspend_time, pre_sus_time, tot_sus_time; |
| time_t preempt_time = 0; |
| time_t resize_time = 0, now = time(NULL); |
| uint32_t array_task_id = NO_VAL; |
| uint16_t job_state, details, batch_flag, step_flag; |
| uint16_t kill_on_node_fail, direct_set_prio; |
| uint16_t alloc_resp_port, other_port, mail_type, state_reason; |
| uint16_t restart_cnt, ckpt_interval; |
| uint16_t wait_all_nodes, warn_flags = 0, warn_signal, warn_time; |
| uint16_t limit_set_max_cpus = 0, limit_set_max_nodes = 0; |
| uint16_t limit_set_min_cpus = 0, limit_set_min_nodes = 0; |
| uint16_t limit_set_pn_min_memory = 0; |
| uint16_t limit_set_time = 0, limit_set_qos = 0; |
| uint16_t uint16_tmp; |
| char *nodes = NULL, *partition = NULL, *name = NULL, *resp_host = NULL; |
| char *account = NULL, *network = NULL, *mail_user = NULL; |
| char *comment = NULL, *nodes_completing = NULL, *alloc_node = NULL; |
| char *licenses = NULL, *state_desc = NULL, *wckey = NULL; |
| char *resv_name = NULL, *gres = NULL, *batch_host = NULL; |
| char *gres_alloc = NULL, *gres_req = NULL, *gres_used = NULL; |
| char **spank_job_env = (char **) NULL; |
| List gres_list = NULL, part_ptr_list = NULL; |
| struct job_record *job_ptr = NULL; |
| struct part_record *part_ptr; |
| int error_code, i, qos_error; |
| dynamic_plugin_data_t *select_jobinfo = NULL; |
| job_resources_t *job_resources = NULL; |
| check_jobinfo_t check_job = NULL; |
| slurmdb_association_rec_t assoc_rec; |
| slurmdb_qos_rec_t qos_rec; |
| bool job_finished = false; |
| |
| if (protocol_version >= SLURM_14_03_PROTOCOL_VERSION) { |
| safe_unpack32(&array_job_id, buffer); |
| safe_unpack32(&array_task_id, buffer); |
| safe_unpack32(&assoc_id, buffer); |
| safe_unpack32(&job_id, buffer); |
| |
| /* validity test as possible */ |
| if (job_id == 0) { |
| verbose("Invalid job_id %u", job_id); |
| goto unpack_error; |
| } |
| |
| job_ptr = find_job_record(job_id); |
| if (job_ptr == NULL) { |
| job_ptr = create_job_record(&error_code); |
| if (error_code) { |
| error("Create job entry failed for job_id %u", |
| job_id); |
| goto unpack_error; |
| } |
| job_ptr->job_id = job_id; |
| job_ptr->array_job_id = array_job_id; |
| job_ptr->array_task_id = array_task_id; |
| _add_job_hash(job_ptr); |
| _add_job_array_hash(job_ptr); |
| } |
| |
| safe_unpack32(&user_id, buffer); |
| safe_unpack32(&group_id, buffer); |
| safe_unpack32(&time_limit, buffer); |
| safe_unpack32(&time_min, buffer); |
| safe_unpack32(&priority, buffer); |
| safe_unpack32(&alloc_sid, buffer); |
| safe_unpack32(&total_cpus, buffer); |
| safe_unpack32(&total_nodes, buffer); |
| safe_unpack32(&cpu_cnt, buffer); |
| safe_unpack32(&exit_code, buffer); |
| safe_unpack32(&derived_ec, buffer); |
| safe_unpack32(&db_index, buffer); |
| safe_unpack32(&resv_id, buffer); |
| safe_unpack32(&next_step_id, buffer); |
| safe_unpack32(&qos_id, buffer); |
| safe_unpack32(&req_switch, buffer); |
| safe_unpack32(&wait4switch, buffer); |
| safe_unpack32(&profile, buffer); |
| |
| safe_unpack_time(&preempt_time, buffer); |
| safe_unpack_time(&start_time, buffer); |
| safe_unpack_time(&end_time, buffer); |
| safe_unpack_time(&suspend_time, buffer); |
| safe_unpack_time(&pre_sus_time, buffer); |
| safe_unpack_time(&resize_time, buffer); |
| safe_unpack_time(&tot_sus_time, buffer); |
| |
| safe_unpack16(&direct_set_prio, buffer); |
| safe_unpack16(&job_state, buffer); |
| safe_unpack16(&kill_on_node_fail, buffer); |
| safe_unpack16(&batch_flag, buffer); |
| safe_unpack16(&mail_type, buffer); |
| safe_unpack16(&state_reason, buffer); |
| safe_unpack16(&restart_cnt, buffer); |
| safe_unpack16(&wait_all_nodes, buffer); |
| safe_unpack16(&warn_flags, buffer); |
| safe_unpack16(&warn_signal, buffer); |
| safe_unpack16(&warn_time, buffer); |
| safe_unpack16(&limit_set_max_cpus, buffer); |
| safe_unpack16(&limit_set_max_nodes, buffer); |
| safe_unpack16(&limit_set_min_cpus, buffer); |
| safe_unpack16(&limit_set_min_nodes, buffer); |
| safe_unpack16(&limit_set_pn_min_memory, buffer); |
| safe_unpack16(&limit_set_time, buffer); |
| safe_unpack16(&limit_set_qos, buffer); |
| |
| safe_unpackstr_xmalloc(&state_desc, &name_len, buffer); |
| safe_unpackstr_xmalloc(&resp_host, &name_len, buffer); |
| |
| safe_unpack16(&alloc_resp_port, buffer); |
| safe_unpack16(&other_port, buffer); |
| |
| if (job_state & JOB_COMPLETING) { |
| safe_unpackstr_xmalloc(&nodes_completing, |
| &name_len, buffer); |
| } |
| safe_unpackstr_xmalloc(&nodes, &name_len, buffer); |
| safe_unpackstr_xmalloc(&partition, &name_len, buffer); |
| if (partition == NULL) { |
| error("No partition for job %u", job_id); |
| goto unpack_error; |
| } |
| part_ptr = find_part_record (partition); |
| if (part_ptr == NULL) { |
| part_ptr_list = get_part_list(partition); |
| if (part_ptr_list) { |
| part_ptr = list_peek(part_ptr_list); |
| } else { |
| verbose("Invalid partition (%s) for job_id %u", |
| partition, job_id); |
| /* not fatal error, partition could have been |
| * removed, reset_job_bitmaps() will clean-up |
| * this job */ |
| } |
| } |
| |
| safe_unpackstr_xmalloc(&name, &name_len, buffer); |
| safe_unpackstr_xmalloc(&wckey, &name_len, buffer); |
| safe_unpackstr_xmalloc(&alloc_node, &name_len, buffer); |
| safe_unpackstr_xmalloc(&account, &name_len, buffer); |
| safe_unpackstr_xmalloc(&comment, &name_len, buffer); |
| safe_unpackstr_xmalloc(&gres, &name_len, buffer); |
| safe_unpackstr_xmalloc(&gres_alloc, &name_len, buffer); |
| safe_unpackstr_xmalloc(&gres_req, &name_len, buffer); |
| safe_unpackstr_xmalloc(&gres_used, &name_len, buffer); |
| safe_unpackstr_xmalloc(&network, &name_len, buffer); |
| safe_unpackstr_xmalloc(&licenses, &name_len, buffer); |
| safe_unpackstr_xmalloc(&mail_user, &name_len, buffer); |
| safe_unpackstr_xmalloc(&resv_name, &name_len, buffer); |
| safe_unpackstr_xmalloc(&batch_host, &name_len, buffer); |
| |
| if (select_g_select_jobinfo_unpack(&select_jobinfo, buffer, |
| protocol_version)) |
| goto unpack_error; |
| if (unpack_job_resources(&job_resources, buffer, |
| protocol_version)) |
| goto unpack_error; |
| |
| safe_unpack16(&ckpt_interval, buffer); |
| if (checkpoint_alloc_jobinfo(&check_job) || |
| checkpoint_unpack_jobinfo(check_job, buffer, |
| protocol_version)) |
| goto unpack_error; |
| |
| safe_unpackstr_array(&spank_job_env, &spank_job_env_size, |
| buffer); |
| |
| if (gres_plugin_job_state_unpack(&gres_list, buffer, job_id, |
| protocol_version) != |
| SLURM_SUCCESS) |
| goto unpack_error; |
| gres_plugin_job_state_log(gres_list, job_id); |
| |
| safe_unpack16(&details, buffer); |
| if ((details == DETAILS_FLAG) && |
| (_load_job_details(job_ptr, buffer, protocol_version))) { |
| job_ptr->job_state = JOB_FAILED; |
| job_ptr->exit_code = 1; |
| job_ptr->state_reason = FAIL_SYSTEM; |
| xfree(job_ptr->state_desc); |
| job_ptr->end_time = now; |
| goto unpack_error; |
| } |
| safe_unpack16(&step_flag, buffer); |
| |
| while (step_flag == STEP_FLAG) { |
| /* No need to put these into accounting if they |
| * haven't been since all information will be |
| * put in when the job is finished. |
| */ |
| if ((error_code = load_step_state(job_ptr, buffer, |
| protocol_version))) |
| goto unpack_error; |
| safe_unpack16(&step_flag, buffer); |
| } |
| } else if (protocol_version >= SLURM_2_6_PROTOCOL_VERSION) { |
| safe_unpack32(&array_job_id, buffer); |
| safe_unpack16(&uint16_tmp, buffer); |
| if (uint16_tmp == (uint16_t) NO_VAL) |
| array_task_id = NO_VAL; |
| else |
| array_task_id = (uint32_t) uint16_tmp; |
| safe_unpack32(&assoc_id, buffer); |
| safe_unpack32(&job_id, buffer); |
| |
| /* validity test as possible */ |
| if (job_id == 0) { |
| verbose("Invalid job_id %u", job_id); |
| goto unpack_error; |
| } |
| |
| job_ptr = find_job_record(job_id); |
| if (job_ptr == NULL) { |
| job_ptr = create_job_record(&error_code); |
| if (error_code) { |
| error("Create job entry failed for job_id %u", |
| job_id); |
| goto unpack_error; |
| } |
| job_ptr->job_id = job_id; |
| job_ptr->array_job_id = array_job_id; |
| job_ptr->array_task_id = array_task_id; |
| _add_job_hash(job_ptr); |
| _add_job_array_hash(job_ptr); |
| } |
| |
| safe_unpack32(&user_id, buffer); |
| safe_unpack32(&group_id, buffer); |
| safe_unpack32(&time_limit, buffer); |
| safe_unpack32(&time_min, buffer); |
| safe_unpack32(&priority, buffer); |
| safe_unpack32(&alloc_sid, buffer); |
| safe_unpack32(&total_cpus, buffer); |
| safe_unpack32(&total_nodes, buffer); |
| safe_unpack32(&cpu_cnt, buffer); |
| safe_unpack32(&exit_code, buffer); |
| safe_unpack32(&derived_ec, buffer); |
| safe_unpack32(&db_index, buffer); |
| safe_unpack32(&resv_id, buffer); |
| safe_unpack32(&next_step_id, buffer); |
| safe_unpack32(&qos_id, buffer); |
| safe_unpack32(&req_switch, buffer); |
| safe_unpack32(&wait4switch, buffer); |
| safe_unpack32(&profile, buffer); |
| |
| safe_unpack_time(&preempt_time, buffer); |
| safe_unpack_time(&start_time, buffer); |
| safe_unpack_time(&end_time, buffer); |
| safe_unpack_time(&suspend_time, buffer); |
| safe_unpack_time(&pre_sus_time, buffer); |
| safe_unpack_time(&resize_time, buffer); |
| safe_unpack_time(&tot_sus_time, buffer); |
| |
| safe_unpack16(&direct_set_prio, buffer); |
| safe_unpack16(&job_state, buffer); |
| safe_unpack16(&kill_on_node_fail, buffer); |
| safe_unpack16(&batch_flag, buffer); |
| safe_unpack16(&mail_type, buffer); |
| safe_unpack16(&state_reason, buffer); |
| safe_unpack16(&restart_cnt, buffer); |
| safe_unpack16(&uint16_tmp, buffer); /* Was resv_flags */ |
| safe_unpack16(&wait_all_nodes, buffer); |
| safe_unpack16(&warn_signal, buffer); |
| safe_unpack16(&warn_time, buffer); |
| safe_unpack16(&limit_set_max_cpus, buffer); |
| safe_unpack16(&limit_set_max_nodes, buffer); |
| safe_unpack16(&limit_set_min_cpus, buffer); |
| safe_unpack16(&limit_set_min_nodes, buffer); |
| safe_unpack16(&limit_set_pn_min_memory, buffer); |
| safe_unpack16(&limit_set_time, buffer); |
| safe_unpack16(&limit_set_qos, buffer); |
| |
| safe_unpackstr_xmalloc(&state_desc, &name_len, buffer); |
| safe_unpackstr_xmalloc(&resp_host, &name_len, buffer); |
| |
| safe_unpack16(&alloc_resp_port, buffer); |
| safe_unpack16(&other_port, buffer); |
| |
| if (job_state & JOB_COMPLETING) { |
| safe_unpackstr_xmalloc(&nodes_completing, |
| &name_len, buffer); |
| } |
| safe_unpackstr_xmalloc(&nodes, &name_len, buffer); |
| safe_unpackstr_xmalloc(&partition, &name_len, buffer); |
| if (partition == NULL) { |
| error("No partition for job %u", job_id); |
| goto unpack_error; |
| } |
| part_ptr = find_part_record (partition); |
| if (part_ptr == NULL) { |
| part_ptr_list = get_part_list(partition); |
| if (part_ptr_list) { |
| part_ptr = list_peek(part_ptr_list); |
| } else { |
| verbose("Invalid partition (%s) for job_id %u", |
| partition, job_id); |
| /* not fatal error, partition could have been |
| * removed, reset_job_bitmaps() will clean-up |
| * this job */ |
| } |
| } |
| |
| safe_unpackstr_xmalloc(&name, &name_len, buffer); |
| safe_unpackstr_xmalloc(&wckey, &name_len, buffer); |
| safe_unpackstr_xmalloc(&alloc_node, &name_len, buffer); |
| safe_unpackstr_xmalloc(&account, &name_len, buffer); |
| safe_unpackstr_xmalloc(&comment, &name_len, buffer); |
| safe_unpackstr_xmalloc(&gres, &name_len, buffer); |
| safe_unpackstr_xmalloc(&gres_alloc, &name_len, buffer); |
| safe_unpackstr_xmalloc(&gres_req, &name_len, buffer); |
| safe_unpackstr_xmalloc(&gres_used, &name_len, buffer); |
| safe_unpackstr_xmalloc(&network, &name_len, buffer); |
| safe_unpackstr_xmalloc(&licenses, &name_len, buffer); |
| safe_unpackstr_xmalloc(&mail_user, &name_len, buffer); |
| safe_unpackstr_xmalloc(&resv_name, &name_len, buffer); |
| safe_unpackstr_xmalloc(&batch_host, &name_len, buffer); |
| |
| if (select_g_select_jobinfo_unpack(&select_jobinfo, buffer, |
| protocol_version)) |
| goto unpack_error; |
| if (unpack_job_resources(&job_resources, buffer, |
| protocol_version)) |
| goto unpack_error; |
| |
| safe_unpack16(&ckpt_interval, buffer); |
| if (checkpoint_alloc_jobinfo(&check_job) || |
| checkpoint_unpack_jobinfo(check_job, buffer, |
| protocol_version)) |
| goto unpack_error; |
| |
| safe_unpackstr_array(&spank_job_env, &spank_job_env_size, |
| buffer); |
| |
| if (gres_plugin_job_state_unpack(&gres_list, buffer, job_id, |
| protocol_version) != |
| SLURM_SUCCESS) |
| goto unpack_error; |
| gres_plugin_job_state_log(gres_list, job_id); |
| |
| safe_unpack16(&details, buffer); |
| if ((details == DETAILS_FLAG) && |
| (_load_job_details(job_ptr, buffer, protocol_version))) { |
| job_ptr->job_state = JOB_FAILED; |
| job_ptr->exit_code = 1; |
| job_ptr->state_reason = FAIL_SYSTEM; |
| xfree(job_ptr->state_desc); |
| job_ptr->end_time = now; |
| goto unpack_error; |
| } |
| safe_unpack16(&step_flag, buffer); |
| |
| while (step_flag == STEP_FLAG) { |
| /* No need to put these into accounting if they |
| * haven't been since all information will be |
| * put in when the job is finished. |
| */ |
| if ((error_code = load_step_state(job_ptr, buffer, |
| protocol_version))) |
| goto unpack_error; |
| safe_unpack16(&step_flag, buffer); |
| } |
| } else if (protocol_version >= SLURM_2_5_PROTOCOL_VERSION) { |
| /* NOTE: As of 12/18/12 the job state of v2.5 and v2.6 are |
| * the same, but the step states differ */ |
| safe_unpack32(&assoc_id, buffer); |
| safe_unpack32(&job_id, buffer); |
| |
| /* validity test as possible */ |
| if (job_id == 0) { |
| verbose("Invalid job_id %u", job_id); |
| goto unpack_error; |
| } |
| |
| job_ptr = find_job_record(job_id); |
| if (job_ptr == NULL) { |
| job_ptr = create_job_record(&error_code); |
| if (error_code) { |
| error("Create job entry failed for job_id %u", |
| job_id); |
| goto unpack_error; |
| } |
| job_ptr->job_id = job_id; |
| job_ptr->array_job_id = array_job_id; |
| job_ptr->array_task_id = array_task_id; |
| _add_job_hash(job_ptr); |
| _add_job_array_hash(job_ptr); |
| } |
| |
| safe_unpack32(&user_id, buffer); |
| safe_unpack32(&group_id, buffer); |
| safe_unpack32(&time_limit, buffer); |
| safe_unpack32(&time_min, buffer); |
| safe_unpack32(&priority, buffer); |
| safe_unpack32(&alloc_sid, buffer); |
| safe_unpack32(&total_cpus, buffer); |
| safe_unpack32(&total_nodes, buffer); |
| safe_unpack32(&cpu_cnt, buffer); |
| safe_unpack32(&exit_code, buffer); |
| safe_unpack32(&derived_ec, buffer); |
| safe_unpack32(&db_index, buffer); |
| safe_unpack32(&resv_id, buffer); |
| safe_unpack32(&next_step_id, buffer); |
| safe_unpack32(&qos_id, buffer); |
| safe_unpack32(&req_switch, buffer); |
| safe_unpack32(&wait4switch, buffer); |
| |
| safe_unpack_time(&preempt_time, buffer); |
| safe_unpack_time(&start_time, buffer); |
| safe_unpack_time(&end_time, buffer); |
| safe_unpack_time(&suspend_time, buffer); |
| safe_unpack_time(&pre_sus_time, buffer); |
| safe_unpack_time(&resize_time, buffer); |
| safe_unpack_time(&tot_sus_time, buffer); |
| |
| safe_unpack16(&direct_set_prio, buffer); |
| safe_unpack16(&job_state, buffer); |
| safe_unpack16(&kill_on_node_fail, buffer); |
| safe_unpack16(&batch_flag, buffer); |
| safe_unpack16(&mail_type, buffer); |
| safe_unpack16(&state_reason, buffer); |
| safe_unpack16(&restart_cnt, buffer); |
| safe_unpack16(&uint16_tmp, buffer); /* Was resv_flags */ |
| safe_unpack16(&wait_all_nodes, buffer); |
| safe_unpack16(&warn_signal, buffer); |
| safe_unpack16(&warn_time, buffer); |
| safe_unpack16(&limit_set_max_cpus, buffer); |
| safe_unpack16(&limit_set_max_nodes, buffer); |
| safe_unpack16(&limit_set_min_cpus, buffer); |
| safe_unpack16(&limit_set_min_nodes, buffer); |
| safe_unpack16(&limit_set_pn_min_memory, buffer); |
| safe_unpack16(&limit_set_time, buffer); |
| safe_unpack16(&limit_set_qos, buffer); |
| |
| safe_unpackstr_xmalloc(&state_desc, &name_len, buffer); |
| safe_unpackstr_xmalloc(&resp_host, &name_len, buffer); |
| |
| safe_unpack16(&alloc_resp_port, buffer); |
| safe_unpack16(&other_port, buffer); |
| |
| if (job_state & JOB_COMPLETING) { |
| safe_unpackstr_xmalloc(&nodes_completing, |
| &name_len, buffer); |
| } |
| safe_unpackstr_xmalloc(&nodes, &name_len, buffer); |
| safe_unpackstr_xmalloc(&partition, &name_len, buffer); |
| if (partition == NULL) { |
| error("No partition for job %u", job_id); |
| goto unpack_error; |
| } |
| part_ptr = find_part_record (partition); |
| if (part_ptr == NULL) { |
| part_ptr_list = get_part_list(partition); |
| if (part_ptr_list) |
| part_ptr = list_peek(part_ptr_list); |
| } |
| if (part_ptr == NULL) { |
| verbose("Invalid partition (%s) for job_id %u", |
| partition, job_id); |
| /* not fatal error, partition could have been removed, |
| * reset_job_bitmaps() will clean-up this job */ |
| } |
| |
| safe_unpackstr_xmalloc(&name, &name_len, buffer); |
| safe_unpackstr_xmalloc(&wckey, &name_len, buffer); |
| safe_unpackstr_xmalloc(&alloc_node, &name_len, buffer); |
| safe_unpackstr_xmalloc(&account, &name_len, buffer); |
| safe_unpackstr_xmalloc(&comment, &name_len, buffer); |
| safe_unpackstr_xmalloc(&gres, &name_len, buffer); |
| safe_unpackstr_xmalloc(&gres_alloc, &name_len, buffer); |
| safe_unpackstr_xmalloc(&gres_req, &name_len, buffer); |
| safe_unpackstr_xmalloc(&gres_used, &name_len, buffer); |
| safe_unpackstr_xmalloc(&network, &name_len, buffer); |
| safe_unpackstr_xmalloc(&licenses, &name_len, buffer); |
| safe_unpackstr_xmalloc(&mail_user, &name_len, buffer); |
| safe_unpackstr_xmalloc(&resv_name, &name_len, buffer); |
| safe_unpackstr_xmalloc(&batch_host, &name_len, buffer); |
| |
| if (select_g_select_jobinfo_unpack(&select_jobinfo, buffer, |
| protocol_version)) |
| goto unpack_error; |
| if (unpack_job_resources(&job_resources, buffer, |
| protocol_version)) |
| goto unpack_error; |
| |
| safe_unpack16(&ckpt_interval, buffer); |
| if (checkpoint_alloc_jobinfo(&check_job) || |
| checkpoint_unpack_jobinfo(check_job, buffer, |
| protocol_version)) |
| goto unpack_error; |
| |
| safe_unpackstr_array(&spank_job_env, &spank_job_env_size, |
| buffer); |
| |
| if (gres_plugin_job_state_unpack(&gres_list, buffer, job_id, |
| protocol_version) != |
| SLURM_SUCCESS) |
| goto unpack_error; |
| gres_plugin_job_state_log(gres_list, job_id); |
| |
| safe_unpack16(&details, buffer); |
| if ((details == DETAILS_FLAG) && |
| (_load_job_details(job_ptr, buffer, protocol_version))) { |
| job_ptr->job_state = JOB_FAILED; |
| job_ptr->exit_code = 1; |
| job_ptr->state_reason = FAIL_SYSTEM; |
| xfree(job_ptr->state_desc); |
| job_ptr->end_time = now; |
| goto unpack_error; |
| } |
| safe_unpack16(&step_flag, buffer); |
| |
| while (step_flag == STEP_FLAG) { |
| /* No need to put these into accounting if they |
| * haven't been since all information will be |
| * put in when the job is finished. |
| */ |
| if ((error_code = load_step_state(job_ptr, buffer, |
| protocol_version))) |
| goto unpack_error; |
| safe_unpack16(&step_flag, buffer); |
| } |
| } else { |
| error("_load_job_state: protocol_version " |
| "%hu not supported", protocol_version); |
| goto unpack_error; |
| } |
| |
| if (((job_state & JOB_STATE_BASE) >= JOB_END) || |
| (batch_flag > 2)) { |
| error("Invalid data for job %u: " |
| "job_state=%u batch_flag=%u", |
| job_id, job_state, batch_flag); |
| goto unpack_error; |
| } |
| if (kill_on_node_fail > 1) { |
| error("Invalid data for job %u: kill_on_node_fail=%u", |
| job_id, kill_on_node_fail); |
| goto unpack_error; |
| } |
| |
| if ((priority > 1) && (direct_set_prio == 0)) { |
| highest_prio = MAX(highest_prio, priority); |
| lowest_prio = MIN(lowest_prio, priority); |
| } |
| if (job_id_sequence <= job_id) |
| job_id_sequence = job_id + 1; |
| |
| xfree(job_ptr->account); |
| job_ptr->account = account; |
| xstrtolower(job_ptr->account); |
| account = NULL; /* reused, nothing left to free */ |
| xfree(job_ptr->alloc_node); |
| job_ptr->alloc_node = alloc_node; |
| alloc_node = NULL; /* reused, nothing left to free */ |
| job_ptr->alloc_resp_port = alloc_resp_port; |
| job_ptr->alloc_sid = alloc_sid; |
| job_ptr->assoc_id = assoc_id; |
| job_ptr->batch_flag = batch_flag; |
| xfree(job_ptr->batch_host); |
| job_ptr->batch_host = batch_host; |
| batch_host = NULL; /* reused, nothing left to free */ |
| xfree(job_ptr->comment); |
| job_ptr->comment = comment; |
| comment = NULL; /* reused, nothing left to free */ |
| xfree(job_ptr->gres); |
| job_ptr->gres = gres; |
| gres = NULL; /* reused, nothing left to free */ |
| xfree(job_ptr->gres_alloc); |
| job_ptr->gres_alloc = gres_alloc; |
| gres_alloc = NULL; /* reused, nothing left to free */ |
| xfree(job_ptr->gres_req); |
| job_ptr->gres_req = gres_req; |
| gres_req = NULL; /* reused, nothing left to free */ |
| xfree(job_ptr->gres_used); |
| job_ptr->gres_used = gres_used; |
| gres_used = NULL; /* reused, nothing left to free */ |
| job_ptr->gres_list = gres_list; |
| job_ptr->direct_set_prio = direct_set_prio; |
| job_ptr->db_index = db_index; |
| job_ptr->derived_ec = derived_ec; |
| job_ptr->end_time = end_time; |
| job_ptr->exit_code = exit_code; |
| job_ptr->group_id = group_id; |
| job_ptr->job_state = job_state; |
| job_ptr->kill_on_node_fail = kill_on_node_fail; |
| xfree(job_ptr->licenses); |
| job_ptr->licenses = licenses; |
| licenses = NULL; /* reused, nothing left to free */ |
| job_ptr->mail_type = mail_type; |
| xfree(job_ptr->mail_user); |
| job_ptr->mail_user = mail_user; |
| mail_user = NULL; /* reused, nothing left to free */ |
| xfree(job_ptr->name); /* in case duplicate record */ |
| job_ptr->name = name; |
| name = NULL; /* reused, nothing left to free */ |
| xfree(job_ptr->wckey); /* in case duplicate record */ |
| job_ptr->wckey = wckey; |
| xstrtolower(job_ptr->wckey); |
| wckey = NULL; /* reused, nothing left to free */ |
| xfree(job_ptr->network); |
| job_ptr->network = network; |
| network = NULL; /* reused, nothing left to free */ |
| job_ptr->next_step_id = next_step_id; |
| xfree(job_ptr->nodes); /* in case duplicate record */ |
| job_ptr->nodes = nodes; |
| nodes = NULL; /* reused, nothing left to free */ |
| if (nodes_completing) { |
| xfree(job_ptr->nodes_completing); |
| job_ptr->nodes_completing = nodes_completing; |
| nodes_completing = NULL; /* reused, nothing left to free */ |
| } |
| job_ptr->other_port = other_port; |
| xfree(job_ptr->partition); |
| job_ptr->partition = partition; |
| partition = NULL; /* reused, nothing left to free */ |
| job_ptr->part_ptr = part_ptr; |
| job_ptr->part_ptr_list = part_ptr_list; |
| job_ptr->pre_sus_time = pre_sus_time; |
| job_ptr->priority = priority; |
| job_ptr->qos_id = qos_id; |
| xfree(job_ptr->resp_host); |
| job_ptr->resp_host = resp_host; |
| resp_host = NULL; /* reused, nothing left to free */ |
| job_ptr->resize_time = resize_time; |
| job_ptr->restart_cnt = restart_cnt; |
| job_ptr->resv_id = resv_id; |
| job_ptr->resv_name = resv_name; |
| resv_name = NULL; /* reused, nothing left to free */ |
| job_ptr->select_jobinfo = select_jobinfo; |
| job_ptr->job_resrcs = job_resources; |
| job_ptr->spank_job_env = spank_job_env; |
| job_ptr->spank_job_env_size = spank_job_env_size; |
| job_ptr->ckpt_interval = ckpt_interval; |
| job_ptr->check_job = check_job; |
| job_ptr->start_time = start_time; |
| job_ptr->state_reason = state_reason; |
| job_ptr->state_desc = state_desc; |
| state_desc = NULL; /* reused, nothing left to free */ |
| job_ptr->suspend_time = suspend_time; |
| job_ptr->time_last_active = now; |
| job_ptr->time_limit = time_limit; |
| job_ptr->time_min = time_min; |
| job_ptr->total_cpus = total_cpus; |
| |
| if (IS_JOB_PENDING(job_ptr)) |
| job_ptr->node_cnt_wag = total_nodes; |
| else |
| job_ptr->total_nodes = total_nodes; |
| |
| job_ptr->cpu_cnt = cpu_cnt; |
| job_ptr->tot_sus_time = tot_sus_time; |
| job_ptr->preempt_time = preempt_time; |
| job_ptr->user_id = user_id; |
| select_g_select_jobinfo_set(job_ptr->select_jobinfo, |
| SELECT_JOBDATA_USER_NAME, &user_id); |
| job_ptr->wait_all_nodes = wait_all_nodes; |
| job_ptr->warn_flags = warn_flags; |
| job_ptr->warn_signal = warn_signal; |
| job_ptr->warn_time = warn_time; |
| job_ptr->limit_set_max_cpus = limit_set_max_cpus; |
| job_ptr->limit_set_max_nodes = limit_set_max_nodes; |
| job_ptr->limit_set_min_cpus = limit_set_min_cpus; |
| job_ptr->limit_set_min_nodes = limit_set_min_nodes; |
| job_ptr->limit_set_pn_min_memory = limit_set_pn_min_memory; |
| job_ptr->limit_set_time = limit_set_time; |
| job_ptr->limit_set_qos = limit_set_qos; |
| job_ptr->req_switch = req_switch; |
| job_ptr->wait4switch = wait4switch; |
| job_ptr->profile = profile; |
| /* This needs to always to initialized to "true". The select |
| plugin will deal with it every time it goes through the |
| logic if req_switch or wait4switch are set. |
| */ |
| job_ptr->best_switch = true; |
| |
| memset(&assoc_rec, 0, sizeof(slurmdb_association_rec_t)); |
| |
| /* |
| * For speed and accurracy we will first see if we once had an |
| * association record. If not look for it by |
| * account,partition, user_id. |
| */ |
| if (job_ptr->assoc_id) |
| assoc_rec.id = job_ptr->assoc_id; |
| else { |
| assoc_rec.acct = job_ptr->account; |
| if (job_ptr->part_ptr) |
| assoc_rec.partition = job_ptr->part_ptr->name; |
| assoc_rec.uid = job_ptr->user_id; |
| } |
| |
| if (assoc_mgr_fill_in_assoc(acct_db_conn, &assoc_rec, |
| accounting_enforce, |
| (slurmdb_association_rec_t **) |
| &job_ptr->assoc_ptr, false) && |
| (accounting_enforce & ACCOUNTING_ENFORCE_ASSOCS) |
| && (!IS_JOB_FINISHED(job_ptr))) { |
| info("Holding job %u with invalid association", job_id); |
| xfree(job_ptr->state_desc); |
| job_ptr->state_reason = FAIL_ACCOUNT; |
| } else { |
| job_ptr->assoc_id = assoc_rec.id; |
| info("Recovered job %u %u", job_id, job_ptr->assoc_id); |
| |
| /* make sure we have started this job in accounting */ |
| if (!job_ptr->db_index) { |
| debug("starting job %u in accounting", |
| job_ptr->job_id); |
| if (!with_slurmdbd) |
| jobacct_storage_g_job_start( |
| acct_db_conn, job_ptr); |
| if (slurmctld_init_db |
| && IS_JOB_SUSPENDED(job_ptr)) { |
| jobacct_storage_g_job_suspend(acct_db_conn, |
| job_ptr); |
| } |
| } |
| /* make sure we have this job completed in the |
| * database */ |
| if (IS_JOB_FINISHED(job_ptr)) { |
| if (slurmctld_init_db) |
| jobacct_storage_g_job_complete( |
| acct_db_conn, job_ptr); |
| job_finished = 1; |
| } |
| } |
| |
| if (!job_finished && job_ptr->qos_id && |
| (job_ptr->state_reason != FAIL_ACCOUNT)) { |
| memset(&qos_rec, 0, sizeof(slurmdb_qos_rec_t)); |
| qos_rec.id = job_ptr->qos_id; |
| job_ptr->qos_ptr = _determine_and_validate_qos( |
| job_ptr->resv_name, job_ptr->assoc_ptr, |
| job_ptr->limit_set_qos, &qos_rec, |
| &qos_error); |
| if ((qos_error != SLURM_SUCCESS) && !job_ptr->limit_set_qos) { |
| info("Holding job %u with invalid qos", job_id); |
| xfree(job_ptr->state_desc); |
| job_ptr->state_reason = FAIL_QOS; |
| job_ptr->qos_id = 0; |
| } else |
| job_ptr->qos_id = qos_rec.id; |
| } |
| build_node_details(job_ptr, false); /* set node_addr */ |
| return SLURM_SUCCESS; |
| |
| unpack_error: |
| error("Incomplete job record"); |
| xfree(alloc_node); |
| xfree(account); |
| xfree(batch_host); |
| xfree(comment); |
| xfree(gres); |
| xfree(gres_alloc); |
| xfree(gres_req); |
| xfree(gres_used); |
| xfree(resp_host); |
| xfree(licenses); |
| xfree(mail_user); |
| xfree(name); |
| xfree(nodes); |
| xfree(nodes_completing); |
| xfree(partition); |
| FREE_NULL_LIST(part_ptr_list); |
| xfree(resv_name); |
| for (i=0; i<spank_job_env_size; i++) |
| xfree(spank_job_env[i]); |
| xfree(spank_job_env); |
| xfree(state_desc); |
| xfree(wckey); |
| select_g_select_jobinfo_free(select_jobinfo); |
| checkpoint_free_jobinfo(check_job); |
| if (job_ptr) { |
| if (job_ptr->job_id == 0) |
| job_ptr->job_id = NO_VAL; |
| _purge_job_record(job_ptr->job_id); |
| } |
| return SLURM_FAILURE; |
| } |
| |
| /* |
| * _dump_job_details - dump the state of a specific job details to |
| * a buffer |
| * IN detail_ptr - pointer to job details for which information is requested |
| * IN/OUT buffer - location to store data, pointers automatically advanced |
| */ |
| void _dump_job_details(struct job_details *detail_ptr, Buf buffer) |
| { |
| pack32(detail_ptr->min_cpus, buffer); |
| pack32(detail_ptr->max_cpus, buffer); |
| pack32(detail_ptr->min_nodes, buffer); |
| pack32(detail_ptr->max_nodes, buffer); |
| pack32(detail_ptr->num_tasks, buffer); |
| |
| packstr(detail_ptr->acctg_freq, buffer); |
| pack16(detail_ptr->contiguous, buffer); |
| pack16(detail_ptr->core_spec, buffer); |
| pack16(detail_ptr->cpus_per_task, buffer); |
| pack16(detail_ptr->nice, buffer); |
| pack16(detail_ptr->ntasks_per_node, buffer); |
| pack16(detail_ptr->requeue, buffer); |
| pack16(detail_ptr->task_dist, buffer); |
| |
| pack8(detail_ptr->share_res, buffer); |
| pack8(detail_ptr->whole_node, buffer); |
| |
| packstr(detail_ptr->cpu_bind, buffer); |
| pack16(detail_ptr->cpu_bind_type, buffer); |
| packstr(detail_ptr->mem_bind, buffer); |
| pack16(detail_ptr->mem_bind_type, buffer); |
| pack16(detail_ptr->plane_size, buffer); |
| |
| pack8(detail_ptr->open_mode, buffer); |
| pack8(detail_ptr->overcommit, buffer); |
| pack8(detail_ptr->prolog_running, buffer); |
| |
| pack32(detail_ptr->pn_min_cpus, buffer); |
| pack32(detail_ptr->pn_min_memory, buffer); |
| pack32(detail_ptr->pn_min_tmp_disk, buffer); |
| pack_time(detail_ptr->begin_time, buffer); |
| pack_time(detail_ptr->submit_time, buffer); |
| |
| packstr(detail_ptr->req_nodes, buffer); |
| packstr(detail_ptr->exc_nodes, buffer); |
| packstr(detail_ptr->features, buffer); |
| packstr(detail_ptr->dependency, buffer); |
| packstr(detail_ptr->orig_dependency, buffer); |
| |
| packstr(detail_ptr->std_err, buffer); |
| packstr(detail_ptr->std_in, buffer); |
| packstr(detail_ptr->std_out, buffer); |
| packstr(detail_ptr->work_dir, buffer); |
| packstr(detail_ptr->ckpt_dir, buffer); |
| packstr(detail_ptr->restart_dir, buffer); |
| |
| pack_multi_core_data(detail_ptr->mc_ptr, buffer, |
| SLURM_PROTOCOL_VERSION); |
| packstr_array(detail_ptr->argv, detail_ptr->argc, buffer); |
| packstr_array(detail_ptr->env_sup, detail_ptr->env_cnt, buffer); |
| } |
| |
| /* _load_job_details - Unpack a job details information from buffer */ |
| static int _load_job_details(struct job_record *job_ptr, Buf buffer, |
| uint16_t protocol_version) |
| { |
| char *acctg_freq = NULL, *req_nodes = NULL, *exc_nodes = NULL; |
| char *features = NULL, *cpu_bind = NULL, *dependency = NULL; |
| char *orig_dependency = NULL, *mem_bind; |
| char *err = NULL, *in = NULL, *out = NULL, *work_dir = NULL; |
| char *ckpt_dir = NULL, *restart_dir = NULL; |
| char **argv = (char **) NULL, **env_sup = (char **) NULL; |
| uint32_t min_nodes, max_nodes; |
| uint32_t min_cpus = 1, max_cpus = NO_VAL; |
| uint32_t pn_min_cpus, pn_min_memory, pn_min_tmp_disk; |
| uint32_t num_tasks, name_len, argc = 0, env_cnt = 0; |
| uint16_t contiguous, core_spec = 0, nice, ntasks_per_node; |
| uint16_t cpus_per_task, requeue, task_dist; |
| uint16_t cpu_bind_type, mem_bind_type, plane_size; |
| uint8_t open_mode, overcommit, prolog_running; |
| uint8_t share_res, whole_node; |
| time_t begin_time, submit_time; |
| int i; |
| multi_core_data_t *mc_ptr; |
| |
| /* unpack the job's details from the buffer */ |
| if (protocol_version >= SLURM_14_03_PROTOCOL_VERSION) { |
| safe_unpack32(&min_cpus, buffer); |
| safe_unpack32(&max_cpus, buffer); |
| safe_unpack32(&min_nodes, buffer); |
| safe_unpack32(&max_nodes, buffer); |
| safe_unpack32(&num_tasks, buffer); |
| |
| safe_unpackstr_xmalloc(&acctg_freq, &name_len, buffer); |
| if (acctg_freq && !strcmp(acctg_freq, "65534")) { |
| /* This fixes job state generated by version 2.6.0, |
| * in which a version 2.5 value of NO_VAL was converted |
| * from uint16_t to a string. */ |
| xfree(acctg_freq); |
| } |
| safe_unpack16(&contiguous, buffer); |
| safe_unpack16(&core_spec, buffer); |
| safe_unpack16(&cpus_per_task, buffer); |
| safe_unpack16(&nice, buffer); |
| safe_unpack16(&ntasks_per_node, buffer); |
| safe_unpack16(&requeue, buffer); |
| safe_unpack16(&task_dist, buffer); |
| |
| safe_unpack8(&share_res, buffer); |
| safe_unpack8(&whole_node, buffer); |
| |
| safe_unpackstr_xmalloc(&cpu_bind, &name_len, buffer); |
| safe_unpack16(&cpu_bind_type, buffer); |
| safe_unpackstr_xmalloc(&mem_bind, &name_len, buffer); |
| safe_unpack16(&mem_bind_type, buffer); |
| safe_unpack16(&plane_size, buffer); |
| |
| safe_unpack8(&open_mode, buffer); |
| safe_unpack8(&overcommit, buffer); |
| safe_unpack8(&prolog_running, buffer); |
| |
| safe_unpack32(&pn_min_cpus, buffer); |
| safe_unpack32(&pn_min_memory, buffer); |
| safe_unpack32(&pn_min_tmp_disk, buffer); |
| safe_unpack_time(&begin_time, buffer); |
| safe_unpack_time(&submit_time, buffer); |
| |
| safe_unpackstr_xmalloc(&req_nodes, &name_len, buffer); |
| safe_unpackstr_xmalloc(&exc_nodes, &name_len, buffer); |
| safe_unpackstr_xmalloc(&features, &name_len, buffer); |
| safe_unpackstr_xmalloc(&dependency, &name_len, buffer); |
| safe_unpackstr_xmalloc(&orig_dependency, &name_len, buffer); |
| |
| safe_unpackstr_xmalloc(&err, &name_len, buffer); |
| safe_unpackstr_xmalloc(&in, &name_len, buffer); |
| safe_unpackstr_xmalloc(&out, &name_len, buffer); |
| safe_unpackstr_xmalloc(&work_dir, &name_len, buffer); |
| safe_unpackstr_xmalloc(&ckpt_dir, &name_len, buffer); |
| safe_unpackstr_xmalloc(&restart_dir, &name_len, buffer); |
| |
| if (unpack_multi_core_data(&mc_ptr, buffer, protocol_version)) |
| goto unpack_error; |
| safe_unpackstr_array(&argv, &argc, buffer); |
| safe_unpackstr_array(&env_sup, &env_cnt, buffer); |
| } else if (protocol_version >= SLURM_2_6_PROTOCOL_VERSION) { |
| uint16_t tmp_uint16; |
| safe_unpack32(&min_cpus, buffer); |
| safe_unpack32(&max_cpus, buffer); |
| safe_unpack32(&min_nodes, buffer); |
| safe_unpack32(&max_nodes, buffer); |
| safe_unpack32(&num_tasks, buffer); |
| |
| safe_unpackstr_xmalloc(&acctg_freq, &name_len, buffer); |
| if (acctg_freq && !strcmp(acctg_freq, "65534")) { |
| /* This fixes job state generated by version 2.6.0, |
| * in which a version 2.5 value of NO_VAL was converted |
| * from uint16_t to a string. */ |
| xfree(acctg_freq); |
| } |
| safe_unpack16(&contiguous, buffer); |
| safe_unpack16(&cpus_per_task, buffer); |
| safe_unpack16(&nice, buffer); |
| safe_unpack16(&ntasks_per_node, buffer); |
| safe_unpack16(&requeue, buffer); |
| safe_unpack16(&tmp_uint16, buffer); |
| if (tmp_uint16 == 0) { |
| share_res = 0; |
| whole_node = 1; |
| } else if ((tmp_uint16 == 1) || (tmp_uint16 == 2)) { |
| share_res = 1; |
| whole_node = 0; |
| } else { |
| share_res = (uint8_t) NO_VAL; |
| whole_node = 0; |
| } |
| safe_unpack16(&task_dist, buffer); |
| |
| safe_unpackstr_xmalloc(&cpu_bind, &name_len, buffer); |
| safe_unpack16(&cpu_bind_type, buffer); |
| safe_unpackstr_xmalloc(&mem_bind, &name_len, buffer); |
| safe_unpack16(&mem_bind_type, buffer); |
| safe_unpack16(&plane_size, buffer); |
| |
| safe_unpack8(&open_mode, buffer); |
| safe_unpack8(&overcommit, buffer); |
| safe_unpack8(&prolog_running, buffer); |
| |
| safe_unpack32(&pn_min_cpus, buffer); |
| safe_unpack32(&pn_min_memory, buffer); |
| safe_unpack32(&pn_min_tmp_disk, buffer); |
| safe_unpack_time(&begin_time, buffer); |
| safe_unpack_time(&submit_time, buffer); |
| |
| safe_unpackstr_xmalloc(&req_nodes, &name_len, buffer); |
| safe_unpackstr_xmalloc(&exc_nodes, &name_len, buffer); |
| safe_unpackstr_xmalloc(&features, &name_len, buffer); |
| safe_unpackstr_xmalloc(&dependency, &name_len, buffer); |
| safe_unpackstr_xmalloc(&orig_dependency, &name_len, buffer); |
| |
| safe_unpackstr_xmalloc(&err, &name_len, buffer); |
| safe_unpackstr_xmalloc(&in, &name_len, buffer); |
| safe_unpackstr_xmalloc(&out, &name_len, buffer); |
| safe_unpackstr_xmalloc(&work_dir, &name_len, buffer); |
| safe_unpackstr_xmalloc(&ckpt_dir, &name_len, buffer); |
| safe_unpackstr_xmalloc(&restart_dir, &name_len, buffer); |
| |
| if (unpack_multi_core_data(&mc_ptr, buffer, protocol_version)) |
| goto unpack_error; |
| safe_unpackstr_array(&argv, &argc, buffer); |
| safe_unpackstr_array(&env_sup, &env_cnt, buffer); |
| } else if (protocol_version >= SLURM_2_5_PROTOCOL_VERSION) { |
| uint16_t tmp_uint16; |
| safe_unpack32(&min_cpus, buffer); |
| safe_unpack32(&max_cpus, buffer); |
| safe_unpack32(&min_nodes, buffer); |
| safe_unpack32(&max_nodes, buffer); |
| safe_unpack32(&num_tasks, buffer); |
| |
| safe_unpack16(&tmp_uint16, buffer); |
| if (tmp_uint16 && (tmp_uint16 != (uint16_t) NO_VAL)) |
| acctg_freq = xstrdup_printf("%u", tmp_uint16); |
| safe_unpack16(&contiguous, buffer); |
| safe_unpack16(&cpus_per_task, buffer); |
| safe_unpack16(&nice, buffer); |
| safe_unpack16(&ntasks_per_node, buffer); |
| safe_unpack16(&requeue, buffer); |
| safe_unpack16(&tmp_uint16, buffer); |
| if (tmp_uint16 == 0) { |
| share_res = 0; |
| whole_node = 1; |
| } else if ((tmp_uint16 == 1) || (tmp_uint16 == 2)) { |
| share_res = 1; |
| whole_node = 0; |
| } else { |
| share_res = (uint8_t) NO_VAL; |
| whole_node = 0; |
| } |
| safe_unpack16(&task_dist, buffer); |
| |
| safe_unpackstr_xmalloc(&cpu_bind, &name_len, buffer); |
| safe_unpack16(&cpu_bind_type, buffer); |
| safe_unpackstr_xmalloc(&mem_bind, &name_len, buffer); |
| safe_unpack16(&mem_bind_type, buffer); |
| safe_unpack16(&plane_size, buffer); |
| |
| safe_unpack8(&open_mode, buffer); |
| safe_unpack8(&overcommit, buffer); |
| safe_unpack8(&prolog_running, buffer); |
| |
| safe_unpack32(&pn_min_cpus, buffer); |
| safe_unpack32(&pn_min_memory, buffer); |
| safe_unpack32(&pn_min_tmp_disk, buffer); |
| safe_unpack_time(&begin_time, buffer); |
| safe_unpack_time(&submit_time, buffer); |
| |
| safe_unpackstr_xmalloc(&req_nodes, &name_len, buffer); |
| safe_unpackstr_xmalloc(&exc_nodes, &name_len, buffer); |
| safe_unpackstr_xmalloc(&features, &name_len, buffer); |
| safe_unpackstr_xmalloc(&dependency, &name_len, buffer); |
| safe_unpackstr_xmalloc(&orig_dependency, &name_len, buffer); |
| |
| safe_unpackstr_xmalloc(&err, &name_len, buffer); |
| safe_unpackstr_xmalloc(&in, &name_len, buffer); |
| safe_unpackstr_xmalloc(&out, &name_len, buffer); |
| safe_unpackstr_xmalloc(&work_dir, &name_len, buffer); |
| safe_unpackstr_xmalloc(&ckpt_dir, &name_len, buffer); |
| safe_unpackstr_xmalloc(&restart_dir, &name_len, buffer); |
| |
| if (unpack_multi_core_data(&mc_ptr, buffer, protocol_version)) |
| goto unpack_error; |
| safe_unpackstr_array(&argv, &argc, buffer); |
| safe_unpackstr_array(&env_sup, &env_cnt, buffer); |
| } else { |
| error("_load_job_details: protocol_version " |
| "%hu not supported", protocol_version); |
| goto unpack_error; |
| } |
| |
| /* validity test as possible */ |
| if (contiguous > 1) { |
| error("Invalid data for job %u: contiguous=%u", |
| job_ptr->job_id, contiguous); |
| goto unpack_error; |
| } |
| if ((requeue > 1) || (overcommit > 1)) { |
| error("Invalid data for job %u: requeue=%u overcommit=%u", |
| job_ptr->job_id, requeue, overcommit); |
| goto unpack_error; |
| } |
| if (prolog_running > 1) { |
| error("Invalid data for job %u: prolog_running=%u", |
| job_ptr->job_id, prolog_running); |
| goto unpack_error; |
| } |
| |
| /* free any left-over detail data */ |
| xfree(job_ptr->details->acctg_freq); |
| for (i=0; i<job_ptr->details->argc; i++) |
| xfree(job_ptr->details->argv[i]); |
| xfree(job_ptr->details->argv); |
| xfree(job_ptr->details->cpu_bind); |
| xfree(job_ptr->details->dependency); |
| xfree(job_ptr->details->orig_dependency); |
| xfree(job_ptr->details->std_err); |
| for (i=0; i<job_ptr->details->env_cnt; i++) |
| xfree(job_ptr->details->env_sup[i]); |
| xfree(job_ptr->details->env_sup); |
| xfree(job_ptr->details->exc_nodes); |
| xfree(job_ptr->details->features); |
| xfree(job_ptr->details->std_in); |
| xfree(job_ptr->details->mem_bind); |
| xfree(job_ptr->details->std_out); |
| xfree(job_ptr->details->req_nodes); |
| xfree(job_ptr->details->work_dir); |
| xfree(job_ptr->details->ckpt_dir); |
| xfree(job_ptr->details->restart_dir); |
| |
| /* now put the details into the job record */ |
| job_ptr->details->acctg_freq = acctg_freq; |
| job_ptr->details->argc = argc; |
| job_ptr->details->argv = argv; |
| job_ptr->details->begin_time = begin_time; |
| job_ptr->details->contiguous = contiguous; |
| job_ptr->details->core_spec = core_spec; |
| job_ptr->details->cpu_bind = cpu_bind; |
| job_ptr->details->cpu_bind_type = cpu_bind_type; |
| job_ptr->details->cpus_per_task = cpus_per_task; |
| job_ptr->details->dependency = dependency; |
| job_ptr->details->orig_dependency = orig_dependency; |
| job_ptr->details->env_cnt = env_cnt; |
| job_ptr->details->env_sup = env_sup; |
| job_ptr->details->std_err = err; |
| job_ptr->details->exc_nodes = exc_nodes; |
| job_ptr->details->features = features; |
| job_ptr->details->std_in = in; |
| job_ptr->details->pn_min_cpus = pn_min_cpus; |
| job_ptr->details->pn_min_memory = pn_min_memory; |
| job_ptr->details->pn_min_tmp_disk = pn_min_tmp_disk; |
| job_ptr->details->max_cpus = max_cpus; |
| job_ptr->details->max_nodes = max_nodes; |
| job_ptr->details->mc_ptr = mc_ptr; |
| job_ptr->details->mem_bind = mem_bind; |
| job_ptr->details->mem_bind_type = mem_bind_type; |
| job_ptr->details->min_cpus = min_cpus; |
| job_ptr->details->min_nodes = min_nodes; |
| job_ptr->details->nice = nice; |
| job_ptr->details->ntasks_per_node = ntasks_per_node; |
| job_ptr->details->num_tasks = num_tasks; |
| job_ptr->details->open_mode = open_mode; |
| job_ptr->details->std_out = out; |
| job_ptr->details->overcommit = overcommit; |
| job_ptr->details->plane_size = plane_size; |
| job_ptr->details->prolog_running = prolog_running; |
| job_ptr->details->req_nodes = req_nodes; |
| job_ptr->details->requeue = requeue; |
| job_ptr->details->share_res = share_res; |
| job_ptr->details->submit_time = submit_time; |
| job_ptr->details->task_dist = task_dist; |
| job_ptr->details->whole_node = whole_node; |
| job_ptr->details->work_dir = work_dir; |
| job_ptr->details->ckpt_dir = ckpt_dir; |
| job_ptr->details->restart_dir = restart_dir; |
| |
| return SLURM_SUCCESS; |
| |
| unpack_error: |
| |
| /* for (i=0; i<argc; i++) |
| xfree(argv[i]); Don't trust this on unpack error */ |
| xfree(acctg_freq); |
| xfree(argv); |
| xfree(cpu_bind); |
| xfree(dependency); |
| xfree(orig_dependency); |
| /* for (i=0; i<env_cnt; i++) |
| xfree(env_sup[i]); Don't trust this on unpack error */ |
| xfree(env_sup); |
| xfree(err); |
| xfree(exc_nodes); |
| xfree(features); |
| xfree(in); |
| xfree(mem_bind); |
| xfree(out); |
| xfree(req_nodes); |
| xfree(work_dir); |
| xfree(ckpt_dir); |
| xfree(restart_dir); |
| return SLURM_FAILURE; |
| } |
| |
| /* _add_job_hash - add a job hash entry for given job record, job_id must |
| * already be set |
| * IN job_ptr - pointer to job record |
| * Globals: hash table updated |
| */ |
| void _add_job_hash(struct job_record *job_ptr) |
| { |
| int inx; |
| |
| inx = JOB_HASH_INX(job_ptr->job_id); |
| job_ptr->job_next = job_hash[inx]; |
| job_hash[inx] = job_ptr; |
| } |
| |
| /* _add_job_array_hash - add a job hash entry for given job record, |
| * array_job_id and array_task_id must already be set |
| * IN job_ptr - pointer to job record |
| * Globals: hash table updated |
| */ |
| void _add_job_array_hash(struct job_record *job_ptr) |
| { |
| int inx; |
| |
| if (job_ptr->array_task_id == NO_VAL) |
| return; /* Not a job array */ |
| |
| inx = JOB_HASH_INX(job_ptr->array_job_id); |
| job_ptr->job_array_next_j = job_array_hash_j[inx]; |
| job_array_hash_j[inx] = job_ptr; |
| |
| inx = JOB_ARRAY_HASH_INX(job_ptr->array_job_id,job_ptr->array_task_id); |
| job_ptr->job_array_next_t = job_array_hash_t[inx]; |
| job_array_hash_t[inx] = job_ptr; |
| } |
| |
| /* |
| * find_job_array_rec - return a pointer to the job record with the given |
| * array_job_id/array_task_id |
| * IN job_id - requested job's id |
| * IN array_task_id - requested job's task id, |
| * NO_VAL if none specified (i.e. not a job array ) |
| * INFINITE return any task for specified job id |
| * RET pointer to the job's record, NULL on error |
| */ |
| extern struct job_record *find_job_array_rec(uint32_t array_job_id, |
| uint32_t array_task_id) |
| { |
| struct job_record *job_ptr, *match_job_ptr = NULL; |
| int inx; |
| |
| if (array_task_id == NO_VAL) |
| return find_job_record(array_job_id); |
| |
| if (array_task_id == INFINITE) { /* find by job ID */ |
| inx = JOB_HASH_INX(array_job_id); |
| job_ptr = job_array_hash_j[inx]; |
| while (job_ptr) { |
| if (job_ptr->array_job_id == array_job_id) { |
| match_job_ptr = job_ptr; |
| if (!IS_JOB_FINISHED(job_ptr)) { |
| return job_ptr; |
| } |
| } |
| job_ptr = job_ptr->job_array_next_j; |
| } |
| return match_job_ptr; |
| } else { /* Find specific task ID */ |
| inx = JOB_ARRAY_HASH_INX(array_job_id, array_task_id); |
| job_ptr = job_array_hash_t[inx]; |
| while (job_ptr) { |
| if ((job_ptr->array_job_id == array_job_id) && |
| (job_ptr->array_task_id == array_task_id)) { |
| return job_ptr; |
| } |
| job_ptr = job_ptr->job_array_next_t; |
| } |
| return NULL; /* None found */ |
| } |
| } |
| |
| /* |
| * find_job_record - return a pointer to the job record with the given job_id |
| * IN job_id - requested job's id |
| * RET pointer to the job's record, NULL on error |
| */ |
| struct job_record *find_job_record(uint32_t job_id) |
| { |
| struct job_record *job_ptr; |
| |
| job_ptr = job_hash[JOB_HASH_INX(job_id)]; |
| while (job_ptr) { |
| if (job_ptr->job_id == job_id) |
| return job_ptr; |
| job_ptr = job_ptr->job_next; |
| } |
| |
| return NULL; |
| } |
| |
| /* rebuild a job's partition name list based upon the contents of its |
| * part_ptr_list */ |
| static void _rebuild_part_name_list(struct job_record *job_ptr) |
| { |
| bool job_active = false, job_pending = false; |
| struct part_record *part_ptr; |
| ListIterator part_iterator; |
| |
| xfree(job_ptr->partition); |
| if (IS_JOB_RUNNING(job_ptr) || IS_JOB_SUSPENDED(job_ptr)) { |
| job_active = true; |
| xfree(job_ptr->partition); |
| job_ptr->partition = xstrdup(job_ptr->part_ptr->name); |
| } else if (IS_JOB_PENDING(job_ptr)) |
| job_pending = true; |
| |
| part_iterator = list_iterator_create(job_ptr->part_ptr_list); |
| while ((part_ptr = (struct part_record *) list_next(part_iterator))) { |
| if (job_pending) { |
| /* Reset job's one partition to a valid one */ |
| job_ptr->part_ptr = part_ptr; |
| job_pending = false; |
| } |
| if (job_active && (part_ptr == job_ptr->part_ptr)) |
| continue; /* already added */ |
| if (job_ptr->partition) |
| xstrcat(job_ptr->partition, ","); |
| xstrcat(job_ptr->partition, part_ptr->name); |
| } |
| list_iterator_destroy(part_iterator); |
| last_job_update = time(NULL); |
| } |
| |
| /* |
| * kill_job_by_part_name - Given a partition name, deallocate resource for |
| * its jobs and kill them. All jobs associated with this partition |
| * will have their partition pointer cleared. |
| * IN part_name - name of a partition |
| * RET number of jobs associated with this partition |
| */ |
| extern int kill_job_by_part_name(char *part_name) |
| { |
| ListIterator job_iterator, part_iterator; |
| struct job_record *job_ptr; |
| struct part_record *part_ptr, *part2_ptr; |
| int job_count = 0; |
| time_t now = time(NULL); |
| |
| part_ptr = find_part_record (part_name); |
| if (part_ptr == NULL) /* No such partition */ |
| return 0; |
| |
| job_iterator = list_iterator_create(job_list); |
| while ((job_ptr = (struct job_record *) list_next(job_iterator))) { |
| bool pending = false, suspended = false; |
| |
| pending = IS_JOB_PENDING(job_ptr); |
| if (job_ptr->part_ptr_list) { |
| /* Remove partition if candidate for a job */ |
| bool rebuild_name_list = false; |
| part_iterator = list_iterator_create(job_ptr-> |
| part_ptr_list); |
| while ((part2_ptr = (struct part_record *) |
| list_next(part_iterator))) { |
| if (part2_ptr != part_ptr) |
| continue; |
| list_remove(part_iterator); |
| rebuild_name_list = true; |
| } |
| list_iterator_destroy(part_iterator); |
| if (rebuild_name_list) { |
| if (list_count(job_ptr->part_ptr_list) > 0) { |
| _rebuild_part_name_list(job_ptr); |
| job_ptr->part_ptr = |
| list_peek(job_ptr-> |
| part_ptr_list); |
| } else { |
| FREE_NULL_LIST(job_ptr->part_ptr_list); |
| } |
| } |
| } |
| |
| if (job_ptr->part_ptr != part_ptr) |
| continue; |
| |
| if (IS_JOB_SUSPENDED(job_ptr)) { |
| enum job_states suspend_job_state = job_ptr->job_state; |
| /* we can't have it as suspended when we call the |
| * accounting stuff. |
| */ |
| job_ptr->job_state = JOB_CANCELLED; |
| jobacct_storage_g_job_suspend(acct_db_conn, job_ptr); |
| job_ptr->job_state = suspend_job_state; |
| suspended = true; |
| } |
| if (IS_JOB_RUNNING(job_ptr) || suspended) { |
| job_count++; |
| info("Killing job_id %u on defunct partition %s", |
| job_ptr->job_id, part_name); |
| job_ptr->job_state = JOB_NODE_FAIL | JOB_COMPLETING; |
| build_cg_bitmap(job_ptr); |
| job_ptr->exit_code = MAX(job_ptr->exit_code, 1); |
| job_ptr->state_reason = FAIL_DOWN_PARTITION; |
| xfree(job_ptr->state_desc); |
| if (suspended) { |
| job_ptr->end_time = job_ptr->suspend_time; |
| job_ptr->tot_sus_time += |
| difftime(now, job_ptr->suspend_time); |
| } else |
| job_ptr->end_time = now; |
| job_completion_logger(job_ptr, false); |
| if (!pending) |
| deallocate_nodes(job_ptr, false, suspended, |
| false); |
| } else if (pending) { |
| job_count++; |
| info("Killing job_id %u on defunct partition %s", |
| job_ptr->job_id, part_name); |
| job_ptr->job_state = JOB_CANCELLED; |
| job_ptr->start_time = now; |
| job_ptr->end_time = now; |
| job_ptr->exit_code = 1; |
| job_completion_logger(job_ptr, false); |
| } |
| job_ptr->part_ptr = NULL; |
| FREE_NULL_LIST(job_ptr->part_ptr_list); |
| } |
| list_iterator_destroy(job_iterator); |
| |
| if (job_count) |
| last_job_update = now; |
| return job_count; |
| } |
| |
| /* |
| * kill_job_by_front_end_name - Given a front end node name, deallocate |
| * resource for its jobs and kill them. |
| * IN node_name - name of a front end node |
| * RET number of jobs associated with this front end node |
| * NOTE: Patterned after kill_running_job_by_node_name() |
| */ |
| extern int kill_job_by_front_end_name(char *node_name) |
| { |
| #ifdef HAVE_FRONT_END |
| ListIterator job_iterator; |
| struct job_record *job_ptr; |
| struct node_record *node_ptr; |
| time_t now = time(NULL); |
| int i, job_count = 0; |
| |
| if (node_name == NULL) |
| fatal("kill_job_by_front_end_name: node_name is NULL"); |
| |
| job_iterator = list_iterator_create(job_list); |
| while ((job_ptr = (struct job_record *) list_next(job_iterator))) { |
| bool suspended = false; |
| |
| if (!IS_JOB_RUNNING(job_ptr) && !IS_JOB_SUSPENDED(job_ptr) && |
| !IS_JOB_COMPLETING(job_ptr)) |
| continue; |
| if ((job_ptr->batch_host == NULL) || |
| strcmp(job_ptr->batch_host, node_name)) |
| continue; /* no match on node name */ |
| |
| if (IS_JOB_SUSPENDED(job_ptr)) { |
| enum job_states suspend_job_state = job_ptr->job_state; |
| /* we can't have it as suspended when we call the |
| * accounting stuff. |
| */ |
| job_ptr->job_state = JOB_CANCELLED; |
| jobacct_storage_g_job_suspend(acct_db_conn, job_ptr); |
| job_ptr->job_state = suspend_job_state; |
| suspended = true; |
| } |
| if (IS_JOB_COMPLETING(job_ptr)) { |
| job_count++; |
| while ((i = bit_ffs(job_ptr->node_bitmap_cg)) >= 0) { |
| bit_clear(job_ptr->node_bitmap_cg, i); |
| job_update_cpu_cnt(job_ptr, i); |
| if (job_ptr->node_cnt) |
| (job_ptr->node_cnt)--; |
| else { |
| error("node_cnt underflow on JobId=%u", |
| job_ptr->job_id); |
| } |
| if (job_ptr->node_cnt == 0) { |
| job_ptr->job_state &= (~JOB_COMPLETING); |
| delete_step_records(job_ptr); |
| slurm_sched_g_schedule(); |
| } |
| node_ptr = &node_record_table_ptr[i]; |
| if (node_ptr->comp_job_cnt) |
| (node_ptr->comp_job_cnt)--; |
| else { |
| error("Node %s comp_job_cnt underflow, " |
| "JobId=%u", |
| node_ptr->name, job_ptr->job_id); |
| } |
| } |
| } else if (IS_JOB_RUNNING(job_ptr) || suspended) { |
| job_count++; |
| if (job_ptr->batch_flag && job_ptr->details && |
| slurmctld_conf.job_requeue && |
| (job_ptr->details->requeue > 0)) { |
| char requeue_msg[128]; |
| |
| srun_node_fail(job_ptr->job_id, node_name); |
| |
| info("requeue job %u due to failure of node %s", |
| job_ptr->job_id, node_name); |
| set_job_prio(job_ptr); |
| snprintf(requeue_msg, sizeof(requeue_msg), |
| "Job requeued due to failure " |
| "of node %s", |
| node_name); |
| slurm_sched_g_requeue(job_ptr, requeue_msg); |
| job_ptr->time_last_active = now; |
| if (suspended) { |
| job_ptr->end_time = |
| job_ptr->suspend_time; |
| job_ptr->tot_sus_time += |
| difftime(now, |
| job_ptr-> |
| suspend_time); |
| } else |
| job_ptr->end_time = now; |
| |
| /* We want this job to look like it |
| * was terminated in the accounting logs. |
| * Set a new submit time so the restarted |
| * job looks like a new job. */ |
| job_ptr->job_state = JOB_NODE_FAIL; |
| build_cg_bitmap(job_ptr); |
| job_completion_logger(job_ptr, true); |
| deallocate_nodes(job_ptr, false, suspended, |
| false); |
| |
| /* do this after the epilog complete, |
| * setting it here is too early */ |
| //job_ptr->db_index = 0; |
| //job_ptr->details->submit_time = now; |
| |
| job_ptr->job_state = JOB_PENDING; |
| if (job_ptr->node_cnt) |
| job_ptr->job_state |= JOB_COMPLETING; |
| |
| /* restart from periodic checkpoint */ |
| if (job_ptr->ckpt_interval && |
| job_ptr->ckpt_time && |
| job_ptr->details->ckpt_dir) { |
| xfree(job_ptr->details->restart_dir); |
| job_ptr->details->restart_dir = |
| xstrdup (job_ptr->details-> |
| ckpt_dir); |
| xstrfmtcat(job_ptr->details-> |
| restart_dir, |
| "/%u", job_ptr->job_id); |
| } |
| job_ptr->restart_cnt++; |
| /* Since the job completion logger |
| * removes the submit we need to add it |
| * again. */ |
| acct_policy_add_job_submit(job_ptr); |
| |
| if (!job_ptr->node_bitmap_cg || |
| bit_set_count(job_ptr->node_bitmap_cg) == 0) |
| batch_requeue_fini(job_ptr); |
| } else { |
| info("Killing job_id %u on failed node %s", |
| job_ptr->job_id, node_name); |
| srun_node_fail(job_ptr->job_id, node_name); |
| job_ptr->job_state = JOB_NODE_FAIL | |
| JOB_COMPLETING; |
| build_cg_bitmap(job_ptr); |
| job_ptr->exit_code = MAX(job_ptr->exit_code, 1); |
| job_ptr->state_reason = FAIL_DOWN_NODE; |
| xfree(job_ptr->state_desc); |
| if (suspended) { |
| job_ptr->end_time = |
| job_ptr->suspend_time; |
| job_ptr->tot_sus_time += |
| difftime(now, |
| job_ptr->suspend_time); |
| } else |
| job_ptr->end_time = now; |
| job_completion_logger(job_ptr, false); |
| deallocate_nodes(job_ptr, false, suspended, |
| false); |
| } |
| } |
| } |
| list_iterator_destroy(job_iterator); |
| |
| if (job_count) |
| last_job_update = now; |
| return job_count; |
| #else |
| return 0; |
| #endif |
| } |
| |
| /* |
| * partition_in_use - determine whether a partition is in use by a RUNNING |
| * PENDING or SUSPENDED job |
| * IN part_name - name of a partition |
| * RET true if the partition is in use, else false |
| */ |
| extern bool partition_in_use(char *part_name) |
| { |
| ListIterator job_iterator; |
| struct job_record *job_ptr; |
| struct part_record *part_ptr; |
| |
| part_ptr = find_part_record (part_name); |
| if (part_ptr == NULL) /* No such partition */ |
| return false; |
| |
| job_iterator = list_iterator_create(job_list); |
| while ((job_ptr = (struct job_record *) list_next(job_iterator))) { |
| if (job_ptr->part_ptr == part_ptr) { |
| if (!IS_JOB_FINISHED(job_ptr)) { |
| list_iterator_destroy(job_iterator); |
| return true; |
| } |
| } |
| } |
| list_iterator_destroy(job_iterator); |
| return false; |
| } |
| |
| /* |
| * allocated_session_in_use - check if an interactive session is already running |
| * IN new_alloc - allocation (alloc_node:alloc_sid) to test for |
| * Returns true if an interactive session of the same node:sid already is in use |
| * by a RUNNING, PENDING, or SUSPENDED job. Provides its own locking. |
| */ |
| extern bool allocated_session_in_use(job_desc_msg_t *new_alloc) |
| { |
| ListIterator job_iter; |
| struct job_record *job_ptr; |
| /* Locks: Read job */ |
| slurmctld_lock_t job_read_lock = { |
| NO_LOCK, READ_LOCK, NO_LOCK, NO_LOCK }; |
| |
| if ((new_alloc->script != NULL) || (new_alloc->alloc_node == NULL)) |
| return false; |
| |
| lock_slurmctld(job_read_lock); |
| job_iter = list_iterator_create(job_list); |
| |
| while ((job_ptr = (struct job_record *)list_next(job_iter))) { |
| if (job_ptr->batch_flag || IS_JOB_FINISHED(job_ptr)) |
| continue; |
| if (job_ptr->alloc_node && |
| (strcmp(job_ptr->alloc_node, new_alloc->alloc_node) == 0) && |
| (job_ptr->alloc_sid == new_alloc->alloc_sid)) |
| break; |
| } |
| list_iterator_destroy(job_iter); |
| unlock_slurmctld(job_read_lock); |
| |
| return job_ptr != NULL; |
| } |
| |
| /* |
| * kill_running_job_by_node_name - Given a node name, deallocate RUNNING |
| * or COMPLETING jobs from the node or kill them |
| * IN node_name - name of a node |
| * RET number of killed jobs |
| */ |
| extern int kill_running_job_by_node_name(char *node_name) |
| { |
| ListIterator job_iterator; |
| struct job_record *job_ptr; |
| struct node_record *node_ptr; |
| int bit_position; |
| int job_count = 0; |
| time_t now = time(NULL); |
| |
| node_ptr = find_node_record(node_name); |
| if (node_ptr == NULL) /* No such node */ |
| return 0; |
| bit_position = node_ptr - node_record_table_ptr; |
| |
| job_iterator = list_iterator_create(job_list); |
| while ((job_ptr = (struct job_record *) list_next(job_iterator))) { |
| bool suspended = false; |
| if ((job_ptr->node_bitmap == NULL) || |
| (!bit_test(job_ptr->node_bitmap, bit_position))) |
| continue; /* job not on this node */ |
| if (nonstop_ops.node_fail) |
| (nonstop_ops.node_fail)(job_ptr, node_ptr); |
| if (IS_JOB_SUSPENDED(job_ptr)) { |
| enum job_states suspend_job_state = job_ptr->job_state; |
| /* we can't have it as suspended when we call the |
| * accounting stuff. |
| */ |
| job_ptr->job_state = JOB_CANCELLED; |
| jobacct_storage_g_job_suspend(acct_db_conn, job_ptr); |
| job_ptr->job_state = suspend_job_state; |
| suspended = true; |
| } |
| |
| if (IS_JOB_COMPLETING(job_ptr)) { |
| if (!bit_test(job_ptr->node_bitmap_cg, bit_position)) |
| continue; |
| job_count++; |
| bit_clear(job_ptr->node_bitmap_cg, bit_position); |
| job_update_cpu_cnt(job_ptr, bit_position); |
| if (job_ptr->node_cnt) |
| (job_ptr->node_cnt)--; |
| else { |
| error("node_cnt underflow on JobId=%u", |
| job_ptr->job_id); |
| } |
| if (job_ptr->node_cnt == 0) { |
| job_ptr->job_state &= (~JOB_COMPLETING); |
| delete_step_records(job_ptr); |
| slurm_sched_g_schedule(); |
| } |
| if (node_ptr->comp_job_cnt) |
| (node_ptr->comp_job_cnt)--; |
| else { |
| error("Node %s comp_job_cnt underflow, " |
| "JobId=%u", |
| node_ptr->name, job_ptr->job_id); |
| } |
| } else if (IS_JOB_RUNNING(job_ptr) || suspended) { |
| job_count++; |
| if ((job_ptr->details) && |
| (job_ptr->kill_on_node_fail == 0) && |
| (job_ptr->node_cnt > 1)) { |
| /* keep job running on remaining nodes */ |
| srun_node_fail(job_ptr->job_id, node_name); |
| error("Removing failed node %s from job_id %u", |
| node_name, job_ptr->job_id); |
| job_pre_resize_acctg(job_ptr); |
| kill_step_on_node(job_ptr, node_ptr, true); |
| excise_node_from_job(job_ptr, node_ptr); |
| job_post_resize_acctg(job_ptr); |
| } else if (job_ptr->batch_flag && job_ptr->details && |
| job_ptr->details->requeue) { |
| char requeue_msg[128]; |
| |
| srun_node_fail(job_ptr->job_id, node_name); |
| |
| info("requeue job %u due to failure of node %s", |
| job_ptr->job_id, node_name); |
| snprintf(requeue_msg, sizeof(requeue_msg), |
| "Job requeued due to failure " |
| "of node %s", |
| node_name); |
| slurm_sched_g_requeue(job_ptr, requeue_msg); |
| job_ptr->time_last_active = now; |
| if (suspended) { |
| job_ptr->end_time = |
| job_ptr->suspend_time; |
| job_ptr->tot_sus_time += |
| difftime(now, |
| job_ptr-> |
| suspend_time); |
| } else |
| job_ptr->end_time = now; |
| |
| /* We want this job to look like it |
| * was terminated in the accounting logs. |
| * Set a new submit time so the restarted |
| * job looks like a new job. */ |
| job_ptr->job_state = JOB_NODE_FAIL; |
| build_cg_bitmap(job_ptr); |
| job_completion_logger(job_ptr, true); |
| deallocate_nodes(job_ptr, false, suspended, |
| false); |
| |
| /* do this after the epilog complete, |
| * setting it here is too early */ |
| //job_ptr->db_index = 0; |
| //job_ptr->details->submit_time = now; |
| |
| job_ptr->job_state = JOB_PENDING; |
| if (job_ptr->node_cnt) |
| job_ptr->job_state |= JOB_COMPLETING; |
| |
| /* restart from periodic checkpoint */ |
| if (job_ptr->ckpt_interval && |
| job_ptr->ckpt_time && |
| job_ptr->details->ckpt_dir) { |
| xfree(job_ptr->details->restart_dir); |
| job_ptr->details->restart_dir = |
| xstrdup (job_ptr->details-> |
| ckpt_dir); |
| xstrfmtcat(job_ptr->details-> |
| restart_dir, |
| "/%u", job_ptr->job_id); |
| } |
| job_ptr->restart_cnt++; |
| /* Since the job completion logger |
| * removes the submit we need to add it |
| * again. */ |
| acct_policy_add_job_submit(job_ptr); |
| |
| if (!job_ptr->node_bitmap_cg || |
| bit_set_count(job_ptr->node_bitmap_cg) == 0) |
| batch_requeue_fini(job_ptr); |
| } else { |
| info("Killing job_id %u on failed node %s", |
| job_ptr->job_id, node_name); |
| srun_node_fail(job_ptr->job_id, node_name); |
| job_ptr->job_state = JOB_NODE_FAIL | |
| JOB_COMPLETING; |
| build_cg_bitmap(job_ptr); |
| job_ptr->exit_code = MAX(job_ptr->exit_code, 1); |
| job_ptr->state_reason = FAIL_DOWN_NODE; |
| xfree(job_ptr->state_desc); |
| if (suspended) { |
| job_ptr->end_time = |
| job_ptr->suspend_time; |
| job_ptr->tot_sus_time += |
| difftime(now, |
| job_ptr->suspend_time); |
| } else |
| job_ptr->end_time = now; |
| job_completion_logger(job_ptr, false); |
| deallocate_nodes(job_ptr, false, suspended, |
| false); |
| } |
| } |
| |
| } |
| list_iterator_destroy(job_iterator); |
| if (job_count) |
| last_job_update = now; |
| |
| return job_count; |
| } |
| |
| /* Remove one node from a job's allocation */ |
| extern void excise_node_from_job(struct job_record *job_ptr, |
| struct node_record *node_ptr) |
| { |
| int i, orig_pos = -1, new_pos = -1; |
| bitstr_t *orig_bitmap; |
| |
| orig_bitmap = bit_copy(job_ptr->node_bitmap); |
| make_node_idle(node_ptr, job_ptr); /* updates bitmap */ |
| xfree(job_ptr->nodes); |
| job_ptr->nodes = bitmap2node_name(job_ptr->node_bitmap); |
| for (i=bit_ffs(orig_bitmap); i<node_record_count; i++) { |
| if (!bit_test(orig_bitmap,i)) |
| continue; |
| orig_pos++; |
| if (!bit_test(job_ptr->node_bitmap, i)) |
| continue; |
| new_pos++; |
| if (orig_pos == new_pos) |
| continue; |
| memcpy(&job_ptr->node_addr[new_pos], |
| &job_ptr->node_addr[orig_pos], sizeof(slurm_addr_t)); |
| /* NOTE: The job's allocation in the job_ptr->job_resrcs |
| * data structure is unchanged even after a node allocated |
| * to the job goes DOWN. */ |
| } |
| |
| job_ptr->total_nodes = job_ptr->node_cnt = new_pos + 1; |
| |
| FREE_NULL_BITMAP(orig_bitmap); |
| (void) select_g_job_resized(job_ptr, node_ptr); |
| } |
| |
| /* |
| * dump_job_desc - dump the incoming job submit request message |
| * IN job_specs - job specification from RPC |
| */ |
| void dump_job_desc(job_desc_msg_t * job_specs) |
| { |
| long job_id, time_min; |
| long pn_min_cpus, pn_min_memory, pn_min_tmp_disk, min_cpus; |
| long time_limit, priority, contiguous, nice; |
| long kill_on_node_fail, shared, immediate, wait_all_nodes; |
| long cpus_per_task, requeue, num_tasks, overcommit; |
| long ntasks_per_node, ntasks_per_socket, ntasks_per_core; |
| int core_spec; |
| char *mem_type, buf[100], *signal_flags; |
| |
| if (job_specs == NULL) |
| return; |
| |
| job_id = (job_specs->job_id != NO_VAL) ? |
| (long) job_specs->job_id : -1L; |
| debug3("JobDesc: user_id=%u job_id=%ld partition=%s name=%s", |
| job_specs->user_id, job_id, |
| job_specs->partition, job_specs->name); |
| |
| min_cpus = (job_specs->min_cpus != NO_VAL) ? |
| (long) job_specs->min_cpus : -1L; |
| pn_min_cpus = (job_specs->pn_min_cpus != (uint16_t) NO_VAL) ? |
| (long) job_specs->pn_min_cpus : -1L; |
| core_spec = (job_specs->core_spec != (uint16_t) NO_VAL) ? |
| job_specs->core_spec : -1; |
| debug3(" cpus=%ld-%u pn_min_cpus=%ld core_spec=%d", |
| min_cpus, job_specs->max_cpus, pn_min_cpus, core_spec); |
| |
| debug3(" -N min-[max]: %u-[%u]:%u:%u:%u", |
| job_specs->min_nodes, job_specs->max_nodes, |
| job_specs->sockets_per_node, job_specs->cores_per_socket, |
| job_specs->threads_per_core); |
| |
| if (job_specs->pn_min_memory == NO_VAL) { |
| pn_min_memory = -1L; |
| mem_type = "job"; |
| } else if (job_specs->pn_min_memory & MEM_PER_CPU) { |
| pn_min_memory = (long) (job_specs->pn_min_memory & |
| (~MEM_PER_CPU)); |
| mem_type = "cpu"; |
| } else { |
| pn_min_memory = (long) job_specs->pn_min_memory; |
| mem_type = "job"; |
| } |
| pn_min_tmp_disk = (job_specs->pn_min_tmp_disk != NO_VAL) ? |
| (long) job_specs->pn_min_tmp_disk : -1L; |
| debug3(" pn_min_memory_%s=%ld pn_min_tmp_disk=%ld", |
| mem_type, pn_min_memory, pn_min_tmp_disk); |
| immediate = (job_specs->immediate == 0) ? 0L : 1L; |
| debug3(" immediate=%ld features=%s reservation=%s", |
| immediate, job_specs->features, job_specs->reservation); |
| |
| debug3(" req_nodes=%s exc_nodes=%s gres=%s", |
| job_specs->req_nodes, job_specs->exc_nodes, job_specs->gres); |
| |
| time_limit = (job_specs->time_limit != NO_VAL) ? |
| (long) job_specs->time_limit : -1L; |
| time_min = (job_specs->time_min != NO_VAL) ? |
| (long) job_specs->time_min : time_limit; |
| priority = (job_specs->priority != NO_VAL) ? |
| (long) job_specs->priority : -1L; |
| contiguous = (job_specs->contiguous != (uint16_t) NO_VAL) ? |
| (long) job_specs->contiguous : -1L; |
| shared = (job_specs->shared != (uint16_t) NO_VAL) ? |
| (long) job_specs->shared : -1L; |
| debug3(" time_limit=%ld-%ld priority=%ld contiguous=%ld shared=%ld", |
| time_min, time_limit, priority, contiguous, shared); |
| |
| kill_on_node_fail = (job_specs->kill_on_node_fail != |
| (uint16_t) NO_VAL) ? |
| (long) job_specs->kill_on_node_fail : -1L; |
| if (job_specs->script) /* log has problem with string len & null */ |
| debug3(" kill_on_node_fail=%ld script=%.40s...", |
| kill_on_node_fail, job_specs->script); |
| else |
| debug3(" kill_on_node_fail=%ld script=%s", |
| kill_on_node_fail, job_specs->script); |
| |
| if (job_specs->argc == 1) |
| debug3(" argv=\"%s\"", |
| job_specs->argv[0]); |
| else if (job_specs->argc == 2) |
| debug3(" argv=%s,%s", |
| job_specs->argv[0], |
| job_specs->argv[1]); |
| else if (job_specs->argc > 2) |
| debug3(" argv=%s,%s,%s,...", |
| job_specs->argv[0], |
| job_specs->argv[1], |
| job_specs->argv[2]); |
| |
| if (job_specs->env_size == 1) |
| debug3(" environment=\"%s\"", |
| job_specs->environment[0]); |
| else if (job_specs->env_size == 2) |
| debug3(" environment=%s,%s", |
| job_specs->environment[0], |
| job_specs->environment[1]); |
| else if (job_specs->env_size > 2) |
| debug3(" environment=%s,%s,%s,...", |
| job_specs->environment[0], |
| job_specs->environment[1], |
| job_specs->environment[2]); |
| |
| if (job_specs->spank_job_env_size == 1) |
| debug3(" spank_job_env=\"%s\"", |
| job_specs->spank_job_env[0]); |
| else if (job_specs->spank_job_env_size == 2) |
| debug3(" spank_job_env=%s,%s", |
| job_specs->spank_job_env[0], |
| job_specs->spank_job_env[1]); |
| else if (job_specs->spank_job_env_size > 2) |
| debug3(" spank_job_env=%s,%s,%s,...", |
| job_specs->spank_job_env[0], |
| job_specs->spank_job_env[1], |
| job_specs->spank_job_env[2]); |
| |
| debug3(" stdin=%s stdout=%s stderr=%s", |
| job_specs->std_in, job_specs->std_out, job_specs->std_err); |
| |
| debug3(" work_dir=%s alloc_node:sid=%s:%u", |
| job_specs->work_dir, |
| job_specs->alloc_node, job_specs->alloc_sid); |
| |
| debug3(" resp_host=%s alloc_resp_port=%u other_port=%u", |
| job_specs->resp_host, |
| job_specs->alloc_resp_port, job_specs->other_port); |
| debug3(" dependency=%s account=%s qos=%s comment=%s", |
| job_specs->dependency, job_specs->account, |
| job_specs->qos, job_specs->comment); |
| |
| num_tasks = (job_specs->num_tasks != NO_VAL) ? |
| (long) job_specs->num_tasks : -1L; |
| overcommit = (job_specs->overcommit != (uint8_t) NO_VAL) ? |
| (long) job_specs->overcommit : -1L; |
| nice = (job_specs->nice != (uint16_t) NO_VAL) ? |
| (job_specs->nice - NICE_OFFSET) : 0; |
| debug3(" mail_type=%u mail_user=%s nice=%ld num_tasks=%ld " |
| "open_mode=%u overcommit=%ld acctg_freq=%s", |
| job_specs->mail_type, job_specs->mail_user, nice, num_tasks, |
| job_specs->open_mode, overcommit, job_specs->acctg_freq); |
| |
| slurm_make_time_str(&job_specs->begin_time, buf, sizeof(buf)); |
| cpus_per_task = (job_specs->cpus_per_task != (uint16_t) NO_VAL) ? |
| (long) job_specs->cpus_per_task : -1L; |
| requeue = (job_specs->requeue != (uint16_t) NO_VAL) ? |
| (long) job_specs->requeue : -1L; |
| debug3(" network=%s begin=%s cpus_per_task=%ld requeue=%ld " |
| "licenses=%s", |
| job_specs->network, buf, cpus_per_task, requeue, |
| job_specs->licenses); |
| |
| slurm_make_time_str(&job_specs->end_time, buf, sizeof(buf)); |
| wait_all_nodes = (job_specs->wait_all_nodes != (uint16_t) NO_VAL) ? |
| (long) job_specs->wait_all_nodes : -1L; |
| if (job_specs->warn_flags & KILL_JOB_BATCH) |
| signal_flags = "B:"; |
| else |
| signal_flags = ""; |
| debug3(" end_time=%s signal=%s%u@%u wait_all_nodes=%ld", |
| buf, signal_flags, job_specs->warn_signal, job_specs->warn_time, |
| wait_all_nodes); |
| |
| ntasks_per_node = (job_specs->ntasks_per_node != (uint16_t) NO_VAL) ? |
| (long) job_specs->ntasks_per_node : -1L; |
| ntasks_per_socket = (job_specs->ntasks_per_socket != |
| (uint16_t) NO_VAL) ? |
| (long) job_specs->ntasks_per_socket : -1L; |
| ntasks_per_core = (job_specs->ntasks_per_core != (uint16_t) NO_VAL) ? |
| (long) job_specs->ntasks_per_core : -1L; |
| debug3(" ntasks_per_node=%ld ntasks_per_socket=%ld " |
| "ntasks_per_core=%ld", |
| ntasks_per_node, ntasks_per_socket, ntasks_per_core); |
| |
| debug3(" cpus_bind=%u:%s mem_bind=%u:%s plane_size:%u", |
| job_specs->cpu_bind_type, job_specs->cpu_bind, |
| job_specs->mem_bind_type, job_specs->mem_bind, |
| job_specs->plane_size); |
| debug3(" array_inx=%s", job_specs->array_inx); |
| |
| select_g_select_jobinfo_sprint(job_specs->select_jobinfo, |
| buf, sizeof(buf), SELECT_PRINT_MIXED); |
| if (buf[0] != '\0') |
| debug3(" %s", buf); |
| } |
| |
| |
| /* |
| * init_job_conf - initialize the job configuration tables and values. |
| * this should be called after creating node information, but |
| * before creating any job entries. Pre-existing job entries are |
| * left unchanged. |
| * NOTE: The job hash table size does not change after initial creation. |
| * RET 0 if no error, otherwise an error code |
| * global: last_job_update - time of last job table update |
| * job_list - pointer to global job list |
| */ |
| int init_job_conf(void) |
| { |
| if (job_list == NULL) { |
| job_count = 0; |
| job_list = list_create(_list_delete_job); |
| } |
| |
| last_job_update = time(NULL); |
| return SLURM_SUCCESS; |
| } |
| |
| /* |
| * rehash_jobs - Create or rebuild the job hash table. |
| * NOTE: run lock_slurmctld before entry: Read config, write job |
| */ |
| extern void rehash_jobs(void) |
| { |
| if (job_hash == NULL) { |
| hash_table_size = slurmctld_conf.max_job_cnt; |
| job_hash = (struct job_record **) |
| xmalloc(hash_table_size * sizeof(struct job_record *)); |
| job_array_hash_j = (struct job_record **) |
| xmalloc(hash_table_size * sizeof(struct job_record *)); |
| job_array_hash_t = (struct job_record **) |
| xmalloc(hash_table_size * sizeof(struct job_record *)); |
| } else if (hash_table_size < (slurmctld_conf.max_job_cnt / 2)) { |
| /* If the MaxJobCount grows by too much, the hash table will |
| * be ineffective without rebuilding. We don't presently bother |
| * to rebuild the hash table, but cut MaxJobCount back as |
| * needed. */ |
| error ("MaxJobCount reset too high, restart slurmctld"); |
| slurmctld_conf.max_job_cnt = hash_table_size; |
| } |
| } |
| |
| /* Create an exact copy of an existing job record for a job array. |
| * Assumes the job has no resource allocaiton */ |
| static struct job_record *_job_rec_copy(struct job_record *job_ptr, |
| uint32_t array_task_id) |
| { |
| struct job_record *job_ptr_new = NULL, *save_job_next; |
| struct job_details *job_details, *details_new, *save_details; |
| uint32_t save_job_id; |
| priority_factors_object_t *save_prio_factors; |
| List save_step_list; |
| int error_code = SLURM_SUCCESS; |
| int i; |
| |
| job_ptr_new = create_job_record(&error_code); |
| if (!job_ptr_new) /* MaxJobCount checked when job array submitted */ |
| fatal("job array create_job_record error"); |
| if (error_code != SLURM_SUCCESS) |
| return NULL; |
| |
| /* Set job-specific ID and hash table */ |
| if (_set_job_id(job_ptr_new) != SLURM_SUCCESS) |
| fatal("job array create_job_record error"); |
| if (_copy_job_desc_files(job_ptr->job_id, job_ptr_new->job_id)) { |
| error("%s: failed to create task %u for job %u", |
| __func__, array_task_id, job_ptr->job_id); |
| (void) _purge_job_record(job_ptr_new->job_id); |
| return NULL; |
| } |
| _add_job_hash(job_ptr_new); |
| |
| /* Copy most of original job data. |
| * This could be done in parallel, but performance was worse. */ |
| save_job_id = job_ptr_new->job_id; |
| save_job_next = job_ptr_new->job_next; |
| save_details = job_ptr_new->details; |
| save_prio_factors = job_ptr_new->prio_factors; |
| save_step_list = job_ptr_new->step_list; |
| memcpy(job_ptr_new, job_ptr, sizeof(struct job_record)); |
| job_ptr_new->job_id = save_job_id; |
| job_ptr_new->job_next = save_job_next; |
| job_ptr_new->details = save_details; |
| job_ptr_new->prio_factors = save_prio_factors; |
| job_ptr_new->step_list = save_step_list; |
| |
| job_ptr_new->array_job_id = job_ptr->job_id; |
| job_ptr_new->array_task_id = array_task_id; |
| _add_job_array_hash(job_ptr_new); |
| |
| job_ptr_new->account = xstrdup(job_ptr->account); |
| job_ptr_new->alias_list = xstrdup(job_ptr->alias_list); |
| job_ptr_new->alloc_node = xstrdup(job_ptr->alloc_node); |
| job_ptr_new->batch_host = xstrdup(job_ptr->batch_host); |
| if (job_ptr->check_job) { |
| job_ptr_new->check_job = |
| checkpoint_copy_jobinfo(job_ptr->check_job); |
| } |
| job_ptr_new->comment = xstrdup(job_ptr->comment); |
| /* struct job_details *details; *** NOTE: Copied below */ |
| job_ptr_new->gres = xstrdup(job_ptr->gres); |
| if (job_ptr->gres_list) { |
| job_ptr_new->gres_list = |
| gres_plugin_job_state_dup(job_ptr->gres_list); |
| } |
| job_ptr_new->gres_alloc = xstrdup(job_ptr->gres_alloc); |
| job_ptr_new->gres_req = xstrdup(job_ptr->gres_req); |
| job_ptr_new->gres_used = xstrdup(job_ptr->gres_used); |
| job_ptr_new->licenses = xstrdup(job_ptr->licenses); |
| job_ptr_new->license_list = license_job_copy(job_ptr->license_list); |
| job_ptr_new->mail_user = xstrdup(job_ptr->mail_user); |
| job_ptr_new->name = xstrdup(job_ptr->name); |
| job_ptr_new->network = xstrdup(job_ptr->network); |
| job_ptr_new->nodes = xstrdup(job_ptr->nodes); |
| job_ptr_new->licenses = xstrdup(job_ptr->licenses); |
| if (job_ptr->node_cnt && job_ptr->node_addr) { |
| i = sizeof(slurm_addr_t) * job_ptr->node_cnt; |
| job_ptr_new->node_addr = xmalloc(i); |
| memcpy(job_ptr_new->node_addr, job_ptr->node_addr, i); |
| } |
| if (job_ptr->node_bitmap) |
| job_ptr_new->node_bitmap = bit_copy(job_ptr->node_bitmap); |
| if (job_ptr->node_bitmap_cg) |
| job_ptr_new->node_bitmap_cg = bit_copy(job_ptr->node_bitmap_cg); |
| job_ptr_new->nodes_completing = xstrdup(job_ptr->nodes_completing); |
| job_ptr_new->partition = xstrdup(job_ptr->partition); |
| job_ptr_new->part_ptr_list = part_list_copy(job_ptr->part_ptr_list); |
| /* On jobs that are held the priority_array isn't set up yet, |
| so check to see if it exists before copying. |
| */ |
| if (job_ptr->part_ptr_list && job_ptr->priority_array) { |
| i = list_count(job_ptr->part_ptr_list) * sizeof(uint32_t); |
| job_ptr_new->priority_array = xmalloc(i); |
| memcpy(job_ptr_new->priority_array, job_ptr->priority_array, i); |
| } |
| job_ptr_new->resv_name = xstrdup(job_ptr->resv_name); |
| job_ptr_new->resp_host = xstrdup(job_ptr->resp_host); |
| if (job_ptr->select_jobinfo) { |
| job_ptr_new->select_jobinfo = |
| select_g_select_jobinfo_copy(job_ptr->select_jobinfo); |
| } |
| if (job_ptr->spank_job_env_size) { |
| job_ptr_new->spank_job_env = |
| xmalloc(sizeof(char *) * |
| (job_ptr->spank_job_env_size + 1)); |
| for (i = 0; i < job_ptr->spank_job_env_size; i++) { |
| job_ptr_new->spank_job_env[i] = |
| xstrdup(job_ptr->spank_job_env[i]); |
| } |
| } |
| job_ptr_new->state_desc = xstrdup(job_ptr->state_desc); |
| job_ptr_new->wckey = xstrdup(job_ptr->wckey); |
| |
| job_details = job_ptr->details; |
| details_new = job_ptr_new->details; |
| memcpy(details_new, job_details, sizeof(struct job_details)); |
| details_new->acctg_freq = xstrdup(job_details->acctg_freq); |
| if (job_details->argc) { |
| details_new->argv = |
| xmalloc(sizeof(char *) * (job_details->argc + 1)); |
| for (i = 0; i < job_details->argc; i++) { |
| details_new->argv[i] = xstrdup(job_details->argv[i]); |
| } |
| } |
| details_new->ckpt_dir = xstrdup(job_details->ckpt_dir); |
| details_new->cpu_bind = xstrdup(job_details->cpu_bind); |
| details_new->depend_list = depended_list_copy(job_details->depend_list); |
| details_new->dependency = xstrdup(job_details->dependency); |
| details_new->orig_dependency = xstrdup(job_details->orig_dependency); |
| if (job_details->env_cnt) { |
| details_new->env_sup = |
| xmalloc(sizeof(char *) * (job_details->env_cnt + 1)); |
| for (i = 0; i < job_details->env_cnt; i++) { |
| details_new->env_sup[i] = |
| xstrdup(job_details->env_sup[i]); |
| } |
| } |
| if (job_details->exc_node_bitmap) { |
| details_new->exc_node_bitmap = |
| bit_copy(job_details->exc_node_bitmap); |
| } |
| details_new->exc_nodes = xstrdup(job_details->exc_nodes); |
| details_new->feature_list = |
| feature_list_copy(job_details->feature_list); |
| details_new->features = xstrdup(job_details->features); |
| if (job_details->mc_ptr) { |
| i = sizeof(multi_core_data_t); |
| details_new->mc_ptr = xmalloc(i); |
| memcpy(details_new->mc_ptr, job_details->mc_ptr, i); |
| } |
| details_new->mem_bind = xstrdup(job_details->mem_bind); |
| if (job_details->req_node_bitmap) { |
| details_new->req_node_bitmap = |
| bit_copy(job_details->req_node_bitmap); |
| } |
| if (job_details->req_node_layout && job_details->req_node_bitmap) { |
| i = bit_set_count(job_details->req_node_bitmap) * |
| sizeof(uint16_t); |
| details_new->req_node_layout = xmalloc(i); |
| memcpy(details_new->req_node_layout, |
| job_details->req_node_layout, i); |
| } |
| details_new->req_nodes = xstrdup(job_details->req_nodes); |
| details_new->restart_dir = xstrdup(job_details->restart_dir); |
| details_new->std_err = xstrdup(job_details->std_err); |
| details_new->std_in = xstrdup(job_details->std_in); |
| details_new->std_out = xstrdup(job_details->std_out); |
| details_new->work_dir = xstrdup(job_details->work_dir); |
| |
| return job_ptr_new; |
| } |
| |
| /* Convert a single job record into an array of job records. |
| * Job record validation is complete, so we only need to duplicate the record |
| * and update job and array ID values */ |
| static void _create_job_array(struct job_record *job_ptr, |
| job_desc_msg_t *job_specs) |
| { |
| struct job_record *job_ptr_new; |
| uint32_t i; |
| int i_first, i_last; |
| |
| if (!job_specs->array_bitmap) |
| return; |
| i_first = bit_ffs(job_specs->array_bitmap); |
| if (i_first == -1) { |
| error("_create_job_array: job %u array_bitmap is empty", |
| job_ptr->job_id); |
| return; |
| } |
| job_ptr->array_job_id = job_ptr->job_id; |
| job_ptr->array_task_id = i_first; |
| _add_job_array_hash(job_ptr); |
| |
| i_last = bit_fls(job_specs->array_bitmap); |
| for (i = (i_first + 1); i <= i_last; i++) { |
| if (!bit_test(job_specs->array_bitmap, i)) |
| continue; |
| job_ptr_new = _job_rec_copy(job_ptr, i); |
| if (!job_ptr_new) |
| break; |
| /* Make sure the db_index is zero |
| * for array elements in case the |
| * first element had the index assigned. |
| */ |
| job_ptr_new->db_index = 0; |
| acct_policy_add_job_submit(job_ptr); |
| } |
| } |
| |
| /* |
| * Wrapper for select_nodes() function that will test all valid partitions |
| * for a new job |
| * IN job_ptr - pointer to the job record |
| * IN test_only - if set do not allocate nodes, just confirm they |
| * could be allocated now |
| * IN select_node_bitmap - bitmap of nodes to be used for the |
| * job's resource allocation (not returned if NULL), caller |
| * must free |
| */ |
| static int _select_nodes_parts(struct job_record *job_ptr, bool test_only, |
| bitstr_t **select_node_bitmap) |
| { |
| struct part_record *part_ptr; |
| ListIterator iter; |
| int rc = ESLURM_REQUESTED_PART_CONFIG_UNAVAILABLE; |
| |
| if (job_ptr->part_ptr_list) { |
| iter = list_iterator_create(job_ptr->part_ptr_list); |
| while ((part_ptr = list_next(iter))) { |
| job_ptr->part_ptr = part_ptr; |
| debug2("Try job %u on next partition %s", |
| job_ptr->job_id, part_ptr->name); |
| if (job_limits_check(&job_ptr, false) != WAIT_NO_REASON) |
| continue; |
| rc = select_nodes(job_ptr, test_only, |
| select_node_bitmap); |
| if ((rc != ESLURM_REQUESTED_NODE_CONFIG_UNAVAILABLE) && |
| (rc != ESLURM_REQUESTED_PART_CONFIG_UNAVAILABLE) && |
| (rc != ESLURM_NODES_BUSY)) |
| break; |
| if ((job_ptr->preempt_in_progress) && |
| (rc != ESLURM_NODES_BUSY)) |
| break; |
| } |
| list_iterator_destroy(iter); |
| } else { |
| if (job_limits_check(&job_ptr, false) != WAIT_NO_REASON) |
| test_only = true; |
| rc = select_nodes(job_ptr, test_only, select_node_bitmap); |
| } |
| |
| return rc; |
| } |
| |
| /* |
| * job_allocate - create job_records for the supplied job specification and |
| * allocate nodes for it. |
| * IN job_specs - job specifications |
| * IN immediate - if set then either initiate the job immediately or fail |
| * IN will_run - don't initiate the job if set, just test if it could run |
| * now or later |
| * OUT resp - will run response (includes start location, time, etc.) |
| * IN allocate - resource allocation request only if set, batch job if zero |
| * IN submit_uid -uid of user issuing the request |
| * OUT job_pptr - set to pointer to job record |
| * OUT err_msg - Custom error message to the user, caller to xfree results |
| * RET 0 or an error code. If the job would only be able to execute with |
| * some change in partition configuration then |
| * ESLURM_REQUESTED_PART_CONFIG_UNAVAILABLE is returned |
| * globals: job_list - pointer to global job list |
| * list_part - global list of partition info |
| * default_part_loc - pointer to default partition |
| * NOTE: lock_slurmctld on entry: Read config Write job, Write node, Read part |
| */ |
| extern int job_allocate(job_desc_msg_t * job_specs, int immediate, |
| int will_run, will_run_response_msg_t **resp, |
| int allocate, uid_t submit_uid, |
| struct job_record **job_pptr, char **err_msg) |
| { |
| static int defer_sched = -1; |
| int error_code, i; |
| bool no_alloc, top_prio, test_only, too_fragmented, independent; |
| struct job_record *job_ptr; |
| time_t now = time(NULL); |
| |
| if (job_specs->array_bitmap) { |
| i = bit_set_count(job_specs->array_bitmap); |
| if ((job_count + i) >= slurmctld_conf.max_job_cnt) { |
| info("%s: MaxJobCount limit reached (%d + %d >= %u)", |
| __func__, job_count, i, |
| slurmctld_conf.max_job_cnt); |
| return EAGAIN; |
| } |
| } else if (job_count >= slurmctld_conf.max_job_cnt) { |
| info("%s: MaxJobCount limit reached (%u)", |
| __func__, slurmctld_conf.max_job_cnt); |
| return EAGAIN; |
| } |
| |
| error_code = _job_create(job_specs, allocate, will_run, |
| &job_ptr, submit_uid, err_msg); |
| *job_pptr = job_ptr; |
| |
| if (error_code) { |
| if (job_ptr && (immediate || will_run)) { |
| /* this should never really happen here */ |
| job_ptr->job_state = JOB_FAILED; |
| job_ptr->exit_code = 1; |
| job_ptr->state_reason = FAIL_BAD_CONSTRAINTS; |
| xfree(job_ptr->state_desc); |
| job_ptr->start_time = job_ptr->end_time = now; |
| job_completion_logger(job_ptr, false); |
| } |
| return error_code; |
| } |
| xassert(job_ptr); |
| if (job_specs->array_bitmap) |
| independent = false; |
| else |
| independent = job_independent(job_ptr, will_run); |
| /* priority needs to be calculated after this since we set a |
| * begin time in job_independent and that lets us know if the |
| * job is eligible. |
| */ |
| if (job_ptr->priority == NO_VAL) |
| set_job_prio(job_ptr); |
| |
| if (independent && |
| (license_job_test(job_ptr, time(NULL)) != SLURM_SUCCESS)) |
| independent = false; |
| |
| /* Avoid resource fragmentation if important */ |
| if ((submit_uid || (job_specs->req_nodes == NULL)) && |
| independent && job_is_completing()) |
| too_fragmented = true; /* Don't pick nodes for job now */ |
| /* FIXME: Ideally we only want to refuse the request if the |
| * required node list is insufficient to satisfy the job's |
| * processor or node count requirements, but the overhead is |
| * rather high to do that right here. We let requests from |
| * user root proceed if a node list is specified, for |
| * meta-schedulers (e.g. LCRM). */ |
| else |
| too_fragmented = false; |
| |
| if (defer_sched == -1) { |
| char *sched_params = slurm_get_sched_params(); |
| if (sched_params && strstr(sched_params, "defer")) |
| defer_sched = 1; |
| else |
| defer_sched = 0; |
| xfree(sched_params); |
| } |
| if (defer_sched == 1) |
| too_fragmented = true; |
| |
| if (independent && (!too_fragmented)) |
| top_prio = _top_priority(job_ptr); |
| else |
| top_prio = true; /* don't bother testing, |
| * it is not runable anyway */ |
| if (immediate && (too_fragmented || (!top_prio) || (!independent))) { |
| job_ptr->job_state = JOB_FAILED; |
| job_ptr->exit_code = 1; |
| job_ptr->state_reason = FAIL_BAD_CONSTRAINTS; |
| xfree(job_ptr->state_desc); |
| job_ptr->start_time = job_ptr->end_time = now; |
| job_completion_logger(job_ptr, false); |
| if (!independent) |
| return ESLURM_DEPENDENCY; |
| else if (too_fragmented) |
| return ESLURM_FRAGMENTATION; |
| else |
| return ESLURM_NOT_TOP_PRIORITY; |
| } |
| |
| if (will_run && resp) { |
| job_desc_msg_t job_desc_msg; |
| int rc; |
| memset(&job_desc_msg, 0, sizeof(job_desc_msg_t)); |
| job_desc_msg.job_id = job_ptr->job_id; |
| rc = job_start_data(&job_desc_msg, resp); |
| job_ptr->job_state = JOB_FAILED; |
| job_ptr->exit_code = 1; |
| job_ptr->start_time = job_ptr->end_time = now; |
| _purge_job_record(job_ptr->job_id); |
| return rc; |
| } |
| |
| test_only = will_run || (allocate == 0); |
| |
| no_alloc = test_only || too_fragmented || |
| (!top_prio) || (!independent) || !avail_front_end(job_ptr); |
| error_code = _select_nodes_parts(job_ptr, no_alloc, NULL); |
| if (!test_only) { |
| last_job_update = now; |
| slurm_sched_g_schedule(); /* work for external scheduler */ |
| } |
| |
| slurmctld_diag_stats.jobs_submitted++; |
| acct_policy_add_job_submit(job_ptr); |
| |
| if ((error_code == ESLURM_NODES_BUSY) || |
| (error_code == ESLURM_JOB_HELD) || |
| (error_code == ESLURM_NODE_NOT_AVAIL) || |
| (error_code == ESLURM_QOS_THRES) || |
| (error_code == ESLURM_ACCOUNTING_POLICY) || |
| (error_code == ESLURM_RESERVATION_NOT_USABLE) || |
| (error_code == ESLURM_REQUESTED_PART_CONFIG_UNAVAILABLE)) { |
| /* Not fatal error, but job can't be scheduled right now */ |
| if (immediate) { |
| job_ptr->job_state = JOB_FAILED; |
| job_ptr->exit_code = 1; |
| job_ptr->state_reason = FAIL_BAD_CONSTRAINTS; |
| xfree(job_ptr->state_desc); |
| job_ptr->start_time = job_ptr->end_time = now; |
| job_completion_logger(job_ptr, false); |
| } else { /* job remains queued */ |
| _create_job_array(job_ptr, job_specs); |
| if ((error_code == ESLURM_NODES_BUSY) || |
| (error_code == ESLURM_ACCOUNTING_POLICY)) { |
| error_code = SLURM_SUCCESS; |
| } |
| } |
| return error_code; |
| } |
| |
| if (error_code) { /* fundamental flaw in job request */ |
| job_ptr->job_state = JOB_FAILED; |
| job_ptr->exit_code = 1; |
| job_ptr->state_reason = FAIL_BAD_CONSTRAINTS; |
| xfree(job_ptr->state_desc); |
| job_ptr->start_time = job_ptr->end_time = now; |
| job_completion_logger(job_ptr, false); |
| return error_code; |
| } |
| |
| if (will_run) { /* job would run, flag job destruction */ |
| job_ptr->job_state = JOB_FAILED; |
| job_ptr->exit_code = 1; |
| job_ptr->start_time = job_ptr->end_time = now; |
| _purge_job_record(job_ptr->job_id); |
| } else if (!with_slurmdbd && !job_ptr->db_index) |
| jobacct_storage_g_job_start(acct_db_conn, job_ptr); |
| |
| if (!will_run) { |
| _create_job_array(job_ptr, job_specs); |
| debug2("sched: JobId=%u allocated resources: NodeList=%s", |
| job_ptr->job_id, job_ptr->nodes); |
| rebuild_job_part_list(job_ptr); |
| } |
| |
| return SLURM_SUCCESS; |
| } |
| |
| /* |
| * job_fail - terminate a job due to initiation failure |
| * IN job_id - id of the job to be killed |
| * IN job_state - desired job state (JOB_BOOT_FAIL, JOB_NODE_FAIL, etc.) |
| * RET 0 on success, otherwise ESLURM error code |
| */ |
| extern int job_fail(uint32_t job_id, uint16_t job_state) |
| { |
| struct job_record *job_ptr; |
| time_t now = time(NULL); |
| bool suspended = false; |
| |
| job_ptr = find_job_record(job_id); |
| if (job_ptr == NULL) { |
| error("job_fail: invalid job id %u", job_id); |
| return ESLURM_INVALID_JOB_ID; |
| } |
| |
| if (IS_JOB_FINISHED(job_ptr)) |
| return ESLURM_ALREADY_DONE; |
| if (IS_JOB_SUSPENDED(job_ptr)) { |
| enum job_states suspend_job_state = job_ptr->job_state; |
| /* we can't have it as suspended when we call the |
| * accounting stuff. |
| */ |
| job_ptr->job_state = JOB_CANCELLED; |
| jobacct_storage_g_job_suspend(acct_db_conn, job_ptr); |
| job_ptr->job_state = suspend_job_state; |
| suspended = true; |
| } |
| |
| if (IS_JOB_RUNNING(job_ptr) || suspended) { |
| /* No need to signal steps, deallocate kills them */ |
| job_ptr->time_last_active = now; |
| if (suspended) { |
| job_ptr->end_time = job_ptr->suspend_time; |
| job_ptr->tot_sus_time += |
| difftime(now, job_ptr->suspend_time); |
| } else |
| job_ptr->end_time = now; |
| last_job_update = now; |
| job_ptr->job_state = job_state | JOB_COMPLETING; |
| job_ptr->exit_code = 1; |
| job_ptr->state_reason = FAIL_LAUNCH; |
| xfree(job_ptr->state_desc); |
| job_completion_logger(job_ptr, false); |
| if (job_ptr->node_bitmap) { |
| build_cg_bitmap(job_ptr); |
| deallocate_nodes(job_ptr, false, suspended, false); |
| } |
| return SLURM_SUCCESS; |
| } |
| /* All other states */ |
| verbose("job_fail: job %u can't be killed from state=%s", |
| job_id, job_state_string(job_ptr->job_state)); |
| return ESLURM_TRANSITION_STATE_NO_UPDATE; |
| |
| } |
| |
| /* |
| * job_signal - signal the specified job |
| * IN job_id - id of the job to be signaled |
| * IN signal - signal to send, SIGKILL == cancel the job |
| * IN flags - see KILL_JOB_* flags in slurm.h |
| * IN uid - uid of requesting user |
| * IN preempt - true if job being preempted |
| * RET 0 on success, otherwise ESLURM error code |
| */ |
| extern int job_signal(uint32_t job_id, uint16_t signal, uint16_t flags, |
| uid_t uid, bool preempt) |
| { |
| struct job_record *job_ptr; |
| time_t now = time(NULL); |
| uint16_t job_term_state; |
| |
| /* Jobs submitted using Moab command should be cancelled using |
| * Moab command for accurate job records */ |
| if (!wiki_sched_test) { |
| char *sched_type = slurm_get_sched_type(); |
| if (strcmp(sched_type, "sched/wiki") == 0) |
| wiki_sched = true; |
| if (strcmp(sched_type, "sched/wiki2") == 0) { |
| wiki_sched = true; |
| wiki2_sched = true; |
| } |
| xfree(sched_type); |
| wiki_sched_test = true; |
| } |
| |
| job_ptr = find_job_record(job_id); |
| if ((flags & KILL_JOB_ARRAY) && /* signal entire job array */ |
| ((job_ptr == NULL) || (job_ptr->array_task_id != NO_VAL))) { |
| int rc = SLURM_SUCCESS, rc1; |
| ListIterator job_iter; |
| |
| flags &= (~KILL_JOB_ARRAY); |
| job_iter = list_iterator_create(job_list); |
| while ((job_ptr = (struct job_record *) list_next(job_iter))) { |
| if ((job_ptr->array_job_id != job_id) || |
| (job_ptr->array_task_id == NO_VAL)) |
| continue; |
| if (IS_JOB_FINISHED(job_ptr)) |
| continue; |
| rc1 = job_signal(job_ptr->job_id, signal, flags, |
| uid, preempt); |
| rc = MAX(rc, rc1); |
| } |
| list_iterator_destroy(job_iter); |
| return rc; |
| } |
| if (job_ptr == NULL) { |
| info("job_signal: invalid job id %u", job_id); |
| return ESLURM_INVALID_JOB_ID; |
| } |
| |
| if ((job_ptr->user_id != uid) && !validate_operator(uid) && |
| !assoc_mgr_is_user_acct_coord(acct_db_conn, uid, |
| job_ptr->account)) { |
| error("Security violation, JOB_CANCEL RPC from uid %d", |
| uid); |
| return ESLURM_ACCESS_DENIED; |
| } |
| if (!validate_slurm_user(uid) && (signal == SIGKILL) && |
| job_ptr->part_ptr && |
| (job_ptr->part_ptr->flags & PART_FLAG_ROOT_ONLY) && wiki2_sched) { |
| info("Attempt to cancel Moab job using Slurm command from " |
| "uid %d", uid); |
| return ESLURM_ACCESS_DENIED; |
| } |
| |
| if (IS_JOB_FINISHED(job_ptr)) |
| return ESLURM_ALREADY_DONE; |
| |
| /* let node select plugin do any state-dependent signalling actions */ |
| select_g_job_signal(job_ptr, signal); |
| |
| /* save user ID of the one who requested the job be cancelled */ |
| if (signal == SIGKILL) |
| job_ptr->requid = uid; |
| if (IS_JOB_PENDING(job_ptr) && IS_JOB_COMPLETING(job_ptr) && |
| (signal == SIGKILL)) { |
| if ((job_ptr->job_state & JOB_STATE_BASE) == JOB_PENDING) { |
| /* Prevent job requeue, otherwise preserve state */ |
| job_ptr->job_state = JOB_CANCELLED | JOB_COMPLETING; |
| } |
| /* build_cg_bitmap() not needed, job already completing */ |
| verbose("job_signal of requeuing job %u successful", job_id); |
| return SLURM_SUCCESS; |
| } |
| |
| if (IS_JOB_PENDING(job_ptr) && (signal == SIGKILL)) { |
| last_job_update = now; |
| job_ptr->job_state = JOB_CANCELLED; |
| job_ptr->start_time = now; |
| job_ptr->end_time = now; |
| srun_allocate_abort(job_ptr); |
| job_completion_logger(job_ptr, false); |
| verbose("job_signal of pending job %u successful", job_id); |
| return SLURM_SUCCESS; |
| } |
| |
| if (preempt) |
| job_term_state = JOB_PREEMPTED; |
| else |
| job_term_state = JOB_CANCELLED; |
| if (IS_JOB_SUSPENDED(job_ptr) && (signal == SIGKILL)) { |
| last_job_update = now; |
| job_ptr->end_time = job_ptr->suspend_time; |
| job_ptr->tot_sus_time += difftime(now, job_ptr->suspend_time); |
| job_ptr->job_state = job_term_state | JOB_COMPLETING; |
| build_cg_bitmap(job_ptr); |
| jobacct_storage_g_job_suspend(acct_db_conn, job_ptr); |
| job_completion_logger(job_ptr, false); |
| deallocate_nodes(job_ptr, false, true, preempt); |
| verbose("job_signal %u of suspended job %u successful", |
| signal, job_id); |
| return SLURM_SUCCESS; |
| } |
| |
| if (IS_JOB_RUNNING(job_ptr)) { |
| if (signal == SIGKILL) { |
| /* No need to signal steps, deallocate kills them */ |
| job_ptr->time_last_active = now; |
| job_ptr->end_time = now; |
| last_job_update = now; |
| job_ptr->job_state = job_term_state | JOB_COMPLETING; |
| build_cg_bitmap(job_ptr); |
| job_completion_logger(job_ptr, false); |
| deallocate_nodes(job_ptr, false, false, preempt); |
| } else if (flags & KILL_JOB_BATCH) {// |
| if (job_ptr->batch_flag) |
| _signal_batch_job(job_ptr, signal); |
| else |
| return ESLURM_JOB_SCRIPT_MISSING; |
| } else { |
| _signal_job(job_ptr, signal); |
| } |
| verbose("job_signal %u of running job %u successful 0x%x", |
| signal, job_id, job_ptr->job_state); |
| return SLURM_SUCCESS; |
| } |
| |
| verbose("job_signal: job %u can't be sent signal %u from state=%s", |
| job_id, signal, job_state_string(job_ptr->job_state)); |
| return ESLURM_TRANSITION_STATE_NO_UPDATE; |
| } |
| |
| static void |
| _signal_batch_job(struct job_record *job_ptr, uint16_t signal) |
| { |
| bitoff_t i; |
| kill_tasks_msg_t *kill_tasks_msg = NULL; |
| agent_arg_t *agent_args = NULL; |
| uint32_t z; |
| |
| xassert(job_ptr); |
| xassert(job_ptr->batch_host); |
| i = bit_ffs(job_ptr->node_bitmap); |
| if (i < 0) { |
| error("_signal_batch_job JobId=%u lacks assigned nodes", |
| job_ptr->job_id); |
| return; |
| } |
| |
| agent_args = xmalloc(sizeof(agent_arg_t)); |
| agent_args->msg_type = REQUEST_SIGNAL_TASKS; |
| agent_args->retry = 1; |
| agent_args->node_count = 1; |
| #ifdef HAVE_FRONT_END |
| if (job_ptr->front_end_ptr) |
| agent_args->protocol_version = |
| job_ptr->front_end_ptr->protocol_version; |
| #else |
| struct node_record *node_ptr; |
| if ((node_ptr = find_node_record(job_ptr->batch_host))) |
| agent_args->protocol_version = node_ptr->protocol_version; |
| #endif |
| agent_args->hostlist = hostlist_create(job_ptr->batch_host); |
| kill_tasks_msg = xmalloc(sizeof(kill_tasks_msg_t)); |
| kill_tasks_msg->job_id = job_ptr->job_id; |
| kill_tasks_msg->job_step_id = NO_VAL; |
| /* Encode the KILL_JOB_BATCH flag for |
| * stepd to know if has to signal only |
| * the batch script. The job was submitted |
| * using the --signal=B:sig sbatch option. |
| */ |
| z = KILL_JOB_BATCH << 24; |
| kill_tasks_msg->signal = z|signal; |
| |
| agent_args->msg_args = kill_tasks_msg; |
| agent_args->node_count = 1;/* slurm/477 be sure to update node_count */ |
| agent_queue_request(agent_args); |
| return; |
| } |
| |
| /* |
| * prolog_complete - note the normal termination of the prolog |
| * IN job_id - id of the job which completed |
| * IN requeue - job should be run again if possible |
| * IN prolog_return_code - prolog's return code, |
| * if set then set job state to FAILED |
| * RET - 0 on success, otherwise ESLURM error code |
| * global: job_list - pointer global job list |
| * last_job_update - time of last job table update |
| */ |
| extern int prolog_complete(uint32_t job_id, bool requeue, |
| uint32_t prolog_return_code) |
| { |
| struct job_record *job_ptr; |
| |
| debug("completing prolog for job %u", job_id); |
| job_ptr = find_job_record(job_id); |
| if (job_ptr == NULL) { |
| info("prolog_complete: invalid JobId=%u", job_id); |
| return ESLURM_INVALID_JOB_ID; |
| } |
| |
| if (IS_JOB_COMPLETING(job_ptr)) |
| return SLURM_SUCCESS; |
| |
| if (requeue && (job_ptr->batch_flag > 1)) { |
| /* Failed one requeue, just kill it */ |
| requeue = 0; |
| if (prolog_return_code == 0) |
| prolog_return_code = 1; |
| error("Prolog launch failure, JobId=%u", job_ptr->job_id); |
| } |
| |
| job_ptr->state_reason = WAIT_NO_REASON; |
| return SLURM_SUCCESS; |
| } |
| |
| /* |
| * job_complete - note the normal termination the specified job |
| * IN job_id - id of the job which completed |
| * IN uid - user id of user issuing the RPC |
| * IN requeue - job should be run again if possible |
| * IN node_fail - true of job terminated due to node failure |
| * IN job_return_code - job's return code, if set then set state to FAILED |
| * RET - 0 on success, otherwise ESLURM error code |
| * global: job_list - pointer global job list |
| * last_job_update - time of last job table update |
| */ |
| extern int job_complete(uint32_t job_id, uid_t uid, bool requeue, |
| bool node_fail, uint32_t job_return_code) |
| { |
| struct node_record *node_ptr; |
| struct job_record *job_ptr; |
| time_t now = time(NULL); |
| uint32_t job_comp_flag = 0; |
| bool suspended = false; |
| int i; |
| int use_cloud = false; |
| |
| info("completing job %u status %d", job_id, job_return_code); |
| job_ptr = find_job_record(job_id); |
| if (job_ptr == NULL) { |
| info("job_complete: invalid JobId=%u", job_id); |
| return ESLURM_INVALID_JOB_ID; |
| } |
| |
| if (IS_JOB_FINISHED(job_ptr)) { |
| if (job_ptr->exit_code == 0) |
| job_ptr->exit_code = job_return_code; |
| return ESLURM_ALREADY_DONE; |
| } |
| |
| if ((job_ptr->user_id != uid) && !validate_slurm_user(uid)) { |
| error("Security violation, JOB_COMPLETE RPC for job %u " |
| "from uid %u", |
| job_ptr->job_id, (unsigned int) uid); |
| return ESLURM_USER_ID_MISSING; |
| } |
| |
| if (IS_JOB_COMPLETING(job_ptr)) |
| return SLURM_SUCCESS; /* avoid replay */ |
| |
| if (IS_JOB_RUNNING(job_ptr)) |
| job_comp_flag = JOB_COMPLETING; |
| else if (IS_JOB_PENDING(job_ptr)) { |
| job_return_code = NO_VAL; |
| job_ptr->start_time = now; |
| } |
| |
| if ((job_return_code == NO_VAL) && |
| (IS_JOB_RUNNING(job_ptr) || IS_JOB_PENDING(job_ptr))) { |
| info("Job %u cancelled from interactive user or node failure", |
| job_ptr->job_id); |
| } |
| |
| if (IS_JOB_SUSPENDED(job_ptr)) { |
| enum job_states suspend_job_state = job_ptr->job_state; |
| /* we can't have it as suspended when we call the |
| * accounting stuff. |
| */ |
| job_ptr->job_state = JOB_CANCELLED; |
| jobacct_storage_g_job_suspend(acct_db_conn, job_ptr); |
| job_ptr->job_state = suspend_job_state; |
| job_comp_flag = JOB_COMPLETING; |
| suspended = true; |
| } |
| |
| if (requeue && (job_ptr->batch_flag > 1)) { |
| /* Failed one requeue, just kill it */ |
| requeue = 0; |
| if (job_return_code == 0) |
| job_return_code = 1; |
| info("Batch job launch failure, JobId=%u", job_ptr->job_id); |
| } |
| |
| if (requeue && job_ptr->details && job_ptr->batch_flag) { |
| /* We want this job to look like it |
| * was terminated in the accounting logs. |
| * Set a new submit time so the restarted |
| * job looks like a new job. */ |
| job_ptr->end_time = now; |
| job_ptr->job_state = JOB_NODE_FAIL; |
| job_completion_logger(job_ptr, true); |
| /* do this after the epilog complete, setting it here |
| * is too early */ |
| //job_ptr->db_index = 0; |
| //job_ptr->details->submit_time = now + 1; |
| if (job_ptr->node_bitmap) { |
| i = bit_ffs(job_ptr->node_bitmap); |
| if (i >= 0) { |
| node_ptr = node_record_table_ptr + i; |
| if (IS_NODE_CLOUD(node_ptr)) |
| use_cloud = true; |
| } |
| } |
| if (!use_cloud) |
| job_ptr->batch_flag++; /* only one retry */ |
| job_ptr->restart_cnt++; |
| job_ptr->job_state = JOB_PENDING | job_comp_flag; |
| /* Since the job completion logger removes the job submit |
| * information, we need to add it again. */ |
| acct_policy_add_job_submit(job_ptr); |
| if (node_fail) { |
| info("Requeue JobId=%u due to node failure", |
| job_ptr->job_id); |
| } else { |
| info("Requeue JobId=%u per user/system request", |
| job_ptr->job_id); |
| } |
| } else if (IS_JOB_PENDING(job_ptr) && job_ptr->details && |
| job_ptr->batch_flag) { |
| /* Possible failure mode with DOWN node and job requeue. |
| * The DOWN node might actually respond to the cancel and |
| * take us here. Don't run job_completion_logger |
| * here since this is here to catch duplicate cancels |
| * from slow responding slurmds */ |
| return SLURM_SUCCESS; |
| } else { |
| if (node_fail) { |
| job_ptr->job_state = JOB_NODE_FAIL | job_comp_flag; |
| job_ptr->requid = uid; |
| } else if (job_return_code == NO_VAL) { |
| job_ptr->job_state = JOB_CANCELLED | job_comp_flag; |
| job_ptr->requid = uid; |
| } else if (WIFEXITED(job_return_code) && |
| WEXITSTATUS(job_return_code)) { |
| job_ptr->job_state = JOB_FAILED | job_comp_flag; |
| job_ptr->exit_code = job_return_code; |
| job_ptr->state_reason = FAIL_EXIT_CODE; |
| xfree(job_ptr->state_desc); |
| } else if (job_comp_flag |
| && ((job_ptr->end_time |
| + slurmctld_conf.over_time_limit * 60) < now)) { |
| /* Test if the job has finished before its allowed |
| * over time has expired. |
| */ |
| job_ptr->job_state = JOB_TIMEOUT | job_comp_flag; |
| job_ptr->exit_code = MAX(job_ptr->exit_code, 1); |
| job_ptr->state_reason = FAIL_TIMEOUT; |
| xfree(job_ptr->state_desc); |
| } else { |
| job_ptr->job_state = JOB_COMPLETE | job_comp_flag; |
| job_ptr->exit_code = job_return_code; |
| if (nonstop_ops.job_fini) |
| (nonstop_ops.job_fini)(job_ptr); |
| } |
| |
| if (suspended) { |
| job_ptr->end_time = job_ptr->suspend_time; |
| job_ptr->tot_sus_time += |
| difftime(now, job_ptr->suspend_time); |
| } else |
| job_ptr->end_time = now; |
| job_completion_logger(job_ptr, false); |
| } |
| |
| last_job_update = now; |
| job_ptr->time_last_active = now; /* Timer for resending kill RPC */ |
| if (job_comp_flag) { /* job was running */ |
| build_cg_bitmap(job_ptr); |
| deallocate_nodes(job_ptr, false, suspended, false); |
| } |
| info("sched: job_complete for JobId=%u successful, exit code=%u", |
| job_id, job_return_code); |
| return SLURM_SUCCESS; |
| } |
| |
| static int _alt_part_test(struct part_record *part_ptr, |
| struct part_record **part_ptr_new) |
| { |
| struct part_record *alt_part_ptr = NULL; |
| char *alt_name; |
| |
| *part_ptr_new = NULL; |
| if ((part_ptr->state_up & PARTITION_SUBMIT) == 0) { |
| info("_alt_part_test: original partition is not available " |
| "(drain or inactive): %s", part_ptr->name); |
| alt_name = part_ptr->alternate; |
| while (alt_name) { |
| alt_part_ptr = find_part_record(alt_name); |
| if (alt_part_ptr == NULL) { |
| info("_alt_part_test: invalid alternate " |
| "partition name specified: %s", alt_name); |
| return ESLURM_INVALID_PARTITION_NAME; |
| } |
| if (alt_part_ptr == part_ptr) { |
| info("_alt_part_test: no valid alternate " |
| "partition is available"); |
| return ESLURM_PARTITION_NOT_AVAIL; |
| } |
| if (alt_part_ptr->state_up & PARTITION_SUBMIT) |
| break; |
| /* Try next alternate in the sequence */ |
| alt_name = alt_part_ptr->alternate; |
| } |
| if (alt_name == NULL) { |
| info("_alt_part_test: no valid alternate partition is " |
| "available"); |
| return ESLURM_PARTITION_NOT_AVAIL; |
| } |
| *part_ptr_new = alt_part_ptr; |
| } |
| return SLURM_SUCCESS; |
| } |
| |
| /* Test if this job can use this partition */ |
| static int _part_access_check(struct part_record *part_ptr, |
| job_desc_msg_t * job_desc, bitstr_t *req_bitmap, |
| uid_t submit_uid, slurmdb_qos_rec_t *qos_ptr, |
| char *acct) |
| { |
| uint32_t total_nodes; |
| size_t resv_name_leng = 0; |
| int rc = SLURM_SUCCESS; |
| |
| if (job_desc->reservation != NULL) { |
| resv_name_leng = strlen(job_desc->reservation); |
| } |
| |
| if ((part_ptr->flags & PART_FLAG_REQ_RESV) && |
| ((job_desc->reservation == NULL) || |
| (resv_name_leng == 0))) { |
| info("_part_access_check: uid %u access to partition %s " |
| "denied, requires reservation", |
| (unsigned int) submit_uid, part_ptr->name); |
| return ESLURM_ACCESS_DENIED; |
| } |
| |
| |
| if ((part_ptr->flags & PART_FLAG_REQ_RESV) && |
| (!job_desc->reservation || !strlen(job_desc->reservation))) { |
| info("_part_access_check: uid %u access to partition %s " |
| "denied, requires reservation", |
| (unsigned int) submit_uid, part_ptr->name); |
| return ESLURM_ACCESS_DENIED; |
| } |
| |
| if ((part_ptr->flags & PART_FLAG_ROOT_ONLY) && (submit_uid != 0) && |
| (submit_uid != slurmctld_conf.slurm_user_id)) { |
| info("_part_access_check: uid %u access to partition %s " |
| "denied, not root", |
| (unsigned int) submit_uid, part_ptr->name); |
| return ESLURM_ACCESS_DENIED; |
| } |
| |
| if ((job_desc->user_id == 0) && (part_ptr->flags & PART_FLAG_NO_ROOT)) { |
| error("_part_access_check: Security violation, SUBMIT_JOB for " |
| "user root disabled"); |
| return ESLURM_USER_ID_MISSING; |
| } |
| |
| if (validate_group(part_ptr, job_desc->user_id) == 0) { |
| info("_part_access_check: uid %u access to partition %s " |
| "denied, bad group", |
| (unsigned int) job_desc->user_id, part_ptr->name); |
| return ESLURM_JOB_MISSING_REQUIRED_PARTITION_GROUP; |
| } |
| |
| if (validate_alloc_node(part_ptr, job_desc->alloc_node) == 0) { |
| info("_part_access_check: uid %u access to partition %s " |
| "denied, bad allocating node: %s", |
| (unsigned int) job_desc->user_id, part_ptr->name, |
| job_desc->alloc_node); |
| return ESLURM_ACCESS_DENIED; |
| } |
| |
| if ((part_ptr->state_up & PARTITION_SCHED) && |
| (job_desc->min_cpus != NO_VAL) && |
| (job_desc->min_cpus > part_ptr->total_cpus)) { |
| info("_part_access_check: Job requested too many cpus (%u) of " |
| "partition %s(%u)", |
| job_desc->min_cpus, part_ptr->name, |
| part_ptr->total_cpus); |
| return ESLURM_TOO_MANY_REQUESTED_CPUS; |
| } |
| |
| total_nodes = part_ptr->total_nodes; |
| select_g_alter_node_cnt(SELECT_APPLY_NODE_MAX_OFFSET, &total_nodes); |
| if ((part_ptr->state_up & PARTITION_SCHED) && |
| (job_desc->min_nodes != NO_VAL) && |
| (job_desc->min_nodes > total_nodes)) { |
| info("_part_access_check: Job requested too many nodes (%u) " |
| "of partition %s(%u)", |
| job_desc->min_nodes, part_ptr->name, total_nodes); |
| return ESLURM_INVALID_NODE_COUNT; |
| } |
| |
| if (req_bitmap && !bit_super_set(req_bitmap, part_ptr->node_bitmap)) { |
| info("_part_access_check: requested nodes %s not in " |
| "partition %s", job_desc->req_nodes, part_ptr->name); |
| return ESLURM_REQUESTED_NODES_NOT_IN_PARTITION; |
| } |
| |
| if (slurmctld_conf.enforce_part_limits) { |
| if ((rc = part_policy_valid_acct(part_ptr, acct)) |
| != SLURM_SUCCESS) |
| goto fini; |
| |
| if ((rc = part_policy_valid_qos(part_ptr, qos_ptr)) |
| != SLURM_SUCCESS) |
| goto fini; |
| } |
| |
| fini: |
| return rc; |
| } |
| |
| static int _get_job_parts(job_desc_msg_t * job_desc, |
| struct part_record **part_pptr, |
| List *part_pptr_list) |
| { |
| struct part_record *part_ptr = NULL, *part_ptr_new = NULL; |
| List part_ptr_list = NULL; |
| int rc = SLURM_SUCCESS; |
| |
| /* Identify partition(s) and set pointer(s) to their struct */ |
| if (job_desc->partition) { |
| part_ptr = find_part_record(job_desc->partition); |
| if (part_ptr == NULL) { |
| part_ptr_list = get_part_list(job_desc->partition); |
| if (part_ptr_list) |
| part_ptr = list_peek(part_ptr_list); |
| } |
| if (part_ptr == NULL) { |
| info("_valid_job_part: invalid partition specified: %s", |
| job_desc->partition); |
| return ESLURM_INVALID_PARTITION_NAME; |
| } |
| } else { |
| if (default_part_loc == NULL) { |
| error("_valid_job_part: default partition not set"); |
| return ESLURM_DEFAULT_PARTITION_NOT_SET; |
| } |
| part_ptr = default_part_loc; |
| job_desc->partition = xstrdup(part_ptr->name); |
| } |
| |
| /* Change partition pointer(s) to alternates as needed */ |
| if (part_ptr_list) { |
| int fail_rc = SLURM_SUCCESS; |
| struct part_record *part_ptr_tmp; |
| bool rebuild_name_list = false; |
| ListIterator iter = list_iterator_create(part_ptr_list); |
| |
| while ((part_ptr_tmp = list_next(iter))) { |
| rc = _alt_part_test(part_ptr_tmp, &part_ptr_new); |
| |
| if (rc == SLURM_SUCCESS && part_ptr_new) |
| part_ptr_tmp = part_ptr_new; |
| else if (rc != SLURM_SUCCESS) { |
| fail_rc = rc; |
| list_remove(iter); |
| rebuild_name_list = true; |
| continue; |
| } |
| if (part_ptr_new) { |
| list_insert(iter, part_ptr_new); |
| list_remove(iter); |
| rebuild_name_list = true; |
| } |
| } |
| list_iterator_destroy(iter); |
| if (list_is_empty(part_ptr_list)) { |
| if (fail_rc != SLURM_SUCCESS) |
| rc = fail_rc; |
| else |
| rc = ESLURM_PARTITION_NOT_AVAIL; |
| goto fini; |
| } |
| rc = SLURM_SUCCESS; /* At least some partition usable */ |
| if (rebuild_name_list) { |
| part_ptr = NULL; |
| xfree(job_desc->partition); |
| iter = list_iterator_create(part_ptr_list); |
| while ((part_ptr_tmp = list_next(iter))) { |
| if (job_desc->partition) |
| xstrcat(job_desc->partition, ","); |
| else |
| part_ptr = part_ptr_tmp; |
| xstrcat(job_desc->partition, |
| part_ptr_tmp->name); |
| } |
| list_iterator_destroy(iter); |
| } |
| } else { |
| rc = _alt_part_test(part_ptr, &part_ptr_new); |
| if (rc != SLURM_SUCCESS) |
| goto fini; |
| if (part_ptr_new) { |
| part_ptr = part_ptr_new; |
| xfree(job_desc->partition); |
| job_desc->partition = xstrdup(part_ptr->name); |
| } |
| } |
| |
| *part_pptr = part_ptr; |
| *part_pptr_list = part_ptr_list; |
| part_ptr_list = NULL; |
| fini: |
| return rc; |
| } |
| |
| static int _valid_job_part(job_desc_msg_t * job_desc, |
| uid_t submit_uid, bitstr_t *req_bitmap, |
| struct part_record **part_pptr, |
| List part_ptr_list, |
| slurmdb_association_rec_t *assoc_ptr, |
| slurmdb_qos_rec_t *qos_ptr) |
| { |
| int rc = SLURM_SUCCESS; |
| struct part_record *part_ptr = *part_pptr, *part_ptr_tmp; |
| slurmdb_association_rec_t assoc_rec; |
| uint32_t min_nodes_orig = INFINITE, max_nodes_orig = 1; |
| uint32_t max_time = 0; |
| |
| /* Change partition pointer(s) to alternates as needed */ |
| if (part_ptr_list) { |
| int fail_rc = SLURM_SUCCESS; |
| bool rebuild_name_list = false; |
| ListIterator iter = list_iterator_create(part_ptr_list); |
| |
| while ((part_ptr_tmp = (struct part_record *)list_next(iter))) { |
| /* FIXME: When dealing with multiple partitions we |
| * currently can't deal with partition based |
| * associations. |
| */ |
| memset(&assoc_rec, 0, |
| sizeof(slurmdb_association_rec_t)); |
| if (assoc_ptr) { |
| assoc_rec.acct = assoc_ptr->acct; |
| assoc_rec.partition = part_ptr_tmp->name; |
| assoc_rec.uid = job_desc->user_id; |
| |
| assoc_mgr_fill_in_assoc( |
| acct_db_conn, &assoc_rec, |
| accounting_enforce, NULL, false); |
| } |
| |
| if (assoc_ptr && assoc_rec.id != assoc_ptr->id) { |
| info("_valid_job_part: can't check multiple " |
| "partitions with partition based " |
| "associations"); |
| rc = SLURM_ERROR; |
| } else |
| rc = _part_access_check(part_ptr_tmp, job_desc, |
| req_bitmap, submit_uid, |
| qos_ptr, assoc_ptr ? |
| assoc_ptr->acct : NULL); |
| |
| if (rc != SLURM_SUCCESS) { |
| fail_rc = rc; |
| list_remove(iter); |
| rebuild_name_list = true; |
| continue; |
| } |
| |
| min_nodes_orig = MIN(min_nodes_orig, |
| part_ptr_tmp->min_nodes_orig); |
| max_nodes_orig = MAX(max_nodes_orig, |
| part_ptr_tmp->max_nodes_orig); |
| max_time = MAX(max_time, part_ptr_tmp->max_time); |
| } |
| list_iterator_destroy(iter); |
| if (list_is_empty(part_ptr_list)) { |
| if (fail_rc != SLURM_SUCCESS) |
| rc = fail_rc; |
| else |
| rc = ESLURM_PARTITION_NOT_AVAIL; |
| goto fini; |
| } |
| rc = SLURM_SUCCESS; /* At least some partition usable */ |
| if (rebuild_name_list) { |
| *part_pptr = part_ptr = NULL; |
| xfree(job_desc->partition); |
| iter = list_iterator_create(part_ptr_list); |
| while ((part_ptr_tmp = list_next(iter))) { |
| if (job_desc->partition) |
| xstrcat(job_desc->partition, ","); |
| else |
| *part_pptr = part_ptr = part_ptr_tmp; |
| xstrcat(job_desc->partition, |
| part_ptr_tmp->name); |
| } |
| list_iterator_destroy(iter); |
| } |
| } else { |
| min_nodes_orig = part_ptr->min_nodes_orig; |
| max_nodes_orig = part_ptr->max_nodes_orig; |
| max_time = part_ptr->max_time; |
| rc = _part_access_check(part_ptr, job_desc, req_bitmap, |
| submit_uid, qos_ptr, |
| assoc_ptr ? assoc_ptr->acct : NULL); |
| if (rc != SLURM_SUCCESS) |
| goto fini; |
| } |
| |
| /* Validate job limits against partition limits */ |
| if (job_desc->min_nodes == NO_VAL) { |
| /* Avoid setting the job request to 0 nodes if the |
| user didn't ask for 0. |
| */ |
| if (!min_nodes_orig) |
| job_desc->min_nodes = 1; |
| else |
| job_desc->min_nodes = min_nodes_orig; |
| } else if ((job_desc->min_nodes > max_nodes_orig) && |
| slurmctld_conf.enforce_part_limits && |
| (!qos_ptr || (qos_ptr && !(qos_ptr->flags & |
| QOS_FLAG_PART_MIN_NODE)))) { |
| info("_valid_job_part: job's min nodes greater than " |
| "partition's max nodes (%u > %u)", |
| job_desc->min_nodes, max_nodes_orig); |
| rc = ESLURM_INVALID_NODE_COUNT; |
| goto fini; |
| } else if ((job_desc->min_nodes < min_nodes_orig) && |
| ((job_desc->max_nodes == NO_VAL) || |
| (job_desc->max_nodes >= min_nodes_orig))) { |
| job_desc->min_nodes = min_nodes_orig; |
| } |
| |
| if ((job_desc->max_nodes != NO_VAL) && |
| slurmctld_conf.enforce_part_limits && |
| (job_desc->max_nodes < min_nodes_orig) && |
| (!qos_ptr || (qos_ptr && !(qos_ptr->flags |
| & QOS_FLAG_PART_MAX_NODE)))) { |
| info("_valid_job_part: job's max nodes less than partition's " |
| "min nodes (%u < %u)", |
| job_desc->max_nodes, min_nodes_orig); |
| rc = ESLURM_INVALID_NODE_COUNT; |
| goto fini; |
| } |
| #ifndef HAVE_FRONT_END |
| if ((job_desc->min_nodes == 0) && (job_desc->script == NULL)) { |
| info("_valid_job_part: min_nodes==0 for non-batch job"); |
| rc = ESLURM_INVALID_NODE_COUNT; |
| goto fini; |
| } |
| #endif |
| |
| if ((job_desc->time_limit == NO_VAL) && |
| (part_ptr->default_time == 0)) { |
| info("_valid_job_part: job's default time is 0"); |
| rc = ESLURM_INVALID_TIME_LIMIT; |
| goto fini; |
| } |
| |
| if ((job_desc->time_limit == NO_VAL) && |
| (part_ptr->default_time != NO_VAL)) |
| job_desc->time_limit = part_ptr->default_time; |
| |
| if ((job_desc->time_min != NO_VAL) && |
| (job_desc->time_min > max_time) && |
| (!qos_ptr || (qos_ptr && !(qos_ptr->flags & |
| QOS_FLAG_PART_TIME_LIMIT)))) { |
| info("_valid_job_part: job's min time greater than " |
| "partition's (%u > %u)", |
| job_desc->time_min, max_time); |
| rc = ESLURM_INVALID_TIME_LIMIT; |
| goto fini; |
| } |
| if ((job_desc->time_limit != NO_VAL) && |
| (job_desc->time_limit > max_time) && |
| (job_desc->time_min == NO_VAL) && |
| slurmctld_conf.enforce_part_limits && |
| (!qos_ptr || (qos_ptr && !(qos_ptr->flags & |
| QOS_FLAG_PART_TIME_LIMIT)))) { |
| info("_valid_job_part: job's time limit greater than " |
| "partition's (%u > %u)", |
| job_desc->time_limit, max_time); |
| rc = ESLURM_INVALID_TIME_LIMIT; |
| goto fini; |
| } |
| if ((job_desc->time_min != NO_VAL) && |
| (job_desc->time_min > job_desc->time_limit) && |
| (!qos_ptr || (qos_ptr && !(qos_ptr->flags & |
| QOS_FLAG_PART_TIME_LIMIT)))) { |
| info("_valid_job_part: job's min_time greater time limit " |
| "(%u > %u)", |
| job_desc->time_min, job_desc->time_limit); |
| rc = ESLURM_INVALID_TIME_LIMIT; |
| goto fini; |
| } |
| |
| fini: |
| return rc; |
| } |
| |
| /* |
| * job_limits_check - check the limits specified for the job. |
| * IN job_ptr - pointer to job table entry. |
| * IN check_min_time - if true test job's minimum time limit, |
| * otherwise test maximum time limit |
| * RET WAIT_NO_REASON on success, fail status otherwise. |
| */ |
| extern int job_limits_check(struct job_record **job_pptr, bool check_min_time) |
| { |
| struct job_details *detail_ptr; |
| enum job_state_reason fail_reason; |
| struct part_record *part_ptr = NULL; |
| struct job_record *job_ptr = NULL; |
| slurmdb_qos_rec_t *qos_ptr; |
| slurmdb_association_rec_t *assoc_ptr; |
| uint32_t job_min_nodes, job_max_nodes; |
| uint32_t part_min_nodes, part_max_nodes; |
| uint32_t time_check; |
| #ifdef HAVE_BG |
| static uint16_t cpus_per_node = 0; |
| if (!cpus_per_node) |
| select_g_alter_node_cnt(SELECT_GET_NODE_CPU_CNT, |
| &cpus_per_node); |
| #endif |
| job_ptr = *job_pptr; |
| detail_ptr = job_ptr->details; |
| part_ptr = job_ptr->part_ptr; |
| qos_ptr = job_ptr->qos_ptr; |
| assoc_ptr = job_ptr->assoc_ptr; |
| if (!detail_ptr) { /* To prevent CLANG error */ |
| fatal("job %u has NULL details_ptr", job_ptr->job_id); |
| return WAIT_NO_REASON; |
| } |
| |
| #ifdef HAVE_BG |
| job_min_nodes = detail_ptr->min_cpus / cpus_per_node; |
| job_max_nodes = detail_ptr->max_cpus / cpus_per_node; |
| part_min_nodes = part_ptr->min_nodes_orig; |
| part_max_nodes = part_ptr->max_nodes_orig; |
| #else |
| job_min_nodes = detail_ptr->min_nodes; |
| job_max_nodes = detail_ptr->max_nodes; |
| part_min_nodes = part_ptr->min_nodes; |
| part_max_nodes = part_ptr->max_nodes; |
| #endif |
| |
| fail_reason = WAIT_NO_REASON; |
| |
| if (check_min_time && job_ptr->time_min) |
| time_check = job_ptr->time_min; |
| else |
| time_check = job_ptr->time_limit; |
| if ((job_min_nodes > part_max_nodes) && |
| (!qos_ptr || (qos_ptr && !(qos_ptr->flags |
| & QOS_FLAG_PART_MAX_NODE)))) { |
| debug2("Job %u requested too many nodes (%u) of " |
| "partition %s(MaxNodes %u)", |
| job_ptr->job_id, job_min_nodes, |
| part_ptr->name, part_max_nodes); |
| fail_reason = WAIT_PART_NODE_LIMIT; |
| } else if ((job_max_nodes != 0) && /* no max_nodes for job */ |
| ((job_max_nodes < part_min_nodes) && |
| (!qos_ptr || (qos_ptr && !(qos_ptr->flags & |
| QOS_FLAG_PART_MIN_NODE))))) { |
| debug2("Job %u requested too few nodes (%u) of " |
| "partition %s(MinNodes %u)", |
| job_ptr->job_id, job_max_nodes, |
| part_ptr->name, part_min_nodes); |
| fail_reason = WAIT_PART_NODE_LIMIT; |
| } else if (part_ptr->state_up == PARTITION_DOWN) { |
| debug2("Job %u requested down partition %s", |
| job_ptr->job_id, part_ptr->name); |
| fail_reason = WAIT_PART_DOWN; |
| } else if (part_ptr->state_up == PARTITION_INACTIVE) { |
| debug2("Job %u requested inactive partition %s", |
| job_ptr->job_id, part_ptr->name); |
| fail_reason = WAIT_PART_INACTIVE; |
| } else if ((time_check != NO_VAL) && |
| (time_check > part_ptr->max_time) && |
| (!qos_ptr || (qos_ptr && !(qos_ptr->flags & |
| QOS_FLAG_PART_TIME_LIMIT)))) { |
| info("Job %u exceeds partition time limit (%u > %u)", |
| job_ptr->job_id, time_check, part_ptr->max_time); |
| fail_reason = WAIT_PART_TIME_LIMIT; |
| } else if (qos_ptr && assoc_ptr && |
| (qos_ptr->flags & QOS_FLAG_ENFORCE_USAGE_THRES) && |
| (!fuzzy_equal(qos_ptr->usage_thres, NO_VAL))) { |
| if (!job_ptr->prio_factors) { |
| job_ptr->prio_factors = |
| xmalloc(sizeof(priority_factors_object_t)); |
| } |
| if (!job_ptr->prio_factors->priority_fs) { |
| if (fuzzy_equal(assoc_ptr->usage->usage_efctv, NO_VAL)) |
| priority_g_set_assoc_usage(assoc_ptr); |
| job_ptr->prio_factors->priority_fs = |
| priority_g_calc_fs_factor( |
| assoc_ptr->usage->usage_efctv, |
| (long double)assoc_ptr->usage-> |
| shares_norm); |
| } |
| if (job_ptr->prio_factors->priority_fs < qos_ptr->usage_thres){ |
| debug2("Job %u exceeds usage threashold", |
| job_ptr->job_id); |
| fail_reason = WAIT_QOS_THRES; |
| } |
| } |
| |
| return (fail_reason); |
| } |
| |
| /* |
| * _job_create - create a job table record for the supplied specifications. |
| * This performs only basic tests for request validity (access to |
| * partition, nodes count in partition, and sufficient processors in |
| * partition). |
| * IN job_specs - job specifications |
| * IN allocate - resource allocation request if set rather than job submit |
| * IN will_run - job is not to be created, test of validity only |
| * OUT job_pptr - pointer to the job (NULL on error) |
| * OUT err_msg - Error message for user |
| * RET 0 on success, otherwise ESLURM error code. If the job would only be |
| * able to execute with some change in partition configuration then |
| * ESLURM_REQUESTED_PART_CONFIG_UNAVAILABLE is returned |
| */ |
| |
| static int _job_create(job_desc_msg_t * job_desc, int allocate, int will_run, |
| struct job_record **job_pptr, uid_t submit_uid, |
| char **err_msg) |
| { |
| static int launch_type_poe = -1; |
| int error_code = SLURM_SUCCESS, i, qos_error; |
| struct part_record *part_ptr = NULL; |
| List part_ptr_list = NULL; |
| bitstr_t *req_bitmap = NULL, *exc_bitmap = NULL; |
| struct job_record *job_ptr = NULL; |
| slurmdb_association_rec_t assoc_rec, *assoc_ptr = NULL; |
| List license_list = NULL; |
| bool valid; |
| slurmdb_qos_rec_t qos_rec, *qos_ptr; |
| uint32_t user_submit_priority; |
| static uint32_t node_scaling = 1; |
| static uint32_t cpus_per_mp = 1; |
| acct_policy_limit_set_t acct_policy_limit_set; |
| |
| #ifdef HAVE_BG |
| uint16_t geo[SYSTEM_DIMENSIONS]; |
| uint16_t reboot; |
| uint16_t rotate; |
| uint16_t conn_type[SYSTEM_DIMENSIONS]; |
| static bool sub_mp_system = 0; |
| |
| if (node_scaling == 1) { |
| select_g_alter_node_cnt(SELECT_GET_NODE_SCALING, |
| &node_scaling); |
| select_g_alter_node_cnt(SELECT_GET_MP_CPU_CNT, |
| &cpus_per_mp); |
| if (node_scaling < 512) |
| sub_mp_system = 1; |
| } |
| #endif |
| |
| if (select_serial == -1) { |
| if (strcmp(slurmctld_conf.select_type, "select/serial")) |
| select_serial = 0; |
| else |
| select_serial = 1; |
| } |
| |
| memset(&acct_policy_limit_set, 0, sizeof(acct_policy_limit_set_t)); |
| |
| *job_pptr = (struct job_record *) NULL; |
| /* |
| * Check user permission for negative 'nice' and non-0 priority values |
| * (both restricted to SlurmUser) before running the job_submit plugin. |
| */ |
| if ((submit_uid != 0) && (submit_uid != slurmctld_conf.slurm_user_id)) { |
| if (job_desc->priority != 0) |
| job_desc->priority = NO_VAL; |
| if (job_desc->nice < NICE_OFFSET) |
| job_desc->nice = NICE_OFFSET; |
| } |
| user_submit_priority = job_desc->priority; |
| |
| error_code = job_submit_plugin_submit(job_desc, (uint32_t) submit_uid, |
| err_msg); |
| if (error_code != SLURM_SUCCESS) |
| return error_code; |
| |
| /* insure that selected nodes are in this partition */ |
| if (job_desc->req_nodes) { |
| error_code = node_name2bitmap(job_desc->req_nodes, false, |
| &req_bitmap); |
| if (error_code) { |
| error_code = ESLURM_INVALID_NODE_NAME; |
| goto cleanup_fail; |
| } |
| if ((job_desc->contiguous != (uint16_t) NO_VAL) && |
| (job_desc->contiguous)) |
| bit_fill_gaps(req_bitmap); |
| i = bit_set_count(req_bitmap); |
| if (i > job_desc->min_nodes) |
| job_desc->min_nodes = i * node_scaling; |
| if (i > job_desc->min_cpus) |
| job_desc->min_cpus = i * cpus_per_mp; |
| if (job_desc->max_nodes && |
| (job_desc->min_nodes > job_desc->max_nodes)) { |
| #if 0 |
| info("_job_create: max node count less than required " |
| "hostlist size for user %u", job_desc->user_id); |
| job_desc->max_nodes = job_desc->min_nodes; |
| #else |
| error_code = ESLURM_INVALID_NODE_COUNT; |
| goto cleanup_fail; |
| #endif |
| } |
| } |
| #ifdef HAVE_ALPS_CRAY |
| if ((job_desc->max_nodes == 0) && (job_desc->script == NULL)) { |
| #else |
| if (job_desc->max_nodes == 0) { |
| #endif |
| info("_job_create: max_nodes == 0"); |
| error_code = ESLURM_INVALID_NODE_COUNT; |
| goto cleanup_fail; |
| } |
| |
| error_code = _get_job_parts(job_desc, &part_ptr, &part_ptr_list); |
| if (error_code != SLURM_SUCCESS) |
| goto cleanup_fail; |
| |
| |
| memset(&assoc_rec, 0, sizeof(slurmdb_association_rec_t)); |
| assoc_rec.acct = job_desc->account; |
| assoc_rec.partition = part_ptr->name; |
| assoc_rec.uid = job_desc->user_id; |
| /* Checks are done later to validate assoc_ptr, so we don't |
| need to lock outside of fill_in_assoc. |
| */ |
| if (assoc_mgr_fill_in_assoc(acct_db_conn, &assoc_rec, |
| accounting_enforce, &assoc_ptr, false)) { |
| info("_job_create: invalid account or partition for user %u, " |
| "account '%s', and partition '%s'", |
| job_desc->user_id, assoc_rec.acct, assoc_rec.partition); |
| error_code = ESLURM_INVALID_ACCOUNT; |
| goto cleanup_fail; |
| } else if (association_based_accounting && |
| !assoc_ptr && |
| !(accounting_enforce & ACCOUNTING_ENFORCE_ASSOCS)) { |
| /* If not enforcing associations we want to look for the |
| * default account and use it to avoid getting trash in the |
| * accounting records. */ |
| assoc_rec.acct = NULL; |
| assoc_mgr_fill_in_assoc(acct_db_conn, &assoc_rec, |
| accounting_enforce, &assoc_ptr, false); |
| if (assoc_ptr) { |
| info("_job_create: account '%s' has no association " |
| "for user %u using default account '%s'", |
| job_desc->account, job_desc->user_id, |
| assoc_rec.acct); |
| xfree(job_desc->account); |
| } |
| } |
| |
| if (job_desc->account == NULL) |
| job_desc->account = xstrdup(assoc_rec.acct); |
| |
| /* This must be done after we have the assoc_ptr set */ |
| memset(&qos_rec, 0, sizeof(slurmdb_qos_rec_t)); |
| qos_rec.name = job_desc->qos; |
| if (wiki_sched && job_desc->comment && |
| strstr(job_desc->comment, "QOS:")) { |
| if (strstr(job_desc->comment, "FLAGS:PREEMPTOR")) |
| qos_rec.name = "expedite"; |
| else if (strstr(job_desc->comment, "FLAGS:PREEMPTEE")) |
| qos_rec.name = "standby"; |
| } |
| |
| qos_ptr = _determine_and_validate_qos( |
| job_desc->reservation, assoc_ptr, false, &qos_rec, &qos_error); |
| |
| if (qos_error != SLURM_SUCCESS) { |
| error_code = qos_error; |
| goto cleanup_fail; |
| } |
| |
| error_code = _valid_job_part(job_desc, submit_uid, req_bitmap, |
| &part_ptr, part_ptr_list, |
| assoc_ptr, qos_ptr); |
| |
| if (error_code != SLURM_SUCCESS) |
| goto cleanup_fail; |
| |
| if ((error_code = _validate_job_desc(job_desc, allocate, submit_uid, |
| part_ptr, part_ptr_list))) { |
| goto cleanup_fail; |
| } |
| |
| if ((accounting_enforce & ACCOUNTING_ENFORCE_LIMITS) && |
| (!acct_policy_validate(job_desc, part_ptr, |
| assoc_ptr, qos_ptr, NULL, |
| &acct_policy_limit_set, 0))) { |
| info("_job_create: exceeded association/qos's limit " |
| "for user %u", job_desc->user_id); |
| error_code = ESLURM_ACCOUNTING_POLICY; |
| goto cleanup_fail; |
| } |
| |
| /* This needs to be done after the association acct policy check since |
| * it looks at unaltered nodes for bluegene systems |
| */ |
| debug3("before alteration asking for nodes %u-%u cpus %u-%u", |
| job_desc->min_nodes, job_desc->max_nodes, |
| job_desc->min_cpus, job_desc->max_cpus); |
| if (select_g_alter_node_cnt(SELECT_SET_NODE_CNT, job_desc) |
| != SLURM_SUCCESS) { |
| error_code = ESLURM_REQUESTED_NODE_CONFIG_UNAVAILABLE; |
| goto cleanup_fail; |
| } |
| |
| debug3("after alteration asking for nodes %u-%u cpus %u-%u", |
| job_desc->min_nodes, job_desc->max_nodes, |
| job_desc->min_cpus, job_desc->max_cpus); |
| |
| if (job_desc->exc_nodes) { |
| error_code = node_name2bitmap(job_desc->exc_nodes, false, |
| &exc_bitmap); |
| if (error_code) { |
| error_code = ESLURM_INVALID_NODE_NAME; |
| goto cleanup_fail; |
| } |
| } |
| if (exc_bitmap && req_bitmap) { |
| bitstr_t *tmp_bitmap = NULL; |
| bitoff_t first_set; |
| tmp_bitmap = bit_copy(exc_bitmap); |
| bit_and(tmp_bitmap, req_bitmap); |
| first_set = bit_ffs(tmp_bitmap); |
| FREE_NULL_BITMAP(tmp_bitmap); |
| if (first_set != -1) { |
| info("Job's required and excluded node lists overlap"); |
| error_code = ESLURM_INVALID_NODE_NAME; |
| goto cleanup_fail; |
| } |
| } |
| |
| if (job_desc->min_nodes == NO_VAL) |
| job_desc->min_nodes = 1; |
| |
| #ifdef HAVE_BG |
| select_g_select_jobinfo_get(job_desc->select_jobinfo, |
| SELECT_JOBDATA_GEOMETRY, &geo); |
| if (geo[0] == (uint16_t) NO_VAL) { |
| for (i=0; i<SYSTEM_DIMENSIONS; i++) |
| geo[i] = 0; |
| select_g_select_jobinfo_set(job_desc->select_jobinfo, |
| SELECT_JOBDATA_GEOMETRY, &geo); |
| } else if (geo[0] != 0) { |
| uint32_t i, tot = 1; |
| for (i=0; i<SYSTEM_DIMENSIONS; i++) |
| tot *= geo[i]; |
| if (job_desc->min_nodes > tot) { |
| info("MinNodes(%d) > GeometryNodes(%d)", |
| job_desc->min_nodes, tot); |
| error_code = ESLURM_TOO_MANY_REQUESTED_CPUS; |
| goto cleanup_fail; |
| } |
| job_desc->min_nodes = tot; |
| } |
| select_g_select_jobinfo_get(job_desc->select_jobinfo, |
| SELECT_JOBDATA_REBOOT, &reboot); |
| if (reboot == (uint16_t) NO_VAL) { |
| reboot = 0; /* default is no reboot */ |
| select_g_select_jobinfo_set(job_desc->select_jobinfo, |
| SELECT_JOBDATA_REBOOT, &reboot); |
| } |
| select_g_select_jobinfo_get(job_desc->select_jobinfo, |
| SELECT_JOBDATA_ROTATE, &rotate); |
| if (rotate == (uint16_t) NO_VAL) { |
| rotate = 1; /* refault is to rotate */ |
| select_g_select_jobinfo_set(job_desc->select_jobinfo, |
| SELECT_JOBDATA_ROTATE, &rotate); |
| } |
| select_g_select_jobinfo_get(job_desc->select_jobinfo, |
| SELECT_JOBDATA_CONN_TYPE, &conn_type); |
| |
| if ((conn_type[0] != (uint16_t) NO_VAL) |
| && (((conn_type[0] >= SELECT_SMALL) |
| && ((job_desc->min_cpus >= cpus_per_mp) && !sub_mp_system)) |
| || (!sub_mp_system |
| && ((conn_type[0] == SELECT_TORUS) |
| || (conn_type[0] == SELECT_MESH)) |
| && (job_desc->min_cpus < cpus_per_mp)))) { |
| /* check to make sure we have a valid conn_type with |
| * the cpu count */ |
| info("Job's cpu count at %u makes our conn_type " |
| "of '%s' invalid.", |
| job_desc->min_cpus, conn_type_string(conn_type[0])); |
| error_code = ESLURM_INVALID_NODE_COUNT; |
| goto cleanup_fail; |
| } |
| |
| /* make sure we reset all the NO_VAL's to NAV's */ |
| for (i=0; i<SYSTEM_DIMENSIONS; i++) { |
| if (conn_type[i] == (uint16_t)NO_VAL) |
| conn_type[i] = SELECT_NAV; |
| } |
| select_g_select_jobinfo_set(job_desc->select_jobinfo, |
| SELECT_JOBDATA_CONN_TYPE, |
| &conn_type); |
| |
| #endif |
| |
| if (job_desc->max_nodes == NO_VAL) |
| job_desc->max_nodes = 0; |
| |
| if (job_desc->max_nodes && |
| (job_desc->max_nodes < job_desc->min_nodes)) { |
| info("_job_create: Job's max_nodes(%u) < min_nodes(%u)", |
| job_desc->max_nodes, job_desc->min_nodes); |
| error_code = ESLURM_INVALID_NODE_COUNT; |
| goto cleanup_fail; |
| } |
| |
| license_list = license_validate(job_desc->licenses, &valid); |
| if (!valid) { |
| info("Job's requested licenses are invalid: %s", |
| job_desc->licenses); |
| error_code = ESLURM_INVALID_LICENSES; |
| goto cleanup_fail; |
| } |
| |
| if ((error_code = _copy_job_desc_to_job_record(job_desc, |
| job_pptr, |
| &req_bitmap, |
| &exc_bitmap))) { |
| if (error_code == SLURM_ERROR) |
| error_code = ESLURM_ERROR_ON_DESC_TO_RECORD_COPY; |
| goto cleanup_fail; |
| } |
| job_ptr = *job_pptr; |
| job_ptr->part_ptr = part_ptr; |
| job_ptr->part_ptr_list = part_ptr_list; |
| |
| part_ptr_list = NULL; |
| if ((error_code = checkpoint_alloc_jobinfo(&(job_ptr->check_job)))) { |
| error("Failed to allocate checkpoint info for job"); |
| goto cleanup_fail; |
| } |
| |
| job_ptr->limit_set_max_cpus = acct_policy_limit_set.max_cpus; |
| job_ptr->limit_set_max_nodes = acct_policy_limit_set.max_nodes; |
| job_ptr->limit_set_min_cpus = acct_policy_limit_set.min_cpus; |
| job_ptr->limit_set_min_nodes = acct_policy_limit_set.min_nodes; |
| job_ptr->limit_set_pn_min_memory = acct_policy_limit_set.pn_min_memory; |
| job_ptr->limit_set_time = acct_policy_limit_set.time; |
| job_ptr->limit_set_qos = acct_policy_limit_set.qos; |
| |
| job_ptr->assoc_id = assoc_rec.id; |
| job_ptr->assoc_ptr = (void *) assoc_ptr; |
| job_ptr->qos_ptr = (void *) qos_ptr; |
| job_ptr->qos_id = qos_rec.id; |
| |
| if (launch_type_poe == -1) { |
| char *launch_type = slurm_get_launch_type(); |
| if (!strcmp(launch_type, "launch/poe")) |
| launch_type_poe = 1; |
| else |
| launch_type_poe = 0; |
| xfree(launch_type); |
| } |
| if (launch_type_poe == 1) |
| job_ptr->next_step_id = 1; |
| |
| /* |
| * Permission for altering priority was confirmed above. The job_submit |
| * plugin may have set the priority directly or put the job on hold. If |
| * the priority is not given, we will figure it out later after we see |
| * if the job is eligible or not. So we want NO_VAL if not set. |
| */ |
| job_ptr->priority = job_desc->priority; |
| if (job_ptr->priority == 0) { |
| if (user_submit_priority == 0) |
| job_ptr->state_reason = WAIT_HELD_USER; |
| else |
| job_ptr->state_reason = WAIT_HELD; |
| } else if (job_ptr->priority != NO_VAL) { |
| job_ptr->direct_set_prio = 1; |
| } |
| |
| error_code = update_job_dependency(job_ptr, job_desc->dependency); |
| if (error_code != SLURM_SUCCESS) |
| goto cleanup_fail; |
| job_ptr->details->orig_dependency = xstrdup(job_ptr->details-> |
| dependency); |
| |
| if (build_feature_list(job_ptr)) { |
| error_code = ESLURM_INVALID_FEATURE; |
| goto cleanup_fail; |
| } |
| /* NOTE: If this job is being used to expand another job, this job's |
| * gres_list has already been filled in with a copy of gres_list job |
| * to be expanded by update_job_dependency() */ |
| if ((job_ptr->details->expanding_jobid == 0) && |
| gres_plugin_job_state_validate(job_ptr->gres, &job_ptr->gres_list)){ |
| error_code = ESLURM_INVALID_GRES; |
| goto cleanup_fail; |
| } |
| gres_plugin_job_state_log(job_ptr->gres_list, job_ptr->job_id); |
| |
| if ((error_code = validate_job_resv(job_ptr))) |
| goto cleanup_fail; |
| |
| if (job_desc->script |
| && (!will_run)) { /* don't bother with copy if just a test */ |
| if ((error_code = _copy_job_desc_to_file(job_desc, |
| job_ptr->job_id))) { |
| error_code = ESLURM_WRITING_TO_FILE; |
| goto cleanup_fail; |
| } |
| job_ptr->batch_flag = 1; |
| } else |
| job_ptr->batch_flag = 0; |
| |
| job_ptr->license_list = license_list; |
| license_list = NULL; |
| |
| if (job_desc->req_switch != NO_VAL) { /* Max # of switches */ |
| job_ptr->req_switch = job_desc->req_switch; |
| if (job_desc->wait4switch != NO_VAL) { |
| job_ptr->wait4switch = |
| _max_switch_wait(job_desc->wait4switch); |
| } else |
| job_ptr->wait4switch = _max_switch_wait(INFINITE); |
| } |
| job_ptr->best_switch = true; |
| |
| FREE_NULL_LIST(license_list); |
| FREE_NULL_BITMAP(req_bitmap); |
| FREE_NULL_BITMAP(exc_bitmap); |
| return error_code; |
| |
| cleanup_fail: |
| if (job_ptr) { |
| job_ptr->job_state = JOB_FAILED; |
| job_ptr->exit_code = 1; |
| job_ptr->state_reason = FAIL_SYSTEM; |
| xfree(job_ptr->state_desc); |
| job_ptr->start_time = job_ptr->end_time = time(NULL); |
| _purge_job_record(job_ptr->job_id); |
| *job_pptr = (struct job_record *) NULL; |
| } |
| FREE_NULL_LIST(license_list); |
| FREE_NULL_LIST(part_ptr_list); |
| FREE_NULL_BITMAP(req_bitmap); |
| FREE_NULL_BITMAP(exc_bitmap); |
| return error_code; |
| } |
| |
| static int _test_strlen(char *test_str, char *str_name, int max_str_len) |
| { |
| int i = 0; |
| |
| if (test_str) |
| i = strlen(test_str); |
| if (i > max_str_len) { |
| info("job_create_request: strlen(%s) too big (%d > %d)", |
| str_name, i, max_str_len); |
| return ESLURM_PATHNAME_TOO_LONG; |
| } |
| return SLURM_SUCCESS; |
| } |
| |
| /* For each token in a comma delimited job array expression set the matching |
| * bitmap entry */ |
| static bool _parse_array_tok(char *tok, bitstr_t *array_bitmap, uint32_t max) |
| { |
| char *end_ptr = NULL; |
| int i, first, last, step = 1; |
| |
| first = strtol(tok, &end_ptr, 10); |
| if (first < 0) |
| return false; |
| if (end_ptr[0] == '-') { |
| last = strtol(end_ptr + 1, &end_ptr, 10); |
| if (end_ptr[0] == ':') { |
| step = strtol(end_ptr + 1, &end_ptr, 10); |
| if (end_ptr[0] != '\0') |
| return false; |
| if (step <= 0) |
| return false; |
| } else if (end_ptr[0] != '\0') { |
| return false; |
| } |
| if (last < first) |
| return false; |
| } else if (end_ptr[0] != '\0') { |
| return false; |
| } else { |
| last = first; |
| } |
| |
| if (last >= max) |
| return false; |
| |
| for (i = first; i <= last; i += step) { |
| bit_set(array_bitmap, i); |
| } |
| |
| return true; |
| } |
| |
| /* Translate a job array expression into the equivalent bitmap */ |
| static bool _valid_array_inx(job_desc_msg_t *job_desc) |
| { |
| slurm_ctl_conf_t *conf; |
| bool valid = true; |
| uint32_t max_array_size; |
| char *tmp, *tok, *last = NULL; |
| |
| FREE_NULL_BITMAP(job_desc->array_bitmap); |
| if (!job_desc->array_inx || !job_desc->array_inx[0]) |
| return true; |
| if (!job_desc->script || !job_desc->script[0]) |
| return false; |
| |
| conf = slurm_conf_lock(); |
| max_array_size = conf->max_array_sz; |
| slurm_conf_unlock(); |
| if (max_array_size == 0) { |
| verbose("Job arrays disabled, MaxArraySize=0"); |
| return false; |
| } |
| |
| /* We have a job array request */ |
| job_desc->immediate = 0; /* Disable immediate option */ |
| job_desc->array_bitmap = bit_alloc(max_array_size); |
| |
| tmp = xstrdup(job_desc->array_inx); |
| tok = strtok_r(tmp, ",", &last); |
| while (tok && valid) { |
| valid = _parse_array_tok(tok, job_desc->array_bitmap, |
| max_array_size); |
| tok = strtok_r(NULL, ",", &last); |
| } |
| xfree(tmp); |
| |
| return valid; |
| } |
| |
| /* Perform some size checks on strings we store to prevent |
| * malicious user filling slurmctld's memory |
| * RET 0 or error code */ |
| extern int validate_job_create_req(job_desc_msg_t * job_desc) |
| { |
| if (_test_strlen(job_desc->account, "account", 1024) || |
| _test_strlen(job_desc->alloc_node, "alloc_node", 1024) || |
| _test_strlen(job_desc->array_inx, "array_inx", 1024 * 4) || |
| _test_strlen(job_desc->blrtsimage, "blrtsimage", 1024) || |
| _test_strlen(job_desc->ckpt_dir, "ckpt_dir", 1024) || |
| _test_strlen(job_desc->comment, "comment", 1024) || |
| _test_strlen(job_desc->cpu_bind, "cpu_bind", 1024) || |
| _test_strlen(job_desc->dependency, "dependency", 1024*128) || |
| _test_strlen(job_desc->exc_nodes, "exc_nodes", 1024*64) || |
| _test_strlen(job_desc->features, "features", 1024) || |
| _test_strlen(job_desc->gres, "gres", 1024) || |
| _test_strlen(job_desc->licenses, "licenses", 1024) || |
| _test_strlen(job_desc->linuximage, "linuximage", 1024) || |
| _test_strlen(job_desc->mail_user, "mail_user", 1024) || |
| _test_strlen(job_desc->mem_bind, "mem_bind", 1024) || |
| _test_strlen(job_desc->mloaderimage, "mloaderimage", 1024) || |
| _test_strlen(job_desc->name, "name", 1024) || |
| _test_strlen(job_desc->network, "network", 1024) || |
| _test_strlen(job_desc->partition, "partition", 1024) || |
| _test_strlen(job_desc->qos, "qos", 1024) || |
| _test_strlen(job_desc->ramdiskimage, "ramdiskimage", 1024) || |
| _test_strlen(job_desc->req_nodes, "req_nodes", 1024*64) || |
| _test_strlen(job_desc->reservation, "reservation", 1024) || |
| _test_strlen(job_desc->script, "script", 1024 * 1024 * 4) || |
| _test_strlen(job_desc->std_err, "std_err", MAXPATHLEN) || |
| _test_strlen(job_desc->std_in, "std_in", MAXPATHLEN) || |
| _test_strlen(job_desc->std_out, "std_out", MAXPATHLEN) || |
| _test_strlen(job_desc->wckey, "wckey", 1024) || |
| _test_strlen(job_desc->work_dir, "work_dir", MAXPATHLEN)) |
| return ESLURM_PATHNAME_TOO_LONG; |
| |
| if (!_valid_array_inx(job_desc)) |
| return ESLURM_INVALID_ARRAY; |
| |
| /* Make sure anything that may be put in the database will be |
| * lower case */ |
| xstrtolower(job_desc->account); |
| xstrtolower(job_desc->wckey); |
| |
| /* Basic validation of some parameters */ |
| if (job_desc->req_nodes) { |
| hostlist_t hl; |
| uint32_t host_cnt; |
| hl = hostlist_create(job_desc->req_nodes); |
| if (hl == NULL) { |
| /* likely a badly formatted hostlist */ |
| error("create_job_record: bad hostlist"); |
| return ESLURM_INVALID_NODE_NAME; |
| } |
| host_cnt = hostlist_count(hl); |
| hostlist_destroy(hl); |
| if ((job_desc->min_nodes == NO_VAL) || |
| (job_desc->min_nodes < host_cnt)) |
| job_desc->min_nodes = host_cnt; |
| } |
| if ((job_desc->ntasks_per_node != (uint16_t) NO_VAL) && |
| (job_desc->min_nodes != NO_VAL) && |
| (job_desc->num_tasks != NO_VAL)) { |
| uint32_t ntasks = job_desc->ntasks_per_node * |
| job_desc->min_nodes; |
| job_desc->num_tasks = MAX(job_desc->num_tasks, ntasks); |
| } |
| if ((job_desc->min_cpus != NO_VAL) && |
| (job_desc->min_nodes != NO_VAL) && |
| (job_desc->min_cpus < job_desc->min_nodes) && |
| (job_desc->max_cpus >= job_desc->min_nodes)) |
| job_desc->min_cpus = job_desc->min_nodes; |
| |
| return SLURM_SUCCESS; |
| } |
| |
| /* _copy_job_desc_to_file - copy the job script and environment from the RPC |
| * structure into a file */ |
| static int |
| _copy_job_desc_to_file(job_desc_msg_t * job_desc, uint32_t job_id) |
| { |
| int error_code = 0; |
| char *dir_name, job_dir[32], *file_name; |
| DEF_TIMERS; |
| |
| START_TIMER; |
| /* Create state_save_location directory */ |
| dir_name = slurm_get_state_save_location(); |
| |
| /* Create job_id specific directory */ |
| sprintf(job_dir, "/job.%u", job_id); |
| xstrcat(dir_name, job_dir); |
| if (mkdir(dir_name, 0700)) { |
| if (!slurmctld_primary && (errno == EEXIST)) { |
| error("Apparent duplicate job ID %u. Two primary " |
| "slurmctld daemons might currently be active", |
| job_id); |
| } |
| error("mkdir(%s) error %m", dir_name); |
| xfree(dir_name); |
| return ESLURM_WRITING_TO_FILE; |
| } |
| |
| /* Create environment file, and write data to it */ |
| file_name = xstrdup(dir_name); |
| xstrcat(file_name, "/environment"); |
| error_code = _write_data_array_to_file(file_name, |
| job_desc->environment, |
| job_desc->env_size); |
| xfree(file_name); |
| |
| if (error_code == 0) { |
| /* Create script file */ |
| file_name = xstrdup(dir_name); |
| xstrcat(file_name, "/script"); |
| error_code = _write_data_to_file(file_name, job_desc->script); |
| xfree(file_name); |
| } |
| |
| xfree(dir_name); |
| END_TIMER2("_copy_job_desc_to_file"); |
| return error_code; |
| } |
| |
| /* Return true of the specified job ID already has a batch directory so |
| * that a different job ID can be created. This is to help limit damage from |
| * split-brain, where two slurmctld daemons are running as primary. */ |
| static bool _dup_job_file_test(uint32_t job_id) |
| { |
| char *dir_name_src, job_dir[32]; |
| struct stat buf; |
| int rc; |
| |
| dir_name_src = slurm_get_state_save_location(); |
| sprintf(job_dir, "/job.%u", job_id); |
| xstrcat(dir_name_src, job_dir); |
| rc = stat(dir_name_src, &buf); |
| xfree(dir_name_src); |
| if (rc == 0) { |
| error("Vestigial state files for job %u, but no job record. " |
| "this may be the result of two slurmctld running in " |
| "primary mode", job_id); |
| return true; |
| } |
| return false; |
| } |
| |
| /* _copy_job_desc_files - create copies of a job script and environment files */ |
| static int |
| _copy_job_desc_files(uint32_t job_id_src, uint32_t job_id_dest) |
| { |
| int error_code = SLURM_SUCCESS; |
| char *dir_name_src, *dir_name_dest, job_dir[32]; |
| char *file_name_src, *file_name_dest; |
| |
| /* Create state_save_location directory */ |
| dir_name_src = slurm_get_state_save_location(); |
| dir_name_dest = xstrdup(dir_name_src); |
| |
| /* Create job_id_dest specific directory */ |
| sprintf(job_dir, "/job.%u", job_id_dest); |
| xstrcat(dir_name_dest, job_dir); |
| if (mkdir(dir_name_dest, 0700)) { |
| if (!slurmctld_primary && (errno == EEXIST)) { |
| error("Apparent duplicate job ID %u. Two primary " |
| "slurmctld daemons might currently be active", |
| job_id_dest); |
| } |
| error("mkdir(%s) error %m", dir_name_dest); |
| xfree(dir_name_src); |
| xfree(dir_name_dest); |
| return ESLURM_WRITING_TO_FILE; |
| } |
| |
| /* Identify job_id_src specific directory */ |
| sprintf(job_dir, "/job.%u", job_id_src); |
| xstrcat(dir_name_src, job_dir); |
| |
| file_name_src = xstrdup(dir_name_src); |
| file_name_dest = xstrdup(dir_name_dest); |
| xstrcat(file_name_src, "/environment"); |
| xstrcat(file_name_dest, "/environment"); |
| error_code = link(file_name_src, file_name_dest); |
| if (error_code < 0) { |
| error("%s: link() failed %m copy files src %s dest %s", |
| __func__, file_name_src, file_name_dest); |
| error_code = _copy_job_file(file_name_src, file_name_dest); |
| if (error_code < 0) { |
| error("%s: failed copy files %m src %s dst %s", |
| __func__, file_name_src, file_name_dest); |
| } |
| } |
| xfree(file_name_src); |
| xfree(file_name_dest); |
| |
| if (error_code == 0) { |
| file_name_src = xstrdup(dir_name_src); |
| file_name_dest = xstrdup(dir_name_dest); |
| xstrcat(file_name_src, "/script"); |
| xstrcat(file_name_dest, "/script"); |
| error_code = link(file_name_src, file_name_dest); |
| if (error_code < 0) { |
| error("%s: link() failed %m copy files src %s dest %s", |
| __func__, file_name_src, file_name_dest); |
| error_code = _copy_job_file(file_name_src, file_name_dest); |
| if (error_code < 0) { |
| error("%s: failed copy files %m src %s dst %s", |
| __func__, file_name_src, file_name_dest); |
| } |
| } |
| xfree(file_name_src); |
| xfree(file_name_dest); |
| } |
| |
| xfree(dir_name_src); |
| xfree(dir_name_dest); |
| return error_code; |
| } |
| |
| /* |
| * Create file with specified name and write the supplied data array to it |
| * IN file_name - file to create and write to |
| * IN data - array of pointers to strings (e.g. env) |
| * IN size - number of elements in data |
| */ |
| static int |
| _write_data_array_to_file(char *file_name, char **data, uint32_t size) |
| { |
| int fd, i, pos, nwrite, amount; |
| |
| fd = creat(file_name, 0600); |
| if (fd < 0) { |
| error("Error creating file %s, %m", file_name); |
| return ESLURM_WRITING_TO_FILE; |
| } |
| |
| amount = write(fd, &size, sizeof(uint32_t)); |
| if (amount < sizeof(uint32_t)) { |
| error("Error writing file %s, %m", file_name); |
| close(fd); |
| return ESLURM_WRITING_TO_FILE; |
| } |
| |
| if (data == NULL) { |
| close(fd); |
| return SLURM_SUCCESS; |
| } |
| |
| for (i = 0; i < size; i++) { |
| nwrite = strlen(data[i]) + 1; |
| pos = 0; |
| while (nwrite > 0) { |
| amount = write(fd, &data[i][pos], nwrite); |
| if ((amount < 0) && (errno != EINTR)) { |
| error("Error writing file %s, %m", |
| file_name); |
| close(fd); |
| return ESLURM_WRITING_TO_FILE; |
| } |
| nwrite -= amount; |
| pos += amount; |
| } |
| } |
| |
| close(fd); |
| return SLURM_SUCCESS; |
| } |
| |
| /* |
| * Create file with specified name and write the supplied data array to it |
| * IN file_name - file to create and write to |
| * IN data - pointer to string |
| */ |
| static int _write_data_to_file(char *file_name, char *data) |
| { |
| int fd, pos, nwrite, amount; |
| |
| if (data == NULL) { |
| (void) unlink(file_name); |
| return SLURM_SUCCESS; |
| } |
| |
| fd = creat(file_name, 0700); |
| if (fd < 0) { |
| error("Error creating file %s, %m", file_name); |
| return ESLURM_WRITING_TO_FILE; |
| } |
| |
| nwrite = strlen(data) + 1; |
| pos = 0; |
| while (nwrite > 0) { |
| amount = write(fd, &data[pos], nwrite); |
| if ((amount < 0) && (errno != EINTR)) { |
| error("Error writing file %s, %m", file_name); |
| close(fd); |
| return ESLURM_WRITING_TO_FILE; |
| } |
| nwrite -= amount; |
| pos += amount; |
| } |
| close(fd); |
| return SLURM_SUCCESS; |
| } |
| |
| /* |
| * get_job_env - return the environment variables and their count for a |
| * given job |
| * IN job_ptr - pointer to job for which data is required |
| * OUT env_size - number of elements to read |
| * RET point to array of string pointers containing environment variables |
| */ |
| char **get_job_env(struct job_record *job_ptr, uint32_t * env_size) |
| { |
| char *file_name, **environment = NULL; |
| int cc; |
| |
| file_name = slurm_get_state_save_location(); |
| xstrfmtcat(file_name, "/job.%u/environment", |
| job_ptr->job_id); |
| |
| cc = _read_data_array_from_file(file_name, |
| &environment, |
| env_size, |
| job_ptr); |
| if (cc < 0) { |
| xfree(file_name); |
| return NULL; |
| } |
| |
| xfree(file_name); |
| return environment; |
| } |
| |
| /* |
| * get_job_script - return the script for a given job |
| * IN job_ptr - pointer to job for which data is required |
| * RET point to string containing job script |
| */ |
| char *get_job_script(struct job_record *job_ptr) |
| { |
| char *script = NULL; |
| |
| if (job_ptr->batch_flag) { |
| char *file_name = slurm_get_state_save_location(); |
| char job_dir[30]; |
| |
| sprintf(job_dir, "/job.%u/script", job_ptr->job_id); |
| xstrcat(file_name, job_dir); |
| |
| _read_data_from_file(file_name, &script); |
| |
| xfree(file_name); |
| } |
| return script; |
| } |
| |
| /* |
| * Read a collection of strings from a file |
| * IN file_name - file to read from |
| * OUT data - pointer to array of pointers to strings (e.g. env), |
| * must be xfreed when no longer needed |
| * OUT size - number of elements in data |
| * IN job_ptr - job |
| * NOTE: The output format of this must be identical with _xduparray2() |
| */ |
| static int |
| _read_data_array_from_file(char *file_name, char ***data, uint32_t * size, |
| struct job_record *job_ptr) |
| { |
| int fd, pos, buf_size, amount, i, j; |
| char *buffer, **array_ptr; |
| uint32_t rec_cnt; |
| |
| xassert(file_name); |
| xassert(data); |
| xassert(size); |
| *data = NULL; |
| *size = 0; |
| |
| fd = open(file_name, 0); |
| if (fd < 0) { |
| error("Error opening file %s, %m", file_name); |
| return -1; |
| } |
| |
| amount = read(fd, &rec_cnt, sizeof(uint32_t)); |
| if (amount < sizeof(uint32_t)) { |
| if (amount != 0) /* incomplete write */ |
| error("Error reading file %s, %m", file_name); |
| else |
| verbose("File %s has zero size", file_name); |
| close(fd); |
| return -1; |
| } |
| |
| if (rec_cnt >= INT_MAX) { |
| error("%s: unreasonable record counter %d in file %s", |
| __func__, rec_cnt, file_name); |
| close(fd); |
| return -1; |
| } |
| |
| if (rec_cnt == 0) { |
| *data = NULL; |
| *size = 0; |
| close(fd); |
| return 0; |
| } |
| |
| pos = 0; |
| buf_size = BUF_SIZE; |
| buffer = xmalloc(buf_size); |
| while (1) { |
| amount = read(fd, &buffer[pos], BUF_SIZE); |
| if (amount < 0) { |
| error("Error reading file %s, %m", file_name); |
| xfree(buffer); |
| close(fd); |
| return -1; |
| } |
| pos += amount; |
| if (amount < BUF_SIZE) /* end of file */ |
| break; |
| buf_size += amount; |
| xrealloc(buffer, buf_size); |
| } |
| close(fd); |
| |
| /* Allocate extra space for supplemental environment variables |
| * as set by Moab */ |
| if (job_ptr->details->env_cnt) { |
| for (j = 0; j < job_ptr->details->env_cnt; j++) |
| pos += (strlen(job_ptr->details->env_sup[j]) + 1); |
| xrealloc(buffer, pos); |
| } |
| |
| /* We have all the data, now let's compute the pointers */ |
| array_ptr = xmalloc(sizeof(char *) * |
| (rec_cnt + job_ptr->details->env_cnt)); |
| for (i = 0, pos = 0; i < rec_cnt; i++) { |
| array_ptr[i] = &buffer[pos]; |
| pos += strlen(&buffer[pos]) + 1; |
| if ((pos > buf_size) && ((i + 1) < rec_cnt)) { |
| error("Bad environment file %s", file_name); |
| rec_cnt = i; |
| break; |
| } |
| } |
| |
| /* Add supplemental environment variables for Moab */ |
| if (job_ptr->details->env_cnt) { |
| char *tmp_chr; |
| int env_len, name_len; |
| for (j = 0; j < job_ptr->details->env_cnt; j++) { |
| tmp_chr = strchr(job_ptr->details->env_sup[j], '='); |
| if (tmp_chr == NULL) { |
| error("Invalid supplemental environment " |
| "variable: %s", |
| job_ptr->details->env_sup[j]); |
| continue; |
| } |
| env_len = strlen(job_ptr->details->env_sup[j]) + 1; |
| name_len = tmp_chr - job_ptr->details->env_sup[j] + 1; |
| /* search for duplicate */ |
| for (i = 0; i < rec_cnt; i++) { |
| if (strncmp(array_ptr[i], |
| job_ptr->details->env_sup[j], |
| name_len)) { |
| continue; |
| } |
| /* over-write duplicate */ |
| memcpy(&buffer[pos], |
| job_ptr->details->env_sup[j], env_len); |
| array_ptr[i] = &buffer[pos]; |
| pos += env_len; |
| break; |
| } |
| if (i >= rec_cnt) { /* add env to array end */ |
| memcpy(&buffer[pos], |
| job_ptr->details->env_sup[j], env_len); |
| array_ptr[rec_cnt++] = &buffer[pos]; |
| pos += env_len; |
| } |
| } |
| } |
| |
| *size = rec_cnt; |
| *data = array_ptr; |
| return 0; |
| } |
| |
| /* |
| * Read a string from a file |
| * IN file_name - file to read from |
| * OUT data - pointer to string |
| * must be xfreed when no longer needed |
| */ |
| void _read_data_from_file(char *file_name, char **data) |
| { |
| int fd, pos, buf_size, amount; |
| char *buffer; |
| |
| xassert(file_name); |
| xassert(data); |
| *data = NULL; |
| |
| fd = open(file_name, 0); |
| if (fd < 0) { |
| error("Error opening file %s, %m", file_name); |
| return; |
| } |
| |
| pos = 0; |
| buf_size = BUF_SIZE; |
| buffer = xmalloc(buf_size); |
| while (1) { |
| amount = read(fd, &buffer[pos], BUF_SIZE); |
| if (amount < 0) { |
| error("Error reading file %s, %m", file_name); |
| xfree(buffer); |
| close(fd); |
| return; |
| } |
| if (amount < BUF_SIZE) /* end of file */ |
| break; |
| pos += amount; |
| buf_size += amount; |
| xrealloc(buffer, buf_size); |
| } |
| |
| *data = buffer; |
| close(fd); |
| return; |
| } |
| |
| /* Given a job request, return a multi_core_data struct. |
| * Returns NULL if no values set in the job/step request */ |
| static multi_core_data_t * |
| _set_multi_core_data(job_desc_msg_t * job_desc) |
| { |
| multi_core_data_t * mc_ptr; |
| |
| if ((job_desc->sockets_per_node == (uint16_t) NO_VAL) && |
| (job_desc->cores_per_socket == (uint16_t) NO_VAL) && |
| (job_desc->threads_per_core == (uint16_t) NO_VAL) && |
| (job_desc->ntasks_per_socket == (uint16_t) NO_VAL) && |
| (job_desc->ntasks_per_core == (uint16_t) NO_VAL) && |
| (job_desc->plane_size == (uint16_t) NO_VAL)) |
| return NULL; |
| |
| mc_ptr = xmalloc(sizeof(multi_core_data_t)); |
| mc_ptr->sockets_per_node = job_desc->sockets_per_node; |
| mc_ptr->cores_per_socket = job_desc->cores_per_socket; |
| mc_ptr->threads_per_core = job_desc->threads_per_core; |
| if (job_desc->ntasks_per_socket != (uint16_t) NO_VAL) |
| mc_ptr->ntasks_per_socket = job_desc->ntasks_per_socket; |
| else |
| mc_ptr->ntasks_per_socket = (uint16_t) INFINITE; |
| if (job_desc->ntasks_per_core != (uint16_t) NO_VAL) |
| mc_ptr->ntasks_per_core = job_desc->ntasks_per_core; |
| else if (slurmctld_conf.select_type_param & CR_ONE_TASK_PER_CORE) |
| mc_ptr->ntasks_per_core = 1; |
| else |
| mc_ptr->ntasks_per_core = (uint16_t) INFINITE; |
| if (job_desc->plane_size != (uint16_t) NO_VAL) |
| mc_ptr->plane_size = job_desc->plane_size; |
| else |
| mc_ptr->plane_size = 0; |
| |
| return mc_ptr; |
| } |
| |
| /* _copy_job_desc_to_job_record - copy the job descriptor from the RPC |
| * structure into the actual slurmctld job record */ |
| static int |
| _copy_job_desc_to_job_record(job_desc_msg_t * job_desc, |
| struct job_record **job_rec_ptr, |
| bitstr_t ** req_bitmap, |
| bitstr_t ** exc_bitmap) |
| { |
| int error_code; |
| struct job_details *detail_ptr; |
| struct job_record *job_ptr; |
| |
| if (slurm_get_track_wckey()) { |
| if (!job_desc->wckey) { |
| /* get the default wckey for this user since none was |
| * given */ |
| slurmdb_user_rec_t user_rec; |
| memset(&user_rec, 0, sizeof(slurmdb_user_rec_t)); |
| user_rec.uid = job_desc->user_id; |
| assoc_mgr_fill_in_user(acct_db_conn, &user_rec, |
| accounting_enforce, NULL); |
| if (user_rec.default_wckey) |
| job_desc->wckey = xstrdup_printf( |
| "*%s", user_rec.default_wckey); |
| else if (!(accounting_enforce & |
| ACCOUNTING_ENFORCE_WCKEYS)) |
| job_desc->wckey = xstrdup("*"); |
| else { |
| error("Job didn't specify wckey and user " |
| "%d has no default.", job_desc->user_id); |
| return ESLURM_INVALID_WCKEY; |
| } |
| } else if (job_desc->wckey) { |
| slurmdb_wckey_rec_t wckey_rec, *wckey_ptr = NULL; |
| |
| memset(&wckey_rec, 0, sizeof(slurmdb_wckey_rec_t)); |
| wckey_rec.uid = job_desc->user_id; |
| wckey_rec.name = job_desc->wckey; |
| |
| if (assoc_mgr_fill_in_wckey(acct_db_conn, &wckey_rec, |
| accounting_enforce, |
| &wckey_ptr)) { |
| if (accounting_enforce & |
| ACCOUNTING_ENFORCE_WCKEYS) { |
| error("_copy_job_desc_to_job_record: " |
| "invalid wckey '%s' for user %u.", |
| wckey_rec.name, |
| job_desc->user_id); |
| return ESLURM_INVALID_WCKEY; |
| } |
| } |
| } else if (accounting_enforce & ACCOUNTING_ENFORCE_WCKEYS) { |
| /* This should never happen */ |
| info("_copy_job_desc_to_job_record: no wckey was given " |
| "for job submit."); |
| return ESLURM_INVALID_WCKEY; |
| } |
| } |
| |
| job_ptr = create_job_record(&error_code); |
| if (error_code) |
| return error_code; |
| |
| job_ptr->partition = xstrdup(job_desc->partition); |
| if (job_desc->profile != ACCT_GATHER_PROFILE_NOT_SET) |
| job_ptr->profile = job_desc->profile; |
| |
| if (job_desc->job_id != NO_VAL) { /* already confirmed unique */ |
| job_ptr->job_id = job_desc->job_id; |
| } else { |
| error_code = _set_job_id(job_ptr); |
| if (error_code) |
| return error_code; |
| } |
| |
| if (job_desc->name) |
| job_ptr->name = xstrdup(job_desc->name); |
| if (job_desc->wckey) |
| job_ptr->wckey = xstrdup(job_desc->wckey); |
| |
| _add_job_hash(job_ptr); |
| |
| job_ptr->user_id = (uid_t) job_desc->user_id; |
| job_ptr->group_id = (gid_t) job_desc->group_id; |
| job_ptr->job_state = JOB_PENDING; |
| job_ptr->time_limit = job_desc->time_limit; |
| if (job_desc->time_min != NO_VAL) |
| job_ptr->time_min = job_desc->time_min; |
| job_ptr->alloc_sid = job_desc->alloc_sid; |
| job_ptr->alloc_node = xstrdup(job_desc->alloc_node); |
| job_ptr->account = xstrdup(job_desc->account); |
| job_ptr->gres = xstrdup(job_desc->gres); |
| job_ptr->network = xstrdup(job_desc->network); |
| job_ptr->resv_name = xstrdup(job_desc->reservation); |
| job_ptr->comment = xstrdup(job_desc->comment); |
| if (!wiki_sched_test) { |
| char *sched_type = slurm_get_sched_type(); |
| if (strcmp(sched_type, "sched/wiki") == 0) |
| wiki_sched = true; |
| if (strcmp(sched_type, "sched/wiki2") == 0) { |
| wiki_sched = true; |
| wiki2_sched = true; |
| } |
| xfree(sched_type); |
| wiki_sched_test = true; |
| } |
| |
| if (job_desc->kill_on_node_fail != (uint16_t) NO_VAL) |
| job_ptr->kill_on_node_fail = job_desc->kill_on_node_fail; |
| |
| job_ptr->resp_host = xstrdup(job_desc->resp_host); |
| job_ptr->alloc_resp_port = job_desc->alloc_resp_port; |
| job_ptr->other_port = job_desc->other_port; |
| job_ptr->time_last_active = time(NULL); |
| job_ptr->cr_enabled = 0; |
| job_ptr->derived_ec = 0; |
| |
| job_ptr->licenses = xstrdup(job_desc->licenses); |
| job_ptr->mail_type = job_desc->mail_type; |
| job_ptr->mail_user = xstrdup(job_desc->mail_user); |
| |
| job_ptr->ckpt_interval = job_desc->ckpt_interval; |
| job_ptr->spank_job_env = job_desc->spank_job_env; |
| job_ptr->spank_job_env_size = job_desc->spank_job_env_size; |
| job_desc->spank_job_env = (char **) NULL; /* nothing left to free */ |
| job_desc->spank_job_env_size = 0; /* nothing left to free */ |
| |
| if (job_desc->wait_all_nodes == (uint16_t) NO_VAL) |
| job_ptr->wait_all_nodes = DEFAULT_WAIT_ALL_NODES; |
| else |
| job_ptr->wait_all_nodes = job_desc->wait_all_nodes; |
| job_ptr->warn_flags = job_desc->warn_flags; |
| job_ptr->warn_signal = job_desc->warn_signal; |
| job_ptr->warn_time = job_desc->warn_time; |
| |
| detail_ptr = job_ptr->details; |
| detail_ptr->argc = job_desc->argc; |
| detail_ptr->argv = job_desc->argv; |
| job_desc->argv = (char **) NULL; /* nothing left to free */ |
| job_desc->argc = 0; /* nothing left to free */ |
| detail_ptr->acctg_freq = xstrdup(job_desc->acctg_freq); |
| detail_ptr->cpu_bind_type = job_desc->cpu_bind_type; |
| detail_ptr->cpu_bind = xstrdup(job_desc->cpu_bind); |
| detail_ptr->nice = job_desc->nice; |
| detail_ptr->open_mode = job_desc->open_mode; |
| detail_ptr->min_cpus = job_desc->min_cpus; |
| detail_ptr->max_cpus = job_desc->max_cpus; |
| detail_ptr->min_nodes = job_desc->min_nodes; |
| detail_ptr->max_nodes = job_desc->max_nodes; |
| if (job_desc->req_nodes) { |
| detail_ptr->req_nodes = |
| _copy_nodelist_no_dup(job_desc->req_nodes); |
| detail_ptr->req_node_bitmap = *req_bitmap; |
| detail_ptr->req_node_layout = NULL; /* Layout specified at |
| * start time */ |
| *req_bitmap = NULL; /* Reused nothing left to free */ |
| } |
| if (job_desc->exc_nodes) { |
| detail_ptr->exc_nodes = |
| _copy_nodelist_no_dup(job_desc->exc_nodes); |
| detail_ptr->exc_node_bitmap = *exc_bitmap; |
| *exc_bitmap = NULL; /* Reused nothing left to free */ |
| } |
| if (job_desc->features) |
| detail_ptr->features = xstrdup(job_desc->features); |
| if ((job_desc->shared == 0) && (select_serial == 0)) { |
| detail_ptr->share_res = 0; |
| detail_ptr->whole_node = 1; |
| } else if (job_desc->shared == 1) { |
| detail_ptr->share_res = 1; |
| detail_ptr->whole_node = 0; |
| } else { |
| detail_ptr->share_res = (uint8_t) NO_VAL; |
| detail_ptr->whole_node = 0; |
| } |
| if (job_desc->contiguous != (uint16_t) NO_VAL) |
| detail_ptr->contiguous = job_desc->contiguous; |
| if (job_desc->core_spec != (uint16_t) NO_VAL) { |
| detail_ptr->core_spec = job_desc->core_spec; |
| if (job_desc->core_spec) |
| detail_ptr->whole_node = 1; |
| } |
| if (job_desc->task_dist != (uint16_t) NO_VAL) |
| detail_ptr->task_dist = job_desc->task_dist; |
| if (job_desc->cpus_per_task != (uint16_t) NO_VAL) |
| detail_ptr->cpus_per_task = MAX(job_desc->cpus_per_task, 1); |
| else |
| detail_ptr->cpus_per_task = 1; |
| if (job_desc->pn_min_cpus != (uint16_t) NO_VAL) |
| detail_ptr->pn_min_cpus = job_desc->pn_min_cpus; |
| if (job_desc->overcommit != (uint8_t) NO_VAL) |
| detail_ptr->overcommit = job_desc->overcommit; |
| if (job_desc->ntasks_per_node != (uint16_t) NO_VAL) { |
| detail_ptr->ntasks_per_node = job_desc->ntasks_per_node; |
| if (detail_ptr->overcommit == 0) { |
| detail_ptr->pn_min_cpus = |
| MAX(detail_ptr->pn_min_cpus, |
| (detail_ptr->cpus_per_task * |
| detail_ptr->ntasks_per_node)); |
| } |
| } else { |
| detail_ptr->pn_min_cpus = MAX(detail_ptr->pn_min_cpus, |
| detail_ptr->cpus_per_task); |
| } |
| if (job_desc->requeue != (uint16_t) NO_VAL) |
| detail_ptr->requeue = MIN(job_desc->requeue, 1); |
| else |
| detail_ptr->requeue = slurmctld_conf.job_requeue; |
| if (job_desc->pn_min_memory != NO_VAL) |
| detail_ptr->pn_min_memory = job_desc->pn_min_memory; |
| if (job_desc->pn_min_tmp_disk != NO_VAL) |
| detail_ptr->pn_min_tmp_disk = job_desc->pn_min_tmp_disk; |
| if (job_desc->num_tasks != NO_VAL) |
| detail_ptr->num_tasks = job_desc->num_tasks; |
| if (job_desc->std_err) |
| detail_ptr->std_err = xstrdup(job_desc->std_err); |
| if (job_desc->std_in) |
| detail_ptr->std_in = xstrdup(job_desc->std_in); |
| if (job_desc->std_out) |
| detail_ptr->std_out = xstrdup(job_desc->std_out); |
| if (job_desc->work_dir) |
| detail_ptr->work_dir = xstrdup(job_desc->work_dir); |
| if (job_desc->begin_time > time(NULL)) |
| detail_ptr->begin_time = job_desc->begin_time; |
| job_ptr->select_jobinfo = |
| select_g_select_jobinfo_copy(job_desc->select_jobinfo); |
| select_g_select_jobinfo_set(job_ptr->select_jobinfo, |
| SELECT_JOBDATA_USER_NAME, |
| &job_ptr->user_id); |
| |
| select_g_select_jobinfo_set(job_ptr->select_jobinfo, |
| SELECT_JOBDATA_NETWORK, |
| job_ptr->network); |
| |
| if (job_desc->ckpt_dir) |
| detail_ptr->ckpt_dir = xstrdup(job_desc->ckpt_dir); |
| else |
| detail_ptr->ckpt_dir = xstrdup(detail_ptr->work_dir); |
| |
| /* The priority needs to be set after this since we don't have |
| * an association rec yet |
| */ |
| |
| detail_ptr->mc_ptr = _set_multi_core_data(job_desc); |
| *job_rec_ptr = job_ptr; |
| return SLURM_SUCCESS; |
| } |
| |
| /* |
| * _copy_nodelist_no_dup - Take a node_list string and convert it to an |
| * expression without duplicate names. For example, we want to convert |
| * a users request for nodes "lx1,lx2,lx1,lx3" to "lx[1-3]" |
| * node_list IN - string describing a list of nodes |
| * RET a compact node expression, must be xfreed by the user |
| */ |
| static char *_copy_nodelist_no_dup(char *node_list) |
| { |
| char *buf; |
| |
| hostlist_t hl = hostlist_create(node_list); |
| if (hl == NULL) |
| return NULL; |
| hostlist_uniq(hl); |
| buf = hostlist_ranged_string_xmalloc(hl); |
| hostlist_destroy(hl); |
| |
| return buf; |
| } |
| |
| static bool _valid_pn_min_mem(job_desc_msg_t * job_desc_msg, |
| struct part_record *part_ptr) |
| { |
| uint32_t job_mem_limit = job_desc_msg->pn_min_memory; |
| uint32_t sys_mem_limit; |
| uint16_t cpus_per_node; |
| |
| if (part_ptr && part_ptr->max_mem_per_cpu) |
| sys_mem_limit = part_ptr->max_mem_per_cpu; |
| else |
| sys_mem_limit = slurmctld_conf.max_mem_per_cpu; |
| |
| if ((sys_mem_limit == 0) || (sys_mem_limit == MEM_PER_CPU)) |
| return true; |
| |
| if ((job_mem_limit & MEM_PER_CPU) && (sys_mem_limit & MEM_PER_CPU)) { |
| uint32_t mem_ratio; |
| job_mem_limit &= (~MEM_PER_CPU); |
| sys_mem_limit &= (~MEM_PER_CPU); |
| if (job_mem_limit <= sys_mem_limit) |
| return true; |
| mem_ratio = (job_mem_limit + sys_mem_limit - 1); |
| mem_ratio /= sys_mem_limit; |
| debug("increasing cpus_per_task and decreasing mem_per_cpu by " |
| "factor of %u based upon mem_per_cpu limits", mem_ratio); |
| if (job_desc_msg->cpus_per_task == (uint16_t) NO_VAL) |
| job_desc_msg->cpus_per_task = mem_ratio; |
| else |
| job_desc_msg->cpus_per_task *= mem_ratio; |
| job_desc_msg->pn_min_memory = ((job_mem_limit + mem_ratio - 1) / |
| mem_ratio) | MEM_PER_CPU; |
| return true; |
| } |
| |
| if (((job_mem_limit & MEM_PER_CPU) == 0) && |
| ((sys_mem_limit & MEM_PER_CPU) == 0)) { |
| if (job_mem_limit <= sys_mem_limit) |
| return true; |
| return false; |
| } |
| |
| /* Our size is per CPU and limit per node or vice-versa. |
| * CPU count my vary by node, but we don't have a good |
| * way to identify specific nodes for the job at this |
| * point, so just pick the first node as a basis for enforcing |
| * MaxMemPerCPU and convert both numbers to per-node values. */ |
| if (slurmctld_conf.fast_schedule) |
| cpus_per_node = node_record_table_ptr[0].config_ptr->cpus; |
| else |
| cpus_per_node = node_record_table_ptr[0].cpus; |
| if (job_desc_msg->min_cpus != NO_VAL) |
| cpus_per_node = MIN(cpus_per_node, job_desc_msg->min_cpus); |
| if (job_mem_limit & MEM_PER_CPU) { |
| job_mem_limit &= (~MEM_PER_CPU); |
| job_mem_limit *= cpus_per_node; |
| } else { |
| uint32_t min_cpus; |
| sys_mem_limit &= (~MEM_PER_CPU); |
| min_cpus = (job_mem_limit + sys_mem_limit - 1) / sys_mem_limit; |
| if ((job_desc_msg->pn_min_cpus == (uint16_t) NO_VAL) || |
| (job_desc_msg->pn_min_cpus < min_cpus)) { |
| debug("Setting job's pn_min_cpus to %u due to memory " |
| "limit", min_cpus); |
| job_desc_msg->pn_min_cpus = min_cpus; |
| sys_mem_limit *= min_cpus; |
| } else { |
| sys_mem_limit *= cpus_per_node; |
| } |
| } |
| if (job_mem_limit <= sys_mem_limit) |
| return true; |
| return false; |
| } |
| |
| /* |
| * job_time_limit - terminate jobs which have exceeded their time limit |
| * global: job_list - pointer global job list |
| * last_job_update - time of last job table update |
| * NOTE: READ lock_slurmctld config before entry |
| */ |
| void job_time_limit(void) |
| { |
| ListIterator job_iterator; |
| struct job_record *job_ptr; |
| time_t now = time(NULL); |
| time_t old = now - ((slurmctld_conf.inactive_limit * 4 / 3) + |
| slurmctld_conf.msg_timeout + 1); |
| time_t over_run; |
| int resv_status = 0; |
| |
| if (slurmctld_conf.over_time_limit == (uint16_t) INFINITE) |
| over_run = now - (365 * 24 * 60 * 60); /* one year */ |
| else |
| over_run = now - (slurmctld_conf.over_time_limit * 60); |
| |
| begin_job_resv_check(); |
| job_iterator = list_iterator_create(job_list); |
| while ((job_ptr =(struct job_record *) list_next(job_iterator))) { |
| xassert (job_ptr->magic == JOB_MAGIC); |
| |
| if (IS_JOB_CONFIGURING(job_ptr)) { |
| if (!IS_JOB_RUNNING(job_ptr) || |
| (bit_overlap(job_ptr->node_bitmap, |
| power_node_bitmap) == 0)) { |
| info("Configuration for job %u is complete", |
| job_ptr->job_id); |
| job_ptr->job_state &= (~JOB_CONFIGURING); |
| } |
| } |
| |
| /* This needs to be near the top of the loop, checks every |
| * running, suspended and pending job */ |
| resv_status = job_resv_check(job_ptr); |
| |
| if (job_ptr->preempt_time && |
| (IS_JOB_RUNNING(job_ptr) || IS_JOB_SUSPENDED(job_ptr))) { |
| if ((job_ptr->warn_time) && |
| (job_ptr->warn_time + PERIODIC_TIMEOUT + now >= |
| job_ptr->end_time)) { |
| debug("Warning signal %u to job %u ", |
| job_ptr->warn_signal, job_ptr->job_id); |
| (void) job_signal(job_ptr->job_id, |
| job_ptr->warn_signal, |
| job_ptr->warn_flags, 0, |
| false); |
| job_ptr->warn_signal = 0; |
| job_ptr->warn_time = 0; |
| } |
| if (job_ptr->end_time <= now) { |
| last_job_update = now; |
| info("Preemption GraceTime reached JobId=%u", |
| job_ptr->job_id); |
| _job_timed_out(job_ptr); |
| job_ptr->job_state = JOB_PREEMPTED | |
| JOB_COMPLETING; |
| xfree(job_ptr->state_desc); |
| } |
| continue; |
| } |
| |
| if (!IS_JOB_RUNNING(job_ptr)) |
| continue; |
| |
| if (slurmctld_conf.inactive_limit && |
| (job_ptr->batch_flag == 0) && |
| (job_ptr->time_last_active <= old) && |
| (job_ptr->other_port) && |
| (job_ptr->part_ptr) && |
| (!(job_ptr->part_ptr->flags & PART_FLAG_ROOT_ONLY))) { |
| /* job inactive, kill it */ |
| info("Inactivity time limit reached for JobId=%u", |
| job_ptr->job_id); |
| _job_timed_out(job_ptr); |
| job_ptr->state_reason = FAIL_INACTIVE_LIMIT; |
| xfree(job_ptr->state_desc); |
| continue; |
| } |
| if (job_ptr->time_limit != INFINITE) { |
| if ((job_ptr->warn_time) && |
| (job_ptr->warn_time + PERIODIC_TIMEOUT + now >= |
| job_ptr->end_time)) { |
| debug("Warning signal %u to job %u ", |
| job_ptr->warn_signal, job_ptr->job_id); |
| (void) job_signal(job_ptr->job_id, |
| job_ptr->warn_signal, |
| job_ptr->warn_flags, 0, |
| false); |
| job_ptr->warn_signal = 0; |
| job_ptr->warn_time = 0; |
| } |
| if (job_ptr->end_time <= over_run) { |
| last_job_update = now; |
| info("Time limit exhausted for JobId=%u", |
| job_ptr->job_id); |
| _job_timed_out(job_ptr); |
| job_ptr->state_reason = FAIL_TIMEOUT; |
| xfree(job_ptr->state_desc); |
| continue; |
| } |
| } |
| |
| if (resv_status != SLURM_SUCCESS) { |
| last_job_update = now; |
| info("Reservation ended for JobId=%u", |
| job_ptr->job_id); |
| _job_timed_out(job_ptr); |
| job_ptr->state_reason = FAIL_TIMEOUT; |
| xfree(job_ptr->state_desc); |
| continue; |
| } |
| |
| /* check if any individual job steps have exceeded |
| * their time limit */ |
| if (job_ptr->step_list && |
| (list_count(job_ptr->step_list) > 0)) |
| check_job_step_time_limit(job_ptr, now); |
| |
| acct_policy_job_time_out(job_ptr); |
| |
| if (job_ptr->state_reason == FAIL_TIMEOUT) { |
| last_job_update = now; |
| _job_timed_out(job_ptr); |
| xfree(job_ptr->state_desc); |
| continue; |
| } |
| |
| /* Give srun command warning message about pending timeout */ |
| if (job_ptr->end_time <= (now + PERIODIC_TIMEOUT * 2)) |
| srun_timeout (job_ptr); |
| } |
| list_iterator_destroy(job_iterator); |
| fini_job_resv_check(); |
| } |
| |
| extern int job_update_cpu_cnt(struct job_record *job_ptr, int node_inx) |
| { |
| int cnt, offset, rc = SLURM_SUCCESS; |
| |
| xassert(job_ptr); |
| |
| #ifdef HAVE_BG |
| /* This function doesn't apply to a bluegene system since the |
| * cpu count isn't set up on that system. */ |
| return SLURM_SUCCESS; |
| #endif |
| if (job_ptr->details->whole_node) { |
| /* Since we are allocating whole nodes don't rely on |
| * the job_resrcs since it could be less because the |
| * node could of only used 1 thread per core. |
| */ |
| struct node_record *node_ptr = |
| node_record_table_ptr + node_inx; |
| if (slurmctld_conf.fast_schedule) |
| cnt = node_ptr->config_ptr->cpus; |
| else |
| cnt = node_ptr->cpus; |
| } else { |
| if ((offset = job_resources_node_inx_to_cpu_inx( |
| job_ptr->job_resrcs, node_inx)) < 0) { |
| error("job_update_cpu_cnt: problem getting " |
| "offset of job %u", |
| job_ptr->job_id); |
| job_ptr->cpu_cnt = 0; |
| return SLURM_ERROR; |
| } |
| |
| cnt = job_ptr->job_resrcs->cpus[offset]; |
| } |
| if (cnt > job_ptr->cpu_cnt) { |
| error("job_update_cpu_cnt: cpu_cnt underflow on job_id %u", |
| job_ptr->job_id); |
| job_ptr->cpu_cnt = 0; |
| rc = SLURM_ERROR; |
| } else |
| job_ptr->cpu_cnt -= cnt; |
| |
| if (IS_JOB_RESIZING(job_ptr)) { |
| if (cnt > job_ptr->total_cpus) { |
| error("job_update_cpu_cnt: total_cpus " |
| "underflow on job_id %u", |
| job_ptr->job_id); |
| job_ptr->total_cpus = 0; |
| rc = SLURM_ERROR; |
| } else |
| job_ptr->total_cpus -= cnt; |
| } |
| return rc; |
| } |
| |
| /* Terminate a job that has exhausted its time limit */ |
| static void _job_timed_out(struct job_record *job_ptr) |
| { |
| xassert(job_ptr); |
| |
| srun_timeout(job_ptr); |
| if (job_ptr->details) { |
| time_t now = time(NULL); |
| job_ptr->end_time = now; |
| job_ptr->time_last_active = now; |
| job_ptr->job_state = JOB_TIMEOUT | JOB_COMPLETING; |
| build_cg_bitmap(job_ptr); |
| job_ptr->exit_code = MAX(job_ptr->exit_code, 1); |
| job_completion_logger(job_ptr, false); |
| deallocate_nodes(job_ptr, true, false, false); |
| } else |
| job_signal(job_ptr->job_id, SIGKILL, 0, 0, false); |
| return; |
| } |
| |
| /* _validate_job_desc - validate that a job descriptor for job submit or |
| * allocate has valid data, set values to defaults as required |
| * IN/OUT job_desc_msg - pointer to job descriptor, modified as needed |
| * IN allocate - if clear job to be queued, if set allocate for user now |
| * IN submit_uid - who request originated |
| */ |
| static int _validate_job_desc(job_desc_msg_t * job_desc_msg, int allocate, |
| uid_t submit_uid, struct part_record *part_ptr, |
| List part_list) |
| { |
| if ((job_desc_msg->min_cpus == NO_VAL) && |
| (job_desc_msg->min_nodes == NO_VAL) && |
| (job_desc_msg->req_nodes == NULL)) { |
| info("Job specified no min_cpus, min_nodes or req_nodes"); |
| return ESLURM_JOB_MISSING_SIZE_SPECIFICATION; |
| } |
| if ((allocate == SLURM_CREATE_JOB_FLAG_NO_ALLOCATE_0) && |
| (job_desc_msg->script == NULL)) { |
| info("_validate_job_desc: job failed to specify Script"); |
| return ESLURM_JOB_SCRIPT_MISSING; |
| } |
| if (job_desc_msg->user_id == NO_VAL) { |
| info("_validate_job_desc: job failed to specify User"); |
| return ESLURM_USER_ID_MISSING; |
| } |
| if ( job_desc_msg->group_id == NO_VAL ) { |
| debug("_validate_job_desc: job failed to specify group"); |
| job_desc_msg->group_id = 0; /* uses user default */ |
| } |
| if (job_desc_msg->contiguous == (uint16_t) NO_VAL) |
| job_desc_msg->contiguous = 0; |
| if (job_desc_msg->core_spec == (uint16_t) NO_VAL) |
| job_desc_msg->core_spec = 0; |
| |
| if (job_desc_msg->task_dist == (uint16_t) NO_VAL) { |
| /* not typically set by salloc or sbatch */ |
| job_desc_msg->task_dist = SLURM_DIST_CYCLIC; |
| } |
| if (job_desc_msg->plane_size == (uint16_t) NO_VAL) |
| job_desc_msg->plane_size = 0; |
| |
| if (job_desc_msg->kill_on_node_fail == (uint16_t) NO_VAL) |
| job_desc_msg->kill_on_node_fail = 1; |
| |
| if (job_desc_msg->job_id != NO_VAL) { |
| struct job_record *dup_job_ptr; |
| if ((submit_uid != 0) && |
| (submit_uid != slurmctld_conf.slurm_user_id)) { |
| info("attempt by uid %u to set job_id", submit_uid); |
| return ESLURM_INVALID_JOB_ID; |
| } |
| if (job_desc_msg->job_id == 0) { |
| info("attempt by uid %u to set zero job_id", |
| submit_uid); |
| return ESLURM_INVALID_JOB_ID; |
| } |
| dup_job_ptr = find_job_record((uint32_t) job_desc_msg->job_id); |
| if (dup_job_ptr && |
| (!(IS_JOB_COMPLETED(dup_job_ptr)))) { |
| info("attempt re-use active job_id %u", |
| job_desc_msg->job_id); |
| return ESLURM_DUPLICATE_JOB_ID; |
| } |
| if (dup_job_ptr) /* Purge the record for re-use */ |
| _purge_job_record(job_desc_msg->job_id); |
| } |
| |
| |
| if (job_desc_msg->nice == (uint16_t) NO_VAL) |
| job_desc_msg->nice = NICE_OFFSET; |
| |
| if (job_desc_msg->pn_min_memory == NO_VAL) { |
| /* Default memory limit is DefMemPerCPU (if set) or no limit */ |
| if (part_ptr && part_ptr->def_mem_per_cpu) { |
| job_desc_msg->pn_min_memory = |
| part_ptr->def_mem_per_cpu; |
| } else { |
| job_desc_msg->pn_min_memory = |
| slurmctld_conf.def_mem_per_cpu; |
| } |
| } else if (!_validate_min_mem_partition(job_desc_msg, part_ptr, part_list)) |
| return ESLURM_INVALID_TASK_MEMORY; |
| |
| /* Validate a job's accounting frequency, if specified */ |
| if (acct_gather_check_acct_freq_task( |
| job_desc_msg->pn_min_memory, job_desc_msg->acctg_freq)) |
| return ESLURMD_INVALID_ACCT_FREQ; |
| |
| if (job_desc_msg->min_nodes == NO_VAL) |
| job_desc_msg->min_nodes = 1; /* default node count of 1 */ |
| if (job_desc_msg->min_cpus == NO_VAL) |
| job_desc_msg->min_cpus = job_desc_msg->min_nodes; |
| |
| if ((job_desc_msg->pn_min_cpus == (uint16_t) NO_VAL) || |
| (job_desc_msg->pn_min_cpus == 0)) |
| job_desc_msg->pn_min_cpus = 1; /* default 1 cpu per node */ |
| if (job_desc_msg->pn_min_tmp_disk == NO_VAL) |
| job_desc_msg->pn_min_tmp_disk = 0;/* default 0MB disk per node */ |
| |
| return SLURM_SUCCESS; |
| } |
| |
| /* _validate_pn_min_mem() |
| * Traverse the list of partitions and invoke the |
| * function validating the job memory specification. |
| */ |
| static bool |
| _validate_min_mem_partition(job_desc_msg_t *job_desc_msg, |
| struct part_record *part_ptr, List part_list) |
| { |
| ListIterator iter; |
| struct part_record *part; |
| bool cc; |
| |
| if (part_list == NULL) |
| return _valid_pn_min_mem(job_desc_msg, part_ptr); |
| |
| cc = false; |
| iter = list_iterator_create(part_list); |
| while ((part = list_next(iter))) { |
| if ((cc = _valid_pn_min_mem(job_desc_msg, part))) |
| break; |
| } |
| list_iterator_destroy(iter); |
| |
| return cc; |
| } |
| |
| /* |
| * _list_delete_job - delete a job record and its corresponding job_details, |
| * see common/list.h for documentation |
| * IN job_entry - pointer to job_record to delete |
| * global: job_list - pointer to global job list |
| * job_count - count of job list entries |
| * job_hash - hash table into job records |
| */ |
| static void _list_delete_job(void *job_entry) |
| { |
| struct job_record *job_ptr = (struct job_record *) job_entry; |
| struct job_record **job_pptr; |
| int i; |
| |
| xassert(job_entry); |
| xassert (job_ptr->magic == JOB_MAGIC); |
| job_ptr->magic = 0; /* make sure we don't delete record twice */ |
| |
| /* Remove the record from job hash table */ |
| job_pptr = &job_hash[JOB_HASH_INX(job_ptr->job_id)]; |
| while ((*job_pptr != NULL) && |
| ((job_ptr = *job_pptr) != (struct job_record *) job_entry)) { |
| job_pptr = &job_ptr->job_next; |
| } |
| if (job_pptr == NULL) { |
| fatal("job hash error"); |
| return; /* Fix CLANG false positive error */ |
| } |
| *job_pptr = job_ptr->job_next; |
| |
| /* Remove the record from job array hash tables, if applicable */ |
| if (job_ptr->array_task_id != NO_VAL) { |
| job_pptr = &job_array_hash_j[ |
| JOB_HASH_INX(job_ptr->array_job_id)]; |
| while ((*job_pptr != NULL) && |
| ((job_ptr = *job_pptr) != |
| (struct job_record *) job_entry)) { |
| job_pptr = &job_ptr->job_array_next_j; |
| } |
| if (job_pptr == NULL) { |
| fatal("job array hash error"); |
| return; /* Fix CLANG false positive error */ |
| } |
| *job_pptr = job_ptr->job_array_next_j; |
| |
| job_pptr = &job_array_hash_t[ |
| JOB_ARRAY_HASH_INX(job_ptr->array_job_id, |
| job_ptr->array_task_id)]; |
| while ((*job_pptr != NULL) && |
| ((job_ptr = *job_pptr) != |
| (struct job_record *) job_entry)) { |
| job_pptr = &job_ptr->job_array_next_t; |
| } |
| if (job_pptr == NULL) { |
| fatal("job array, task ID hash error"); |
| return; /* Fix CLANG false positive error */ |
| } |
| *job_pptr = job_ptr->job_array_next_t; |
| } |
| |
| /* |
| * NOTE: Anything you free here also needs to be allocated memory copied |
| * when a job array is created in _job_rec_copy() above |
| */ |
| delete_job_details(job_ptr); |
| xfree(job_ptr->account); |
| xfree(job_ptr->alias_list); |
| xfree(job_ptr->alloc_node); |
| xfree(job_ptr->batch_host); |
| xfree(job_ptr->comment); |
| xfree(job_ptr->gres); |
| xfree(job_ptr->gres_alloc); |
| xfree(job_ptr->gres_req); |
| xfree(job_ptr->gres_used); |
| FREE_NULL_LIST(job_ptr->gres_list); |
| xfree(job_ptr->licenses); |
| FREE_NULL_LIST(job_ptr->license_list); |
| xfree(job_ptr->mail_user); |
| xfree(job_ptr->name); |
| xfree(job_ptr->network); |
| xfree(job_ptr->node_addr); |
| FREE_NULL_BITMAP(job_ptr->node_bitmap); |
| FREE_NULL_BITMAP(job_ptr->node_bitmap_cg); |
| xfree(job_ptr->nodes); |
| xfree(job_ptr->nodes_completing); |
| xfree(job_ptr->partition); |
| FREE_NULL_LIST(job_ptr->part_ptr_list); |
| xfree(job_ptr->priority_array); |
| slurm_destroy_priority_factors_object(job_ptr->prio_factors); |
| xfree(job_ptr->resp_host); |
| xfree(job_ptr->resv_name); |
| free_job_resources(&job_ptr->job_resrcs); |
| for (i=0; i<job_ptr->spank_job_env_size; i++) |
| xfree(job_ptr->spank_job_env[i]); |
| xfree(job_ptr->spank_job_env); |
| xfree(job_ptr->state_desc); |
| if (job_ptr->step_list) { |
| delete_step_records(job_ptr); |
| list_destroy(job_ptr->step_list); |
| } |
| /* select_jobinfo is used in delete_step_records so free it |
| afterwards */ |
| select_g_select_jobinfo_free(job_ptr->select_jobinfo); |
| xfree(job_ptr->wckey); |
| job_count--; |
| xfree(job_ptr); |
| } |
| |
| |
| /* |
| * _list_find_job_id - find specific job_id entry in the job list, |
| * see common/list.h for documentation, key is job_id_ptr |
| * global- job_list - the global partition list |
| */ |
| static int _list_find_job_id(void *job_entry, void *key) |
| { |
| uint32_t *job_id_ptr = (uint32_t *) key; |
| |
| if (((struct job_record *) job_entry)->job_id == *job_id_ptr) |
| return 1; |
| else |
| return 0; |
| } |
| |
| |
| /* |
| * _list_find_job_old - find old entries in the job list, |
| * see common/list.h for documentation, key is ignored |
| * global- job_list - the global partition list |
| */ |
| static int _list_find_job_old(void *job_entry, void *key) |
| { |
| time_t kill_age, min_age, now = time(NULL);; |
| struct job_record *job_ptr = (struct job_record *)job_entry; |
| uint16_t cleaning = 0; |
| |
| if (IS_JOB_COMPLETING(job_ptr)) { |
| kill_age = now - (slurmctld_conf.kill_wait + |
| 2 * slurm_get_msg_timeout()); |
| if (job_ptr->time_last_active < kill_age) { |
| job_ptr->time_last_active = now; |
| re_kill_job(job_ptr); |
| } |
| return 0; /* Job still completing */ |
| } |
| |
| if (job_ptr->epilog_running) |
| return 0; /* EpilogSlurmctld still running */ |
| |
| if (slurmctld_conf.min_job_age == 0) |
| return 0; /* No job record purging */ |
| |
| min_age = now - slurmctld_conf.min_job_age; |
| if (job_ptr->end_time > min_age) |
| return 0; /* Too new to purge */ |
| |
| if (!(IS_JOB_FINISHED(job_ptr))) |
| return 0; /* Job still active */ |
| |
| if (job_ptr->step_list && list_count(job_ptr->step_list)) { |
| debug("Job %u still has %d active steps", |
| job_ptr->job_id, list_count(job_ptr->step_list)); |
| return 0; /* steps are still active */ |
| } |
| select_g_select_jobinfo_get(job_ptr->select_jobinfo, |
| SELECT_JOBDATA_CLEANING, |
| &cleaning); |
| if (cleaning) |
| return 0; /* Job hasn't finished yet */ |
| |
| /* If we don't have a db_index by now and we are running with |
| the slurmdbd lets put it on the list to be handled later |
| when it comes back up since we won't get another chance. |
| */ |
| if (with_slurmdbd && !job_ptr->db_index) |
| jobacct_storage_g_job_start(acct_db_conn, job_ptr); |
| |
| return 1; /* Purge the job */ |
| } |
| |
| /* Determine if ALL partitions associated with a job are hidden */ |
| static bool _all_parts_hidden(struct job_record *job_ptr) |
| { |
| bool rc; |
| ListIterator part_iterator; |
| struct part_record *part_ptr; |
| |
| if (job_ptr->part_ptr_list) { |
| rc = true; |
| part_iterator = list_iterator_create(job_ptr->part_ptr_list); |
| while ((part_ptr = (struct part_record *) |
| list_next(part_iterator))) { |
| if (!(part_ptr->flags & PART_FLAG_HIDDEN)) { |
| rc = false; |
| break; |
| } |
| } |
| list_iterator_destroy(part_iterator); |
| return rc; |
| } |
| |
| if ((job_ptr->part_ptr) && |
| (job_ptr->part_ptr->flags & PART_FLAG_HIDDEN)) |
| return true; |
| return false; |
| } |
| |
| /* Determine if a given job should be seen by a specific user */ |
| static bool _hide_job(struct job_record *job_ptr, uid_t uid) |
| { |
| if ((slurmctld_conf.private_data & PRIVATE_DATA_JOBS) && |
| (job_ptr->user_id != uid) && !validate_operator(uid) && |
| !assoc_mgr_is_user_acct_coord(acct_db_conn, uid, job_ptr->account)) |
| return true; |
| return false; |
| } |
| |
| /* |
| * pack_all_jobs - dump all job information for all jobs in |
| * machine independent form (for network transmission) |
| * OUT buffer_ptr - the pointer is set to the allocated buffer. |
| * OUT buffer_size - set to size of the buffer in bytes |
| * IN show_flags - job filtering options |
| * IN uid - uid of user making request (for partition filtering) |
| * IN filter_uid - pack only jobs belonging to this user if not NO_VAL |
| * global: job_list - global list of job records |
| * NOTE: the buffer at *buffer_ptr must be xfreed by the caller |
| * NOTE: change _unpack_job_desc_msg() in common/slurm_protocol_pack.c |
| * whenever the data format changes |
| */ |
| extern void pack_all_jobs(char **buffer_ptr, int *buffer_size, |
| uint16_t show_flags, uid_t uid, uint32_t filter_uid, |
| uint16_t protocol_version) |
| { |
| ListIterator job_iterator; |
| struct job_record *job_ptr; |
| uint32_t jobs_packed = 0, tmp_offset; |
| Buf buffer; |
| time_t min_age = 0, now = time(NULL); |
| |
| buffer_ptr[0] = NULL; |
| *buffer_size = 0; |
| |
| buffer = init_buf(BUF_SIZE); |
| |
| /* write message body header : size and time */ |
| /* put in a place holder job record count of 0 for now */ |
| pack32(jobs_packed, buffer); |
| pack_time(now, buffer); |
| |
| if (slurmctld_conf.min_job_age > 0) |
| min_age = now - slurmctld_conf.min_job_age; |
| |
| /* write individual job records */ |
| part_filter_set(uid); |
| job_iterator = list_iterator_create(job_list); |
| while ((job_ptr = (struct job_record *) list_next(job_iterator))) { |
| xassert (job_ptr->magic == JOB_MAGIC); |
| |
| if (((show_flags & SHOW_ALL) == 0) && (uid != 0) && |
| _all_parts_hidden(job_ptr)) |
| continue; |
| |
| if (_hide_job(job_ptr, uid)) |
| continue; |
| |
| if ((min_age > 0) && (job_ptr->end_time < min_age) && |
| (! IS_JOB_COMPLETING(job_ptr)) && IS_JOB_FINISHED(job_ptr)) |
| continue; /* job ready for purging, don't dump */ |
| |
| if ((filter_uid != NO_VAL) && (filter_uid != job_ptr->user_id)) |
| continue; |
| |
| pack_job(job_ptr, show_flags, buffer, protocol_version, uid); |
| jobs_packed++; |
| } |
| part_filter_clear(); |
| list_iterator_destroy(job_iterator); |
| |
| /* put the real record count in the message body header */ |
| tmp_offset = get_buf_offset(buffer); |
| set_buf_offset(buffer, 0); |
| pack32(jobs_packed, buffer); |
| set_buf_offset(buffer, tmp_offset); |
| |
| *buffer_size = get_buf_offset(buffer); |
| buffer_ptr[0] = xfer_buf_data(buffer); |
| } |
| |
| /* |
| * pack_one_job - dump information for one jobs in |
| * machine independent form (for network transmission) |
| * OUT buffer_ptr - the pointer is set to the allocated buffer. |
| * OUT buffer_size - set to size of the buffer in bytes |
| * IN job_id - ID of job that we want info for |
| * IN show_flags - job filtering options |
| * IN uid - uid of user making request (for partition filtering) |
| * NOTE: the buffer at *buffer_ptr must be xfreed by the caller |
| * NOTE: change _unpack_job_desc_msg() in common/slurm_protocol_pack.c |
| * whenever the data format changes |
| */ |
| extern int pack_one_job(char **buffer_ptr, int *buffer_size, |
| uint32_t job_id, uint16_t show_flags, uid_t uid, |
| uint16_t protocol_version) |
| { |
| ListIterator job_iterator; |
| struct job_record *job_ptr; |
| uint32_t jobs_packed = 0, tmp_offset; |
| Buf buffer; |
| |
| buffer_ptr[0] = NULL; |
| *buffer_size = 0; |
| |
| buffer = init_buf(BUF_SIZE); |
| |
| /* write message body header : size and time */ |
| /* put in a place holder job record count of 0 for now */ |
| pack32(jobs_packed, buffer); |
| pack_time(time(NULL), buffer); |
| |
| job_ptr = find_job_record(job_id); |
| if (job_ptr && (job_ptr->array_task_id == NO_VAL)) { |
| if (!_hide_job(job_ptr, uid)) { |
| pack_job(job_ptr, show_flags, buffer, protocol_version, |
| uid); |
| jobs_packed++; |
| } |
| } else { |
| /* Job ID not found. It could reference a job array. */ |
| job_iterator = list_iterator_create(job_list); |
| while ((job_ptr = (struct job_record *) |
| list_next(job_iterator))) { |
| if ((job_ptr->job_id != job_id) && |
| ((job_ptr->array_task_id == NO_VAL) || |
| (job_ptr->array_job_id != job_id))) |
| continue; |
| |
| if (_hide_job(job_ptr, uid)) |
| break; |
| |
| pack_job(job_ptr, show_flags, buffer, protocol_version, |
| uid); |
| jobs_packed++; |
| } |
| list_iterator_destroy(job_iterator); |
| } |
| |
| if (jobs_packed == 0) { |
| free_buf(buffer); |
| return ESLURM_INVALID_JOB_ID; |
| } |
| |
| /* put the real record count in the message body header */ |
| tmp_offset = get_buf_offset(buffer); |
| set_buf_offset(buffer, 0); |
| pack32(jobs_packed, buffer); |
| set_buf_offset(buffer, tmp_offset); |
| |
| *buffer_size = get_buf_offset(buffer); |
| buffer_ptr[0] = xfer_buf_data(buffer); |
| |
| return SLURM_SUCCESS; |
| } |
| |
| /* |
| * pack_job - dump all configuration information about a specific job in |
| * machine independent form (for network transmission) |
| * IN dump_job_ptr - pointer to job for which information is requested |
| * IN show_flags - job filtering options |
| * IN/OUT buffer - buffer in which data is placed, pointers automatically |
| * updated |
| * IN uid - user requesting the data |
| * NOTE: change _unpack_job_info_members() in common/slurm_protocol_pack.c |
| * whenever the data format changes |
| */ |
| void pack_job(struct job_record *dump_job_ptr, uint16_t show_flags, Buf buffer, |
| uint16_t protocol_version, uid_t uid) |
| { |
| struct job_details *detail_ptr; |
| time_t begin_time = 0; |
| char *nodelist = NULL; |
| assoc_mgr_lock_t locks = { NO_LOCK, NO_LOCK, |
| READ_LOCK, NO_LOCK, NO_LOCK }; |
| |
| if (protocol_version >= SLURM_14_03_PROTOCOL_VERSION) { |
| detail_ptr = dump_job_ptr->details; |
| pack32(dump_job_ptr->array_job_id, buffer); |
| pack32(dump_job_ptr->array_task_id, buffer); |
| pack32(dump_job_ptr->assoc_id, buffer); |
| pack32(dump_job_ptr->job_id, buffer); |
| pack32(dump_job_ptr->user_id, buffer); |
| pack32(dump_job_ptr->group_id, buffer); |
| pack32(dump_job_ptr->profile, buffer); |
| |
| pack16(dump_job_ptr->job_state, buffer); |
| pack16(dump_job_ptr->batch_flag, buffer); |
| pack16(dump_job_ptr->state_reason, buffer); |
| pack16(dump_job_ptr->restart_cnt, buffer); |
| pack16(show_flags, buffer); |
| |
| pack32(dump_job_ptr->alloc_sid, buffer); |
| if ((dump_job_ptr->time_limit == NO_VAL) |
| && dump_job_ptr->part_ptr) |
| pack32(dump_job_ptr->part_ptr->max_time, buffer); |
| else |
| pack32(dump_job_ptr->time_limit, buffer); |
| pack32(dump_job_ptr->time_min, buffer); |
| |
| if (dump_job_ptr->details) { |
| pack16(dump_job_ptr->details->nice, buffer); |
| pack_time(dump_job_ptr->details->submit_time, buffer); |
| /* Earliest possible begin time */ |
| begin_time = dump_job_ptr->details->begin_time; |
| } else { |
| pack16(0, buffer); |
| pack_time((time_t) 0, buffer); |
| } |
| |
| pack_time(begin_time, buffer); |
| /* Actual or expected start time */ |
| if ((dump_job_ptr->start_time) || (begin_time <= time(NULL))) |
| pack_time(dump_job_ptr->start_time, buffer); |
| else /* earliest start time in the future */ |
| pack_time(begin_time, buffer); |
| |
| pack_time(dump_job_ptr->end_time, buffer); |
| pack_time(dump_job_ptr->suspend_time, buffer); |
| pack_time(dump_job_ptr->pre_sus_time, buffer); |
| pack_time(dump_job_ptr->resize_time, buffer); |
| pack_time(dump_job_ptr->preempt_time, buffer); |
| pack32(dump_job_ptr->priority, buffer); |
| |
| /* Only send the allocated nodelist since we are only sending |
| * the number of cpus and nodes that are currently allocated. */ |
| if (!IS_JOB_COMPLETING(dump_job_ptr)) |
| packstr(dump_job_ptr->nodes, buffer); |
| else { |
| nodelist = |
| bitmap2node_name(dump_job_ptr->node_bitmap_cg); |
| packstr(nodelist, buffer); |
| xfree(nodelist); |
| } |
| |
| if (!IS_JOB_PENDING(dump_job_ptr) && dump_job_ptr->part_ptr) |
| packstr(dump_job_ptr->part_ptr->name, buffer); |
| else |
| packstr(dump_job_ptr->partition, buffer); |
| packstr(dump_job_ptr->account, buffer); |
| packstr(dump_job_ptr->network, buffer); |
| packstr(dump_job_ptr->comment, buffer); |
| packstr(dump_job_ptr->gres, buffer); |
| packstr(dump_job_ptr->batch_host, buffer); |
| if (!IS_JOB_COMPLETED(dump_job_ptr) && |
| (show_flags & SHOW_DETAIL2) && |
| ((dump_job_ptr->user_id == (uint32_t) uid) || |
| validate_slurm_user(uid))) { |
| char *batch_script = get_job_script(dump_job_ptr); |
| packstr(batch_script, buffer); |
| xfree(batch_script); |
| } else { |
| packnull(buffer); |
| } |
| |
| assoc_mgr_lock(&locks); |
| if (assoc_mgr_qos_list) { |
| packstr(slurmdb_qos_str(assoc_mgr_qos_list, |
| dump_job_ptr->qos_id), buffer); |
| } else |
| packnull(buffer); |
| assoc_mgr_unlock(&locks); |
| |
| packstr(dump_job_ptr->licenses, buffer); |
| packstr(dump_job_ptr->state_desc, buffer); |
| packstr(dump_job_ptr->resv_name, buffer); |
| |
| pack32(dump_job_ptr->exit_code, buffer); |
| pack32(dump_job_ptr->derived_ec, buffer); |
| |
| if (show_flags & SHOW_DETAIL) { |
| pack_job_resources(dump_job_ptr->job_resrcs, buffer, |
| protocol_version); |
| } else { |
| uint32_t empty = NO_VAL; |
| pack32(empty, buffer); |
| } |
| |
| packstr(dump_job_ptr->name, buffer); |
| packstr(dump_job_ptr->wckey, buffer); |
| pack32(dump_job_ptr->req_switch, buffer); |
| pack32(dump_job_ptr->wait4switch, buffer); |
| |
| packstr(dump_job_ptr->alloc_node, buffer); |
| if (!IS_JOB_COMPLETING(dump_job_ptr)) |
| pack_bit_fmt(dump_job_ptr->node_bitmap, buffer); |
| else |
| pack_bit_fmt(dump_job_ptr->node_bitmap_cg, buffer); |
| |
| select_g_select_jobinfo_pack(dump_job_ptr->select_jobinfo, |
| buffer, protocol_version); |
| |
| /* A few details are always dumped here */ |
| _pack_default_job_details(dump_job_ptr, buffer, |
| protocol_version); |
| |
| /* other job details are only dumped until the job starts |
| * running (at which time they become meaningless) */ |
| if (detail_ptr) |
| _pack_pending_job_details(detail_ptr, buffer, |
| protocol_version); |
| else |
| _pack_pending_job_details(NULL, buffer, |
| protocol_version); |
| } else if (protocol_version >= SLURM_2_6_PROTOCOL_VERSION) { |
| pack32(dump_job_ptr->array_job_id, buffer); |
| pack16((uint16_t) dump_job_ptr->array_task_id, buffer); |
| pack32(dump_job_ptr->assoc_id, buffer); |
| pack32(dump_job_ptr->job_id, buffer); |
| pack32(dump_job_ptr->user_id, buffer); |
| pack32(dump_job_ptr->group_id, buffer); |
| pack32(dump_job_ptr->profile, buffer); |
| |
| pack16(dump_job_ptr->job_state, buffer); |
| pack16(dump_job_ptr->batch_flag, buffer); |
| pack16(dump_job_ptr->state_reason, buffer); |
| pack16(dump_job_ptr->restart_cnt, buffer); |
| pack16(show_flags, buffer); |
| |
| pack32(dump_job_ptr->alloc_sid, buffer); |
| if ((dump_job_ptr->time_limit == NO_VAL) |
| && dump_job_ptr->part_ptr) |
| pack32(dump_job_ptr->part_ptr->max_time, buffer); |
| else |
| pack32(dump_job_ptr->time_limit, buffer); |
| pack32(dump_job_ptr->time_min, buffer); |
| |
| if (dump_job_ptr->details) { |
| pack16(dump_job_ptr->details->nice, buffer); |
| pack_time(dump_job_ptr->details->submit_time, buffer); |
| /* Earliest possible begin time */ |
| begin_time = dump_job_ptr->details->begin_time; |
| } else { |
| pack16(0, buffer); |
| pack_time((time_t) 0, buffer); |
| } |
| |
| pack_time(begin_time, buffer); |
| /* Actual or expected start time */ |
| if ((dump_job_ptr->start_time) || (begin_time <= time(NULL))) |
| pack_time(dump_job_ptr->start_time, buffer); |
| else /* earliest start time in the future */ |
| pack_time(begin_time, buffer); |
| |
| pack_time(dump_job_ptr->end_time, buffer); |
| pack_time(dump_job_ptr->suspend_time, buffer); |
| pack_time(dump_job_ptr->pre_sus_time, buffer); |
| pack_time(dump_job_ptr->resize_time, buffer); |
| pack_time(dump_job_ptr->preempt_time, buffer); |
| pack32(dump_job_ptr->priority, buffer); |
| |
| /* Only send the allocated nodelist since we are only sending |
| * the number of cpus and nodes that are currently allocated. */ |
| if (!IS_JOB_COMPLETING(dump_job_ptr)) |
| packstr(dump_job_ptr->nodes, buffer); |
| else { |
| nodelist = |
| bitmap2node_name(dump_job_ptr->node_bitmap_cg); |
| packstr(nodelist, buffer); |
| xfree(nodelist); |
| } |
| |
| if (!IS_JOB_PENDING(dump_job_ptr) && dump_job_ptr->part_ptr) |
| packstr(dump_job_ptr->part_ptr->name, buffer); |
| else |
| packstr(dump_job_ptr->partition, buffer); |
| packstr(dump_job_ptr->account, buffer); |
| packstr(dump_job_ptr->network, buffer); |
| packstr(dump_job_ptr->comment, buffer); |
| packstr(dump_job_ptr->gres, buffer); |
| packstr(dump_job_ptr->batch_host, buffer); |
| if (!IS_JOB_COMPLETED(dump_job_ptr) && |
| (show_flags & SHOW_DETAIL2) && |
| ((dump_job_ptr->user_id == (uint32_t) uid) || |
| validate_slurm_user(uid))) { |
| char *batch_script = get_job_script(dump_job_ptr); |
| packstr(batch_script, buffer); |
| xfree(batch_script); |
| } else { |
| packnull(buffer); |
| } |
| |
| assoc_mgr_lock(&locks); |
| if (assoc_mgr_qos_list) { |
| packstr(slurmdb_qos_str(assoc_mgr_qos_list, |
| dump_job_ptr->qos_id), buffer); |
| } else |
| packnull(buffer); |
| assoc_mgr_unlock(&locks); |
| |
| packstr(dump_job_ptr->licenses, buffer); |
| packstr(dump_job_ptr->state_desc, buffer); |
| packstr(dump_job_ptr->resv_name, buffer); |
| |
| pack32(dump_job_ptr->exit_code, buffer); |
| pack32(dump_job_ptr->derived_ec, buffer); |
| |
| if (show_flags & SHOW_DETAIL) { |
| pack_job_resources(dump_job_ptr->job_resrcs, buffer, |
| protocol_version); |
| } else { |
| uint32_t empty = NO_VAL; |
| pack32(empty, buffer); |
| } |
| |
| packstr(dump_job_ptr->name, buffer); |
| packstr(dump_job_ptr->wckey, buffer); |
| pack32(dump_job_ptr->req_switch, buffer); |
| pack32(dump_job_ptr->wait4switch, buffer); |
| |
| packstr(dump_job_ptr->alloc_node, buffer); |
| if (!IS_JOB_COMPLETING(dump_job_ptr)) |
| pack_bit_fmt(dump_job_ptr->node_bitmap, buffer); |
| else |
| pack_bit_fmt(dump_job_ptr->node_bitmap_cg, buffer); |
| |
| select_g_select_jobinfo_pack(dump_job_ptr->select_jobinfo, |
| buffer, protocol_version); |
| |
| detail_ptr = dump_job_ptr->details; |
| /* A few details are always dumped here */ |
| _pack_default_job_details(dump_job_ptr, buffer, |
| protocol_version); |
| |
| /* other job details are only dumped until the job starts |
| * running (at which time they become meaningless) */ |
| if (detail_ptr) |
| _pack_pending_job_details(detail_ptr, buffer, |
| protocol_version); |
| else |
| _pack_pending_job_details(NULL, buffer, |
| protocol_version); |
| } else if (protocol_version >= SLURM_2_5_PROTOCOL_VERSION) { |
| pack32(dump_job_ptr->assoc_id, buffer); |
| pack32(dump_job_ptr->job_id, buffer); |
| pack32(dump_job_ptr->user_id, buffer); |
| pack32(dump_job_ptr->group_id, buffer); |
| |
| pack16(dump_job_ptr->job_state, buffer); |
| pack16(dump_job_ptr->batch_flag, buffer); |
| pack16(dump_job_ptr->state_reason, buffer); |
| pack16(dump_job_ptr->restart_cnt, buffer); |
| pack16(show_flags, buffer); |
| |
| pack32(dump_job_ptr->alloc_sid, buffer); |
| if ((dump_job_ptr->time_limit == NO_VAL) |
| && dump_job_ptr->part_ptr) |
| pack32(dump_job_ptr->part_ptr->max_time, buffer); |
| else |
| pack32(dump_job_ptr->time_limit, buffer); |
| pack32(dump_job_ptr->time_min, buffer); |
| |
| if (dump_job_ptr->details) { |
| pack16(dump_job_ptr->details->nice, buffer); |
| pack_time(dump_job_ptr->details->submit_time, buffer); |
| /* Earliest possible begin time */ |
| begin_time = dump_job_ptr->details->begin_time; |
| } else { |
| pack16(0, buffer); |
| pack_time((time_t) 0, buffer); |
| } |
| |
| pack_time(begin_time, buffer); |
| /* Actual or expected start time */ |
| if ((dump_job_ptr->start_time) || (begin_time <= time(NULL))) |
| pack_time(dump_job_ptr->start_time, buffer); |
| else /* earliest start time in the future */ |
| pack_time(begin_time, buffer); |
| |
| pack_time(dump_job_ptr->end_time, buffer); |
| pack_time(dump_job_ptr->suspend_time, buffer); |
| pack_time(dump_job_ptr->pre_sus_time, buffer); |
| pack_time(dump_job_ptr->resize_time, buffer); |
| pack_time(dump_job_ptr->preempt_time, buffer); |
| pack32(dump_job_ptr->priority, buffer); |
| |
| /* Only send the allocated nodelist since we are only sending |
| * the number of cpus and nodes that are currently allocated. */ |
| if (!IS_JOB_COMPLETING(dump_job_ptr)) |
| packstr(dump_job_ptr->nodes, buffer); |
| else { |
| nodelist = |
| bitmap2node_name(dump_job_ptr->node_bitmap_cg); |
| packstr(nodelist, buffer); |
| xfree(nodelist); |
| } |
| |
| if (!IS_JOB_PENDING(dump_job_ptr) && dump_job_ptr->part_ptr) |
| packstr(dump_job_ptr->part_ptr->name, buffer); |
| else |
| packstr(dump_job_ptr->partition, buffer); |
| packstr(dump_job_ptr->account, buffer); |
| packstr(dump_job_ptr->network, buffer); |
| packstr(dump_job_ptr->comment, buffer); |
| packstr(dump_job_ptr->gres, buffer); |
| packstr(dump_job_ptr->batch_host, buffer); |
| if (!IS_JOB_COMPLETED(dump_job_ptr) && |
| (show_flags & SHOW_DETAIL2) && |
| ((dump_job_ptr->user_id == (uint32_t) uid) || |
| validate_slurm_user(uid))) { |
| char *batch_script = get_job_script(dump_job_ptr); |
| packstr(batch_script, buffer); |
| xfree(batch_script); |
| } else { |
| packnull(buffer); |
| } |
| |
| assoc_mgr_lock(&locks); |
| if (assoc_mgr_qos_list) { |
| packstr(slurmdb_qos_str(assoc_mgr_qos_list, |
| dump_job_ptr->qos_id), buffer); |
| } else |
| packnull(buffer); |
| assoc_mgr_unlock(&locks); |
| |
| packstr(dump_job_ptr->licenses, buffer); |
| packstr(dump_job_ptr->state_desc, buffer); |
| packstr(dump_job_ptr->resv_name, buffer); |
| |
| pack32(dump_job_ptr->exit_code, buffer); |
| pack32(dump_job_ptr->derived_ec, buffer); |
| |
| if (show_flags & SHOW_DETAIL) { |
| pack_job_resources(dump_job_ptr->job_resrcs, buffer, |
| protocol_version); |
| } else { |
| uint32_t empty = NO_VAL; |
| pack32(empty, buffer); |
| } |
| |
| packstr(dump_job_ptr->name, buffer); |
| packstr(dump_job_ptr->wckey, buffer); |
| pack32(dump_job_ptr->req_switch, buffer); |
| pack32(dump_job_ptr->wait4switch, buffer); |
| |
| packstr(dump_job_ptr->alloc_node, buffer); |
| if (!IS_JOB_COMPLETING(dump_job_ptr)) |
| pack_bit_fmt(dump_job_ptr->node_bitmap, buffer); |
| else |
| pack_bit_fmt(dump_job_ptr->node_bitmap_cg, buffer); |
| |
| select_g_select_jobinfo_pack(dump_job_ptr->select_jobinfo, |
| buffer, protocol_version); |
| |
| detail_ptr = dump_job_ptr->details; |
| /* A few details are always dumped here */ |
| _pack_default_job_details(dump_job_ptr, buffer, |
| protocol_version); |
| |
| /* other job details are only dumped until the job starts |
| * running (at which time they become meaningless) */ |
| if (detail_ptr) |
| _pack_pending_job_details(detail_ptr, buffer, |
| protocol_version); |
| else |
| _pack_pending_job_details(NULL, buffer, |
| protocol_version); |
| } else { |
| error("pack_job: protocol_version " |
| "%hu not supported", protocol_version); |
| } |
| } |
| |
| static void _find_node_config(int *cpu_cnt_ptr, int *core_cnt_ptr) |
| { |
| int i, max_cpu_cnt = 1, max_core_cnt = 1; |
| struct node_record *node_ptr = node_record_table_ptr; |
| |
| for (i = 0; i < node_record_count; i++, node_ptr++) { |
| #ifndef HAVE_BG |
| if (slurmctld_conf.fast_schedule) { |
| /* Only data from config_record used for scheduling */ |
| max_cpu_cnt = MAX(max_cpu_cnt, |
| node_ptr->config_ptr->cpus); |
| max_core_cnt = MAX(max_core_cnt, |
| node_ptr->config_ptr->cores); |
| } else { |
| #endif |
| /* Individual node data used for scheduling */ |
| max_cpu_cnt = MAX(max_cpu_cnt, node_ptr->cpus); |
| max_core_cnt = MAX(max_core_cnt, node_ptr->cores); |
| #ifndef HAVE_BG |
| } |
| #endif |
| } |
| *cpu_cnt_ptr = max_cpu_cnt; |
| *core_cnt_ptr = max_core_cnt; |
| } |
| |
| /* pack default job details for "get_job_info" RPC */ |
| static void _pack_default_job_details(struct job_record *job_ptr, |
| Buf buffer, uint16_t protocol_version) |
| { |
| static int max_cpu_cnt = -1, max_core_cnt = -1; |
| int i; |
| struct job_details *detail_ptr = job_ptr->details; |
| char *cmd_line = NULL; |
| char *tmp = NULL; |
| uint32_t len = 0; |
| uint16_t shared = 0; |
| |
| if (!detail_ptr) |
| shared = (uint16_t) NO_VAL; |
| else if (detail_ptr->share_res == 1) /* User --share */ |
| shared = 1; |
| else if ((detail_ptr->share_res == 0) || |
| (detail_ptr->whole_node == 1)) /* User --exclusive */ |
| shared = 0; |
| else if (job_ptr->part_ptr) { |
| /* Report shared status based upon latest partition info */ |
| if ((job_ptr->part_ptr->max_share & SHARED_FORCE) && |
| ((job_ptr->part_ptr->max_share & (~SHARED_FORCE)) > 1)) |
| shared = 1; /* Partition Shared=force */ |
| else if (job_ptr->part_ptr->max_share == 0) |
| shared = 0; /* Partition Shared=exclusive */ |
| else |
| shared = 0; /* Part Shared=yes or no */ |
| } else |
| shared = (uint16_t) NO_VAL; /* No user or partition info */ |
| |
| if (max_cpu_cnt == -1) |
| _find_node_config(&max_cpu_cnt, &max_core_cnt); |
| |
| if (protocol_version >= SLURM_2_6_PROTOCOL_VERSION) { |
| if (detail_ptr) { |
| packstr(detail_ptr->features, buffer); |
| packstr(detail_ptr->work_dir, buffer); |
| packstr(detail_ptr->dependency, buffer); |
| |
| if (detail_ptr->argv) { |
| /* Determine size needed for a string |
| * containing all arguments */ |
| for (i=0; detail_ptr->argv[i]; i++) { |
| len += strlen(detail_ptr->argv[i]); |
| } |
| len += i; |
| |
| cmd_line = xmalloc(len*sizeof(char)); |
| tmp = cmd_line; |
| for (i=0; detail_ptr->argv[i]; i++) { |
| if (i != 0) { |
| *tmp = ' '; |
| tmp++; |
| } |
| strcpy(tmp,detail_ptr->argv[i]); |
| tmp += strlen(detail_ptr->argv[i]); |
| } |
| packstr(cmd_line, buffer); |
| xfree(cmd_line); |
| } else |
| packnull(buffer); |
| |
| if (IS_JOB_COMPLETING(job_ptr) && job_ptr->cpu_cnt) { |
| pack32(job_ptr->cpu_cnt, buffer); |
| pack32((uint32_t) 0, buffer); |
| } else if (job_ptr->total_cpus && |
| !IS_JOB_PENDING(job_ptr)) { |
| /* If job is PENDING ignore total_cpus, |
| * which may have been set by previous run |
| * followed by job requeue. */ |
| pack32(job_ptr->total_cpus, buffer); |
| pack32((uint32_t) 0, buffer); |
| } else { |
| pack32(detail_ptr->min_cpus, buffer); |
| if (detail_ptr->max_cpus != NO_VAL) |
| pack32(detail_ptr->max_cpus, buffer); |
| else |
| pack32((uint32_t) 0, buffer); |
| |
| } |
| |
| if (IS_JOB_COMPLETING(job_ptr) && job_ptr->node_cnt) { |
| pack32(job_ptr->node_cnt, buffer); |
| pack32((uint32_t) 0, buffer); |
| } else if (job_ptr->total_nodes) { |
| pack32(job_ptr->total_nodes, buffer); |
| pack32((uint32_t) 0, buffer); |
| } else if (job_ptr->node_cnt_wag) { |
| /* This should catch everything else, but |
| * just incase this is 0 (startup or |
| * whatever) we will keep the rest of |
| * this if statement around. |
| */ |
| pack32(job_ptr->node_cnt_wag, buffer); |
| pack32((uint32_t) detail_ptr->max_nodes, |
| buffer); |
| } else if (detail_ptr->ntasks_per_node) { |
| /* min_nodes based upon task count and ntasks |
| * per node */ |
| uint32_t min_nodes; |
| min_nodes = detail_ptr->num_tasks / |
| detail_ptr->ntasks_per_node; |
| min_nodes = MAX(min_nodes, |
| detail_ptr->min_nodes); |
| pack32(min_nodes, buffer); |
| pack32(detail_ptr->max_nodes, buffer); |
| } else if (detail_ptr->cpus_per_task > 1) { |
| /* min_nodes based upon task count and cpus |
| * per task */ |
| uint32_t min_cpus, min_nodes; |
| min_cpus = detail_ptr->num_tasks * |
| detail_ptr->cpus_per_task; |
| min_nodes = min_cpus + max_cpu_cnt - 1; |
| min_nodes /= max_cpu_cnt; |
| min_nodes = MAX(min_nodes, |
| detail_ptr->min_nodes); |
| pack32(min_nodes, buffer); |
| pack32(detail_ptr->max_nodes, buffer); |
| } else if (detail_ptr->mc_ptr && |
| detail_ptr->mc_ptr->ntasks_per_core && |
| (detail_ptr->mc_ptr->ntasks_per_core |
| != (uint16_t)INFINITE)) { |
| /* min_nodes based upon task count and ntasks |
| * per core */ |
| uint32_t min_cores, min_nodes; |
| min_cores = detail_ptr->num_tasks + |
| detail_ptr->mc_ptr->ntasks_per_core |
| - 1; |
| min_cores /= detail_ptr->mc_ptr->ntasks_per_core; |
| |
| min_nodes = min_cores + max_core_cnt - 1; |
| min_nodes /= max_core_cnt; |
| min_nodes = MAX(min_nodes, |
| detail_ptr->min_nodes); |
| pack32(min_nodes, buffer); |
| pack32(detail_ptr->max_nodes, buffer); |
| } else { |
| /* min_nodes based upon task count only */ |
| uint32_t min_nodes; |
| min_nodes = detail_ptr->num_tasks + |
| max_cpu_cnt - 1; |
| min_nodes /= max_cpu_cnt; |
| min_nodes = MAX(min_nodes, |
| detail_ptr->min_nodes); |
| pack32(min_nodes, buffer); |
| pack32(detail_ptr->max_nodes, buffer); |
| } |
| pack16(detail_ptr->requeue, buffer); |
| pack16(detail_ptr->ntasks_per_node, buffer); |
| pack16(shared, buffer); |
| } else { |
| packnull(buffer); |
| packnull(buffer); |
| packnull(buffer); |
| packnull(buffer); |
| |
| if (job_ptr->total_cpus) |
| pack32(job_ptr->total_cpus, buffer); |
| else |
| pack32(job_ptr->cpu_cnt, buffer); |
| pack32((uint32_t) 0, buffer); |
| |
| pack32(job_ptr->node_cnt, buffer); |
| pack32((uint32_t) 0, buffer); |
| pack16((uint16_t) 0, buffer); |
| pack16((uint16_t) 0, buffer); |
| pack16((uint16_t) 0, buffer); |
| } |
| } else if (protocol_version >= SLURM_2_5_PROTOCOL_VERSION) { |
| if (detail_ptr) { |
| packstr(detail_ptr->features, buffer); |
| packstr(detail_ptr->work_dir, buffer); |
| packstr(detail_ptr->dependency, buffer); |
| |
| if (detail_ptr->argv) { |
| /* Determine size needed for a string |
| * containing all arguments */ |
| for (i=0; detail_ptr->argv[i]; i++) { |
| len += strlen(detail_ptr->argv[i]); |
| } |
| len += i; |
| |
| cmd_line = xmalloc(len*sizeof(char)); |
| tmp = cmd_line; |
| for (i=0; detail_ptr->argv[i]; i++) { |
| if (i != 0) { |
| *tmp = ' '; |
| tmp++; |
| } |
| strcpy(tmp,detail_ptr->argv[i]); |
| tmp += strlen(detail_ptr->argv[i]); |
| } |
| packstr(cmd_line, buffer); |
| xfree(cmd_line); |
| } else |
| packnull(buffer); |
| |
| if (IS_JOB_COMPLETING(job_ptr) && job_ptr->cpu_cnt) { |
| pack32(job_ptr->cpu_cnt, buffer); |
| pack32((uint32_t) 0, buffer); |
| } else if (job_ptr->total_cpus) { |
| pack32(job_ptr->total_cpus, buffer); |
| pack32((uint32_t) 0, buffer); |
| } else { |
| pack32(detail_ptr->min_cpus, buffer); |
| if (detail_ptr->max_cpus != NO_VAL) |
| pack32(detail_ptr->max_cpus, buffer); |
| else |
| pack32((uint32_t) 0, buffer); |
| |
| } |
| if (IS_JOB_COMPLETING(job_ptr) && job_ptr->node_cnt) { |
| pack32(job_ptr->node_cnt, buffer); |
| pack32((uint32_t) 0, buffer); |
| } else if (job_ptr->total_nodes) { |
| pack32(job_ptr->total_nodes, buffer); |
| pack32((uint32_t) 0, buffer); |
| } else { |
| pack32(detail_ptr->min_nodes, buffer); |
| pack32(detail_ptr->max_nodes, buffer); |
| } |
| pack16(detail_ptr->requeue, buffer); |
| pack16(shared, buffer); |
| } else { |
| packnull(buffer); |
| packnull(buffer); |
| packnull(buffer); |
| packnull(buffer); |
| |
| if (job_ptr->total_cpus) |
| pack32(job_ptr->total_cpus, buffer); |
| else |
| pack32(job_ptr->cpu_cnt, buffer); |
| pack32((uint32_t) 0, buffer); |
| |
| pack32(job_ptr->node_cnt, buffer); |
| pack32((uint32_t) 0, buffer); |
| pack16((uint16_t) 0, buffer); |
| pack16((uint16_t) 0, buffer); |
| } |
| } else { |
| error("_pack_default_job_details: protocol_version " |
| "%hu not supported", protocol_version); |
| } |
| } |
| |
| /* pack pending job details for "get_job_info" RPC */ |
| static void _pack_pending_job_details(struct job_details *detail_ptr, |
| Buf buffer, uint16_t protocol_version) |
| { |
| if (protocol_version >= SLURM_14_03_PROTOCOL_VERSION) { |
| if (detail_ptr) { |
| pack16(detail_ptr->contiguous, buffer); |
| pack16(detail_ptr->core_spec, buffer); |
| pack16(detail_ptr->cpus_per_task, buffer); |
| pack16(detail_ptr->pn_min_cpus, buffer); |
| |
| pack32(detail_ptr->pn_min_memory, buffer); |
| pack32(detail_ptr->pn_min_tmp_disk, buffer); |
| |
| packstr(detail_ptr->req_nodes, buffer); |
| pack_bit_fmt(detail_ptr->req_node_bitmap, buffer); |
| /* detail_ptr->req_node_layout is not packed */ |
| packstr(detail_ptr->exc_nodes, buffer); |
| pack_bit_fmt(detail_ptr->exc_node_bitmap, buffer); |
| |
| packstr(detail_ptr->std_err, buffer); |
| packstr(detail_ptr->std_in, buffer); |
| packstr(detail_ptr->std_out, buffer); |
| |
| pack_multi_core_data(detail_ptr->mc_ptr, buffer, |
| protocol_version); |
| } else { |
| pack16((uint16_t) 0, buffer); |
| pack16((uint16_t) 0, buffer); |
| pack16((uint16_t) 0, buffer); |
| pack16((uint16_t) 0, buffer); |
| |
| pack32((uint32_t) 0, buffer); |
| pack32((uint32_t) 0, buffer); |
| |
| packnull(buffer); |
| packnull(buffer); |
| packnull(buffer); |
| packnull(buffer); |
| |
| packnull(buffer); |
| packnull(buffer); |
| packnull(buffer); |
| |
| pack_multi_core_data(NULL, buffer, protocol_version); |
| } |
| } else if (protocol_version >= SLURM_2_5_PROTOCOL_VERSION) { |
| if (detail_ptr) { |
| pack16(detail_ptr->contiguous, buffer); |
| pack16(detail_ptr->cpus_per_task, buffer); |
| pack16(detail_ptr->pn_min_cpus, buffer); |
| |
| pack32(detail_ptr->pn_min_memory, buffer); |
| pack32(detail_ptr->pn_min_tmp_disk, buffer); |
| |
| packstr(detail_ptr->req_nodes, buffer); |
| pack_bit_fmt(detail_ptr->req_node_bitmap, buffer); |
| /* detail_ptr->req_node_layout is not packed */ |
| packstr(detail_ptr->exc_nodes, buffer); |
| pack_bit_fmt(detail_ptr->exc_node_bitmap, buffer); |
| |
| pack_multi_core_data(detail_ptr->mc_ptr, buffer, |
| protocol_version); |
| } else { |
| pack16((uint16_t) 0, buffer); |
| pack16((uint16_t) 0, buffer); |
| pack16((uint16_t) 0, buffer); |
| |
| pack32((uint32_t) 0, buffer); |
| pack32((uint32_t) 0, buffer); |
| |
| packnull(buffer); |
| packnull(buffer); |
| packnull(buffer); |
| packnull(buffer); |
| |
| pack_multi_core_data(NULL, buffer, protocol_version); |
| } |
| } else { |
| error("_pack_pending_job_details: protocol_version " |
| "%hu not supported", protocol_version); |
| } |
| } |
| |
| /* |
| * purge_old_job - purge old job records. |
| * The jobs must have completed at least MIN_JOB_AGE minutes ago. |
| * Test job dependencies, handle after_ok, after_not_ok before |
| * purging any jobs. |
| * NOTE: READ lock slurmctld config and WRITE lock jobs before entry |
| */ |
| void purge_old_job(void) |
| { |
| ListIterator job_iterator; |
| struct job_record *job_ptr; |
| time_t now = time(NULL); |
| int i; |
| |
| job_iterator = list_iterator_create(job_list); |
| while ((job_ptr = (struct job_record *) list_next(job_iterator))) { |
| if (!IS_JOB_PENDING(job_ptr)) |
| continue; |
| if (test_job_dependency(job_ptr) == 2) { |
| info("Job dependency can't be satisfied, cancelling " |
| "job %u", job_ptr->job_id); |
| job_ptr->job_state = JOB_CANCELLED; |
| xfree(job_ptr->state_desc); |
| job_ptr->start_time = now; |
| job_ptr->end_time = now; |
| job_completion_logger(job_ptr, false); |
| last_job_update = now; |
| srun_allocate_abort(job_ptr); |
| } |
| } |
| list_iterator_destroy(job_iterator); |
| |
| i = list_delete_all(job_list, &_list_find_job_old, ""); |
| if (i) { |
| debug2("purge_old_job: purged %d old job records", i); |
| /* last_job_update = now; don't worry about state save */ |
| } |
| } |
| |
| |
| /* |
| * _purge_job_record - purge specific job record |
| * IN job_id - job_id of job record to be purged |
| * RET int - count of job's purged |
| * global: job_list - global job table |
| */ |
| static int _purge_job_record(uint32_t job_id) |
| { |
| return list_delete_all(job_list, &_list_find_job_id, (void *) &job_id); |
| } |
| |
| |
| /* |
| * reset_job_bitmaps - reestablish bitmaps for existing jobs. |
| * this should be called after rebuilding node information, |
| * but before using any job entries. |
| * global: last_job_update - time of last job table update |
| * job_list - pointer to global job list |
| */ |
| void reset_job_bitmaps(void) |
| { |
| ListIterator job_iterator; |
| struct job_record *job_ptr; |
| struct part_record *part_ptr; |
| List part_ptr_list = NULL; |
| bool job_fail = false; |
| time_t now = time(NULL); |
| bool gang_flag = false; |
| static uint32_t cr_flag = NO_VAL; |
| |
| xassert(job_list); |
| |
| if (cr_flag == NO_VAL) { |
| cr_flag = 0; /* call is no-op for select/linear and bluegene */ |
| if (select_g_get_info_from_plugin(SELECT_CR_PLUGIN, |
| NULL, &cr_flag)) { |
| cr_flag = NO_VAL; /* error */ |
| } |
| |
| } |
| if (slurm_get_preempt_mode() == PREEMPT_MODE_GANG) |
| gang_flag = true; |
| |
| job_iterator = list_iterator_create(job_list); |
| while ((job_ptr = (struct job_record *) list_next(job_iterator))) { |
| xassert (job_ptr->magic == JOB_MAGIC); |
| job_fail = false; |
| |
| if (job_ptr->partition == NULL) { |
| error("No partition for job_id %u", job_ptr->job_id); |
| part_ptr = NULL; |
| job_fail = true; |
| } else { |
| part_ptr = find_part_record(job_ptr->partition); |
| if (part_ptr == NULL) { |
| part_ptr_list = get_part_list(job_ptr-> |
| partition); |
| if (part_ptr_list) |
| part_ptr = list_peek(part_ptr_list); |
| } |
| if (part_ptr == NULL) { |
| error("Invalid partition (%s) for job %u", |
| job_ptr->partition, job_ptr->job_id); |
| job_fail = true; |
| } |
| } |
| job_ptr->part_ptr = part_ptr; |
| FREE_NULL_LIST(job_ptr->part_ptr_list); |
| if (part_ptr_list) { |
| job_ptr->part_ptr_list = part_ptr_list; |
| part_ptr_list = NULL; /* clear for next job */ |
| } |
| |
| FREE_NULL_BITMAP(job_ptr->node_bitmap_cg); |
| if (job_ptr->nodes_completing && |
| node_name2bitmap(job_ptr->nodes_completing, |
| false, &job_ptr->node_bitmap_cg)) { |
| error("Invalid nodes (%s) for job_id %u", |
| job_ptr->nodes_completing, |
| job_ptr->job_id); |
| job_fail = true; |
| } |
| FREE_NULL_BITMAP(job_ptr->node_bitmap); |
| if (job_ptr->nodes && |
| node_name2bitmap(job_ptr->nodes, false, |
| &job_ptr->node_bitmap) && !job_fail) { |
| error("Invalid nodes (%s) for job_id %u", |
| job_ptr->nodes, job_ptr->job_id); |
| job_fail = true; |
| } |
| if (reset_node_bitmap(job_ptr->job_resrcs, job_ptr->job_id)) |
| job_fail = true; |
| if (!job_fail && !IS_JOB_FINISHED(job_ptr) && |
| job_ptr->job_resrcs && (cr_flag || gang_flag) && |
| valid_job_resources(job_ptr->job_resrcs, |
| node_record_table_ptr, |
| slurmctld_conf.fast_schedule)) { |
| error("Aborting JobID %u due to change in socket/core " |
| "configuration of allocated nodes", |
| job_ptr->job_id); |
| job_fail = true; |
| } |
| _reset_step_bitmaps(job_ptr); |
| |
| /* Do not increase the job->node_cnt for |
| * completed jobs. |
| */ |
| if (! IS_JOB_COMPLETED(job_ptr)) |
| build_node_details(job_ptr, false); /* set node_addr */ |
| |
| if (_reset_detail_bitmaps(job_ptr)) |
| job_fail = true; |
| |
| if (job_fail) { |
| if (IS_JOB_PENDING(job_ptr)) { |
| job_ptr->start_time = |
| job_ptr->end_time = time(NULL); |
| job_ptr->job_state = JOB_NODE_FAIL; |
| } else if (IS_JOB_RUNNING(job_ptr)) { |
| job_ptr->end_time = time(NULL); |
| job_ptr->job_state = JOB_NODE_FAIL | |
| JOB_COMPLETING; |
| build_cg_bitmap(job_ptr); |
| } else if (IS_JOB_SUSPENDED(job_ptr)) { |
| job_ptr->end_time = job_ptr->suspend_time; |
| job_ptr->job_state = JOB_NODE_FAIL | |
| JOB_COMPLETING; |
| build_cg_bitmap(job_ptr); |
| job_ptr->tot_sus_time += |
| difftime(now, job_ptr->suspend_time); |
| jobacct_storage_g_job_suspend(acct_db_conn, |
| job_ptr); |
| } |
| job_ptr->exit_code = MAX(job_ptr->exit_code, 1); |
| job_ptr->state_reason = FAIL_DOWN_NODE; |
| xfree(job_ptr->state_desc); |
| job_completion_logger(job_ptr, false); |
| if (job_ptr->job_state == JOB_NODE_FAIL) { |
| /* build_cg_bitmap() may clear JOB_COMPLETING */ |
| epilog_slurmctld(job_ptr); |
| } |
| } |
| } |
| |
| list_iterator_reset(job_iterator); |
| /* This will reinitialize the select plugin database, which |
| * we can only do after ALL job's states and bitmaps are set |
| * (i.e. it needs to be in this second loop) */ |
| while ((job_ptr = (struct job_record *) list_next(job_iterator))) { |
| if (select_g_select_nodeinfo_set(job_ptr) != SLURM_SUCCESS) { |
| error("select_g_select_nodeinfo_set(%u): %m", |
| job_ptr->job_id); |
| } |
| } |
| list_iterator_destroy(job_iterator); |
| |
| last_job_update = now; |
| } |
| |
| static int _reset_detail_bitmaps(struct job_record *job_ptr) |
| { |
| if (job_ptr->details == NULL) |
| return SLURM_SUCCESS; |
| |
| FREE_NULL_BITMAP(job_ptr->details->req_node_bitmap); |
| xfree(job_ptr->details->req_node_layout); /* layout info is lost |
| * but should be re-generated |
| * at job start time */ |
| if ((job_ptr->details->req_nodes) && |
| (node_name2bitmap(job_ptr->details->req_nodes, false, |
| &job_ptr->details->req_node_bitmap))) { |
| error("Invalid req_nodes (%s) for job_id %u", |
| job_ptr->details->req_nodes, job_ptr->job_id); |
| return SLURM_ERROR; |
| } |
| |
| FREE_NULL_BITMAP(job_ptr->details->exc_node_bitmap); |
| if ((job_ptr->details->exc_nodes) && |
| (node_name2bitmap(job_ptr->details->exc_nodes, true, |
| &job_ptr->details->exc_node_bitmap))) { |
| error("Invalid exc_nodes (%s) for job_id %u", |
| job_ptr->details->exc_nodes, job_ptr->job_id); |
| return SLURM_ERROR; |
| } |
| |
| return SLURM_SUCCESS; |
| } |
| |
| static void _reset_step_bitmaps(struct job_record *job_ptr) |
| { |
| ListIterator step_iterator; |
| struct step_record *step_ptr; |
| |
| step_iterator = list_iterator_create (job_ptr->step_list); |
| while ((step_ptr = (struct step_record *) list_next (step_iterator))) { |
| if (step_ptr->state < JOB_RUNNING) |
| continue; |
| FREE_NULL_BITMAP(step_ptr->step_node_bitmap); |
| if (step_ptr->step_layout && |
| step_ptr->step_layout->node_list && |
| (node_name2bitmap(step_ptr->step_layout->node_list, false, |
| &step_ptr->step_node_bitmap))) { |
| error("Invalid step_node_list (%s) for step_id %u.%u", |
| step_ptr->step_layout->node_list, |
| job_ptr->job_id, step_ptr->step_id); |
| delete_step_record (job_ptr, step_ptr->step_id); |
| } |
| if ((step_ptr->step_node_bitmap == NULL) && |
| (step_ptr->batch_step == 0)) { |
| error("Missing node_list for step_id %u.%u", |
| job_ptr->job_id, step_ptr->step_id); |
| delete_step_record (job_ptr, step_ptr->step_id); |
| } |
| } |
| |
| list_iterator_destroy (step_iterator); |
| return; |
| } |
| |
| /* update first assigned job id as needed on reconfigure |
| * NOTE: READ lock_slurmctld config before entry */ |
| void reset_first_job_id(void) |
| { |
| job_id_sequence = MAX(job_id_sequence, slurmctld_conf.first_job_id); |
| } |
| |
| /* |
| * get_next_job_id - return the job_id to be used by default for |
| * the next job |
| */ |
| extern uint32_t get_next_job_id(void) |
| { |
| uint32_t next_id; |
| |
| job_id_sequence = MAX(job_id_sequence, slurmctld_conf.first_job_id); |
| next_id = job_id_sequence + 1; |
| if (next_id >= slurmctld_conf.max_job_id) |
| next_id = slurmctld_conf.first_job_id; |
| return next_id; |
| } |
| |
| /* |
| * _set_job_id - set a default job_id, insure that it is unique |
| * IN job_ptr - pointer to the job_record |
| */ |
| static int _set_job_id(struct job_record *job_ptr) |
| { |
| int i; |
| uint32_t new_id, max_jobs; |
| |
| xassert(job_ptr); |
| xassert (job_ptr->magic == JOB_MAGIC); |
| |
| job_id_sequence = MAX(job_id_sequence, slurmctld_conf.first_job_id); |
| max_jobs = slurmctld_conf.max_job_id - slurmctld_conf.first_job_id; |
| |
| /* Insure no conflict in job id if we roll over 32 bits */ |
| for (i = 0; i < max_jobs; i++) { |
| if (++job_id_sequence >= slurmctld_conf.max_job_id) |
| job_id_sequence = slurmctld_conf.first_job_id; |
| new_id = job_id_sequence; |
| if (find_job_record(new_id)) |
| continue; |
| if (_dup_job_file_test(new_id)) |
| continue; |
| job_ptr->job_id = new_id; |
| return SLURM_SUCCESS; |
| } |
| error("We have exhausted our supply of valid job id values. " |
| "FirstJobId=%u MaxJobId=%u", slurmctld_conf.first_job_id, |
| slurmctld_conf.max_job_id); |
| job_ptr->job_id = NO_VAL; |
| return EAGAIN; |
| } |
| |
| |
| /* |
| * set_job_prio - set a default job priority |
| * IN job_ptr - pointer to the job_record |
| */ |
| extern void set_job_prio(struct job_record *job_ptr) |
| { |
| uint32_t relative_prio; |
| |
| xassert(job_ptr); |
| xassert (job_ptr->magic == JOB_MAGIC); |
| |
| if (IS_JOB_FINISHED(job_ptr)) |
| return; |
| job_ptr->priority = slurm_sched_g_initial_priority(lowest_prio, |
| job_ptr); |
| if ((job_ptr->priority == 0) || (job_ptr->direct_set_prio)) |
| return; |
| |
| relative_prio = job_ptr->priority; |
| if (job_ptr->details && (job_ptr->details->nice != NICE_OFFSET)) { |
| int offset = job_ptr->details->nice; |
| offset -= NICE_OFFSET; |
| relative_prio += offset; |
| } |
| lowest_prio = MIN(relative_prio, lowest_prio); |
| } |
| |
| /* After recovering job state, if using priority/basic then we increment the |
| * priorities of all jobs to avoid decrementing the base down to zero */ |
| extern void sync_job_priorities(void) |
| { |
| ListIterator job_iterator; |
| struct job_record *job_ptr; |
| uint32_t prio_boost = 0; |
| |
| if ((highest_prio != 0) && (highest_prio < TOP_PRIORITY)) |
| prio_boost = TOP_PRIORITY - highest_prio; |
| if (strcmp(slurmctld_conf.priority_type, "priority/basic") || |
| (prio_boost < 1000000)) |
| return; |
| |
| job_iterator = list_iterator_create(job_list); |
| while ((job_ptr = (struct job_record *) list_next(job_iterator))) { |
| if ((job_ptr->priority) && (job_ptr->direct_set_prio == 0)) |
| job_ptr->priority += prio_boost; |
| } |
| list_iterator_destroy(job_iterator); |
| lowest_prio += prio_boost; |
| } |
| |
| /* |
| * _top_priority - determine if any other job has a higher priority than the |
| * specified job |
| * IN job_ptr - pointer to selected job |
| * RET true if selected job has highest priority |
| */ |
| static bool _top_priority(struct job_record *job_ptr) |
| { |
| struct job_details *detail_ptr = job_ptr->details; |
| bool top; |
| |
| #ifdef HAVE_BG |
| static uint16_t static_part = (uint16_t)NO_VAL; |
| int rc = SLURM_SUCCESS; |
| |
| /* On BlueGene with static partitioning, we don't want to delay |
| * jobs based upon priority since jobs of different sizes can |
| * execute on different sets of nodes. While sched/backfill would |
| * eventually start the job if delayed here based upon priority, |
| * that could delay the initiation of a job by a few seconds. */ |
| if (static_part == (uint16_t)NO_VAL) { |
| /* Since this never changes we can just set it once |
| and not look at it again. */ |
| rc = select_g_get_info_from_plugin(SELECT_STATIC_PART, job_ptr, |
| &static_part); |
| } |
| if ((rc == SLURM_SUCCESS) && (static_part == 1)) |
| return true; |
| #endif |
| |
| if (job_ptr->priority == 0) /* user held */ |
| top = false; |
| else { |
| ListIterator job_iterator; |
| struct job_record *job_ptr2; |
| |
| top = true; /* assume top priority until found otherwise */ |
| job_iterator = list_iterator_create(job_list); |
| while ((job_ptr2 = (struct job_record *) |
| list_next(job_iterator))) { |
| if (job_ptr2 == job_ptr) |
| continue; |
| if (!IS_JOB_PENDING(job_ptr2)) |
| continue; |
| if (IS_JOB_COMPLETING(job_ptr2)) { |
| /* Job is hung in pending & completing state, |
| * indicative of job requeue */ |
| continue; |
| } |
| if (!acct_policy_job_runnable_state(job_ptr2) || |
| !misc_policy_job_runnable_state(job_ptr2) || |
| !part_policy_job_runnable_state(job_ptr2) || |
| !job_independent(job_ptr2, 0)) |
| continue; |
| if ((job_ptr2->resv_name && (!job_ptr->resv_name)) || |
| ((!job_ptr2->resv_name) && job_ptr->resv_name)) |
| continue; /* different reservation */ |
| if (job_ptr2->resv_name && job_ptr->resv_name && |
| (!strcmp(job_ptr2->resv_name, |
| job_ptr->resv_name))) { |
| /* same reservation */ |
| if (job_ptr2->priority <= job_ptr->priority) |
| continue; |
| top = false; |
| break; |
| } |
| if (job_ptr2->part_ptr == job_ptr->part_ptr) { |
| /* same partition */ |
| if (job_ptr2->priority <= job_ptr->priority) |
| continue; |
| top = false; |
| break; |
| } |
| if (bit_overlap(job_ptr->part_ptr->node_bitmap, |
| job_ptr2->part_ptr->node_bitmap) == 0) |
| continue; /* no node overlap in partitions */ |
| if ((job_ptr2->part_ptr->priority > |
| job_ptr ->part_ptr->priority) || |
| ((job_ptr2->part_ptr->priority == |
| job_ptr ->part_ptr->priority) && |
| (job_ptr2->priority > job_ptr->priority))) { |
| top = false; |
| break; |
| } |
| } |
| list_iterator_destroy(job_iterator); |
| } |
| |
| if ((!top) && detail_ptr) { /* not top prio */ |
| if (job_ptr->priority == 0) { /* user/admin hold */ |
| if (job_ptr->state_reason != FAIL_BAD_CONSTRAINTS |
| && (job_ptr->state_reason != WAIT_HELD) |
| && (job_ptr->state_reason != WAIT_HELD_USER)) { |
| job_ptr->state_reason = WAIT_HELD; |
| xfree(job_ptr->state_desc); |
| } |
| } else if (job_ptr->state_reason == WAIT_NO_REASON) { |
| job_ptr->state_reason = WAIT_PRIORITY; |
| xfree(job_ptr->state_desc); |
| } |
| } |
| return top; |
| } |
| |
| static void _merge_job_licenses(struct job_record *shrink_job_ptr, |
| struct job_record *expand_job_ptr) |
| { |
| xassert(shrink_job_ptr); |
| xassert(expand_job_ptr); |
| |
| if (!shrink_job_ptr->licenses) /* No licenses to add */ |
| return; |
| |
| if (!expand_job_ptr->licenses) { /* Just transfer licenses */ |
| expand_job_ptr->licenses = shrink_job_ptr->licenses; |
| shrink_job_ptr->licenses = NULL; |
| FREE_NULL_LIST(expand_job_ptr->license_list); |
| expand_job_ptr->license_list = shrink_job_ptr->license_list; |
| shrink_job_ptr->license_list = NULL; |
| return; |
| } |
| |
| /* Merge the license information into expanding job */ |
| xstrcat(expand_job_ptr->licenses, ","); |
| xstrcat(expand_job_ptr->licenses, shrink_job_ptr->licenses); |
| xfree(shrink_job_ptr->licenses); |
| FREE_NULL_LIST(expand_job_ptr->license_list); |
| FREE_NULL_LIST(shrink_job_ptr->license_list); |
| license_job_merge(expand_job_ptr); |
| return; |
| } |
| |
| /* |
| * update_job - update a job's parameters per the supplied specifications |
| * IN job_specs - a job's specification |
| * IN uid - uid of user issuing RPC |
| * RET returns an error code from slurm_errno.h |
| * global: job_list - global list of job entries |
| * last_job_update - time of last job table update |
| */ |
| int update_job(job_desc_msg_t * job_specs, uid_t uid) |
| { |
| int error_code = SLURM_SUCCESS; |
| enum job_state_reason fail_reason; |
| bool authorized = false, admin = false; |
| uint32_t save_min_nodes = 0, save_max_nodes = 0; |
| uint32_t save_min_cpus = 0, save_max_cpus = 0; |
| struct job_record *job_ptr; |
| struct job_details *detail_ptr; |
| struct part_record *tmp_part_ptr; |
| bitstr_t *exc_bitmap = NULL, *req_bitmap = NULL; |
| time_t now = time(NULL); |
| multi_core_data_t *mc_ptr = NULL; |
| bool update_accounting = false; |
| acct_policy_limit_set_t acct_policy_limit_set; |
| |
| #ifdef HAVE_BG |
| uint16_t conn_type[SYSTEM_DIMENSIONS] = {(uint16_t) NO_VAL}; |
| uint16_t reboot = (uint16_t) NO_VAL; |
| uint16_t rotate = (uint16_t) NO_VAL; |
| uint16_t geometry[SYSTEM_DIMENSIONS] = {(uint16_t) NO_VAL}; |
| char *image = NULL; |
| static uint32_t cpus_per_mp = 0; |
| static uint16_t cpus_per_node = 0; |
| |
| if (!cpus_per_mp) |
| select_g_alter_node_cnt(SELECT_GET_MP_CPU_CNT, &cpus_per_mp); |
| if (!cpus_per_node) |
| select_g_alter_node_cnt(SELECT_GET_NODE_CPU_CNT, |
| &cpus_per_node); |
| #endif |
| memset(&acct_policy_limit_set, 0, sizeof(acct_policy_limit_set_t)); |
| |
| /* Make sure anything that may be put in the database will be |
| * lower case */ |
| xstrtolower(job_specs->account); |
| xstrtolower(job_specs->wckey); |
| |
| job_ptr = find_job_record(job_specs->job_id); |
| if (job_ptr == NULL) { |
| error("update_job: job_id %u does not exist.", |
| job_specs->job_id); |
| return ESLURM_INVALID_JOB_ID; |
| } |
| |
| error_code = job_submit_plugin_modify(job_specs, job_ptr, |
| (uint32_t) uid); |
| if (error_code != SLURM_SUCCESS) |
| return error_code; |
| |
| admin = validate_operator(uid); |
| authorized = admin || assoc_mgr_is_user_acct_coord( |
| acct_db_conn, uid, job_ptr->account); |
| if ((job_ptr->user_id != uid) && !authorized) { |
| error("Security violation, JOB_UPDATE RPC from uid %d", |
| uid); |
| return ESLURM_USER_ID_MISSING; |
| } |
| |
| if (!wiki_sched_test) { |
| char *sched_type = slurm_get_sched_type(); |
| if (strcmp(sched_type, "sched/wiki") == 0) |
| wiki_sched = true; |
| if (strcmp(sched_type, "sched/wiki2") == 0) { |
| wiki_sched = true; |
| wiki2_sched = true; |
| } |
| xfree(sched_type); |
| wiki_sched_test = true; |
| } |
| detail_ptr = job_ptr->details; |
| if (detail_ptr) |
| mc_ptr = detail_ptr->mc_ptr; |
| last_job_update = now; |
| |
| if (job_specs->account |
| && !xstrcmp(job_specs->account, job_ptr->account)) { |
| debug("sched: update_job: new account identical to " |
| "old account %u", job_ptr->job_id); |
| xfree(job_specs->account); |
| } |
| |
| if (job_specs->account) { |
| if (!IS_JOB_PENDING(job_ptr)) |
| error_code = ESLURM_DISABLED; |
| else { |
| int rc = update_job_account("update_job", job_ptr, |
| job_specs->account); |
| if (rc != SLURM_SUCCESS) |
| error_code = rc; |
| else |
| update_accounting = true; |
| } |
| } |
| if (error_code != SLURM_SUCCESS) |
| goto fini; |
| |
| if (job_specs->exc_nodes) { |
| if ((!IS_JOB_PENDING(job_ptr)) || (detail_ptr == NULL)) |
| error_code = ESLURM_DISABLED; |
| else if (job_specs->exc_nodes[0] == '\0') { |
| xfree(detail_ptr->exc_nodes); |
| FREE_NULL_BITMAP(detail_ptr->exc_node_bitmap); |
| } else { |
| if (node_name2bitmap(job_specs->exc_nodes, false, |
| &exc_bitmap)) { |
| error("sched: Invalid node list for " |
| "job_update: %s",job_specs->exc_nodes); |
| FREE_NULL_BITMAP(exc_bitmap); |
| error_code = ESLURM_INVALID_NODE_NAME; |
| } |
| if (exc_bitmap) { |
| xfree(detail_ptr->exc_nodes); |
| detail_ptr->exc_nodes = |
| job_specs->exc_nodes; |
| FREE_NULL_BITMAP(detail_ptr->exc_node_bitmap); |
| detail_ptr->exc_node_bitmap = exc_bitmap; |
| info("sched: update_job: setting exc_nodes to " |
| "%s for job_id %u", job_specs->exc_nodes, |
| job_specs->job_id); |
| job_specs->exc_nodes = NULL; |
| } |
| } |
| } |
| if (error_code != SLURM_SUCCESS) |
| goto fini; |
| |
| #ifndef HAVE_BG |
| if (job_specs->req_nodes && |
| (IS_JOB_RUNNING(job_ptr) || IS_JOB_SUSPENDED(job_ptr))) { |
| /* Use req_nodes to change the nodes associated with a running |
| * for lack of other field in the job request to use */ |
| if ((job_specs->req_nodes[0] == '\0') || |
| node_name2bitmap(job_specs->req_nodes,false, &req_bitmap) || |
| !bit_super_set(req_bitmap, job_ptr->node_bitmap) || |
| job_ptr->details->expanding_jobid) { |
| info("sched: Invalid node list (%s) for job %u update", |
| job_specs->req_nodes, job_specs->job_id); |
| error_code = ESLURM_INVALID_NODE_NAME; |
| goto fini; |
| } else if (req_bitmap) { |
| int i, i_first, i_last; |
| struct node_record *node_ptr; |
| info("sched: update_job: setting nodes to %s for " |
| "job_id %u", |
| job_specs->req_nodes, job_specs->job_id); |
| job_pre_resize_acctg(job_ptr); |
| i_first = bit_ffs(job_ptr->node_bitmap); |
| i_last = bit_fls(job_ptr->node_bitmap); |
| for (i=i_first; i<=i_last; i++) { |
| if (bit_test(req_bitmap, i) || |
| !bit_test(job_ptr->node_bitmap, i)) |
| continue; |
| node_ptr = node_record_table_ptr + i; |
| kill_step_on_node(job_ptr, node_ptr, false); |
| excise_node_from_job(job_ptr, node_ptr); |
| } |
| job_post_resize_acctg(job_ptr); |
| /* Since job_post_resize_acctg will restart |
| * things, don't do it again. */ |
| update_accounting = false; |
| } else { |
| update_accounting = true; |
| } |
| FREE_NULL_BITMAP(req_bitmap); |
| xfree(job_specs->req_nodes); |
| } |
| #endif |
| |
| if (job_specs->req_nodes) { |
| if ((!IS_JOB_PENDING(job_ptr)) || (detail_ptr == NULL)) |
| error_code = ESLURM_DISABLED; |
| else if (job_specs->req_nodes[0] == '\0') { |
| xfree(detail_ptr->req_nodes); |
| FREE_NULL_BITMAP(detail_ptr->req_node_bitmap); |
| xfree(detail_ptr->req_node_layout); |
| } else { |
| if (node_name2bitmap(job_specs->req_nodes, false, |
| &req_bitmap)) { |
| info("sched: Invalid node list for " |
| "job_update: %s", job_specs->req_nodes); |
| FREE_NULL_BITMAP(req_bitmap); |
| error_code = ESLURM_INVALID_NODE_NAME; |
| } |
| if (req_bitmap) { |
| xfree(detail_ptr->req_nodes); |
| detail_ptr->req_nodes = |
| job_specs->req_nodes; |
| FREE_NULL_BITMAP(detail_ptr->req_node_bitmap); |
| xfree(detail_ptr->req_node_layout); |
| detail_ptr->req_node_bitmap = req_bitmap; |
| info("sched: update_job: setting req_nodes to " |
| "%s for job_id %u", job_specs->req_nodes, |
| job_specs->job_id); |
| job_specs->req_nodes = NULL; |
| } |
| } |
| } |
| if (error_code != SLURM_SUCCESS) |
| goto fini; |
| |
| if (job_specs->min_nodes == INFINITE) { |
| /* Used by scontrol just to get current configuration info */ |
| job_specs->min_nodes = NO_VAL; |
| } |
| #if defined(HAVE_BG) || defined(HAVE_ALPS_CRAY) |
| if ((job_specs->min_nodes != NO_VAL) && |
| (IS_JOB_RUNNING(job_ptr) || IS_JOB_SUSPENDED(job_ptr))) { |
| #else |
| if ((job_specs->min_nodes != NO_VAL) && |
| (job_specs->min_nodes > job_ptr->node_cnt) && |
| !select_g_job_expand_allow() && |
| (IS_JOB_RUNNING(job_ptr) || IS_JOB_SUSPENDED(job_ptr))) { |
| #endif |
| info("Change of size for job %u not supported", |
| job_specs->job_id); |
| error_code = ESLURM_NOT_SUPPORTED; |
| goto fini; |
| } |
| |
| if (job_specs->req_switch != NO_VAL) { |
| job_ptr->req_switch = job_specs->req_switch; |
| info("Change of switches to %u job %u", |
| job_specs->req_switch, job_specs->job_id); |
| } |
| if (job_specs->wait4switch != NO_VAL) { |
| job_ptr->wait4switch = _max_switch_wait(job_specs->wait4switch); |
| info("Change of switch wait to %u secs job %u", |
| job_ptr->wait4switch, job_specs->job_id); |
| } |
| |
| if (job_specs->partition |
| && !xstrcmp(job_specs->partition, job_ptr->partition)) { |
| debug("sched: update_job: new partition identical to " |
| "old partition %u", job_ptr->job_id); |
| xfree(job_specs->partition); |
| } |
| |
| if (job_specs->partition) { |
| List part_ptr_list = NULL; |
| bool old_res = false; |
| |
| if (!IS_JOB_PENDING(job_ptr)) { |
| error_code = ESLURM_DISABLED; |
| goto fini; |
| } |
| |
| if (job_specs->min_nodes == NO_VAL) { |
| #ifdef HAVE_BG |
| select_g_select_jobinfo_get( |
| job_ptr->select_jobinfo, |
| SELECT_JOBDATA_NODE_CNT, |
| &job_specs->min_nodes); |
| #else |
| job_specs->min_nodes = detail_ptr->min_nodes; |
| #endif |
| } |
| if ((job_specs->max_nodes == NO_VAL) && |
| (detail_ptr->max_nodes != 0)) { |
| #ifdef HAVE_BG |
| select_g_select_jobinfo_get( |
| job_ptr->select_jobinfo, |
| SELECT_JOBDATA_NODE_CNT, |
| &job_specs->max_nodes); |
| #else |
| job_specs->max_nodes = detail_ptr->max_nodes; |
| #endif |
| } |
| |
| if ((job_specs->time_min == NO_VAL) && |
| (job_ptr->time_min != 0)) |
| job_specs->time_min = job_ptr->time_min; |
| if (job_specs->time_limit == NO_VAL) |
| job_specs->time_limit = job_ptr->time_limit; |
| if (!job_specs->reservation |
| || job_specs->reservation[0] == '\0') { |
| /* just incase the reservation is '\0' */ |
| xfree(job_specs->reservation); |
| job_specs->reservation = job_ptr->resv_name; |
| old_res = true; |
| } |
| |
| error_code = _get_job_parts(job_specs, |
| &tmp_part_ptr, &part_ptr_list); |
| |
| if (error_code != SLURM_SUCCESS) |
| ; |
| else if ((tmp_part_ptr->state_up & PARTITION_SUBMIT) == 0) |
| error_code = ESLURM_PARTITION_NOT_AVAIL; |
| else { |
| slurmdb_association_rec_t assoc_rec; |
| memset(&assoc_rec, 0, |
| sizeof(slurmdb_association_rec_t)); |
| assoc_rec.acct = job_ptr->account; |
| assoc_rec.partition = tmp_part_ptr->name; |
| assoc_rec.uid = job_ptr->user_id; |
| if (assoc_mgr_fill_in_assoc( |
| acct_db_conn, &assoc_rec, |
| accounting_enforce, |
| (slurmdb_association_rec_t **) |
| &job_ptr->assoc_ptr, false)) { |
| info("job_update: invalid account %s " |
| "for job %u", |
| job_specs->account, job_ptr->job_id); |
| error_code = ESLURM_INVALID_ACCOUNT; |
| /* Let update proceed. Note there is an invalid |
| * association ID for accounting purposes */ |
| } else |
| job_ptr->assoc_id = assoc_rec.id; |
| |
| error_code = _valid_job_part( |
| job_specs, uid, |
| job_ptr->details->req_node_bitmap, |
| &tmp_part_ptr, part_ptr_list, |
| job_ptr->assoc_ptr, job_ptr->qos_ptr); |
| |
| xfree(job_ptr->partition); |
| job_ptr->partition = xstrdup(job_specs->partition); |
| job_ptr->part_ptr = tmp_part_ptr; |
| xfree(job_ptr->priority_array); /* Rebuilt in plugin */ |
| FREE_NULL_LIST(job_ptr->part_ptr_list); |
| job_ptr->part_ptr_list = part_ptr_list; |
| part_ptr_list = NULL; /* nothing to free */ |
| info("update_job: setting partition to %s for " |
| "job_id %u", job_specs->partition, |
| job_specs->job_id); |
| update_accounting = true; |
| } |
| FREE_NULL_LIST(part_ptr_list); /* error clean-up */ |
| |
| if (old_res) |
| job_specs->reservation = NULL; |
| |
| if (error_code != SLURM_SUCCESS) |
| goto fini; |
| } |
| |
| /* Always do this last just in case the assoc_ptr changed */ |
| if (job_specs->comment && wiki_sched && !validate_slurm_user(uid)) { |
| /* User must use Moab command to change job comment */ |
| error("Attempt to change comment for job %u", |
| job_specs->job_id); |
| error_code = ESLURM_ACCESS_DENIED; |
| } else if (job_specs->comment) { |
| xfree(job_ptr->comment); |
| job_ptr->comment = job_specs->comment; |
| job_specs->comment = NULL; /* Nothing left to free */ |
| info("update_job: setting comment to %s for job_id %u", |
| job_ptr->comment, job_specs->job_id); |
| |
| if (wiki_sched && strstr(job_ptr->comment, "QOS:")) { |
| if (!IS_JOB_PENDING(job_ptr)) |
| error_code = ESLURM_DISABLED; |
| else { |
| slurmdb_qos_rec_t qos_rec; |
| slurmdb_qos_rec_t *new_qos_ptr; |
| char *resv_name; |
| if (job_specs->reservation |
| && job_specs->reservation[0] != '\0') |
| resv_name = job_specs->reservation; |
| else |
| resv_name = job_ptr->resv_name; |
| |
| memset(&qos_rec, 0, sizeof(slurmdb_qos_rec_t)); |
| if (strstr(job_ptr->comment, |
| "FLAGS:PREEMPTOR")) |
| qos_rec.name = "expedite"; |
| else if (strstr(job_ptr->comment, |
| "FLAGS:PREEMPTEE")) |
| qos_rec.name = "standby"; |
| |
| new_qos_ptr = _determine_and_validate_qos( |
| resv_name, job_ptr->assoc_ptr, |
| authorized, &qos_rec, &error_code); |
| if (error_code == SLURM_SUCCESS) { |
| info("update_job: setting qos to %s " |
| "for job_id %u", |
| job_specs->qos, job_specs->job_id); |
| if (job_ptr->qos_id != qos_rec.id) { |
| job_ptr->qos_id = qos_rec.id; |
| job_ptr->qos_ptr = new_qos_ptr; |
| if (authorized) |
| job_ptr->limit_set_qos = |
| ADMIN_SET_LIMIT; |
| else |
| job_ptr->limit_set_qos |
| = 0; |
| update_accounting = true; |
| } else |
| debug("sched: update_job: " |
| "new qos identical to " |
| "old qos %u", |
| job_ptr->job_id); |
| } |
| } |
| } |
| } |
| |
| if (error_code != SLURM_SUCCESS) |
| goto fini; |
| |
| if (job_specs->qos) { |
| |
| if (!authorized && !IS_JOB_PENDING(job_ptr)) |
| error_code = ESLURM_DISABLED; |
| else { |
| slurmdb_qos_rec_t qos_rec; |
| slurmdb_qos_rec_t *new_qos_ptr; |
| char *resv_name; |
| |
| if (job_specs->reservation |
| && job_specs->reservation[0] != '\0') |
| resv_name = job_specs->reservation; |
| else |
| resv_name = job_ptr->resv_name; |
| |
| memset(&qos_rec, 0, sizeof(slurmdb_qos_rec_t)); |
| qos_rec.name = job_specs->qos; |
| |
| new_qos_ptr = _determine_and_validate_qos( |
| resv_name, job_ptr->assoc_ptr, |
| authorized, &qos_rec, &error_code); |
| if (error_code == SLURM_SUCCESS) { |
| info("update_job: setting qos to %s " |
| "for job_id %u", |
| job_specs->qos, job_specs->job_id); |
| if (job_ptr->qos_id != qos_rec.id) { |
| job_ptr->qos_id = qos_rec.id; |
| job_ptr->qos_ptr = new_qos_ptr; |
| if (authorized) |
| job_ptr->limit_set_qos = |
| ADMIN_SET_LIMIT; |
| else |
| job_ptr->limit_set_qos = 0; |
| update_accounting = true; |
| } else |
| debug("sched: update_job: new qos " |
| "identical to old qos %u", |
| job_ptr->job_id); |
| } |
| } |
| } |
| if (error_code != SLURM_SUCCESS) |
| goto fini; |
| |
| if (!authorized && (accounting_enforce & ACCOUNTING_ENFORCE_LIMITS)) { |
| if (!acct_policy_validate(job_specs, job_ptr->part_ptr, |
| job_ptr->assoc_ptr, job_ptr->qos_ptr, |
| NULL, &acct_policy_limit_set, 1)) { |
| info("update_job: exceeded association's cpu, node, " |
| "memory or time limit for user %u", |
| job_specs->user_id); |
| error_code = ESLURM_ACCOUNTING_POLICY; |
| goto fini; |
| } |
| |
| /* Perhaps the limit was removed, so we will remove it |
| since it was imposed previously. |
| */ |
| if (!acct_policy_limit_set.max_cpus |
| && (job_ptr->limit_set_max_cpus == 1)) |
| job_ptr->details->max_cpus = NO_VAL; |
| |
| if (!acct_policy_limit_set.max_nodes |
| && (job_ptr->limit_set_max_nodes == 1)) |
| job_ptr->details->max_nodes = NO_VAL; |
| |
| if (!acct_policy_limit_set.time |
| && (job_ptr->limit_set_time == 1)) |
| job_ptr->time_limit = NO_VAL; |
| |
| if (job_ptr->limit_set_max_cpus != ADMIN_SET_LIMIT) |
| job_ptr->limit_set_max_cpus = |
| acct_policy_limit_set.max_cpus; |
| if (job_ptr->limit_set_max_nodes != ADMIN_SET_LIMIT) |
| job_ptr->limit_set_max_nodes = |
| acct_policy_limit_set.max_nodes; |
| if (job_ptr->limit_set_time != ADMIN_SET_LIMIT) |
| job_ptr->limit_set_time = acct_policy_limit_set.time; |
| } else if (authorized) { |
| acct_policy_limit_set.max_cpus = ADMIN_SET_LIMIT; |
| acct_policy_limit_set.max_nodes = ADMIN_SET_LIMIT; |
| acct_policy_limit_set.min_cpus = ADMIN_SET_LIMIT; |
| acct_policy_limit_set.min_nodes = ADMIN_SET_LIMIT; |
| acct_policy_limit_set.pn_min_memory = ADMIN_SET_LIMIT; |
| acct_policy_limit_set.time = ADMIN_SET_LIMIT; |
| acct_policy_limit_set.qos = ADMIN_SET_LIMIT; |
| } |
| |
| |
| /* This needs to be done after the association acct policy check since |
| * it looks at unaltered nodes for bluegene systems |
| */ |
| debug3("update before alteration asking for nodes %u-%u cpus %u-%u", |
| job_specs->min_nodes, job_specs->max_nodes, |
| job_specs->min_cpus, job_specs->max_cpus); |
| if (select_g_alter_node_cnt(SELECT_SET_NODE_CNT, job_specs) |
| != SLURM_SUCCESS) { |
| error_code = ESLURM_REQUESTED_NODE_CONFIG_UNAVAILABLE; |
| goto fini; |
| } |
| debug3("update after alteration asking for nodes %u-%u cpus %u-%u", |
| job_specs->min_nodes, job_specs->max_nodes, |
| job_specs->min_cpus, job_specs->max_cpus); |
| |
| /* Reset min and max cpu counts as needed, insure consistency */ |
| if (job_specs->min_cpus != NO_VAL) { |
| if ((!IS_JOB_PENDING(job_ptr)) || (detail_ptr == NULL)) |
| error_code = ESLURM_DISABLED; |
| else if (job_specs->min_cpus < 1) |
| error_code = ESLURM_INVALID_CPU_COUNT; |
| else { |
| save_min_cpus = detail_ptr->min_cpus; |
| detail_ptr->min_cpus = job_specs->min_cpus; |
| } |
| } |
| if (job_specs->max_cpus != NO_VAL) { |
| if ((!IS_JOB_PENDING(job_ptr)) || (detail_ptr == NULL)) |
| error_code = ESLURM_DISABLED; |
| else { |
| save_max_cpus = detail_ptr->max_cpus; |
| detail_ptr->max_cpus = job_specs->max_cpus; |
| } |
| } |
| if ((save_min_cpus || save_max_cpus) && detail_ptr->max_cpus && |
| (detail_ptr->max_cpus < detail_ptr->min_cpus)) { |
| error_code = ESLURM_INVALID_CPU_COUNT; |
| if (save_min_cpus) { |
| detail_ptr->min_cpus = save_min_cpus; |
| save_min_cpus = 0; |
| } |
| if (save_max_cpus) { |
| detail_ptr->max_cpus = save_max_cpus; |
| save_max_cpus = 0; |
| } |
| } |
| if (error_code != SLURM_SUCCESS) |
| goto fini; |
| |
| if (save_min_cpus && (detail_ptr->min_cpus != save_min_cpus)) { |
| #ifdef HAVE_BG |
| uint32_t node_cnt = detail_ptr->min_cpus; |
| if (cpus_per_node) |
| node_cnt /= cpus_per_node; |
| /* Ensure that accounting is set up correctly */ |
| select_g_select_jobinfo_set(job_ptr->select_jobinfo, |
| SELECT_JOBDATA_NODE_CNT, |
| &node_cnt); |
| /* Reset geo since changing this makes any geo |
| * potentially invalid */ |
| select_g_select_jobinfo_set(job_ptr->select_jobinfo, |
| SELECT_JOBDATA_GEOMETRY, |
| geometry); |
| #endif |
| info("update_job: setting min_cpus from " |
| "%u to %u for job_id %u", |
| save_min_cpus, detail_ptr->min_cpus, job_specs->job_id); |
| job_ptr->limit_set_min_cpus = acct_policy_limit_set.min_cpus; |
| update_accounting = true; |
| } |
| if (save_max_cpus && (detail_ptr->max_cpus != save_max_cpus)) { |
| info("update_job: setting max_cpus from " |
| "%u to %u for job_id %u", |
| save_max_cpus, detail_ptr->max_cpus, job_specs->job_id); |
| /* Always use the acct_policy_limit_set.* since if set by a |
| * super user it be set correctly */ |
| job_ptr->limit_set_max_cpus = acct_policy_limit_set.max_cpus; |
| update_accounting = true; |
| } |
| |
| if ((job_specs->pn_min_cpus != (uint16_t) NO_VAL) && |
| (job_specs->pn_min_cpus != 0)) { |
| if ((!IS_JOB_PENDING(job_ptr)) || (detail_ptr == NULL)) |
| error_code = ESLURM_DISABLED; |
| else if (authorized |
| || (detail_ptr->pn_min_cpus |
| > job_specs->pn_min_cpus)) { |
| detail_ptr->pn_min_cpus = job_specs->pn_min_cpus; |
| info("update_job: setting pn_min_cpus to %u for " |
| "job_id %u", job_specs->pn_min_cpus, |
| job_specs->job_id); |
| } else { |
| error("Attempt to increase pn_min_cpus for job %u", |
| job_specs->job_id); |
| error_code = ESLURM_ACCESS_DENIED; |
| } |
| } |
| if (error_code != SLURM_SUCCESS) |
| goto fini; |
| |
| if (job_specs->num_tasks != NO_VAL) { |
| if (!IS_JOB_PENDING(job_ptr)) |
| error_code = ESLURM_DISABLED; |
| else if (job_specs->num_tasks < 1) |
| error_code = ESLURM_BAD_TASK_COUNT; |
| else { |
| #ifdef HAVE_BG |
| uint32_t node_cnt = job_specs->num_tasks; |
| if (cpus_per_node) |
| node_cnt /= cpus_per_node; |
| /* This is only set up so accounting is set up |
| correctly */ |
| select_g_select_jobinfo_set(job_ptr->select_jobinfo, |
| SELECT_JOBDATA_NODE_CNT, |
| &node_cnt); |
| #endif |
| detail_ptr->num_tasks = job_specs->num_tasks; |
| info("update_job: setting num_tasks to %u for " |
| "job_id %u", job_specs->num_tasks, |
| job_specs->job_id); |
| if (detail_ptr->cpus_per_task) { |
| uint32_t new_cpus = detail_ptr->num_tasks |
| / detail_ptr->cpus_per_task; |
| if ((new_cpus < detail_ptr->min_cpus) || |
| (!detail_ptr->overcommit && |
| (new_cpus > detail_ptr->min_cpus))) { |
| detail_ptr->min_cpus = new_cpus; |
| detail_ptr->max_cpus = new_cpus; |
| info("update_job: setting " |
| "min_cpus to %u for " |
| "job_id %u", detail_ptr->min_cpus, |
| job_specs->job_id); |
| /* Always use the |
| * acct_policy_limit_set.* |
| * since if set by a |
| * super user it be set correctly */ |
| job_ptr->limit_set_min_cpus = |
| acct_policy_limit_set.min_cpus; |
| job_ptr->limit_set_max_cpus = |
| acct_policy_limit_set.max_cpus; |
| } |
| } |
| } |
| } |
| if (error_code != SLURM_SUCCESS) |
| goto fini; |
| |
| /* Reset min and max node counts as needed, insure consistency */ |
| if (job_specs->min_nodes != NO_VAL) { |
| if (IS_JOB_RUNNING(job_ptr) || IS_JOB_SUSPENDED(job_ptr)) |
| ; /* shrink running job, processed later */ |
| else if ((!IS_JOB_PENDING(job_ptr)) || (detail_ptr == NULL)) |
| error_code = ESLURM_DISABLED; |
| else if (job_specs->min_nodes < 1) { |
| info("update_job: min_nodes < 1 for job %u", |
| job_specs->job_id); |
| error_code = ESLURM_INVALID_NODE_COUNT; |
| } else { |
| /* Resize of pending job */ |
| save_min_nodes = detail_ptr->min_nodes; |
| detail_ptr->min_nodes = job_specs->min_nodes; |
| } |
| } |
| if (job_specs->max_nodes != NO_VAL) { |
| if ((!IS_JOB_PENDING(job_ptr)) || (detail_ptr == NULL)) |
| error_code = ESLURM_DISABLED; |
| else { |
| save_max_nodes = detail_ptr->max_nodes; |
| detail_ptr->max_nodes = job_specs->max_nodes; |
| } |
| } |
| if ((save_min_nodes || save_max_nodes) && detail_ptr->max_nodes && |
| (detail_ptr->max_nodes < detail_ptr->min_nodes)) { |
| info("update_job: max_nodes < min_nodes (%u < %u) for job %u", |
| detail_ptr->max_nodes, detail_ptr->min_nodes, |
| job_specs->job_id); |
| error_code = ESLURM_INVALID_NODE_COUNT; |
| if (save_min_nodes) { |
| detail_ptr->min_nodes = save_min_nodes; |
| save_min_nodes = 0; |
| } |
| if (save_max_nodes) { |
| detail_ptr->max_nodes = save_max_nodes; |
| save_max_nodes = 0; |
| } |
| } |
| if (error_code != SLURM_SUCCESS) |
| goto fini; |
| |
| if (save_min_nodes && (save_min_nodes!= detail_ptr->min_nodes)) { |
| info("update_job: setting min_nodes from " |
| "%u to %u for job_id %u", |
| save_min_nodes, detail_ptr->min_nodes, job_specs->job_id); |
| job_ptr->limit_set_min_nodes = acct_policy_limit_set.min_nodes; |
| update_accounting = true; |
| } |
| if (save_max_nodes && (save_max_nodes != detail_ptr->max_nodes)) { |
| info("update_job: setting max_nodes from " |
| "%u to %u for job_id %u", |
| save_max_nodes, detail_ptr->max_nodes, job_specs->job_id); |
| /* Always use the acct_policy_limit_set.* since if set by a |
| * super user it be set correctly */ |
| job_ptr->limit_set_max_nodes = acct_policy_limit_set.max_nodes; |
| update_accounting = true; |
| } |
| |
| if (job_specs->time_limit != NO_VAL) { |
| if (IS_JOB_FINISHED(job_ptr) || job_ptr->preempt_time) |
| error_code = ESLURM_DISABLED; |
| else if (job_ptr->time_limit == job_specs->time_limit) { |
| debug("sched: update_job: new time limit identical to " |
| "old time limit %u", job_specs->job_id); |
| } else if (authorized || |
| (job_ptr->time_limit > job_specs->time_limit)) { |
| time_t old_time = job_ptr->time_limit; |
| if (old_time == INFINITE) /* one year in mins */ |
| old_time = (365 * 24 * 60); |
| acct_policy_alter_job(job_ptr, job_specs->time_limit); |
| job_ptr->time_limit = job_specs->time_limit; |
| if (IS_JOB_RUNNING(job_ptr) || |
| IS_JOB_SUSPENDED(job_ptr)) { |
| if (job_ptr->preempt_time) { |
| ; /* Preemption in progress */ |
| } else if (job_ptr->time_limit == INFINITE) { |
| /* Set end time in one year */ |
| job_ptr->end_time = now + |
| (365 * 24 * 60 * 60); |
| } else { |
| /* Update end_time based upon change |
| * to preserve suspend time info */ |
| job_ptr->end_time = job_ptr->end_time + |
| ((job_ptr->time_limit - |
| old_time) * 60); |
| } |
| if (job_ptr->end_time < now) |
| job_ptr->end_time = now; |
| if (IS_JOB_RUNNING(job_ptr) && |
| (list_is_empty(job_ptr->step_list) == 0)) { |
| _xmit_new_end_time(job_ptr); |
| } |
| } |
| info("sched: update_job: setting time_limit to %u for " |
| "job_id %u", job_specs->time_limit, |
| job_specs->job_id); |
| /* Always use the acct_policy_limit_set.* |
| * since if set by a super user it be set correctly */ |
| job_ptr->limit_set_time = acct_policy_limit_set.time; |
| update_accounting = true; |
| } else if (IS_JOB_PENDING(job_ptr) && job_ptr->part_ptr && |
| (job_ptr->part_ptr->max_time >= |
| job_specs->time_limit)) { |
| job_ptr->time_limit = job_specs->time_limit; |
| info("sched: update_job: setting time_limit to %u for " |
| "job_id %u", job_specs->time_limit, |
| job_specs->job_id); |
| /* Always use the acct_policy_limit_set.* |
| * since if set by a super user it be set correctly */ |
| job_ptr->limit_set_time = acct_policy_limit_set.time; |
| update_accounting = true; |
| } else { |
| info("sched: Attempt to increase time limit for job %u", |
| job_specs->job_id); |
| error_code = ESLURM_ACCESS_DENIED; |
| } |
| } |
| if (error_code != SLURM_SUCCESS) |
| goto fini; |
| |
| if ((job_specs->time_min != NO_VAL) && IS_JOB_PENDING(job_ptr)) { |
| if (job_specs->time_min > job_ptr->time_limit) { |
| info("update_job: attempt to set TimeMin > TimeLimit " |
| "(%u > %u)", |
| job_specs->time_min, job_ptr->time_limit); |
| error_code = ESLURM_INVALID_TIME_LIMIT; |
| } else if (job_ptr->time_min != job_specs->time_min) { |
| job_ptr->time_min = job_specs->time_min; |
| info("update_job: setting TimeMin to %u for job_id %u", |
| job_specs->time_min, job_specs->job_id); |
| } |
| } |
| if (error_code != SLURM_SUCCESS) |
| goto fini; |
| |
| if (job_specs->end_time) { |
| if (!IS_JOB_RUNNING(job_ptr) || job_ptr->preempt_time) { |
| /* We may want to use this for deadline scheduling |
| * at some point in the future. For now only reset |
| * the time limit of running jobs. */ |
| error_code = ESLURM_DISABLED; |
| } else if (job_specs->end_time < now) { |
| error_code = ESLURM_INVALID_TIME_VALUE; |
| } else if (authorized || |
| (job_ptr->end_time > job_specs->end_time)) { |
| int delta_t = job_specs->end_time - job_ptr->end_time; |
| job_ptr->end_time = job_specs->end_time; |
| job_ptr->time_limit += (delta_t+30)/60; /* Sec->min */ |
| info("sched: update_job: setting time_limit to %u for " |
| "job_id %u", job_ptr->time_limit, |
| job_specs->job_id); |
| /* Always use the acct_policy_limit_set.* |
| * since if set by a super user it be set correctly */ |
| job_ptr->limit_set_time = acct_policy_limit_set.time; |
| update_accounting = true; |
| } else { |
| info("sched: Attempt to extend end time for job %u", |
| job_specs->job_id); |
| error_code = ESLURM_ACCESS_DENIED; |
| } |
| } |
| if (error_code != SLURM_SUCCESS) |
| goto fini; |
| |
| if (job_specs->reservation |
| && !xstrcmp(job_specs->reservation, job_ptr->resv_name)) { |
| debug("sched: update_job: new reservation identical to " |
| "old reservation %u", job_ptr->job_id); |
| xfree(job_specs->reservation); |
| } |
| |
| /* this needs to be after partition and qos checks */ |
| if (job_specs->reservation) { |
| if (!IS_JOB_PENDING(job_ptr) && !IS_JOB_RUNNING(job_ptr)) { |
| error_code = ESLURM_DISABLED; |
| } else { |
| int rc; |
| char *save_resv_name = job_ptr->resv_name; |
| slurmctld_resv_t *save_resv_ptr = job_ptr->resv_ptr; |
| |
| job_ptr->resv_name = job_specs->reservation; |
| job_specs->reservation = NULL; /* Nothing to free */ |
| rc = validate_job_resv(job_ptr); |
| /* Make sure this job isn't using a partition |
| or qos that requires it to be in a |
| reservation. |
| */ |
| if (rc == SLURM_SUCCESS && !job_ptr->resv_name) { |
| struct part_record *part_ptr = |
| job_ptr->part_ptr; |
| slurmdb_qos_rec_t *qos_ptr = |
| (slurmdb_qos_rec_t *)job_ptr->qos_ptr; |
| |
| if (part_ptr |
| && part_ptr->flags & PART_FLAG_REQ_RESV) |
| rc = ESLURM_ACCESS_DENIED; |
| |
| if (qos_ptr |
| && qos_ptr->flags & QOS_FLAG_REQ_RESV) |
| rc = ESLURM_INVALID_QOS; |
| } |
| |
| if (rc == SLURM_SUCCESS) { |
| info("sched: update_job: setting reservation " |
| "to %s for job_id %u", job_ptr->resv_name, |
| job_ptr->job_id); |
| xfree(save_resv_name); |
| update_accounting = true; |
| } else { |
| /* Restore reservation info */ |
| job_specs->reservation = job_ptr->resv_name; |
| job_ptr->resv_name = save_resv_name; |
| job_ptr->resv_ptr = save_resv_ptr; |
| error_code = rc; |
| } |
| } |
| } |
| if (error_code != SLURM_SUCCESS) |
| goto fini; |
| |
| if ((job_specs->requeue != (uint16_t) NO_VAL) && detail_ptr) { |
| detail_ptr->requeue = MIN(job_specs->requeue, 1); |
| info("sched: update_job: setting requeue to %u for job_id %u", |
| job_specs->requeue, job_specs->job_id); |
| } |
| |
| if (job_specs->priority != NO_VAL) { |
| /* If we are doing time slicing we could update the |
| priority of the job while running to give better |
| position (larger time slices) than competing jobs |
| */ |
| if (IS_JOB_FINISHED(job_ptr) || (detail_ptr == NULL)) |
| error_code = ESLURM_DISABLED; |
| else if (job_ptr->priority == job_specs->priority) { |
| debug("update_job: setting priority to current value"); |
| if ((job_ptr->priority == 0) && |
| (job_ptr->user_id != uid) && authorized) { |
| /* Authorized user can change from user hold |
| * to admin hold or admin hold to user hold */ |
| if (job_specs->alloc_sid == ALLOC_SID_USER_HOLD) |
| job_ptr->state_reason = WAIT_HELD_USER; |
| else |
| job_ptr->state_reason = WAIT_HELD; |
| } |
| } else if ((job_ptr->priority == 0) && |
| (job_specs->priority == INFINITE) && |
| (authorized || |
| (job_ptr->state_reason == WAIT_HELD_USER))) { |
| job_ptr->direct_set_prio = 0; |
| set_job_prio(job_ptr); |
| info("sched: update_job: releasing hold for job_id %u", |
| job_specs->job_id); |
| job_ptr->state_reason = WAIT_NO_REASON; |
| job_ptr->job_state &= ~JOB_SPECIAL_EXIT; |
| job_ptr->exit_code = 0; |
| xfree(job_ptr->state_desc); |
| } else if ((job_ptr->priority == 0) && |
| (job_specs->priority != INFINITE)) { |
| info("ignore priority reset request on held job %u", |
| job_specs->job_id); |
| } else if (authorized || |
| (job_ptr->priority > job_specs->priority)) { |
| if (job_specs->priority != 0) |
| job_ptr->details->nice = NICE_OFFSET; |
| if (job_specs->priority == INFINITE) { |
| job_ptr->direct_set_prio = 0; |
| set_job_prio(job_ptr); |
| } else { |
| job_ptr->direct_set_prio = 1; |
| job_ptr->priority = job_specs->priority; |
| } |
| info("sched: update_job: setting priority to %u for " |
| "job_id %u", job_ptr->priority, |
| job_specs->job_id); |
| update_accounting = true; |
| if (job_ptr->priority == 0) { |
| if ((job_ptr->user_id == uid) || |
| (job_specs->alloc_sid == |
| ALLOC_SID_USER_HOLD)) { |
| job_ptr->state_reason = WAIT_HELD_USER; |
| } else |
| job_ptr->state_reason = WAIT_HELD; |
| xfree(job_ptr->state_desc); |
| } |
| } else if (job_specs->priority == INFINITE |
| && job_ptr->state_reason != WAIT_HELD_USER) { |
| /* If the job was already released ignore another |
| * release request. |
| */ |
| debug("%s: job %d already release ignoring request", |
| __func__, job_ptr->job_id); |
| } else { |
| error("sched: Attempt to modify priority for job %u", |
| job_specs->job_id); |
| error_code = ESLURM_ACCESS_DENIED; |
| } |
| } |
| if (error_code != SLURM_SUCCESS) |
| goto fini; |
| |
| if (job_specs->nice != (uint16_t) NO_VAL) { |
| if (IS_JOB_FINISHED(job_ptr) || (job_ptr->details == NULL)) |
| error_code = ESLURM_DISABLED; |
| else if (job_ptr->details && |
| (job_ptr->details->nice == job_specs->nice)) |
| debug("sched: update_job: new nice identical to " |
| "old nice %u", job_ptr->job_id); |
| else if (authorized || (job_specs->nice >= NICE_OFFSET)) { |
| int64_t new_prio = job_ptr->priority; |
| new_prio += job_ptr->details->nice; |
| new_prio -= job_specs->nice; |
| job_ptr->priority = MAX(new_prio, 2); |
| job_ptr->details->nice = job_specs->nice; |
| info("sched: update_job: setting priority to %u for " |
| "job_id %u", job_ptr->priority, |
| job_specs->job_id); |
| update_accounting = true; |
| } else { |
| error("sched: Attempt to modify nice for " |
| "job %u", job_specs->job_id); |
| error_code = ESLURM_ACCESS_DENIED; |
| } |
| } |
| if (error_code != SLURM_SUCCESS) |
| goto fini; |
| |
| if (job_specs->pn_min_memory != NO_VAL) { |
| if ((!IS_JOB_PENDING(job_ptr)) || (detail_ptr == NULL)) |
| error_code = ESLURM_DISABLED; |
| else if (job_specs->pn_min_memory |
| == detail_ptr->pn_min_memory) |
| debug("sched: update_job: new memory limit identical " |
| "to old limit for job %u", job_specs->job_id); |
| else if (authorized) { |
| char *entity; |
| if (job_specs->pn_min_memory & MEM_PER_CPU) |
| entity = "cpu"; |
| else |
| entity = "job"; |
| |
| detail_ptr->pn_min_memory = job_specs->pn_min_memory; |
| info("sched: update_job: setting min_memory_%s to %u " |
| "for job_id %u", entity, |
| (job_specs->pn_min_memory & (~MEM_PER_CPU)), |
| job_specs->job_id); |
| /* Always use the acct_policy_limit_set.* |
| * since if set by a super user it be set correctly */ |
| job_ptr->limit_set_pn_min_memory = |
| acct_policy_limit_set.pn_min_memory; |
| } else { |
| error("sched: Attempt to modify pn_min_memory for " |
| "job %u", job_specs->job_id); |
| error_code = ESLURM_ACCESS_DENIED; |
| } |
| } |
| if (error_code != SLURM_SUCCESS) |
| goto fini; |
| |
| if (job_specs->pn_min_tmp_disk != NO_VAL) { |
| if ((!IS_JOB_PENDING(job_ptr)) || (detail_ptr == NULL)) |
| error_code = ESLURM_DISABLED; |
| else if (authorized |
| || (detail_ptr->pn_min_tmp_disk |
| > job_specs->pn_min_tmp_disk)) { |
| detail_ptr->pn_min_tmp_disk = |
| job_specs->pn_min_tmp_disk; |
| info("sched: update_job: setting job_min_tmp_disk to " |
| "%u for job_id %u", job_specs->pn_min_tmp_disk, |
| job_specs->job_id); |
| } else { |
| |
| error("sched: Attempt to modify pn_min_tmp_disk " |
| "for job %u", |
| job_specs->job_id); |
| error_code = ESLURM_ACCESS_DENIED; |
| } |
| } |
| if (error_code != SLURM_SUCCESS) |
| goto fini; |
| |
| if (job_specs->sockets_per_node != (uint16_t) NO_VAL) { |
| if ((!IS_JOB_PENDING(job_ptr)) || (mc_ptr == NULL)) { |
| error_code = ESLURM_DISABLED; |
| goto fini; |
| } else { |
| mc_ptr->sockets_per_node = job_specs->sockets_per_node; |
| info("sched: update_job: setting sockets_per_node to " |
| "%u for job_id %u", job_specs->sockets_per_node, |
| job_specs->job_id); |
| } |
| } |
| |
| if (job_specs->cores_per_socket != (uint16_t) NO_VAL) { |
| if ((!IS_JOB_PENDING(job_ptr)) || (mc_ptr == NULL)) { |
| error_code = ESLURM_DISABLED; |
| goto fini; |
| } else { |
| mc_ptr->cores_per_socket = job_specs->cores_per_socket; |
| info("sched: update_job: setting cores_per_socket to " |
| "%u for job_id %u", job_specs->cores_per_socket, |
| job_specs->job_id); |
| } |
| } |
| |
| if ((job_specs->threads_per_core != (uint16_t) NO_VAL)) { |
| if ((!IS_JOB_PENDING(job_ptr)) || (mc_ptr == NULL)) { |
| error_code = ESLURM_DISABLED; |
| goto fini; |
| } else { |
| mc_ptr->threads_per_core = job_specs->threads_per_core; |
| info("sched: update_job: setting threads_per_core to " |
| "%u for job_id %u", job_specs->threads_per_core, |
| job_specs->job_id); |
| } |
| } |
| |
| if (job_specs->shared != (uint16_t) NO_VAL) { |
| if ((!IS_JOB_PENDING(job_ptr)) || (detail_ptr == NULL)) { |
| error_code = ESLURM_DISABLED; |
| } else if (!authorized) { |
| error("sched: Attempt to change sharing for job %u", |
| job_specs->job_id); |
| error_code = ESLURM_ACCESS_DENIED; |
| } else { |
| if (job_specs->shared) { |
| detail_ptr->share_res = 1; |
| detail_ptr->whole_node = 0; |
| } else { |
| detail_ptr->share_res = 0; |
| } |
| info("sched: update_job: setting shared to %u for " |
| "job_id %u", |
| job_specs->shared, job_specs->job_id); |
| } |
| } |
| if (error_code != SLURM_SUCCESS) |
| goto fini; |
| |
| if (job_specs->contiguous != (uint16_t) NO_VAL) { |
| if ((!IS_JOB_PENDING(job_ptr)) || (detail_ptr == NULL)) |
| error_code = ESLURM_DISABLED; |
| else if (authorized |
| || (detail_ptr->contiguous > job_specs->contiguous)) { |
| detail_ptr->contiguous = job_specs->contiguous; |
| info("sched: update_job: setting contiguous to %u " |
| "for job_id %u", job_specs->contiguous, |
| job_specs->job_id); |
| } else { |
| error("sched: Attempt to add contiguous for job %u", |
| job_specs->job_id); |
| error_code = ESLURM_ACCESS_DENIED; |
| } |
| } |
| if (error_code != SLURM_SUCCESS) |
| goto fini; |
| |
| if (job_specs->core_spec != (uint16_t) NO_VAL) { |
| if ((!IS_JOB_PENDING(job_ptr)) || (detail_ptr == NULL)) |
| error_code = ESLURM_DISABLED; |
| else if (authorized) { |
| detail_ptr->core_spec = job_specs->core_spec; |
| info("sched: update_job: setting core_spec to %u " |
| "for job_id %u", job_specs->core_spec, |
| job_specs->job_id); |
| } else { |
| error("sched: Attempt to add core_spec for job %u", |
| job_specs->job_id); |
| error_code = ESLURM_ACCESS_DENIED; |
| } |
| } |
| if (error_code != SLURM_SUCCESS) |
| goto fini; |
| |
| if (job_specs->features) { |
| if ((!IS_JOB_PENDING(job_ptr)) || (detail_ptr == NULL)) |
| error_code = ESLURM_DISABLED; |
| else if (job_specs->features[0] != '\0') { |
| char *old_features = detail_ptr->features; |
| List old_list = detail_ptr->feature_list; |
| detail_ptr->features = job_specs->features; |
| detail_ptr->feature_list = NULL; |
| if (build_feature_list(job_ptr)) { |
| info("sched: update_job: invalid features" |
| "(%s) for job_id %u", |
| job_specs->features, job_specs->job_id); |
| if (detail_ptr->feature_list) |
| list_destroy(detail_ptr->feature_list); |
| detail_ptr->features = old_features; |
| detail_ptr->feature_list = old_list; |
| error_code = ESLURM_INVALID_FEATURE; |
| } else { |
| info("sched: update_job: setting features to " |
| "%s for job_id %u", |
| job_specs->features, job_specs->job_id); |
| xfree(old_features); |
| if (old_list) |
| list_destroy(old_list); |
| job_specs->features = NULL; |
| } |
| } else { |
| info("sched: update_job: cleared features for job %u", |
| job_specs->job_id); |
| xfree(detail_ptr->features); |
| if (detail_ptr->feature_list) { |
| list_destroy(detail_ptr->feature_list); |
| detail_ptr->feature_list = NULL; |
| } |
| } |
| } |
| if (error_code != SLURM_SUCCESS) |
| goto fini; |
| |
| if (job_specs->gres) { |
| List tmp_gres_list = NULL; |
| if ((!IS_JOB_PENDING(job_ptr)) || (detail_ptr == NULL) || |
| (detail_ptr->expanding_jobid != 0)) { |
| error_code = ESLURM_DISABLED; |
| } else if (job_specs->gres[0] == '\0') { |
| info("sched: update_job: cleared gres for job %u", |
| job_specs->job_id); |
| xfree(job_ptr->gres); |
| FREE_NULL_LIST(job_ptr->gres_list); |
| } else if (gres_plugin_job_state_validate(job_specs->gres, |
| &tmp_gres_list)) { |
| info("sched: update_job: invalid gres %s for job %u", |
| job_specs->gres, job_specs->job_id); |
| error_code = ESLURM_INVALID_GRES; |
| FREE_NULL_LIST(tmp_gres_list); |
| } else { |
| info("sched: update_job: setting gres to " |
| "%s for job_id %u", |
| job_specs->gres, job_specs->job_id); |
| xfree(job_ptr->gres); |
| job_ptr->gres = job_specs->gres; |
| job_specs->gres = NULL; |
| FREE_NULL_LIST(job_ptr->gres_list); |
| job_ptr->gres_list = tmp_gres_list; |
| } |
| } |
| if (error_code != SLURM_SUCCESS) |
| goto fini; |
| |
| if (job_specs->name |
| && !xstrcmp(job_specs->name, job_ptr->name)) { |
| debug("sched: update_job: new name identical to " |
| "old name %u", job_ptr->job_id); |
| xfree(job_specs->name); |
| } |
| |
| if (job_specs->name) { |
| if (IS_JOB_FINISHED(job_ptr)) { |
| error_code = ESLURM_DISABLED; |
| goto fini; |
| } else { |
| xfree(job_ptr->name); |
| job_ptr->name = job_specs->name; |
| job_specs->name = NULL; |
| |
| info("sched: update_job: setting name to %s for " |
| "job_id %u", job_ptr->name, job_specs->job_id); |
| update_accounting = true; |
| } |
| } |
| |
| if (job_specs->std_out) { |
| if (!IS_JOB_PENDING(job_ptr)) |
| error_code = ESLURM_DISABLED; |
| else if (detail_ptr) { |
| xfree(detail_ptr->std_out); |
| detail_ptr->std_out = job_specs->std_out; |
| job_specs->std_out = NULL; |
| } |
| } |
| if (error_code != SLURM_SUCCESS) |
| goto fini; |
| |
| if (job_specs->wckey |
| && !xstrcmp(job_specs->wckey, job_ptr->wckey)) { |
| debug("sched: update_job: new wckey identical to " |
| "old wckey %u", job_ptr->job_id); |
| xfree(job_specs->wckey); |
| } |
| |
| if (job_specs->wckey) { |
| if (!IS_JOB_PENDING(job_ptr)) |
| error_code = ESLURM_DISABLED; |
| else { |
| int rc = update_job_wckey("update_job", |
| job_ptr, |
| job_specs->wckey); |
| if (rc != SLURM_SUCCESS) |
| error_code = rc; |
| else |
| update_accounting = true; |
| } |
| } |
| if (error_code != SLURM_SUCCESS) |
| goto fini; |
| |
| if ((job_specs->min_nodes != NO_VAL) && |
| (IS_JOB_RUNNING(job_ptr) || IS_JOB_SUSPENDED(job_ptr))) { |
| /* Use req_nodes to change the nodes associated with a running |
| * for lack of other field in the job request to use */ |
| if ((job_specs->min_nodes == 0) && (job_ptr->node_cnt > 0) && |
| job_ptr->details && job_ptr->details->expanding_jobid) { |
| struct job_record *expand_job_ptr; |
| bitstr_t *orig_job_node_bitmap; |
| |
| expand_job_ptr = find_job_record(job_ptr->details-> |
| expanding_jobid); |
| if (expand_job_ptr == NULL) { |
| info("Invalid node count (%u) for job %u " |
| "update, job %u to expand not found", |
| job_specs->min_nodes, job_specs->job_id, |
| job_ptr->details->expanding_jobid); |
| error_code = ESLURM_INVALID_JOB_ID; |
| goto fini; |
| } |
| if (IS_JOB_SUSPENDED(job_ptr) || |
| IS_JOB_SUSPENDED(expand_job_ptr)) { |
| info("Can not expand job %u from job %u, " |
| "job is suspended", |
| expand_job_ptr->job_id, job_ptr->job_id); |
| error_code = ESLURM_JOB_SUSPENDED; |
| goto fini; |
| } |
| if ((job_ptr->step_list != NULL) && |
| (list_count(job_ptr->step_list) != 0)) { |
| info("Attempt to merge job %u with active " |
| "steps into job %u", |
| job_specs->job_id, |
| job_ptr->details->expanding_jobid); |
| error_code = ESLURMD_STEP_EXISTS; |
| goto fini; |
| } |
| info("sched: killing job %u and moving all resources " |
| "to job %u", job_specs->job_id, |
| expand_job_ptr->job_id); |
| job_pre_resize_acctg(job_ptr); |
| job_pre_resize_acctg(expand_job_ptr); |
| _send_job_kill(job_ptr); |
| |
| xassert(job_ptr->job_resrcs); |
| xassert(job_ptr->job_resrcs->node_bitmap); |
| orig_job_node_bitmap = bit_copy(expand_job_ptr-> |
| job_resrcs-> |
| node_bitmap); |
| error_code = select_g_job_expand(job_ptr, |
| expand_job_ptr); |
| if (error_code == SLURM_SUCCESS) { |
| _merge_job_licenses(job_ptr, expand_job_ptr); |
| rebuild_step_bitmaps(expand_job_ptr, |
| orig_job_node_bitmap); |
| } |
| bit_free(orig_job_node_bitmap); |
| job_post_resize_acctg(job_ptr); |
| job_post_resize_acctg(expand_job_ptr); |
| /* Since job_post_resize_acctg will restart things, |
| * don't do it again. */ |
| update_accounting = false; |
| if (error_code) |
| goto fini; |
| } else if ((job_specs->min_nodes == 0) || |
| (job_specs->min_nodes > job_ptr->node_cnt) || |
| job_ptr->details->expanding_jobid) { |
| info("sched: Invalid node count (%u) for job %u update", |
| job_specs->min_nodes, job_specs->job_id); |
| error_code = ESLURM_INVALID_NODE_COUNT; |
| goto fini; |
| } else if (job_specs->min_nodes == job_ptr->node_cnt) { |
| debug2("No change in node count update for job %u", |
| job_specs->job_id); |
| } else { |
| int i, i_first, i_last, total; |
| struct node_record *node_ptr; |
| info("sched: update_job: set node count to %u for " |
| "job_id %u", |
| job_specs->min_nodes, job_specs->job_id); |
| job_pre_resize_acctg(job_ptr); |
| i_first = bit_ffs(job_ptr->node_bitmap); |
| i_last = bit_fls(job_ptr->node_bitmap); |
| for (i=i_first, total=0; i<=i_last; i++) { |
| if (!bit_test(job_ptr->node_bitmap, i)) |
| continue; |
| if (++total <= job_specs->min_nodes) |
| continue; |
| node_ptr = node_record_table_ptr + i; |
| kill_step_on_node(job_ptr, node_ptr, false); |
| excise_node_from_job(job_ptr, node_ptr); |
| } |
| job_post_resize_acctg(job_ptr); |
| info("sched: update_job: set nodes to %s for " |
| "job_id %u", |
| job_ptr->nodes, job_specs->job_id); |
| /* Since job_post_resize_acctg will restart |
| * things don't do it again. */ |
| update_accounting = false; |
| } |
| } |
| |
| if (job_specs->ntasks_per_node != (uint16_t) NO_VAL) { |
| if ((!IS_JOB_PENDING(job_ptr)) || (detail_ptr == NULL)) |
| error_code = ESLURM_DISABLED; |
| else if (authorized) { |
| detail_ptr->ntasks_per_node = |
| job_specs->ntasks_per_node; |
| info("sched: update_job: setting ntasks_per_node to %u" |
| " for job_id %u", job_specs->ntasks_per_node, |
| job_specs->job_id); |
| } else { |
| error("sched: Not super user: ignore ntasks_oper_node " |
| "change for job %u", job_specs->job_id); |
| error_code = ESLURM_ACCESS_DENIED; |
| } |
| } |
| if (error_code != SLURM_SUCCESS) |
| goto fini; |
| |
| if (job_specs->dependency) { |
| if ((!IS_JOB_PENDING(job_ptr)) || (job_ptr->details == NULL)) |
| error_code = ESLURM_DISABLED; |
| else { |
| int rc; |
| rc = update_job_dependency(job_ptr, |
| job_specs->dependency); |
| if (rc != SLURM_SUCCESS) |
| error_code = rc; |
| else { |
| job_ptr->details->orig_dependency = |
| xstrdup(job_ptr->details->dependency); |
| info("sched: update_job: setting dependency to " |
| "%s for job_id %u", |
| job_ptr->details->dependency, |
| job_ptr->job_id); |
| } |
| } |
| } |
| if (error_code != SLURM_SUCCESS) |
| goto fini; |
| |
| if (job_specs->begin_time) { |
| if (IS_JOB_PENDING(job_ptr) && detail_ptr) { |
| char time_str[32]; |
| /* Make sure this time is current, it does no good for |
| * accounting to say this job could have started before |
| * now */ |
| if (job_specs->begin_time < now) |
| job_specs->begin_time = now; |
| |
| if (detail_ptr->begin_time != job_specs->begin_time) { |
| detail_ptr->begin_time = job_specs->begin_time; |
| update_accounting = true; |
| slurm_make_time_str(&detail_ptr->begin_time, |
| time_str, sizeof(time_str)); |
| info("sched: update_job: setting begin " |
| "to %s for job_id %u", |
| time_str, job_ptr->job_id); |
| } else |
| debug("sched: update_job: new begin time " |
| "identical to old begin time %u", |
| job_ptr->job_id); |
| } else { |
| error_code = ESLURM_DISABLED; |
| goto fini; |
| } |
| } |
| |
| if (job_specs->licenses) { |
| List license_list; |
| bool valid; |
| |
| license_list = license_validate(job_specs->licenses, &valid); |
| if (!valid) { |
| info("sched: update_job: invalid licenses: %s", |
| job_specs->licenses); |
| error_code = ESLURM_INVALID_LICENSES; |
| } else if (IS_JOB_PENDING(job_ptr)) { |
| FREE_NULL_LIST(job_ptr->license_list); |
| job_ptr->license_list = license_list; |
| info("sched: update_job: changing licenses from '%s' " |
| "to '%s' for pending job %u", |
| job_ptr->licenses, job_specs->licenses, |
| job_ptr->job_id); |
| xfree(job_ptr->licenses); |
| job_ptr->licenses = job_specs->licenses; |
| job_specs->licenses = NULL; /* nothing to free */ |
| } else if (IS_JOB_RUNNING(job_ptr) && |
| (authorized || (license_list == NULL))) { |
| /* NOTE: This can result in oversubscription of |
| * licenses */ |
| license_job_return(job_ptr); |
| FREE_NULL_LIST(job_ptr->license_list); |
| job_ptr->license_list = license_list; |
| info("sched: update_job: changing licenses from '%s' " |
| "to '%s' for running job %u", |
| job_ptr->licenses, job_specs->licenses, |
| job_ptr->job_id); |
| xfree(job_ptr->licenses); |
| job_ptr->licenses = job_specs->licenses; |
| job_specs->licenses = NULL; /* nothing to free */ |
| license_job_get(job_ptr); |
| } else { |
| /* licenses are valid, but job state or user not |
| * allowed to make changes */ |
| info("sched: update_job: could not change licenses " |
| "for job %u", job_ptr->job_id); |
| error_code = ESLURM_DISABLED; |
| FREE_NULL_LIST(license_list); |
| } |
| } |
| if (error_code != SLURM_SUCCESS) |
| goto fini; |
| |
| fail_reason = job_limits_check(&job_ptr, false); |
| if (fail_reason != WAIT_NO_REASON) { |
| if (fail_reason == WAIT_QOS_THRES) |
| error_code = ESLURM_QOS_THRES; |
| else if ((fail_reason == WAIT_PART_TIME_LIMIT) || |
| (fail_reason == WAIT_PART_NODE_LIMIT) || |
| (fail_reason == WAIT_PART_DOWN) || |
| (fail_reason == WAIT_HELD)) |
| error_code = SLURM_SUCCESS; |
| else |
| error_code = ESLURM_REQUESTED_PART_CONFIG_UNAVAILABLE; |
| if ((job_ptr->state_reason != WAIT_HELD) && |
| (job_ptr->state_reason != WAIT_HELD_USER)) { |
| job_ptr->state_reason = fail_reason; |
| xfree(job_ptr->state_desc); |
| } |
| return error_code; |
| } else if ((job_ptr->state_reason != WAIT_HELD) && |
| (job_ptr->state_reason != WAIT_HELD_USER)) { |
| job_ptr->state_reason = WAIT_NO_REASON; |
| } |
| |
| #ifdef HAVE_BG |
| select_g_select_jobinfo_get(job_specs->select_jobinfo, |
| SELECT_JOBDATA_CONN_TYPE, &conn_type); |
| if (conn_type[0] != (uint16_t) NO_VAL) { |
| if ((!IS_JOB_PENDING(job_ptr)) || (detail_ptr == NULL)) |
| error_code = ESLURM_DISABLED; |
| else { |
| char *conn_type_char = conn_type_string_full(conn_type); |
| if ((conn_type[0] >= SELECT_SMALL) |
| && (detail_ptr->min_cpus >= cpus_per_mp)) { |
| info("update_job: could not change " |
| "conn_type to '%s' because cpu " |
| "count is %u for job %u making " |
| "the conn_type invalid.", |
| conn_type_char, |
| detail_ptr->min_cpus, |
| job_ptr->job_id); |
| error_code = ESLURM_INVALID_NODE_COUNT; |
| } else if (((conn_type[0] == SELECT_TORUS) |
| || (conn_type[0] == SELECT_MESH)) |
| && (detail_ptr->min_cpus < cpus_per_mp)) { |
| info("update_job: could not change " |
| "conn_type to '%s' because cpu " |
| "count is %u for job %u making " |
| "the conn_type invalid.", |
| conn_type_char, |
| detail_ptr->min_cpus, |
| job_ptr->job_id); |
| error_code = ESLURM_INVALID_NODE_COUNT; |
| } else { |
| info("update_job: setting conn_type to '%s' " |
| "for jobid %u", |
| conn_type_char, |
| job_ptr->job_id); |
| select_g_select_jobinfo_set( |
| job_ptr->select_jobinfo, |
| SELECT_JOBDATA_CONN_TYPE, &conn_type); |
| } |
| xfree(conn_type_char); |
| } |
| } |
| |
| if (error_code != SLURM_SUCCESS) |
| goto fini; |
| |
| /* check to make sure we didn't mess up with the proc count */ |
| select_g_select_jobinfo_get(job_ptr->select_jobinfo, |
| SELECT_JOBDATA_CONN_TYPE, &conn_type); |
| if (detail_ptr && |
| (((conn_type[0] >= SELECT_SMALL) |
| && (detail_ptr->min_cpus >= cpus_per_mp)) |
| || (((conn_type[0] == SELECT_TORUS)|| (conn_type[0] == SELECT_MESH)) |
| && (detail_ptr->min_cpus < cpus_per_mp)))) { |
| char *conn_type_char = conn_type_string_full(conn_type); |
| info("update_job: With cpu count at %u our conn_type " |
| "of '%s' is invalid for job %u.", |
| detail_ptr->min_cpus, |
| conn_type_char, |
| job_ptr->job_id); |
| xfree(conn_type_char); |
| error_code = ESLURM_INVALID_NODE_COUNT; |
| goto fini; |
| } |
| |
| select_g_select_jobinfo_get(job_specs->select_jobinfo, |
| SELECT_JOBDATA_ROTATE, &rotate); |
| if (rotate != (uint16_t) NO_VAL) { |
| if (!IS_JOB_PENDING(job_ptr)) { |
| error_code = ESLURM_DISABLED; |
| goto fini; |
| } else { |
| info("sched: update_job: setting rotate to %u for " |
| "jobid %u", rotate, job_ptr->job_id); |
| select_g_select_jobinfo_set( |
| job_ptr->select_jobinfo, |
| SELECT_JOBDATA_ROTATE, &rotate); |
| } |
| } |
| |
| select_g_select_jobinfo_get(job_specs->select_jobinfo, |
| SELECT_JOBDATA_REBOOT, &reboot); |
| if (reboot != (uint16_t) NO_VAL) { |
| if (!IS_JOB_PENDING(job_ptr)) { |
| error_code = ESLURM_DISABLED; |
| goto fini; |
| } else { |
| info("sched: update_job: setting reboot to %u for " |
| "jobid %u", reboot, job_ptr->job_id); |
| select_g_select_jobinfo_set( |
| job_ptr->select_jobinfo, |
| SELECT_JOBDATA_REBOOT, &reboot); |
| } |
| } |
| |
| select_g_select_jobinfo_get(job_specs->select_jobinfo, |
| SELECT_JOBDATA_GEOMETRY, geometry); |
| if (geometry[0] != (uint16_t) NO_VAL) { |
| if (!IS_JOB_PENDING(job_ptr)) |
| error_code = ESLURM_DISABLED; |
| else if (authorized) { |
| uint32_t i, tot = 1; |
| for (i=0; i<SYSTEM_DIMENSIONS; i++) |
| tot *= geometry[i]; |
| info("sched: update_job: setting geometry to %ux%ux%u" |
| " min_nodes=%u for jobid %u", |
| geometry[0], geometry[1], |
| geometry[2], tot, job_ptr->job_id); |
| select_g_select_jobinfo_set(job_ptr->select_jobinfo, |
| SELECT_JOBDATA_GEOMETRY, |
| geometry); |
| detail_ptr->min_nodes = tot; |
| } else { |
| error("sched: Attempt to change geometry for job %u", |
| job_specs->job_id); |
| error_code = ESLURM_ACCESS_DENIED; |
| } |
| } |
| if (error_code != SLURM_SUCCESS) |
| goto fini; |
| |
| select_g_select_jobinfo_get(job_specs->select_jobinfo, |
| SELECT_JOBDATA_BLRTS_IMAGE, &image); |
| if (image) { |
| if (!IS_JOB_PENDING(job_ptr)) { |
| xfree(image); |
| error_code = ESLURM_DISABLED; |
| goto fini; |
| } else { |
| info("sched: update_job: setting BlrtsImage to %s " |
| "for jobid %u", image, job_ptr->job_id); |
| select_g_select_jobinfo_set( |
| job_ptr->select_jobinfo, |
| SELECT_JOBDATA_BLRTS_IMAGE, |
| image); |
| } |
| xfree(image); |
| } |
| select_g_select_jobinfo_get(job_specs->select_jobinfo, |
| SELECT_JOBDATA_LINUX_IMAGE, &image); |
| if (image) { |
| if (!IS_JOB_PENDING(job_ptr)) { |
| error_code = ESLURM_DISABLED; |
| xfree(image); |
| goto fini; |
| } else { |
| info("sched: update_job: setting LinuxImage to %s " |
| "for jobid %u", image, job_ptr->job_id); |
| select_g_select_jobinfo_set( |
| job_ptr->select_jobinfo, |
| SELECT_JOBDATA_LINUX_IMAGE, image); |
| } |
| xfree(image); |
| } |
| select_g_select_jobinfo_get(job_specs->select_jobinfo, |
| SELECT_JOBDATA_MLOADER_IMAGE, &image); |
| if (image) { |
| if (!IS_JOB_PENDING(job_ptr)) { |
| error_code = ESLURM_DISABLED; |
| xfree(image); |
| goto fini; |
| } else { |
| info("sched: update_job: setting MloaderImage to %s " |
| "for jobid %u", image, job_ptr->job_id); |
| select_g_select_jobinfo_set( |
| job_ptr->select_jobinfo, |
| SELECT_JOBDATA_MLOADER_IMAGE, |
| image); |
| } |
| xfree(image); |
| } |
| select_g_select_jobinfo_get(job_specs->select_jobinfo, |
| SELECT_JOBDATA_RAMDISK_IMAGE, &image); |
| if (image) { |
| if (!IS_JOB_PENDING(job_ptr)) { |
| error_code = ESLURM_DISABLED; |
| xfree(image); |
| goto fini; |
| } else { |
| info("sched: update_job: setting RamdiskImage to %s " |
| "for jobid %u", image, job_ptr->job_id); |
| select_g_select_jobinfo_set( |
| job_ptr->select_jobinfo, |
| SELECT_JOBDATA_RAMDISK_IMAGE, |
| image); |
| } |
| xfree(image); |
| } |
| #endif |
| |
| if (job_specs->network) { |
| xfree(job_ptr->network); |
| if (!strlen(job_specs->network) |
| || !strcmp(job_specs->network, "none")) { |
| info("sched: update_job: clearing Network option " |
| "for jobid %u", job_ptr->job_id); |
| |
| } else { |
| job_ptr->network = xstrdup(job_specs->network); |
| info("sched: update_job: setting Network to %s " |
| "for jobid %u", job_ptr->network, job_ptr->job_id); |
| select_g_select_jobinfo_set( |
| job_ptr->select_jobinfo, |
| SELECT_JOBDATA_NETWORK, |
| job_ptr->network); |
| } |
| } |
| |
| fini: |
| if (update_accounting) { |
| info("updating accounting"); |
| if (job_ptr->details && job_ptr->details->begin_time) { |
| /* Update job record in accounting to reflect |
| * changes */ |
| jobacct_storage_g_job_start(acct_db_conn, |
| job_ptr); |
| } |
| } |
| |
| /* If job update is successful and priority is calculated (not only |
| * based upon job submit order), recalculate the job priority, since |
| * many factors of an update may affect priority considerations. |
| * If job has a hold then do nothing */ |
| if ((error_code == SLURM_SUCCESS) && (job_ptr->priority != 0) && |
| strcmp(slurmctld_conf.priority_type, "priority/basic")) |
| set_job_prio(job_ptr); |
| |
| return error_code; |
| } |
| |
| static void _send_job_kill(struct job_record *job_ptr) |
| { |
| kill_job_msg_t *kill_job = NULL; |
| agent_arg_t *agent_args = NULL; |
| #ifdef HAVE_FRONT_END |
| front_end_record_t *front_end_ptr; |
| #else |
| int i; |
| struct node_record *node_ptr; |
| #endif |
| |
| if (select_serial == -1) { |
| if (strcmp(slurmctld_conf.select_type, "select/serial")) |
| select_serial = 0; |
| else |
| select_serial = 1; |
| } |
| |
| xassert(job_ptr); |
| xassert(job_ptr->details); |
| |
| agent_args = xmalloc(sizeof(agent_arg_t)); |
| agent_args->msg_type = REQUEST_TERMINATE_JOB; |
| agent_args->retry = 0; /* re_kill_job() resends as needed */ |
| agent_args->hostlist = hostlist_create(NULL); |
| kill_job = xmalloc(sizeof(kill_job_msg_t)); |
| last_node_update = time(NULL); |
| kill_job->job_id = job_ptr->job_id; |
| kill_job->step_id = NO_VAL; |
| kill_job->job_state = job_ptr->job_state; |
| kill_job->job_uid = job_ptr->user_id; |
| kill_job->nodes = xstrdup(job_ptr->nodes); |
| kill_job->time = time(NULL); |
| kill_job->start_time = job_ptr->start_time; |
| kill_job->select_jobinfo = select_g_select_jobinfo_copy( |
| job_ptr->select_jobinfo); |
| kill_job->spank_job_env = xduparray(job_ptr->spank_job_env_size, |
| job_ptr->spank_job_env); |
| kill_job->spank_job_env_size = job_ptr->spank_job_env_size; |
| |
| #ifdef HAVE_FRONT_END |
| if (job_ptr->batch_host && |
| (front_end_ptr = job_ptr->front_end_ptr)) { |
| agent_args->protocol_version = front_end_ptr->protocol_version; |
| hostlist_push_host(agent_args->hostlist, job_ptr->batch_host); |
| agent_args->node_count++; |
| } |
| #else |
| if (!job_ptr->node_bitmap_cg) |
| build_cg_bitmap(job_ptr); |
| agent_args->protocol_version = SLURM_PROTOCOL_VERSION; |
| for (i = 0, node_ptr = node_record_table_ptr; |
| i < node_record_count; i++, node_ptr++) { |
| if (!bit_test(job_ptr->node_bitmap_cg, i)) |
| continue; |
| if (agent_args->protocol_version > node_ptr->protocol_version) |
| agent_args->protocol_version = |
| node_ptr->protocol_version; |
| hostlist_push_host(agent_args->hostlist, node_ptr->name); |
| agent_args->node_count++; |
| } |
| #endif |
| if (agent_args->node_count == 0) { |
| if ((job_ptr->details->expanding_jobid == 0) && |
| (select_serial == 0)) { |
| error("%s: job %u allocated no nodes to be killed on", |
| __func__, job_ptr->job_id); |
| } |
| xfree(kill_job->nodes); |
| xfree(kill_job); |
| hostlist_destroy(agent_args->hostlist); |
| xfree(agent_args); |
| return; |
| } |
| |
| agent_args->msg_args = kill_job; |
| agent_queue_request(agent_args); |
| return; |
| } |
| |
| /* Record accounting information for a job immediately before changing size */ |
| extern void job_pre_resize_acctg(struct job_record *job_ptr) |
| { |
| /* if we don't have a db_index go a start this one up since if |
| running with the slurmDBD the job may not have started yet. |
| */ |
| if (!job_ptr->db_index) |
| jobacct_storage_g_job_start(acct_db_conn, job_ptr); |
| |
| job_ptr->job_state |= JOB_RESIZING; |
| job_ptr->resize_time = time(NULL); |
| /* NOTE: job_completion_logger() calls |
| * acct_policy_remove_job_submit() */ |
| job_completion_logger(job_ptr, false); |
| |
| /* This doesn't happen in job_completion_logger, but gets |
| * added back in with job_post_resize_acctg so remove it here. */ |
| acct_policy_job_fini(job_ptr); |
| |
| /* NOTE: The RESIZING FLAG needed to be cleared with |
| job_post_resize_acctg */ |
| } |
| |
| /* Record accounting information for a job immediately after changing size */ |
| extern void job_post_resize_acctg(struct job_record *job_ptr) |
| { |
| /* NOTE: The RESIZING FLAG needed to be set with |
| job_pre_resize_acctg the assert is here to make sure we |
| code it that way. */ |
| xassert(IS_JOB_RESIZING(job_ptr)); |
| acct_policy_add_job_submit(job_ptr); |
| acct_policy_job_begin(job_ptr); |
| jobacct_storage_g_job_start(acct_db_conn, job_ptr); |
| job_ptr->job_state &= (~JOB_RESIZING); |
| } |
| |
| /* |
| * validate_jobs_on_node - validate that any jobs that should be on the node |
| * are actually running, if not clean up the job records and/or node |
| * records, call this function after validate_node_specs() sets the node |
| * state properly |
| * IN reg_msg - node registration message |
| */ |
| extern void |
| validate_jobs_on_node(slurm_node_registration_status_msg_t *reg_msg) |
| { |
| int i, node_inx, jobs_on_node; |
| struct node_record *node_ptr; |
| struct job_record *job_ptr; |
| struct step_record *step_ptr; |
| time_t now = time(NULL); |
| |
| node_ptr = find_node_record(reg_msg->node_name); |
| if (node_ptr == NULL) { |
| error("slurmd registered on unknown node %s", |
| reg_msg->node_name); |
| return; |
| } |
| |
| if (reg_msg->energy) |
| memcpy(node_ptr->energy, reg_msg->energy, |
| sizeof(acct_gather_energy_t)); |
| |
| if (node_ptr->up_time > reg_msg->up_time) { |
| verbose("Node %s rebooted %u secs ago", |
| reg_msg->node_name, reg_msg->up_time); |
| } |
| |
| if (reg_msg->up_time <= now) { |
| node_ptr->up_time = reg_msg->up_time; |
| node_ptr->boot_time = now - reg_msg->up_time; |
| node_ptr->slurmd_start_time = reg_msg->slurmd_start_time; |
| } else { |
| error("Node up_time is invalid: %u>%u", reg_msg->up_time, |
| (uint32_t) now); |
| } |
| |
| node_inx = node_ptr - node_record_table_ptr; |
| |
| /* Check that jobs running are really supposed to be there */ |
| for (i = 0; i < reg_msg->job_count; i++) { |
| if ( (reg_msg->job_id[i] >= MIN_NOALLOC_JOBID) && |
| (reg_msg->job_id[i] <= MAX_NOALLOC_JOBID) ) { |
| info("NoAllocate job %u.%u reported on node %s", |
| reg_msg->job_id[i], reg_msg->step_id[i], |
| reg_msg->node_name); |
| continue; |
| } |
| |
| job_ptr = find_job_record(reg_msg->job_id[i]); |
| if (job_ptr == NULL) { |
| error("Orphan job %u.%u reported on node %s", |
| reg_msg->job_id[i], reg_msg->step_id[i], |
| reg_msg->node_name); |
| abort_job_on_node(reg_msg->job_id[i], |
| job_ptr, node_ptr->name); |
| } |
| |
| else if (IS_JOB_RUNNING(job_ptr) || |
| IS_JOB_SUSPENDED(job_ptr)) { |
| if (bit_test(job_ptr->node_bitmap, node_inx)) { |
| debug3("Registered job %u.%u on node %s ", |
| reg_msg->job_id[i], |
| reg_msg->step_id[i], |
| reg_msg->node_name); |
| if ((job_ptr->batch_flag) && |
| (node_inx == bit_ffs( |
| job_ptr->node_bitmap))) { |
| /* NOTE: Used for purging defunct |
| * batch jobs */ |
| job_ptr->time_last_active = now; |
| } |
| step_ptr = find_step_record(job_ptr, |
| reg_msg-> |
| step_id[i]); |
| if (step_ptr) |
| step_ptr->time_last_active = now; |
| } else { |
| /* Typically indicates a job requeue and |
| * restart on another nodes. A node from the |
| * original allocation just responded here. */ |
| error("Registered job %u.%u on wrong node %s ", |
| reg_msg->job_id[i], |
| reg_msg->step_id[i], |
| reg_msg->node_name); |
| info("%s: job nodes %s count %d inx %d", |
| __func__, job_ptr->nodes, |
| job_ptr->node_cnt, node_inx); |
| abort_job_on_node(reg_msg->job_id[i], job_ptr, |
| node_ptr->name); |
| } |
| } |
| |
| else if (IS_JOB_COMPLETING(job_ptr)) { |
| /* Re-send kill request as needed, |
| * not necessarily an error */ |
| kill_job_on_node(reg_msg->job_id[i], job_ptr, |
| node_ptr); |
| } |
| |
| |
| else if (IS_JOB_PENDING(job_ptr)) { |
| /* Typically indicates a job requeue and the hung |
| * slurmd that went DOWN is now responding */ |
| error("Registered PENDING job %u.%u on node %s ", |
| reg_msg->job_id[i], reg_msg->step_id[i], |
| reg_msg->node_name); |
| abort_job_on_node(reg_msg->job_id[i], |
| job_ptr, node_ptr->name); |
| } |
| |
| else if (difftime(now, job_ptr->end_time) < |
| slurm_get_msg_timeout()) { /* Race condition */ |
| debug("Registered newly completed job %u.%u on %s", |
| reg_msg->job_id[i], reg_msg->step_id[i], |
| node_ptr->name); |
| } |
| |
| else { /* else job is supposed to be done */ |
| error("Registered job %u.%u in state %s on node %s ", |
| reg_msg->job_id[i], reg_msg->step_id[i], |
| job_state_string(job_ptr->job_state), |
| reg_msg->node_name); |
| kill_job_on_node(reg_msg->job_id[i], job_ptr, |
| node_ptr); |
| } |
| } |
| |
| jobs_on_node = node_ptr->run_job_cnt + node_ptr->comp_job_cnt; |
| if (jobs_on_node) |
| _purge_missing_jobs(node_inx, now); |
| |
| if (jobs_on_node != reg_msg->job_count) { |
| /* slurmd will not know of a job unless the job has |
| * steps active at registration time, so this is not |
| * an error condition, slurmd is also reporting steps |
| * rather than jobs */ |
| debug3("resetting job_count on node %s from %u to %d", |
| reg_msg->node_name, reg_msg->job_count, jobs_on_node); |
| reg_msg->job_count = jobs_on_node; |
| } |
| |
| return; |
| } |
| |
| /* Purge any batch job that should have its script running on node |
| * node_inx, but is not. Allow BatchStartTimeout + ResumeTimeout seconds |
| * for startup. |
| * |
| * Purge all job steps that were started before the node was last booted. |
| * |
| * Also notify srun if any job steps should be active on this node |
| * but are not found. */ |
| static void _purge_missing_jobs(int node_inx, time_t now) |
| { |
| ListIterator job_iterator; |
| struct job_record *job_ptr; |
| struct node_record *node_ptr = node_record_table_ptr + node_inx; |
| uint16_t batch_start_timeout = slurm_get_batch_start_timeout(); |
| uint16_t msg_timeout = slurm_get_msg_timeout(); |
| uint16_t resume_timeout = slurm_get_resume_timeout(); |
| uint32_t suspend_time = slurm_get_suspend_time(); |
| time_t batch_startup_time, node_boot_time = (time_t) 0, startup_time; |
| |
| if (node_ptr->boot_time > (msg_timeout + 5)) { |
| /* allow for message timeout and other delays */ |
| node_boot_time = node_ptr->boot_time - (msg_timeout + 5); |
| } |
| batch_startup_time = now - batch_start_timeout; |
| batch_startup_time -= msg_timeout; |
| |
| job_iterator = list_iterator_create(job_list); |
| while ((job_ptr = (struct job_record *) list_next(job_iterator))) { |
| bool job_active = IS_JOB_RUNNING(job_ptr) || |
| IS_JOB_SUSPENDED(job_ptr); |
| |
| if ((!job_active) || |
| (!bit_test(job_ptr->node_bitmap, node_inx))) |
| continue; |
| if ((job_ptr->batch_flag != 0) && |
| (suspend_time != 0) /* power mgmt on */ && |
| (job_ptr->start_time < node_boot_time)) { |
| startup_time = batch_startup_time - resume_timeout; |
| } else |
| startup_time = batch_startup_time; |
| |
| if ((job_ptr->batch_flag != 0) && |
| (job_ptr->time_last_active < startup_time) && |
| (job_ptr->start_time < startup_time) && |
| (node_inx == bit_ffs(job_ptr->node_bitmap))) { |
| bool requeue = false; |
| if ((job_ptr->start_time < node_ptr->boot_time) && |
| (job_ptr->details && job_ptr->details->requeue)) |
| requeue = true; |
| info("Batch JobId=%u missing from node 0 (not found " |
| "BatchStartTime after startup)", job_ptr->job_id); |
| job_ptr->exit_code = 1; |
| job_complete(job_ptr->job_id, 0, requeue, true, NO_VAL); |
| } else { |
| _notify_srun_missing_step(job_ptr, node_inx, |
| now, node_boot_time); |
| } |
| } |
| list_iterator_destroy(job_iterator); |
| } |
| |
| static void _notify_srun_missing_step(struct job_record *job_ptr, int node_inx, |
| time_t now, time_t node_boot_time) |
| { |
| ListIterator step_iterator; |
| struct step_record *step_ptr; |
| char *node_name = node_record_table_ptr[node_inx].name; |
| |
| xassert(job_ptr); |
| step_iterator = list_iterator_create (job_ptr->step_list); |
| while ((step_ptr = (struct step_record *) list_next (step_iterator))) { |
| if (step_ptr->state != JOB_RUNNING) |
| continue; |
| if (!bit_test(step_ptr->step_node_bitmap, node_inx)) |
| continue; |
| if (step_ptr->time_last_active >= now) { |
| /* Back up timer in case more than one node |
| * registration happens at this same time. |
| * We don't want this node's registration |
| * to count toward a different node's |
| * registration message. */ |
| step_ptr->time_last_active = now - 1; |
| } else if (step_ptr->host && step_ptr->port) { |
| /* srun may be able to verify step exists on |
| * this node using I/O sockets and kill the |
| * job as needed */ |
| srun_step_missing(step_ptr, node_name); |
| } else if ((step_ptr->start_time < node_boot_time) && |
| (step_ptr->no_kill == 0)) { |
| /* There is a risk that the job step's tasks completed |
| * on this node before its reboot, but that should be |
| * very rare and there is no srun to work with (POE) */ |
| info("Node %s rebooted, killing missing step %u.%u", |
| node_name, job_ptr->job_id, step_ptr->step_id); |
| signal_step_tasks_on_node(node_name, step_ptr, SIGKILL, |
| REQUEST_TERMINATE_TASKS); |
| } |
| } |
| list_iterator_destroy (step_iterator); |
| } |
| |
| /* |
| * abort_job_on_node - Kill the specific job_id on a specific node, |
| * the request is not processed immediately, but queued. |
| * This is to prevent a flood of pthreads if slurmctld restarts |
| * without saved state and slurmd daemons register with a |
| * multitude of running jobs. Slurmctld will not recognize |
| * these jobs and use this function to kill them - one |
| * agent request per node as they register. |
| * IN job_id - id of the job to be killed |
| * IN job_ptr - pointer to terminating job (NULL if unknown, e.g. orphaned) |
| * IN node_name - name of the node on which the job resides |
| */ |
| extern void |
| abort_job_on_node(uint32_t job_id, struct job_record *job_ptr, char *node_name) |
| { |
| agent_arg_t *agent_info; |
| kill_job_msg_t *kill_req; |
| |
| kill_req = xmalloc(sizeof(kill_job_msg_t)); |
| kill_req->job_id = job_id; |
| kill_req->step_id = NO_VAL; |
| kill_req->time = time(NULL); |
| kill_req->nodes = xstrdup(node_name); |
| if (job_ptr) { /* NULL if unknown */ |
| kill_req->start_time = job_ptr->start_time; |
| kill_req->select_jobinfo = |
| select_g_select_jobinfo_copy(job_ptr->select_jobinfo); |
| kill_req->spank_job_env = xduparray(job_ptr->spank_job_env_size, |
| job_ptr->spank_job_env); |
| kill_req->spank_job_env_size = job_ptr->spank_job_env_size; |
| } else { |
| /* kill_req->start_time = 0; Default value */ |
| } |
| |
| agent_info = xmalloc(sizeof(agent_arg_t)); |
| agent_info->node_count = 1; |
| agent_info->retry = 0; |
| agent_info->hostlist = hostlist_create(node_name); |
| #ifdef HAVE_FRONT_END |
| if (job_ptr && job_ptr->front_end_ptr) |
| agent_info->protocol_version = |
| job_ptr->front_end_ptr->protocol_version; |
| debug("Aborting job %u on front end node %s", job_id, node_name); |
| #else |
| struct node_record *node_ptr; |
| if ((node_ptr = find_node_record(node_name))) |
| agent_info->protocol_version = node_ptr->protocol_version; |
| |
| debug("Aborting job %u on node %s", job_id, node_name); |
| #endif |
| agent_info->msg_type = REQUEST_ABORT_JOB; |
| agent_info->msg_args = kill_req; |
| |
| agent_queue_request(agent_info); |
| } |
| |
| /* |
| * kill_job_on_node - Kill the specific job_id on a specific node. |
| * IN job_id - id of the job to be killed |
| * IN job_ptr - pointer to terminating job (NULL if unknown, e.g. orphaned) |
| * IN node_ptr - pointer to the node on which the job resides |
| */ |
| extern void |
| kill_job_on_node(uint32_t job_id, struct job_record *job_ptr, |
| struct node_record *node_ptr) |
| { |
| agent_arg_t *agent_info; |
| kill_job_msg_t *kill_req; |
| |
| kill_req = xmalloc(sizeof(kill_job_msg_t)); |
| kill_req->job_id = job_id; |
| kill_req->step_id = NO_VAL; |
| kill_req->time = time(NULL); |
| kill_req->start_time = job_ptr->start_time; |
| kill_req->nodes = xstrdup(node_ptr->name); |
| if (job_ptr) { /* NULL if unknown */ |
| kill_req->select_jobinfo = |
| select_g_select_jobinfo_copy(job_ptr->select_jobinfo); |
| kill_req->job_state = job_ptr->job_state; |
| } |
| kill_req->spank_job_env = xduparray(job_ptr->spank_job_env_size, |
| job_ptr->spank_job_env); |
| kill_req->spank_job_env_size = job_ptr->spank_job_env_size; |
| |
| agent_info = xmalloc(sizeof(agent_arg_t)); |
| agent_info->node_count = 1; |
| agent_info->retry = 0; |
| #ifdef HAVE_FRONT_END |
| xassert(job_ptr->batch_host); |
| if (job_ptr->front_end_ptr) |
| agent_info->protocol_version = |
| job_ptr->front_end_ptr->protocol_version; |
| agent_info->hostlist = hostlist_create(job_ptr->batch_host); |
| debug("Killing job %u on front end node %s", job_id, |
| job_ptr->batch_host); |
| #else |
| agent_info->protocol_version = node_ptr->protocol_version; |
| agent_info->hostlist = hostlist_create(node_ptr->name); |
| debug("Killing job %u on node %s", job_id, node_ptr->name); |
| #endif |
| agent_info->msg_type = REQUEST_TERMINATE_JOB; |
| agent_info->msg_args = kill_req; |
| |
| agent_queue_request(agent_info); |
| } |
| |
| |
| /* |
| * job_alloc_info - get details about an existing job allocation |
| * IN uid - job issuing the code |
| * IN job_id - ID of job for which info is requested |
| * OUT job_pptr - set to pointer to job record |
| */ |
| extern int |
| job_alloc_info(uint32_t uid, uint32_t job_id, struct job_record **job_pptr) |
| { |
| struct job_record *job_ptr; |
| |
| job_ptr = find_job_record(job_id); |
| if (job_ptr == NULL) |
| return ESLURM_INVALID_JOB_ID; |
| if ((slurmctld_conf.private_data & PRIVATE_DATA_JOBS) && |
| (job_ptr->user_id != uid) && !validate_operator(uid) && |
| !assoc_mgr_is_user_acct_coord(acct_db_conn, uid, job_ptr->account)) |
| return ESLURM_ACCESS_DENIED; |
| if (IS_JOB_PENDING(job_ptr)) |
| return ESLURM_JOB_PENDING; |
| if (IS_JOB_FINISHED(job_ptr)) |
| return ESLURM_ALREADY_DONE; |
| |
| if (job_ptr->alias_list && !strcmp(job_ptr->alias_list, "TBD") && |
| job_ptr->node_bitmap && |
| (bit_overlap(power_node_bitmap, job_ptr->node_bitmap) == 0)) { |
| set_job_alias_list(job_ptr); |
| } |
| |
| *job_pptr = job_ptr; |
| return SLURM_SUCCESS; |
| } |
| |
| /* |
| * Synchronize the batch job in the system with their files. |
| * All pending batch jobs must have script and environment files |
| * No other jobs should have such files |
| * NOTE: READ lock_slurmctld config before entry |
| */ |
| int sync_job_files(void) |
| { |
| List batch_dirs; |
| |
| if (!slurmctld_primary) /* Don't purge files from backup slurmctld */ |
| return SLURM_SUCCESS; |
| |
| batch_dirs = list_create(_del_batch_list_rec); |
| _get_batch_job_dir_ids(batch_dirs); |
| _validate_job_files(batch_dirs); |
| _remove_defunct_batch_dirs(batch_dirs); |
| list_destroy(batch_dirs); |
| return SLURM_SUCCESS; |
| } |
| |
| /* Append to the batch_dirs list the job_id's associated with |
| * every batch job directory in existence |
| * NOTE: READ lock_slurmctld config before entry |
| */ |
| static void _get_batch_job_dir_ids(List batch_dirs) |
| { |
| DIR *f_dir; |
| struct dirent *dir_ent; |
| long long_job_id; |
| uint32_t *job_id_ptr; |
| char *endptr; |
| |
| xassert(slurmctld_conf.state_save_location); |
| f_dir = opendir(slurmctld_conf.state_save_location); |
| if (!f_dir) { |
| error("opendir(%s): %m", |
| slurmctld_conf.state_save_location); |
| return; |
| } |
| |
| while ((dir_ent = readdir(f_dir))) { |
| if (strncmp("job.#", dir_ent->d_name, 4)) |
| continue; |
| long_job_id = strtol(&dir_ent->d_name[4], &endptr, 10); |
| if ((long_job_id == 0) || (endptr[0] != '\0')) |
| continue; |
| debug3("found batch directory for job_id %ld", long_job_id); |
| job_id_ptr = xmalloc(sizeof(uint32_t)); |
| *job_id_ptr = long_job_id; |
| list_append (batch_dirs, job_id_ptr); |
| } |
| |
| closedir(f_dir); |
| } |
| |
| /* All pending batch jobs must have a batch_dir entry, |
| * otherwise we flag it as FAILED and don't schedule |
| * If the batch_dir entry exists for a PENDING or RUNNING batch job, |
| * remove it the list (of directories to be deleted) */ |
| static void _validate_job_files(List batch_dirs) |
| { |
| ListIterator job_iterator; |
| struct job_record *job_ptr; |
| int del_cnt; |
| |
| job_iterator = list_iterator_create(job_list); |
| while ((job_ptr = (struct job_record *) list_next(job_iterator))) { |
| if (!job_ptr->batch_flag) |
| continue; |
| /* Want to keep this job's files */ |
| del_cnt = list_delete_all(batch_dirs, _find_batch_dir, |
| &(job_ptr->job_id)); |
| if ((del_cnt == 0) && IS_JOB_PENDING(job_ptr)) { |
| error("Script for job %u lost, state set to FAILED", |
| job_ptr->job_id); |
| job_ptr->job_state = JOB_FAILED; |
| job_ptr->exit_code = 1; |
| job_ptr->state_reason = FAIL_SYSTEM; |
| xfree(job_ptr->state_desc); |
| job_ptr->start_time = job_ptr->end_time = time(NULL); |
| job_completion_logger(job_ptr, false); |
| } |
| } |
| list_iterator_destroy(job_iterator); |
| } |
| |
| /* List matching function, see common/list.h */ |
| static int _find_batch_dir(void *x, void *key) |
| { |
| uint32_t *key1 = x; |
| uint32_t *key2 = key; |
| return (int)(*key1 == *key2); |
| } |
| /* List entry deletion function, see common/list.h */ |
| static void _del_batch_list_rec(void *x) |
| { |
| xfree(x); |
| } |
| |
| /* Remove all batch_dir entries in the list |
| * NOTE: READ lock_slurmctld config before entry */ |
| static void _remove_defunct_batch_dirs(List batch_dirs) |
| { |
| ListIterator batch_dir_inx; |
| uint32_t *job_id_ptr; |
| |
| batch_dir_inx = list_iterator_create(batch_dirs); |
| while ((job_id_ptr = list_next(batch_dir_inx))) { |
| info("Purging files for defunct batch job %u", |
| *job_id_ptr); |
| _delete_job_desc_files(*job_id_ptr); |
| } |
| list_iterator_destroy(batch_dir_inx); |
| } |
| |
| /* |
| * _xmit_new_end_time |
| * Tell all slurmd's associated with a job of its new end time |
| * IN job_ptr - pointer to terminating job |
| * globals: node_record_count - number of nodes in the system |
| * node_record_table_ptr - pointer to global node table |
| */ |
| static void |
| _xmit_new_end_time(struct job_record *job_ptr) |
| { |
| #ifndef HAVE_FRONT_END |
| int i; |
| #endif |
| job_time_msg_t *job_time_msg_ptr; |
| agent_arg_t *agent_args; |
| |
| agent_args = xmalloc(sizeof(agent_arg_t)); |
| agent_args->msg_type = REQUEST_UPDATE_JOB_TIME; |
| agent_args->retry = 1; |
| agent_args->hostlist = hostlist_create(NULL); |
| job_time_msg_ptr = xmalloc(sizeof(job_time_msg_t)); |
| job_time_msg_ptr->job_id = job_ptr->job_id; |
| job_time_msg_ptr->expiration_time = job_ptr->end_time; |
| |
| #ifdef HAVE_FRONT_END |
| xassert(job_ptr->batch_host); |
| if (job_ptr->front_end_ptr) |
| agent_args->protocol_version = |
| job_ptr->front_end_ptr->protocol_version; |
| hostlist_push_host(agent_args->hostlist, job_ptr->batch_host); |
| agent_args->node_count = 1; |
| #else |
| agent_args->protocol_version = SLURM_PROTOCOL_VERSION; |
| for (i = 0; i < node_record_count; i++) { |
| if (bit_test(job_ptr->node_bitmap, i) == 0) |
| continue; |
| if (agent_args->protocol_version > |
| node_record_table_ptr[i].protocol_version) |
| agent_args->protocol_version = |
| node_record_table_ptr[i].protocol_version; |
| hostlist_push_host(agent_args->hostlist, |
| node_record_table_ptr[i].name); |
| agent_args->node_count++; |
| } |
| #endif |
| |
| agent_args->msg_args = job_time_msg_ptr; |
| agent_queue_request(agent_args); |
| return; |
| } |
| |
| |
| /* |
| * job_epilog_complete - Note the completion of the epilog script for a |
| * given job |
| * IN job_id - id of the job for which the epilog was executed |
| * IN node_name - name of the node on which the epilog was executed |
| * IN return_code - return code from epilog script |
| * RET true if job is COMPLETED, otherwise false |
| */ |
| extern bool job_epilog_complete(uint32_t job_id, char *node_name, |
| uint32_t return_code) |
| { |
| #ifdef HAVE_FRONT_END |
| int i; |
| #endif |
| struct job_record *job_ptr = find_job_record(job_id); |
| struct node_record *node_ptr; |
| |
| if (job_ptr == NULL) |
| return true; |
| |
| /* There is a potential race condition this handles. |
| * If slurmctld cold-starts while slurmd keeps running, |
| * slurmd could notify slurmctld of a job epilog completion |
| * before getting synced up with slurmctld state. If |
| * a new job arrives and the job_id is reused, we |
| * could try to note the termination of a job that |
| * hasn't really started. Very rare obviously. */ |
| if ((IS_JOB_PENDING(job_ptr) && (!IS_JOB_COMPLETING(job_ptr))) || |
| (job_ptr->node_bitmap == NULL)) { |
| #ifndef HAVE_FRONT_END |
| uint16_t base_state = NODE_STATE_UNKNOWN; |
| node_ptr = find_node_record(node_name); |
| if (node_ptr) |
| base_state = node_ptr->node_state & NODE_STATE_BASE; |
| if (base_state == NODE_STATE_DOWN) { |
| debug("Epilog complete response for job %u from DOWN " |
| "node %s", job_id, node_name); |
| } else if (job_ptr->restart_cnt) { |
| /* Duplicate epilog complete can be due to race |
| * condition, especially with select/serial */ |
| debug("Duplicate epilog complete response for job %u", |
| job_id); |
| } else { |
| |
| error("Epilog complete response for non-running job " |
| "%u, slurmctld and slurmd out of sync", job_id); |
| } |
| #endif |
| return false; |
| } |
| |
| #ifdef HAVE_FRONT_END |
| xassert(job_ptr->batch_host); |
| /* If there is a bad epilog error don't down the frontend |
| node. If needed (not on a bluegene) the nodes in use by |
| the job will be downed below. |
| */ |
| if (return_code) |
| error("Epilog error for job %u on %s", |
| job_ptr->job_id, job_ptr->batch_host); |
| |
| if (job_ptr->front_end_ptr && IS_JOB_COMPLETING(job_ptr)) { |
| front_end_record_t *front_end_ptr = job_ptr->front_end_ptr; |
| if (front_end_ptr->job_cnt_comp) |
| front_end_ptr->job_cnt_comp--; |
| else { |
| error("job_cnt_comp underflow for for job %u on " |
| "front end %s", |
| job_ptr->job_id, front_end_ptr->name); |
| } |
| if (front_end_ptr->job_cnt_comp == 0) |
| front_end_ptr->node_state &= (~NODE_STATE_COMPLETING); |
| } |
| |
| if ((job_ptr->total_nodes == 0) && IS_JOB_COMPLETING(job_ptr)) { |
| /* Job resources moved into another job and |
| * tasks already killed */ |
| front_end_record_t *front_end_ptr = job_ptr->front_end_ptr; |
| if (front_end_ptr) |
| front_end_ptr->node_state &= (~NODE_STATE_COMPLETING); |
| } else { |
| for (i = 0; i < node_record_count; i++) { |
| if (!bit_test(job_ptr->node_bitmap, i)) |
| continue; |
| node_ptr = &node_record_table_ptr[i]; |
| #ifndef HAVE_BG |
| /* If this is a bluegene system we do not want to mark |
| * the entire midplane down if we have an epilog error. |
| * This would most likely kill other jobs sharing that |
| * midplane and that is not what we want. */ |
| if (return_code) { |
| static uint32_t slurm_user_id = NO_VAL; |
| if (slurm_user_id == NO_VAL) |
| slurm_user_id=slurm_get_slurm_user_id(); |
| drain_nodes(node_ptr->name, "Epilog error", |
| slurm_user_id); |
| } |
| #endif |
| /* Change job from completing to completed */ |
| make_node_idle(node_ptr, job_ptr); |
| } |
| } |
| #else |
| if (return_code) { |
| error("Epilog error on %s, draining the node", node_name); |
| drain_nodes(node_name, "Epilog error", |
| slurm_get_slurm_user_id()); |
| } |
| /* Change job from completing to completed */ |
| node_ptr = find_node_record(node_name); |
| if (node_ptr) |
| make_node_idle(node_ptr, job_ptr); |
| #endif |
| |
| step_epilog_complete(job_ptr, node_name); |
| /* nodes_completing is out of date, rebuild when next saved */ |
| xfree(job_ptr->nodes_completing); |
| if (!IS_JOB_COMPLETING(job_ptr)) { /* COMPLETED */ |
| batch_requeue_fini(job_ptr); |
| return true; |
| } else |
| return false; |
| } |
| |
| /* Complete a batch job requeue logic after all steps complete so that |
| * subsequent jobs appear in a separate accounting record. */ |
| void batch_requeue_fini(struct job_record *job_ptr) |
| { |
| if (IS_JOB_COMPLETING(job_ptr) || |
| !IS_JOB_PENDING(job_ptr) || !job_ptr->batch_flag) |
| return; |
| |
| info("requeue batch job %u", job_ptr->job_id); |
| |
| /* Clear everything so this appears to be a new job and then restart |
| * it in accounting. */ |
| job_ptr->start_time = 0; |
| job_ptr->end_time = 0; |
| job_ptr->total_cpus = 0; |
| job_ptr->pre_sus_time = 0; |
| job_ptr->suspend_time = 0; |
| job_ptr->tot_sus_time = 0; |
| /* Current code (<= 2.1) has it so we start the new job with the next |
| * step id. This could be used when restarting to figure out which |
| * step the previous run of this job stopped on. */ |
| //job_ptr->next_step_id = 0; |
| |
| job_ptr->node_cnt = 0; |
| #ifdef HAVE_BG |
| select_g_select_jobinfo_set(job_ptr->select_jobinfo, |
| SELECT_JOBDATA_BLOCK_ID, "unassigned"); |
| /* If on a bluegene system we want to remove the job_resrcs so |
| * we don't get an error message about them already existing |
| * when the job goes to run again. */ |
| free_job_resources(&job_ptr->job_resrcs); |
| #endif |
| xfree(job_ptr->nodes); |
| xfree(job_ptr->nodes_completing); |
| FREE_NULL_BITMAP(job_ptr->node_bitmap); |
| FREE_NULL_BITMAP(job_ptr->node_bitmap_cg); |
| if (job_ptr->details) { |
| time_t now = time(NULL); |
| /* the time stamp on the new batch launch credential must be |
| * larger than the time stamp on the revoke request. Also the |
| * I/O must be all cleared out and the named socket purged, |
| * so delay for at least ten seconds. */ |
| if (job_ptr->details->begin_time <= now) |
| job_ptr->details->begin_time = now + 10; |
| |
| /* Since this could happen on a launch we need to make sure the |
| * submit isn't the same as the last submit so put now + 1 so |
| * we get different records in the database */ |
| if (now == job_ptr->details->submit_time) |
| now++; |
| job_ptr->details->submit_time = now; |
| } |
| |
| /* Reset this after the batch step has finished or the batch step |
| * information will be attributed to the next run of the job. */ |
| job_ptr->db_index = 0; |
| if (!with_slurmdbd) |
| jobacct_storage_g_job_start(acct_db_conn, job_ptr); |
| } |
| |
| |
| /* job_fini - free all memory associated with job records */ |
| void job_fini (void) |
| { |
| if (job_list) { |
| list_destroy(job_list); |
| job_list = NULL; |
| } |
| xfree(job_hash); |
| xfree(job_array_hash_j); |
| xfree(job_array_hash_t); |
| } |
| |
| /* log the completion of the specified job */ |
| extern void job_completion_logger(struct job_record *job_ptr, bool requeue) |
| { |
| int base_state; |
| |
| xassert(job_ptr); |
| |
| acct_policy_remove_job_submit(job_ptr); |
| |
| if (!IS_JOB_RESIZING(job_ptr)) { |
| /* Remove configuring state just to make sure it isn't there |
| * since it will throw off displays of the job. */ |
| job_ptr->job_state &= (~JOB_CONFIGURING); |
| |
| /* make sure all parts of the job are notified */ |
| srun_job_complete(job_ptr); |
| |
| /* mail out notifications of completion */ |
| base_state = job_ptr->job_state & JOB_STATE_BASE; |
| if ((base_state == JOB_COMPLETE) || |
| (base_state == JOB_CANCELLED)) { |
| if (requeue && (job_ptr->mail_type & MAIL_JOB_REQUEUE)) |
| mail_job_info(job_ptr, MAIL_JOB_REQUEUE); |
| if (!requeue && (job_ptr->mail_type & MAIL_JOB_END)) |
| mail_job_info(job_ptr, MAIL_JOB_END); |
| } else { /* JOB_FAILED, JOB_TIMEOUT, etc. */ |
| if (job_ptr->mail_type & MAIL_JOB_FAIL) |
| mail_job_info(job_ptr, MAIL_JOB_FAIL); |
| else if (job_ptr->mail_type & MAIL_JOB_END) |
| mail_job_info(job_ptr, MAIL_JOB_END); |
| } |
| } |
| |
| g_slurm_jobcomp_write(job_ptr); |
| |
| /* When starting the resized job everything is taken care of |
| * elsewhere, so don't call it here. */ |
| if (IS_JOB_RESIZING(job_ptr)) |
| return; |
| |
| if (!job_ptr->assoc_id) { |
| slurmdb_association_rec_t assoc_rec; |
| /* In case accounting enabled after starting the job */ |
| memset(&assoc_rec, 0, sizeof(slurmdb_association_rec_t)); |
| assoc_rec.acct = job_ptr->account; |
| if (job_ptr->part_ptr) |
| assoc_rec.partition = job_ptr->part_ptr->name; |
| assoc_rec.uid = job_ptr->user_id; |
| |
| if (!(assoc_mgr_fill_in_assoc(acct_db_conn, &assoc_rec, |
| accounting_enforce, |
| (slurmdb_association_rec_t **) |
| &job_ptr->assoc_ptr, false))) { |
| job_ptr->assoc_id = assoc_rec.id; |
| /* we have to call job start again because the |
| * associd does not get updated in job complete */ |
| jobacct_storage_g_job_start(acct_db_conn, job_ptr); |
| } |
| } |
| |
| if (!with_slurmdbd && !job_ptr->db_index) |
| jobacct_storage_g_job_start(acct_db_conn, job_ptr); |
| |
| jobacct_storage_g_job_complete(acct_db_conn, job_ptr); |
| } |
| |
| /* |
| * job_independent - determine if this job has a dependent job pending |
| * or if the job's scheduled begin time is in the future |
| * IN job_ptr - pointer to job being tested |
| * RET - true if job no longer must be deferred for another job |
| */ |
| extern bool job_independent(struct job_record *job_ptr, int will_run) |
| { |
| struct job_details *detail_ptr = job_ptr->details; |
| time_t now = time(NULL); |
| int depend_rc; |
| |
| if ((job_ptr->state_reason == WAIT_HELD) || |
| (job_ptr->state_reason == WAIT_HELD_USER)) |
| return false; |
| |
| /* Test dependencies first so we can cancel jobs before dependent |
| * job records get purged (e.g. afterok, afternotok) */ |
| depend_rc = test_job_dependency(job_ptr); |
| if (depend_rc == 1) { |
| job_ptr->state_reason = WAIT_DEPENDENCY; |
| xfree(job_ptr->state_desc); |
| return false; |
| } else if (depend_rc == 2) { |
| time_t now = time(NULL); |
| info("Job dependency can't be satisfied, cancelling job %u", |
| job_ptr->job_id); |
| job_ptr->job_state = JOB_CANCELLED; |
| xfree(job_ptr->state_desc); |
| job_ptr->start_time = now; |
| job_ptr->end_time = now; |
| srun_allocate_abort(job_ptr); |
| job_completion_logger(job_ptr, false); |
| srun_allocate_abort(job_ptr); |
| return false; |
| } |
| |
| if (detail_ptr && (detail_ptr->begin_time > now)) { |
| job_ptr->state_reason = WAIT_TIME; |
| xfree(job_ptr->state_desc); |
| return false; /* not yet time */ |
| } |
| |
| if (job_test_resv_now(job_ptr) != SLURM_SUCCESS) { |
| job_ptr->state_reason = WAIT_RESERVATION; |
| xfree(job_ptr->state_desc); |
| return false; /* not yet time */ |
| } |
| |
| /* Job is eligible to start now */ |
| if (job_ptr->state_reason == WAIT_DEPENDENCY) { |
| job_ptr->state_reason = WAIT_NO_REASON; |
| xfree(job_ptr->state_desc); |
| } |
| if ((detail_ptr && (detail_ptr->begin_time == 0) && |
| (job_ptr->priority != 0))) { |
| detail_ptr->begin_time = now; |
| } else if (job_ptr->state_reason == WAIT_TIME) { |
| job_ptr->state_reason = WAIT_NO_REASON; |
| xfree(job_ptr->state_desc); |
| } |
| return true; |
| } |
| |
| /* |
| * determine if job is ready to execute per the node select plugin |
| * IN job_id - job to test |
| * OUT ready - 1 if job is ready to execute 0 otherwise |
| * RET SLURM error code |
| */ |
| extern int job_node_ready(uint32_t job_id, int *ready) |
| { |
| int rc; |
| struct job_record *job_ptr; |
| xassert(ready); |
| |
| *ready = 0; |
| job_ptr = find_job_record(job_id); |
| if (job_ptr == NULL) |
| return ESLURM_INVALID_JOB_ID; |
| |
| /* Always call select_g_job_ready() so that select/bluegene can |
| * test and update block state information. */ |
| rc = select_g_job_ready(job_ptr); |
| if (rc == READY_JOB_FATAL) |
| return ESLURM_INVALID_PARTITION_NAME; |
| if (rc == READY_JOB_ERROR) |
| return EAGAIN; |
| if (rc) |
| rc = READY_NODE_STATE; |
| |
| if (job_ptr->details && job_ptr->details->prolog_running) |
| rc &= (~READY_NODE_STATE); |
| |
| if (IS_JOB_RUNNING(job_ptr) || IS_JOB_SUSPENDED(job_ptr)) |
| rc |= READY_JOB_STATE; |
| if ((rc == (READY_NODE_STATE | READY_JOB_STATE)) && |
| job_ptr->alias_list && !strcmp(job_ptr->alias_list, "TBD") && |
| job_ptr->node_bitmap && |
| (bit_overlap(power_node_bitmap, job_ptr->node_bitmap) == 0)) { |
| set_job_alias_list(job_ptr); |
| } |
| |
| *ready = rc; |
| return SLURM_SUCCESS; |
| } |
| |
| /* Send specified signal to all steps associated with a job */ |
| static void _signal_job(struct job_record *job_ptr, int signal) |
| { |
| #ifndef HAVE_FRONT_END |
| int i; |
| #endif |
| agent_arg_t *agent_args = NULL; |
| signal_job_msg_t *signal_job_msg = NULL; |
| static int notify_srun_static = -1; |
| int notify_srun = 0; |
| |
| if (notify_srun_static == -1) { |
| char *launch_type = slurm_get_launch_type(); |
| /* do this for all but slurm (poe, aprun, etc...) */ |
| if (strcmp(launch_type, "launch/slurm")) |
| notify_srun_static = 1; |
| else |
| notify_srun_static = 0; |
| xfree(launch_type); |
| } |
| |
| #ifdef HAVE_FRONT_END |
| /* On a front end system always notify_srun instead of slurmd */ |
| if (notify_srun_static) |
| notify_srun = 1; |
| #else |
| /* For launch/poe all signals are forwarded by srun to poe to tasks |
| * except SIGSTOP/SIGCONT, which are used for job preemption. In that |
| * case the slurmd must directly suspend tasks and switch resources. */ |
| if (notify_srun_static && (signal != SIGSTOP) && (signal != SIGCONT)) |
| notify_srun = 1; |
| #endif |
| |
| if (notify_srun) { |
| ListIterator step_iterator; |
| struct step_record *step_ptr; |
| step_iterator = list_iterator_create(job_ptr->step_list); |
| while ((step_ptr = list_next(step_iterator))) { |
| /* Since we have already checked the uid, |
| * we can send this signal as uid 0. */ |
| job_step_signal(job_ptr->job_id, step_ptr->step_id, |
| signal, 0); |
| } |
| list_iterator_destroy (step_iterator); |
| |
| return; |
| } |
| |
| agent_args = xmalloc(sizeof(agent_arg_t)); |
| agent_args->msg_type = REQUEST_SIGNAL_JOB; |
| agent_args->retry = 1; |
| agent_args->hostlist = hostlist_create(NULL); |
| signal_job_msg = xmalloc(sizeof(kill_tasks_msg_t)); |
| signal_job_msg->job_id = job_ptr->job_id; |
| signal_job_msg->signal = signal; |
| |
| #ifdef HAVE_FRONT_END |
| xassert(job_ptr->batch_host); |
| if (job_ptr->front_end_ptr) |
| agent_args->protocol_version = |
| job_ptr->front_end_ptr->protocol_version; |
| hostlist_push_host(agent_args->hostlist, job_ptr->batch_host); |
| agent_args->node_count = 1; |
| #else |
| agent_args->protocol_version = SLURM_PROTOCOL_VERSION; |
| for (i = 0; i < node_record_count; i++) { |
| if (bit_test(job_ptr->node_bitmap, i) == 0) |
| continue; |
| if (agent_args->protocol_version > |
| node_record_table_ptr[i].protocol_version) |
| agent_args->protocol_version = |
| node_record_table_ptr[i].protocol_version; |
| hostlist_push_host(agent_args->hostlist, |
| node_record_table_ptr[i].name); |
| agent_args->node_count++; |
| } |
| #endif |
| |
| if (agent_args->node_count == 0) { |
| xfree(signal_job_msg); |
| xfree(agent_args); |
| return; |
| } |
| |
| agent_args->msg_args = signal_job_msg; |
| agent_queue_request(agent_args); |
| return; |
| } |
| |
| static void *_switch_suspend_info(struct job_record *job_ptr) |
| { |
| ListIterator step_iterator; |
| struct step_record *step_ptr; |
| void *switch_suspend_info = NULL; |
| |
| step_iterator = list_iterator_create (job_ptr->step_list); |
| while ((step_ptr = (struct step_record *) list_next (step_iterator))) { |
| if (step_ptr->state != JOB_RUNNING) |
| continue; |
| switch_g_job_suspend_info_get(step_ptr->switch_job, |
| &switch_suspend_info); |
| } |
| list_iterator_destroy (step_iterator); |
| |
| return switch_suspend_info; |
| } |
| |
| /* Send suspend request to slumrd of all nodes associated with a job |
| * job_ptr IN - job to be suspended or resumed |
| * op IN - SUSPEND_JOB or RESUME_JOB |
| * indf_susp IN - set if job is being suspended indefinitely by user |
| * or admin, otherwise suspended for gang scheduling |
| */ |
| static void _suspend_job(struct job_record *job_ptr, uint16_t op, |
| bool indf_susp) |
| { |
| #ifndef HAVE_FRONT_END |
| int i; |
| #endif |
| agent_arg_t *agent_args; |
| suspend_int_msg_t *sus_ptr; |
| |
| agent_args = xmalloc(sizeof(agent_arg_t)); |
| agent_args->msg_type = REQUEST_SUSPEND_INT; |
| agent_args->retry = 0; /* don't resend, gang scheduler |
| * sched/wiki or sched/wiki2 can |
| * quickly induce huge backlog |
| * of agent.c RPCs */ |
| agent_args->hostlist = hostlist_create(NULL); |
| sus_ptr = xmalloc(sizeof(suspend_int_msg_t)); |
| sus_ptr->job_core_spec = job_ptr->details->core_spec; |
| sus_ptr->job_id = job_ptr->job_id; |
| sus_ptr->op = op; |
| sus_ptr->indf_susp = indf_susp; |
| sus_ptr->switch_info = _switch_suspend_info(job_ptr); |
| |
| #ifdef HAVE_FRONT_END |
| xassert(job_ptr->batch_host); |
| if (job_ptr->front_end_ptr) |
| agent_args->protocol_version = |
| job_ptr->front_end_ptr->protocol_version; |
| hostlist_push_host(agent_args->hostlist, job_ptr->batch_host); |
| agent_args->node_count = 1; |
| #else |
| agent_args->protocol_version = SLURM_PROTOCOL_VERSION; |
| for (i = 0; i < node_record_count; i++) { |
| if (bit_test(job_ptr->node_bitmap, i) == 0) |
| continue; |
| if (agent_args->protocol_version > |
| node_record_table_ptr[i].protocol_version) |
| agent_args->protocol_version = |
| node_record_table_ptr[i].protocol_version; |
| hostlist_push_host(agent_args->hostlist, |
| node_record_table_ptr[i].name); |
| agent_args->node_count++; |
| } |
| #endif |
| |
| if (agent_args->node_count == 0) { |
| slurm_free_suspend_int_msg(sus_ptr); |
| xfree(agent_args); |
| return; |
| } |
| |
| agent_args->msg_args = sus_ptr; |
| agent_queue_request(agent_args); |
| return; |
| } |
| |
| /* |
| * Specified job is being suspended, release allocated nodes |
| * job_ptr IN - job to be suspended |
| * indf_susp IN - set if job is being suspended indefinitely by user |
| * or admin, otherwise suspended for gang scheduling |
| */ |
| static int _suspend_job_nodes(struct job_record *job_ptr, bool indf_susp) |
| { |
| int i, rc = SLURM_SUCCESS; |
| struct node_record *node_ptr = node_record_table_ptr; |
| uint16_t node_flags; |
| time_t now = time(NULL); |
| |
| if ((rc = select_g_job_suspend(job_ptr, indf_susp)) != SLURM_SUCCESS) |
| return rc; |
| |
| for (i=0; i<node_record_count; i++, node_ptr++) { |
| if (bit_test(job_ptr->node_bitmap, i) == 0) |
| continue; |
| |
| node_ptr->sus_job_cnt++; |
| if (node_ptr->run_job_cnt) |
| (node_ptr->run_job_cnt)--; |
| else { |
| error("Node %s run_job_cnt underflow", |
| node_ptr->name); |
| } |
| if (job_ptr->details && (job_ptr->details->share_res == 0)) { |
| if (node_ptr->no_share_job_cnt) |
| (node_ptr->no_share_job_cnt)--; |
| else { |
| error("Node %s no_share_job_cnt " |
| "underflow", node_ptr->name); |
| } |
| if (node_ptr->no_share_job_cnt == 0) |
| bit_set(share_node_bitmap, i); |
| } |
| node_flags = node_ptr->node_state & NODE_STATE_FLAGS; |
| if ((node_ptr->run_job_cnt == 0) && |
| (node_ptr->comp_job_cnt == 0)) { |
| bit_set(idle_node_bitmap, i); |
| } |
| if (IS_NODE_DOWN(node_ptr)) { |
| debug3("_suspend_job_nodes: Node %s left DOWN", |
| node_ptr->name); |
| } else if (node_ptr->run_job_cnt) { |
| node_ptr->node_state = NODE_STATE_ALLOCATED | |
| node_flags; |
| } else { |
| node_ptr->node_state = NODE_STATE_IDLE | node_flags; |
| node_ptr->last_idle = now; |
| } |
| } |
| last_job_update = last_node_update = now; |
| return rc; |
| } |
| |
| /* |
| * Specified job is being resumed, re-allocate the nodes |
| * job_ptr IN - job to be resumed |
| * indf_susp IN - set i f job is being resumed from indefinite suspend by user |
| * or admin, otherwise resume from gang scheduling |
| */ |
| static int _resume_job_nodes(struct job_record *job_ptr, bool indf_susp) |
| { |
| int i, rc = SLURM_SUCCESS; |
| struct node_record *node_ptr = node_record_table_ptr; |
| uint16_t node_flags; |
| |
| if ((rc = select_g_job_resume(job_ptr, indf_susp)) != SLURM_SUCCESS) |
| return rc; |
| |
| for (i=0; i<node_record_count; i++, node_ptr++) { |
| if (bit_test(job_ptr->node_bitmap, i) == 0) |
| continue; |
| if (IS_NODE_DOWN(node_ptr)) |
| return SLURM_ERROR; |
| } |
| |
| node_ptr = node_record_table_ptr; |
| for (i=0; i<node_record_count; i++, node_ptr++) { |
| if (bit_test(job_ptr->node_bitmap, i) == 0) |
| continue; |
| |
| if (node_ptr->sus_job_cnt) |
| (node_ptr->sus_job_cnt)--; |
| else { |
| error("Node %s sus_job_cnt underflow", |
| node_ptr->name); |
| } |
| node_ptr->run_job_cnt++; |
| if (job_ptr->details && |
| (job_ptr->details->share_res == 0)) { |
| node_ptr->no_share_job_cnt++; |
| if (node_ptr->no_share_job_cnt) |
| bit_clear(share_node_bitmap, i); |
| } |
| bit_clear(idle_node_bitmap, i); |
| node_flags = node_ptr->node_state & NODE_STATE_FLAGS; |
| node_ptr->node_state = NODE_STATE_ALLOCATED | node_flags; |
| } |
| last_job_update = last_node_update = time(NULL); |
| return rc; |
| } |
| |
| static int _job_suspend_switch_test(struct job_record *job_ptr) |
| { |
| int rc = SLURM_SUCCESS; |
| ListIterator step_iterator; |
| struct step_record *step_ptr; |
| |
| step_iterator = list_iterator_create(job_ptr->step_list); |
| while ((step_ptr = (struct step_record *) list_next (step_iterator))) { |
| if (step_ptr->state != JOB_RUNNING) |
| continue; |
| rc = switch_g_job_suspend_test(step_ptr->switch_job); |
| if (rc != SLURM_SUCCESS) |
| break; |
| } |
| list_iterator_destroy (step_iterator); |
| |
| return rc; |
| } |
| |
| /* |
| * job_suspend - perform some suspend/resume operation |
| * IN sus_ptr - suspend/resume request message |
| * IN uid - user id of the user issuing the RPC |
| * IN conn_fd - file descriptor on which to send reply, |
| * -1 if none |
| * indf_susp IN - set if job is being suspended indefinitely by user or admin |
| * and we should clear it's priority, otherwise suspended |
| * temporarily for gang scheduling |
| * IN protocol_version - slurm protocol version of client |
| * RET 0 on success, otherwise ESLURM error code |
| */ |
| extern int job_suspend(suspend_msg_t *sus_ptr, uid_t uid, |
| slurm_fd_t conn_fd, bool indf_susp, |
| uint16_t protocol_version) |
| { |
| int rc = SLURM_SUCCESS; |
| time_t now = time(NULL); |
| struct job_record *job_ptr = NULL; |
| slurm_msg_t resp_msg; |
| return_code_msg_t rc_msg; |
| #ifdef HAVE_BG |
| rc = ESLURM_NOT_SUPPORTED; |
| #endif |
| if (rc) |
| goto reply; |
| |
| /* find the job */ |
| job_ptr = find_job_record (sus_ptr->job_id); |
| if (job_ptr == NULL) { |
| rc = ESLURM_INVALID_JOB_ID; |
| goto reply; |
| } |
| |
| /* validate the request */ |
| if ((uid != 0) && (uid != getuid())) { |
| rc = ESLURM_ACCESS_DENIED; |
| goto reply; |
| } |
| if (IS_JOB_PENDING(job_ptr)) { |
| rc = ESLURM_JOB_PENDING; |
| goto reply; |
| } |
| if (IS_JOB_FINISHED(job_ptr)) { |
| rc = ESLURM_ALREADY_DONE; |
| goto reply; |
| } |
| if ((sus_ptr->op == SUSPEND_JOB) && |
| (_job_suspend_switch_test(job_ptr) != SLURM_SUCCESS)) { |
| rc = ESLURM_NOT_SUPPORTED; |
| goto reply; |
| } |
| |
| /* Notify salloc/srun of suspend/resume */ |
| srun_job_suspend(job_ptr, sus_ptr->op); |
| |
| /* perform the operation */ |
| if (sus_ptr->op == SUSPEND_JOB) { |
| if (!IS_JOB_RUNNING(job_ptr)) { |
| rc = ESLURM_DISABLED; |
| goto reply; |
| } |
| rc = _suspend_job_nodes(job_ptr, indf_susp); |
| if (rc != SLURM_SUCCESS) |
| goto reply; |
| _suspend_job(job_ptr, sus_ptr->op, indf_susp); |
| job_ptr->job_state = JOB_SUSPENDED; |
| if (indf_susp) |
| job_ptr->priority = 0; |
| if (job_ptr->suspend_time) { |
| job_ptr->pre_sus_time += |
| difftime(now, |
| job_ptr->suspend_time); |
| } else { |
| job_ptr->pre_sus_time += |
| difftime(now, |
| job_ptr->start_time); |
| } |
| suspend_job_step(job_ptr); |
| } else if (sus_ptr->op == RESUME_JOB) { |
| if (!IS_JOB_SUSPENDED(job_ptr)) { |
| rc = ESLURM_DISABLED; |
| goto reply; |
| } |
| rc = _resume_job_nodes(job_ptr, indf_susp); |
| if (rc != SLURM_SUCCESS) |
| goto reply; |
| _suspend_job(job_ptr, sus_ptr->op, indf_susp); |
| if (job_ptr->priority == 0) |
| set_job_prio(job_ptr); |
| job_ptr->job_state = JOB_RUNNING; |
| job_ptr->tot_sus_time += |
| difftime(now, job_ptr->suspend_time); |
| if (!wiki_sched_test) { |
| char *sched_type = slurm_get_sched_type(); |
| if (strcmp(sched_type, "sched/wiki") == 0) |
| wiki_sched = true; |
| if (strcmp(sched_type, "sched/wiki2") == 0) { |
| wiki_sched = true; |
| wiki2_sched = true; |
| } |
| xfree(sched_type); |
| wiki_sched_test = true; |
| } |
| if ((job_ptr->time_limit != INFINITE) && (!wiki2_sched) && |
| (!job_ptr->preempt_time)) { |
| debug3("Job %u resumed, updating end_time", |
| job_ptr->job_id); |
| job_ptr->end_time = now + |
| (job_ptr->time_limit * 60) |
| - job_ptr->pre_sus_time; |
| } |
| resume_job_step(job_ptr); |
| } |
| |
| job_ptr->time_last_active = now; |
| job_ptr->suspend_time = now; |
| jobacct_storage_g_job_suspend(acct_db_conn, job_ptr); |
| |
| reply: |
| if (conn_fd >= 0) { |
| slurm_msg_t_init(&resp_msg); |
| resp_msg.protocol_version = protocol_version; |
| resp_msg.msg_type = RESPONSE_SLURM_RC; |
| rc_msg.return_code = rc; |
| resp_msg.data = &rc_msg; |
| slurm_send_node_msg(conn_fd, &resp_msg); |
| } |
| return rc; |
| } |
| |
| /* |
| * job_requeue - Requeue a running or pending batch job |
| * IN uid - user id of user issuing the RPC |
| * IN job_id - id of the job to be requeued |
| * IN conn_fd - file descriptor on which to send reply |
| * IN protocol_version - slurm protocol version of client |
| * IN preempt - true if job being preempted |
| * RET 0 on success, otherwise ESLURM error code |
| */ |
| extern int job_requeue(uid_t uid, |
| uint32_t job_id, |
| slurm_fd_t conn_fd, |
| uint16_t protocol_version, |
| bool preempt) |
| { |
| int rc = SLURM_SUCCESS; |
| struct job_record *job_ptr = NULL; |
| bool suspended = false; |
| slurm_msg_t resp_msg; |
| return_code_msg_t rc_msg; |
| time_t now = time(NULL); |
| bool is_running; |
| |
| /* find the job */ |
| job_ptr = find_job_record(job_id); |
| if (job_ptr == NULL) { |
| rc = ESLURM_INVALID_JOB_ID; |
| goto reply; |
| } |
| |
| /* validate the request */ |
| if ((uid != job_ptr->user_id) && !validate_operator(uid) && |
| !assoc_mgr_is_user_acct_coord(acct_db_conn, uid, |
| job_ptr->account)) { |
| rc = ESLURM_ACCESS_DENIED; |
| goto reply; |
| } |
| |
| /* If the partition was removed don't allow the job to be |
| * requeued. If it doesn't have details then something is very |
| * wrong and if the job doesn't want to be requeued don't. |
| */ |
| if (!job_ptr->part_ptr || !job_ptr->details |
| || !job_ptr->details->requeue) { |
| rc = ESLURM_DISABLED; |
| goto reply; |
| } |
| |
| /* In the job is in the process of completing |
| * return SLURM_SUCCESS, the caller will record |
| * the request. |
| */ |
| if (IS_JOB_COMPLETING(job_ptr)) { |
| goto reply; |
| } |
| |
| /* If the job is already pending do nothing |
| * and return is well to the library. |
| */ |
| if (IS_JOB_PENDING(job_ptr)) { |
| rc = ESLURM_JOB_PENDING; |
| goto reply; |
| } |
| |
| if (job_ptr->batch_flag == 0) { |
| debug("Job-requeue can only be done for batch jobs"); |
| rc = ESLURM_BATCH_ONLY; |
| goto reply; |
| } |
| |
| slurm_sched_g_requeue(job_ptr, "Job requeued by user/admin"); |
| last_job_update = now; |
| |
| if (IS_JOB_SUSPENDED(job_ptr)) { |
| enum job_states suspend_job_state = job_ptr->job_state; |
| /* we can't have it as suspended when we call the |
| * accounting stuff. |
| */ |
| job_ptr->job_state = JOB_REQUEUE; |
| jobacct_storage_g_job_suspend(acct_db_conn, job_ptr); |
| job_ptr->job_state = suspend_job_state; |
| suspended = true; |
| } |
| |
| job_ptr->time_last_active = now; |
| if (suspended) |
| job_ptr->end_time = job_ptr->suspend_time; |
| else |
| job_ptr->end_time = now; |
| |
| /* Save the state of the job so that |
| * we deallocate the nodes if is in |
| * running state. |
| */ |
| is_running = false; |
| if (IS_JOB_SUSPENDED(job_ptr) |
| || IS_JOB_RUNNING(job_ptr)) |
| is_running = true; |
| |
| /* We want this job to have the requeued state in the |
| * accounting logs. Set a new submit time so the restarted |
| * job looks like a new job. */ |
| job_ptr->job_state = JOB_REQUEUE; |
| build_cg_bitmap(job_ptr); |
| job_completion_logger(job_ptr, true); |
| |
| /* Deallocate resources only if the job |
| * has some. |
| */ |
| if (is_running) |
| deallocate_nodes(job_ptr, false, suspended, preempt); |
| |
| xfree(job_ptr->details->req_node_layout); |
| |
| /* do this after the epilog complete, setting it here is too early */ |
| //job_ptr->db_index = 0; |
| //job_ptr->details->submit_time = now; |
| |
| job_ptr->job_state = JOB_PENDING; |
| if (job_ptr->node_cnt) |
| job_ptr->job_state |= JOB_COMPLETING; |
| |
| job_ptr->pre_sus_time = (time_t) 0; |
| job_ptr->suspend_time = (time_t) 0; |
| job_ptr->tot_sus_time = (time_t) 0; |
| |
| job_ptr->restart_cnt++; |
| /* Since the job completion logger removes the submit we need |
| * to add it again. */ |
| acct_policy_add_job_submit(job_ptr); |
| |
| reply: |
| if (conn_fd >= 0) { |
| slurm_msg_t_init(&resp_msg); |
| resp_msg.protocol_version = protocol_version; |
| resp_msg.msg_type = RESPONSE_SLURM_RC; |
| rc_msg.return_code = rc; |
| resp_msg.data = &rc_msg; |
| slurm_send_node_msg(conn_fd, &resp_msg); |
| } |
| return rc; |
| } |
| |
| /* |
| * job_end_time - Process JOB_END_TIME |
| * IN time_req_msg - job end time request |
| * OUT timeout_msg - job timeout response to be sent |
| * RET SLURM_SUCESS or an error code |
| */ |
| extern int job_end_time(job_alloc_info_msg_t *time_req_msg, |
| srun_timeout_msg_t *timeout_msg) |
| { |
| struct job_record *job_ptr; |
| xassert(timeout_msg); |
| |
| job_ptr = find_job_record(time_req_msg->job_id); |
| if (!job_ptr) |
| return ESLURM_INVALID_JOB_ID; |
| |
| timeout_msg->job_id = time_req_msg->job_id; |
| timeout_msg->step_id = NO_VAL; |
| timeout_msg->timeout = job_ptr->end_time; |
| return SLURM_SUCCESS; |
| } |
| |
| /* Reset nodes_completing field for all jobs. |
| * Job write lock must be set before calling. */ |
| extern void update_job_nodes_completing(void) |
| { |
| ListIterator job_iterator; |
| struct job_record *job_ptr; |
| |
| if (!job_list) |
| return; |
| |
| job_iterator = list_iterator_create(job_list); |
| while ((job_ptr = (struct job_record *) list_next(job_iterator))) { |
| if ((!IS_JOB_COMPLETING(job_ptr)) || |
| (job_ptr->node_bitmap == NULL)) |
| continue; |
| xfree(job_ptr->nodes_completing); |
| if (job_ptr->node_bitmap_cg) { |
| job_ptr->nodes_completing = |
| bitmap2node_name(job_ptr->node_bitmap_cg); |
| } else { |
| job_ptr->nodes_completing = |
| bitmap2node_name(job_ptr->node_bitmap); |
| } |
| } |
| list_iterator_destroy(job_iterator); |
| } |
| |
| /* |
| * job_hold_by_assoc_id - Hold all pending jobs with a given |
| * association ID. This happens when an association is deleted (e.g. when |
| * a user is removed from the association database). |
| * RET count of held jobs |
| */ |
| extern int job_hold_by_assoc_id(uint32_t assoc_id) |
| { |
| int cnt = 0; |
| ListIterator job_iterator; |
| struct job_record *job_ptr; |
| /* Write lock on jobs */ |
| slurmctld_lock_t job_write_lock = |
| { NO_LOCK, WRITE_LOCK, NO_LOCK, NO_LOCK }; |
| |
| if (!job_list) |
| return cnt; |
| |
| lock_slurmctld(job_write_lock); |
| job_iterator = list_iterator_create(job_list); |
| while ((job_ptr = (struct job_record *) list_next(job_iterator))) { |
| if (job_ptr->assoc_id != assoc_id) |
| continue; |
| |
| /* move up to the parent that should still exist */ |
| if (job_ptr->assoc_ptr) { |
| /* Force a start so the association doesn't |
| get lost. Since there could be some delay |
| in the start of the job when running with |
| the slurmdbd. |
| */ |
| if (!job_ptr->db_index) { |
| jobacct_storage_g_job_start(acct_db_conn, |
| job_ptr); |
| } |
| |
| job_ptr->assoc_ptr = |
| ((slurmdb_association_rec_t *) |
| job_ptr->assoc_ptr)->usage->parent_assoc_ptr; |
| if (job_ptr->assoc_ptr) |
| job_ptr->assoc_id = |
| ((slurmdb_association_rec_t *) |
| job_ptr->assoc_ptr)->id; |
| } |
| |
| if (IS_JOB_FINISHED(job_ptr)) |
| continue; |
| |
| info("Association deleted, holding job %u", |
| job_ptr->job_id); |
| xfree(job_ptr->state_desc); |
| job_ptr->state_reason = FAIL_ACCOUNT; |
| cnt++; |
| } |
| list_iterator_destroy(job_iterator); |
| unlock_slurmctld(job_write_lock); |
| return cnt; |
| } |
| |
| /* |
| * job_hold_by_qos_id - Hold all pending jobs with a given |
| * QOS ID. This happens when a QOS is deleted (e.g. when |
| * a QOS is removed from the association database). |
| * RET count of held jobs |
| */ |
| extern int job_hold_by_qos_id(uint32_t qos_id) |
| { |
| int cnt = 0; |
| ListIterator job_iterator; |
| struct job_record *job_ptr; |
| /* Write lock on jobs */ |
| slurmctld_lock_t job_write_lock = |
| { NO_LOCK, WRITE_LOCK, NO_LOCK, NO_LOCK }; |
| |
| if (!job_list) |
| return cnt; |
| |
| lock_slurmctld(job_write_lock); |
| job_iterator = list_iterator_create(job_list); |
| while ((job_ptr = (struct job_record *) list_next(job_iterator))) { |
| if (job_ptr->qos_id != qos_id) |
| continue; |
| |
| /* move up to the parent that should still exist */ |
| if (job_ptr->qos_ptr) { |
| /* Force a start so the association doesn't |
| get lost. Since there could be some delay |
| in the start of the job when running with |
| the slurmdbd. |
| */ |
| if (!job_ptr->db_index) { |
| jobacct_storage_g_job_start(acct_db_conn, |
| job_ptr); |
| } |
| job_ptr->qos_ptr = NULL; |
| } |
| |
| if (IS_JOB_FINISHED(job_ptr)) |
| continue; |
| |
| info("QOS deleted, holding job %u", job_ptr->job_id); |
| xfree(job_ptr->state_desc); |
| job_ptr->state_reason = FAIL_QOS; |
| cnt++; |
| } |
| list_iterator_destroy(job_iterator); |
| unlock_slurmctld(job_write_lock); |
| return cnt; |
| } |
| |
| /* |
| * Modify the account associated with a pending job |
| * IN module - where this is called from |
| * IN job_ptr - pointer to job which should be modified |
| * IN new_account - desired account name |
| * RET SLURM_SUCCESS or error code |
| */ |
| extern int update_job_account(char *module, struct job_record *job_ptr, |
| char *new_account) |
| { |
| slurmdb_association_rec_t assoc_rec; |
| |
| if ((!IS_JOB_PENDING(job_ptr)) || (job_ptr->details == NULL)) { |
| info("%s: attempt to modify account for non-pending " |
| "job_id %u", module, job_ptr->job_id); |
| return ESLURM_DISABLED; |
| } |
| |
| |
| memset(&assoc_rec, 0, sizeof(slurmdb_association_rec_t)); |
| assoc_rec.acct = new_account; |
| if (job_ptr->part_ptr) |
| assoc_rec.partition = job_ptr->part_ptr->name; |
| assoc_rec.uid = job_ptr->user_id; |
| if (assoc_mgr_fill_in_assoc(acct_db_conn, &assoc_rec, |
| accounting_enforce, |
| (slurmdb_association_rec_t **) |
| &job_ptr->assoc_ptr, false)) { |
| info("%s: invalid account %s for job_id %u", |
| module, new_account, job_ptr->job_id); |
| return ESLURM_INVALID_ACCOUNT; |
| } else if (association_based_accounting && |
| !job_ptr->assoc_ptr && |
| !(accounting_enforce & ACCOUNTING_ENFORCE_ASSOCS)) { |
| /* if not enforcing associations we want to look for |
| * the default account and use it to avoid getting |
| * trash in the accounting records. |
| */ |
| assoc_rec.acct = NULL; |
| assoc_mgr_fill_in_assoc(acct_db_conn, &assoc_rec, |
| accounting_enforce, |
| (slurmdb_association_rec_t **) |
| &job_ptr->assoc_ptr, false); |
| if (!job_ptr->assoc_ptr) { |
| debug("%s: we didn't have an association for account " |
| "'%s' and user '%u', and we can't seem to find " |
| "a default one either. Keeping new account " |
| "'%s'. This will produce trash in accounting. " |
| "If this is not what you desire please put " |
| "AccountStorageEnforce=associations " |
| "in your slurm.conf " |
| "file.", module, new_account, |
| job_ptr->user_id, new_account); |
| assoc_rec.acct = new_account; |
| } |
| } |
| |
| xfree(job_ptr->account); |
| if (assoc_rec.acct && assoc_rec.acct[0] != '\0') { |
| job_ptr->account = xstrdup(assoc_rec.acct); |
| info("%s: setting account to %s for job_id %u", |
| module, assoc_rec.acct, job_ptr->job_id); |
| } else { |
| info("%s: cleared account for job_id %u", |
| module, job_ptr->job_id); |
| } |
| job_ptr->assoc_id = assoc_rec.id; |
| |
| last_job_update = time(NULL); |
| |
| return SLURM_SUCCESS; |
| } |
| |
| /* |
| * Modify the account associated with a pending job |
| * IN module - where this is called from |
| * IN job_ptr - pointer to job which should be modified |
| * IN new_wckey - desired wckey name |
| * RET SLURM_SUCCESS or error code |
| */ |
| extern int update_job_wckey(char *module, struct job_record *job_ptr, |
| char *new_wckey) |
| { |
| slurmdb_wckey_rec_t wckey_rec, *wckey_ptr; |
| |
| if ((!IS_JOB_PENDING(job_ptr)) || (job_ptr->details == NULL)) { |
| info("%s: attempt to modify account for non-pending " |
| "job_id %u", module, job_ptr->job_id); |
| return ESLURM_DISABLED; |
| } |
| |
| memset(&wckey_rec, 0, sizeof(slurmdb_wckey_rec_t)); |
| wckey_rec.uid = job_ptr->user_id; |
| wckey_rec.name = new_wckey; |
| if (assoc_mgr_fill_in_wckey(acct_db_conn, &wckey_rec, |
| accounting_enforce, &wckey_ptr)) { |
| info("%s: invalid wckey %s for job_id %u", |
| module, new_wckey, job_ptr->job_id); |
| return ESLURM_INVALID_WCKEY; |
| } else if (association_based_accounting |
| && !wckey_ptr |
| && !(accounting_enforce & ACCOUNTING_ENFORCE_WCKEYS)) { |
| /* if not enforcing associations we want to look for |
| the default account and use it to avoid getting |
| trash in the accounting records. |
| */ |
| wckey_rec.name = NULL; |
| assoc_mgr_fill_in_wckey(acct_db_conn, &wckey_rec, |
| accounting_enforce, &wckey_ptr); |
| if (!wckey_ptr) { |
| debug("%s: we didn't have a wckey record for wckey " |
| "'%s' and user '%u', and we can't seem to find " |
| "a default one either. Setting it anyway. " |
| "This will produce trash in accounting. " |
| "If this is not what you desire please put " |
| "AccountStorageEnforce=wckeys in your slurm.conf " |
| "file.", module, new_wckey, |
| job_ptr->user_id); |
| wckey_rec.name = new_wckey; |
| } |
| } |
| |
| xfree(job_ptr->wckey); |
| if (wckey_rec.name && wckey_rec.name[0] != '\0') { |
| job_ptr->wckey = xstrdup(wckey_rec.name); |
| info("%s: setting wckey to %s for job_id %u", |
| module, wckey_rec.name, job_ptr->job_id); |
| } else { |
| info("%s: cleared wckey for job_id %u", |
| module, job_ptr->job_id); |
| } |
| |
| last_job_update = time(NULL); |
| |
| return SLURM_SUCCESS; |
| } |
| |
| extern int send_jobs_to_accounting(void) |
| { |
| ListIterator itr = NULL; |
| struct job_record *job_ptr; |
| slurmctld_lock_t job_write_lock = { |
| NO_LOCK, WRITE_LOCK, READ_LOCK, READ_LOCK }; |
| |
| /* send jobs in pending or running state */ |
| lock_slurmctld(job_write_lock); |
| itr = list_iterator_create(job_list); |
| while ((job_ptr = list_next(itr))) { |
| if (!job_ptr->assoc_id) { |
| slurmdb_association_rec_t assoc_rec; |
| memset(&assoc_rec, 0, |
| sizeof(slurmdb_association_rec_t)); |
| assoc_rec.acct = job_ptr->account; |
| if (job_ptr->part_ptr) |
| assoc_rec.partition = job_ptr->part_ptr->name; |
| assoc_rec.uid = job_ptr->user_id; |
| |
| if (assoc_mgr_fill_in_assoc( |
| acct_db_conn, &assoc_rec, |
| accounting_enforce, |
| (slurmdb_association_rec_t **) |
| &job_ptr->assoc_ptr, false) && |
| (accounting_enforce & ACCOUNTING_ENFORCE_ASSOCS) |
| && (!IS_JOB_FINISHED(job_ptr))) { |
| info("Holding job %u with " |
| "invalid association", |
| job_ptr->job_id); |
| xfree(job_ptr->state_desc); |
| job_ptr->state_reason = FAIL_ACCOUNT; |
| continue; |
| } else |
| job_ptr->assoc_id = assoc_rec.id; |
| } |
| |
| /* we only want active, un accounted for jobs */ |
| if (job_ptr->db_index || IS_JOB_FINISHED(job_ptr)) |
| continue; |
| |
| debug("first reg: starting job %u in accounting", |
| job_ptr->job_id); |
| jobacct_storage_g_job_start(acct_db_conn, job_ptr); |
| |
| if (IS_JOB_SUSPENDED(job_ptr)) |
| jobacct_storage_g_job_suspend(acct_db_conn, job_ptr); |
| } |
| list_iterator_destroy(itr); |
| unlock_slurmctld(job_write_lock); |
| |
| return SLURM_SUCCESS; |
| } |
| |
| /* Perform checkpoint operation on a job */ |
| extern int job_checkpoint(checkpoint_msg_t *ckpt_ptr, uid_t uid, |
| slurm_fd_t conn_fd, uint16_t protocol_version) |
| { |
| int rc = SLURM_SUCCESS; |
| struct job_record *job_ptr; |
| struct step_record *step_ptr; |
| checkpoint_resp_msg_t resp_data; |
| slurm_msg_t resp_msg; |
| |
| slurm_msg_t_init(&resp_msg); |
| resp_msg.protocol_version = protocol_version; |
| |
| /* find the job */ |
| job_ptr = find_job_record (ckpt_ptr->job_id); |
| if (job_ptr == NULL) { |
| rc = ESLURM_INVALID_JOB_ID; |
| goto reply; |
| } |
| if ((uid != job_ptr->user_id) && !validate_slurm_user(uid)) { |
| rc = ESLURM_ACCESS_DENIED ; |
| goto reply; |
| } |
| if (IS_JOB_PENDING(job_ptr)) { |
| rc = ESLURM_JOB_PENDING; |
| goto reply; |
| } else if (IS_JOB_SUSPENDED(job_ptr)) { |
| /* job can't get cycles for checkpoint |
| * if it is already suspended */ |
| rc = ESLURM_DISABLED; |
| goto reply; |
| } else if (!IS_JOB_RUNNING(job_ptr)) { |
| rc = ESLURM_ALREADY_DONE; |
| goto reply; |
| } |
| |
| memset((void *)&resp_data, 0, sizeof(checkpoint_resp_msg_t)); |
| |
| if (job_ptr->batch_flag) { /* operate on batch job */ |
| if ((ckpt_ptr->op == CHECK_CREATE) || |
| (ckpt_ptr->op == CHECK_REQUEUE) || |
| (ckpt_ptr->op == CHECK_VACATE)) { |
| if (job_ptr->details == NULL) { |
| rc = ESLURM_DISABLED; |
| goto reply; |
| } |
| if (ckpt_ptr->image_dir == NULL) { |
| if (job_ptr->details->ckpt_dir == NULL) { |
| rc = ESLURM_DISABLED; |
| goto reply; |
| } |
| ckpt_ptr->image_dir = xstrdup(job_ptr->details |
| ->ckpt_dir); |
| } |
| |
| rc = _checkpoint_job_record(job_ptr, |
| ckpt_ptr->image_dir); |
| if (rc != SLURM_SUCCESS) |
| goto reply; |
| } |
| /* append job id to ckpt image dir */ |
| xstrfmtcat(ckpt_ptr->image_dir, "/%u", job_ptr->job_id); |
| rc = checkpoint_op(ckpt_ptr->job_id, ckpt_ptr->step_id, NULL, |
| ckpt_ptr->op, ckpt_ptr->data, |
| ckpt_ptr->image_dir, &resp_data.event_time, |
| &resp_data.error_code, |
| &resp_data.error_msg); |
| info("checkpoint_op %u of %u.%u complete, rc=%d", |
| ckpt_ptr->op, ckpt_ptr->job_id, ckpt_ptr->step_id, rc); |
| last_job_update = time(NULL); |
| } else { /* operate on all of a job's steps */ |
| int update_rc = -2; |
| ListIterator step_iterator; |
| |
| step_iterator = list_iterator_create (job_ptr->step_list); |
| while ((step_ptr = (struct step_record *) |
| list_next (step_iterator))) { |
| char *image_dir = NULL; |
| if (step_ptr->state != JOB_RUNNING) |
| continue; |
| if (ckpt_ptr->image_dir) { |
| image_dir = xstrdup(ckpt_ptr->image_dir); |
| } else { |
| image_dir = xstrdup(step_ptr->ckpt_dir); |
| } |
| xstrfmtcat(image_dir, "/%u.%u", job_ptr->job_id, |
| step_ptr->step_id); |
| update_rc = checkpoint_op(ckpt_ptr->job_id, |
| step_ptr->step_id, |
| step_ptr, |
| ckpt_ptr->op, |
| ckpt_ptr->data, |
| image_dir, |
| &resp_data.event_time, |
| &resp_data.error_code, |
| &resp_data.error_msg); |
| info("checkpoint_op %u of %u.%u complete, rc=%d", |
| ckpt_ptr->op, ckpt_ptr->job_id, |
| step_ptr->step_id, rc); |
| rc = MAX(rc, update_rc); |
| xfree(image_dir); |
| } |
| if (update_rc != -2) /* some work done */ |
| last_job_update = time(NULL); |
| list_iterator_destroy (step_iterator); |
| } |
| |
| reply: |
| if (conn_fd < 0) /* periodic checkpoint */ |
| return rc; |
| |
| if ((rc == SLURM_SUCCESS) && |
| ((ckpt_ptr->op == CHECK_ABLE) || (ckpt_ptr->op == CHECK_ERROR))) { |
| resp_msg.msg_type = RESPONSE_CHECKPOINT; |
| resp_msg.data = &resp_data; |
| (void) slurm_send_node_msg(conn_fd, &resp_msg); |
| } else { |
| return_code_msg_t rc_msg; |
| rc_msg.return_code = rc; |
| resp_msg.msg_type = RESPONSE_SLURM_RC; |
| resp_msg.data = &rc_msg; |
| (void) slurm_send_node_msg(conn_fd, &resp_msg); |
| } |
| return rc; |
| } |
| |
| /* |
| * _checkpoint_job_record - save job to file for checkpoint |
| * |
| */ |
| static int _checkpoint_job_record (struct job_record *job_ptr, char *image_dir) |
| { |
| static int high_buffer_size = (1024*1024); |
| char *ckpt_file = NULL, *old_file = NULL, *new_file = NULL; |
| int ckpt_fd, error_code = SLURM_SUCCESS; |
| Buf buffer = init_buf(high_buffer_size); |
| |
| ckpt_file = xstrdup(slurmctld_conf.job_ckpt_dir); |
| xstrfmtcat(ckpt_file, "/%u.ckpt", job_ptr->job_id); |
| |
| debug("_checkpoint_job_record: checkpoint job record of %u to file %s", |
| job_ptr->job_id, ckpt_file); |
| |
| old_file = xstrdup(ckpt_file); |
| xstrcat(old_file, ".old"); |
| |
| new_file = xstrdup(ckpt_file); |
| xstrcat(new_file, ".new"); |
| |
| /* save version string */ |
| packstr(JOB_CKPT_VERSION, buffer); |
| pack16(SLURM_PROTOCOL_VERSION, buffer); |
| |
| /* save checkpoint image directory */ |
| packstr(image_dir, buffer); |
| |
| _pack_job_for_ckpt(job_ptr, buffer); |
| |
| ckpt_fd = creat(new_file, 0600); |
| if (ckpt_fd < 0) { |
| error("Can't ckpt job, create file %s error: %m", |
| new_file); |
| error_code = errno; |
| } else { |
| int pos = 0, nwrite = get_buf_offset(buffer), amount, rc; |
| char *data = (char *)get_buf_data(buffer); |
| while (nwrite > 0) { |
| amount = write(ckpt_fd, &data[pos], nwrite); |
| if ((amount < 0) && (errno != EINTR)) { |
| error("Error writing file %s, %m", new_file); |
| error_code = errno; |
| break; |
| } else if (amount >= 0) { |
| nwrite -= amount; |
| pos += amount; |
| } |
| } |
| |
| rc = fsync_and_close(ckpt_fd, "checkpoint"); |
| if (rc && !error_code) |
| error_code = rc; |
| } |
| if (error_code) |
| (void) unlink(new_file); |
| else { /* file shuffle */ |
| (void) unlink(old_file); |
| if (link(ckpt_file, old_file)) |
| debug4("unable to create link for %s -> %s: %m", |
| ckpt_file, old_file); |
| (void) unlink(ckpt_file); |
| if (link(new_file, ckpt_file)) |
| debug4("unable to create link for %s -> %s: %m", |
| new_file, ckpt_file); |
| (void) unlink(new_file); |
| } |
| |
| xfree(ckpt_file); |
| xfree(old_file); |
| xfree(new_file); |
| free_buf(buffer); |
| |
| return error_code; |
| } |
| |
| /* |
| * _pack_job_for_ckpt - save RUNNING job to buffer for checkpoint |
| * |
| * Just save enough information to restart it |
| * |
| * IN job_ptr - id of the job to be checkpointed |
| * IN buffer - buffer to save the job state |
| */ |
| static void _pack_job_for_ckpt (struct job_record *job_ptr, Buf buffer) |
| { |
| slurm_msg_t msg; |
| job_desc_msg_t *job_desc; |
| |
| /* save allocated nodes */ |
| packstr(job_ptr->nodes, buffer); |
| |
| /* save job req */ |
| job_desc = _copy_job_record_to_job_desc(job_ptr); |
| msg.msg_type = REQUEST_SUBMIT_BATCH_JOB; |
| msg.protocol_version = SLURM_PROTOCOL_VERSION; |
| msg.data = job_desc; |
| pack_msg(&msg, buffer); |
| |
| /* free the environment since all strings are stored in one |
| * xmalloced buffer */ |
| if (job_desc->environment) { |
| xfree(job_desc->environment[0]); |
| xfree(job_desc->environment); |
| job_desc->env_size = 0; |
| } |
| slurm_free_job_desc_msg(job_desc); |
| } |
| |
| /* |
| * _copy_job_record_to_job_desc - construct a job_desc_msg_t for a job. |
| * IN job_ptr - the job record |
| * RET the job_desc_msg_t, NULL on error |
| */ |
| static job_desc_msg_t * |
| _copy_job_record_to_job_desc(struct job_record *job_ptr) |
| { |
| job_desc_msg_t *job_desc; |
| struct job_details *details = job_ptr->details; |
| multi_core_data_t *mc_ptr = details->mc_ptr; |
| int i; |
| |
| /* construct a job_desc_msg_t from job */ |
| job_desc = xmalloc(sizeof(job_desc_msg_t)); |
| |
| job_desc->account = xstrdup(job_ptr->account); |
| job_desc->acctg_freq = xstrdup(details->acctg_freq); |
| job_desc->alloc_node = xstrdup(job_ptr->alloc_node); |
| /* Since the allocating salloc or srun is not expected to exist |
| * when this checkpointed job is restarted, do not save these: |
| * |
| * job_desc->alloc_resp_port = job_ptr->alloc_resp_port; |
| * job_desc->alloc_sid = job_ptr->alloc_sid; |
| */ |
| job_desc->argc = details->argc; |
| job_desc->argv = xmalloc(sizeof(char *) * job_desc->argc); |
| for (i = 0; i < job_desc->argc; i ++) |
| job_desc->argv[i] = xstrdup(details->argv[i]); |
| job_desc->begin_time = details->begin_time; |
| job_desc->ckpt_interval = job_ptr->ckpt_interval; |
| job_desc->ckpt_dir = xstrdup(details->ckpt_dir); |
| job_desc->comment = xstrdup(job_ptr->comment); |
| job_desc->contiguous = details->contiguous; |
| job_desc->core_spec = details->core_spec; |
| job_desc->cpu_bind = xstrdup(details->cpu_bind); |
| job_desc->cpu_bind_type = details->cpu_bind_type; |
| job_desc->dependency = xstrdup(details->dependency); |
| job_desc->end_time = 0; /* Unused today */ |
| job_desc->environment = get_job_env(job_ptr, |
| &job_desc->env_size); |
| job_desc->exc_nodes = xstrdup(details->exc_nodes); |
| job_desc->features = xstrdup(details->features); |
| job_desc->gres = xstrdup(job_ptr->gres); |
| job_desc->group_id = job_ptr->group_id; |
| job_desc->immediate = 0; /* nowhere to get this value */ |
| job_desc->job_id = job_ptr->job_id; |
| job_desc->kill_on_node_fail = job_ptr->kill_on_node_fail; |
| job_desc->licenses = xstrdup(job_ptr->licenses); |
| job_desc->mail_type = job_ptr->mail_type; |
| job_desc->mail_user = xstrdup(job_ptr->mail_user); |
| job_desc->mem_bind = xstrdup(details->mem_bind); |
| job_desc->mem_bind_type = details->mem_bind_type; |
| job_desc->name = xstrdup(job_ptr->name); |
| job_desc->network = xstrdup(job_ptr->network); |
| job_desc->nice = details->nice; |
| job_desc->num_tasks = details->num_tasks; |
| job_desc->open_mode = details->open_mode; |
| job_desc->other_port = job_ptr->other_port; |
| job_desc->overcommit = details->overcommit; |
| job_desc->partition = xstrdup(job_ptr->partition); |
| job_desc->plane_size = details->plane_size; |
| job_desc->priority = job_ptr->priority; |
| if (job_ptr->qos_ptr) { |
| slurmdb_qos_rec_t *qos_ptr = |
| (slurmdb_qos_rec_t *)job_ptr->qos_ptr; |
| job_desc->qos = xstrdup(qos_ptr->name); |
| } |
| job_desc->resp_host = xstrdup(job_ptr->resp_host); |
| job_desc->req_nodes = xstrdup(details->req_nodes); |
| job_desc->requeue = details->requeue; |
| job_desc->reservation = xstrdup(job_ptr->resv_name); |
| job_desc->script = get_job_script(job_ptr); |
| if (details->share_res == 1) |
| job_desc->shared = 1; |
| else if (details->whole_node) |
| job_desc->shared = 0; |
| else |
| job_desc->shared = (uint16_t) NO_VAL; |
| job_desc->spank_job_env_size = job_ptr->spank_job_env_size; |
| job_desc->spank_job_env = xmalloc(sizeof(char *) * |
| job_desc->spank_job_env_size); |
| for (i = 0; i < job_desc->spank_job_env_size; i ++) |
| job_desc->spank_job_env[i]= xstrdup(job_ptr->spank_job_env[i]); |
| job_desc->std_err = xstrdup(details->std_err); |
| job_desc->std_in = xstrdup(details->std_in); |
| job_desc->std_out = xstrdup(details->std_out); |
| job_desc->task_dist = details->task_dist; |
| job_desc->time_limit = job_ptr->time_limit; |
| job_desc->time_min = job_ptr->time_min; |
| job_desc->user_id = job_ptr->user_id; |
| job_desc->wait_all_nodes = job_ptr->wait_all_nodes; |
| job_desc->warn_flags = job_ptr->warn_flags; |
| job_desc->warn_signal = job_ptr->warn_signal; |
| job_desc->warn_time = job_ptr->warn_time; |
| job_desc->wckey = xstrdup(job_ptr->wckey); |
| job_desc->work_dir = xstrdup(details->work_dir); |
| job_desc->pn_min_cpus = details->pn_min_cpus; |
| job_desc->pn_min_memory = details->pn_min_memory; |
| job_desc->pn_min_tmp_disk = details->pn_min_tmp_disk; |
| job_desc->min_cpus = details->min_cpus; |
| job_desc->max_cpus = details->max_cpus; |
| job_desc->min_nodes = details->min_nodes; |
| job_desc->max_nodes = details->max_nodes; |
| if (job_desc->max_nodes == 0) /* set 0 in _job_create() */ |
| job_desc->max_nodes = NO_VAL; |
| job_desc->sockets_per_node = mc_ptr->sockets_per_node; |
| job_desc->cores_per_socket = mc_ptr->cores_per_socket; |
| job_desc->threads_per_core = mc_ptr->threads_per_core; |
| job_desc->cpus_per_task = details->cpus_per_task; |
| job_desc->ntasks_per_node = details->ntasks_per_node; |
| job_desc->ntasks_per_socket = mc_ptr->ntasks_per_socket; |
| job_desc->ntasks_per_core = mc_ptr->ntasks_per_core; |
| #if 0 |
| /* select_jobinfo is unused at job submit time, only it's |
| * components are set. We recover those from the structure below. |
| * job_desc->select_jobinfo = select_g_select_jobinfo_copy(job_ptr-> |
| select_jobinfo); */ |
| |
| /* The following fields are used only on BlueGene systems. |
| * Since BlueGene does not use the checkpoint/restart logic today, |
| * we do not them. */ |
| select_g_select_jobinfo_get(job_ptr->select_jobinfo, |
| SELECT_JOBDATA_GEOMETRY, |
| &job_desc->geometry); |
| select_g_select_jobinfo_get(job_ptr->select_jobinfo, |
| SELECT_JOBDATA_CONN_TYPE, |
| &job_desc->conn_type); |
| select_g_select_jobinfo_get(job_ptr->select_jobinfo, |
| SELECT_JOBDATA_REBOOT, |
| &job_desc->reboot); |
| select_g_select_jobinfo_get(job_ptr->select_jobinfo, |
| SELECT_JOBDATA_ROTATE, |
| &job_desc->rotate); |
| select_g_select_jobinfo_get(job_ptr->select_jobinfo, |
| SELECT_JOBDATA_BLRTS_IMAGE, |
| &job_desc->blrtsimage); |
| select_g_select_jobinfo_get(job_ptr->select_jobinfo, |
| SELECT_JOBDATA_LINUX_IMAGE, |
| &job_desc->linuximage); |
| select_g_select_jobinfo_get(job_ptr->select_jobinfo, |
| SELECT_JOBDATA_MLOADER_IMAGE, |
| &job_desc->mloaderimage); |
| select_g_select_jobinfo_get(job_ptr->select_jobinfo, |
| SELECT_JOBDATA_RAMDISK_IMAGE, |
| &job_desc->ramdiskimage); |
| #endif |
| |
| return job_desc; |
| } |
| |
| |
| /* |
| * job_restart - Restart a batch job from checkpointed state |
| * |
| * Restarting a job is similar to submitting a new job, except that |
| * the job requirements are loaded from the checkpoint file, and |
| * the job id is restored. |
| * |
| * IN ckpt_ptr - checkpoint request message |
| * IN uid - user id of the user issuing the RPC |
| * IN conn_fd - file descriptor on which to send reply |
| * IN protocol_version - slurm protocol version of client |
| * RET 0 on success, otherwise ESLURM error code |
| */ |
| extern int job_restart(checkpoint_msg_t *ckpt_ptr, uid_t uid, slurm_fd_t conn_fd, |
| uint16_t protocol_version) |
| { |
| struct job_record *job_ptr; |
| char *image_dir, *ckpt_file, *data, *ver_str = NULL; |
| char *alloc_nodes = NULL; |
| int data_size = 0; |
| Buf buffer; |
| uint32_t tmp_uint32; |
| slurm_msg_t msg, resp_msg; |
| return_code_msg_t rc_msg; |
| job_desc_msg_t *job_desc = NULL; |
| int rc = SLURM_SUCCESS; |
| uint16_t ckpt_version = (uint16_t) NO_VAL; |
| |
| if (ckpt_ptr->step_id != SLURM_BATCH_SCRIPT) { |
| rc = ESLURM_NOT_SUPPORTED; |
| goto reply; |
| } |
| |
| if ((job_ptr = find_job_record(ckpt_ptr->job_id)) && |
| ! IS_JOB_FINISHED(job_ptr)) { |
| rc = ESLURM_DISABLED; |
| goto reply; |
| } |
| |
| ckpt_file = xstrdup(slurmctld_conf.job_ckpt_dir); |
| xstrfmtcat(ckpt_file, "/%u.ckpt", ckpt_ptr->job_id); |
| |
| data = _read_job_ckpt_file(ckpt_file, &data_size); |
| xfree(ckpt_file); |
| |
| if (data == NULL) { |
| rc = errno; |
| xfree (ckpt_file); |
| goto reply; |
| } |
| buffer = create_buf(data, data_size); |
| |
| /* unpack version string */ |
| safe_unpackstr_xmalloc(&ver_str, &tmp_uint32, buffer); |
| debug3("Version string in job_ckpt header is %s", ver_str); |
| if (ver_str) { |
| if (!strcmp(ver_str, JOB_CKPT_VERSION)) |
| safe_unpack16(&ckpt_version, buffer); |
| else |
| ckpt_version = SLURM_2_6_PROTOCOL_VERSION; |
| } |
| |
| if (ckpt_version == (uint16_t)NO_VAL) { |
| error("***************************************************"); |
| error("Can not restart from job ckpt, incompatible version"); |
| error("***************************************************"); |
| rc = EINVAL; |
| goto unpack_error; |
| } |
| |
| /* unpack checkpoint image directory */ |
| safe_unpackstr_xmalloc(&image_dir, &tmp_uint32, buffer); |
| |
| /* unpack the allocated nodes */ |
| safe_unpackstr_xmalloc(&alloc_nodes, &tmp_uint32, buffer); |
| |
| /* unpack the job req */ |
| msg.msg_type = REQUEST_SUBMIT_BATCH_JOB; |
| msg.protocol_version = ckpt_version; |
| if (unpack_msg(&msg, buffer) != SLURM_SUCCESS) |
| goto unpack_error; |
| |
| job_desc = msg.data; |
| |
| /* sanity check */ |
| if (job_desc->job_id != ckpt_ptr->job_id) { |
| error("saved job id(%u) is different from required job id(%u)", |
| job_desc->job_id, ckpt_ptr->job_id); |
| rc = EINVAL; |
| goto unpack_error; |
| } |
| if (!validate_slurm_user(uid) && (job_desc->user_id != uid)) { |
| error("Security violation, user %u not allowed to restart " |
| "job %u of user %u", |
| uid, ckpt_ptr->job_id, job_desc->user_id); |
| rc = EPERM; |
| goto unpack_error; |
| } |
| |
| if (ckpt_ptr->data == 1) { /* stick to nodes */ |
| xfree(job_desc->req_nodes); |
| job_desc->req_nodes = alloc_nodes; |
| alloc_nodes = NULL; /* Nothing left to xfree */ |
| } |
| |
| /* set open mode to append */ |
| job_desc->open_mode = OPEN_MODE_APPEND; |
| |
| /* Set new job priority */ |
| job_desc->priority = NO_VAL; |
| |
| /* |
| * XXX: we set submit_uid to 0 in the following job_allocate() call |
| * This is for setting the job_id to the original one. |
| * But this will bypass some partition access permission checks. |
| * TODO: fix this. |
| */ |
| rc = job_allocate(job_desc, |
| 0, /* immediate */ |
| 0, /* will_run */ |
| NULL, /* resp */ |
| 0, /* allocate */ |
| 0, /* submit_uid. set to 0 to set job_id */ |
| &job_ptr, NULL); |
| |
| /* set restart directory */ |
| if (job_ptr) { |
| if (ckpt_ptr->image_dir) { |
| xfree (image_dir); |
| image_dir = xstrdup(ckpt_ptr->image_dir); |
| } |
| xstrfmtcat(image_dir, "/%u", ckpt_ptr->job_id); |
| |
| job_ptr->details->restart_dir = image_dir; |
| image_dir = NULL; /* Nothing left to xfree */ |
| |
| last_job_update = time(NULL); |
| } |
| |
| unpack_error: |
| free_buf(buffer); |
| xfree(ver_str); |
| xfree(image_dir); |
| xfree(alloc_nodes); |
| xfree(ckpt_file); |
| |
| reply: |
| slurm_msg_t_init(&resp_msg); |
| resp_msg.protocol_version = protocol_version; |
| rc_msg.return_code = rc; |
| resp_msg.msg_type = RESPONSE_SLURM_RC; |
| resp_msg.data = &rc_msg; |
| (void) slurm_send_node_msg(conn_fd, &resp_msg); |
| |
| return rc; |
| } |
| |
| static char * |
| _read_job_ckpt_file(char *ckpt_file, int *size_ptr) |
| { |
| int ckpt_fd, error_code = 0; |
| int data_allocated, data_read, data_size = 0; |
| char *data = NULL; |
| |
| ckpt_fd = open(ckpt_file, O_RDONLY); |
| if (ckpt_fd < 0) { |
| info("No job ckpt file (%s) to read", ckpt_file); |
| error_code = ENOENT; |
| } else { |
| data_allocated = BUF_SIZE; |
| data = xmalloc(data_allocated); |
| while (1) { |
| data_read = read(ckpt_fd, &data[data_size], |
| BUF_SIZE); |
| if (data_read < 0) { |
| if (errno == EINTR) |
| continue; |
| else { |
| error("Read error on %s: %m", |
| ckpt_file); |
| error_code = errno; |
| break; |
| } |
| } else if (data_read == 0) /* eof */ |
| break; |
| data_size += data_read; |
| data_allocated += data_read; |
| xrealloc(data, data_allocated); |
| } |
| close(ckpt_fd); |
| } |
| |
| if (error_code) { |
| xfree(data); |
| return NULL; |
| } |
| *size_ptr = data_size; |
| return data; |
| } |
| |
| /* Build a bitmap of nodes completing this job */ |
| extern void build_cg_bitmap(struct job_record *job_ptr) |
| { |
| FREE_NULL_BITMAP(job_ptr->node_bitmap_cg); |
| if (job_ptr->node_bitmap) { |
| job_ptr->node_bitmap_cg = bit_copy(job_ptr->node_bitmap); |
| if (bit_set_count(job_ptr->node_bitmap_cg) == 0) |
| job_ptr->job_state &= (~JOB_COMPLETING); |
| } else { |
| error("build_cg_bitmap: node_bitmap is NULL"); |
| job_ptr->node_bitmap_cg = bit_alloc(node_record_count); |
| job_ptr->job_state &= (~JOB_COMPLETING); |
| } |
| } |
| |
| /* job_hold_requeue() |
| * |
| * Requeue the job based upon its current state. |
| * If JOB_SPECIAL_EXIT then requeue and hold with JOB_SPECIAL_EXIT state. |
| * If JOB_REQUEUE_HOLD then requeue and hold. |
| * If JOB_REQUEUE then requeue and let it run again. |
| * The requeue can happen directly from job_requeue() or from |
| * job_epilog_complete() after the last component has finished. |
| */ |
| extern void job_hold_requeue(struct job_record *job_ptr) |
| { |
| uint32_t state; |
| uint32_t flags; |
| |
| xassert(job_ptr); |
| |
| state = job_ptr->job_state; |
| |
| if (! (state & JOB_SPECIAL_EXIT) |
| && ! (state & JOB_REQUEUE_HOLD) |
| && ! (state & JOB_REQUEUE)) |
| return; |
| |
| debug("%s: job %u state 0x%x", __func__, job_ptr->job_id, state); |
| |
| /* We have to set the state here in case |
| * we are not requeueing the job from |
| * job_requeue() but from job_epilog_complete(). |
| */ |
| flags = job_ptr->job_state & JOB_STATE_FLAGS; |
| job_ptr->job_state = JOB_PENDING | flags; |
| |
| /* Test if user wants to requeue the job |
| * in hold or with a special exit value. |
| */ |
| if (state & JOB_SPECIAL_EXIT) { |
| /* JOB_SPECIAL_EXIT means requeue the |
| * the job, put it on hold and display |
| * it as JOB_SPECIAL_EXIT. |
| */ |
| job_ptr->job_state |= JOB_SPECIAL_EXIT; |
| job_ptr->state_reason = WAIT_HELD_USER; |
| job_ptr->priority = 0; |
| } |
| |
| if (state & JOB_REQUEUE_HOLD) { |
| /* The job will be requeued in status |
| * PENDING and held |
| */ |
| job_ptr->state_reason = WAIT_HELD_USER; |
| job_ptr->priority = 0; |
| } |
| |
| job_ptr->job_state &= ~JOB_REQUEUE_HOLD; |
| job_ptr->job_state &= ~JOB_REQUEUE; |
| |
| debug("%s: job %u state 0x%x reason %u priority %d", __func__, |
| job_ptr->job_id, job_ptr->job_state, |
| job_ptr->state_reason, job_ptr->priority); |
| } |
| |
| /* Reset a job's end_time based upon it's start_time and time_limit. |
| * NOTE: Do not reset the end_time if already being preempted */ |
| extern void job_end_time_reset(struct job_record *job_ptr) |
| { |
| if (job_ptr->preempt_time) |
| return; |
| if (job_ptr->time_limit == INFINITE) { |
| job_ptr->end_time = job_ptr->start_time + |
| (365 * 24 * 60 * 60); /* secs in year */ |
| } else { |
| job_ptr->end_time = job_ptr->start_time + |
| (job_ptr->time_limit * 60); /* secs */ |
| } |
| } |
| |
| /* _copy_job_files() |
| * |
| * This function is invoked in case the controller fails |
| * to link the job array job files. If the link fails the |
| * controller tries to copy the files instead. |
| * |
| */ |
| static int |
| _copy_job_file(const char *src, const char *dst) |
| { |
| struct stat stat_buf; |
| int fsrc; |
| int fdst; |
| int cc; |
| char buf[BUFSIZ]; |
| |
| if (stat(src, &stat_buf) < 0) |
| return -1; |
| |
| fsrc = open(src, O_RDONLY); |
| if (fsrc < 0) |
| return -1; |
| |
| |
| fdst = creat(dst, stat_buf.st_mode); |
| if (fdst < 0) { |
| close(fsrc); |
| return -1; |
| } |
| |
| while (1) { |
| cc = read(fsrc, buf, BUFSIZ); |
| if (cc == 0) |
| break; |
| if (cc < 0) { |
| close(fsrc); |
| close(fdst); |
| return -1; |
| } |
| if (write(fdst, buf, cc) != cc) { |
| close(fsrc); |
| close(fdst); |
| return -1; |
| } |
| } |
| |
| close(fsrc); |
| close(fdst); |
| |
| return 0; |
| } |