blob: 3337b8349ff440e6a58ae3a8b276153ddf97ad18 [file] [edit]
/*****************************************************************************\
* job_mgr.c - manage the job information of slurm
* Note: there is a global job list (job_list), time stamp
* (last_job_update), and hash table (job_hash)
*****************************************************************************
* Copyright (C) 2002-2007 The Regents of the University of California.
* Copyright (C) 2008-2010 Lawrence Livermore National Security.
* Portions Copyright (C) 2010 SchedMD <http://www.schedmd.com>.
* Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
* Written by Morris Jette <jette1@llnl.gov>
* CODE-OCEC-09-009. All rights reserved.
*
* This file is part of SLURM, a resource management program.
* For details, see <http://www.schedmd.com/slurmdocs/>.
* Please also read the included file: DISCLAIMER.
*
* SLURM is free software; you can redistribute it and/or modify it under
* the terms of the GNU General Public License as published by the Free
* Software Foundation; either version 2 of the License, or (at your option)
* any later version.
*
* In addition, as a special exception, the copyright holders give permission
* to link the code of portions of this program with the OpenSSL library under
* certain conditions as described in each individual source file, and
* distribute linked combinations including the two. You must obey the GNU
* General Public License in all respects for all of the code used other than
* OpenSSL. If you modify file(s) with this exception, you may extend this
* exception to your version of the file(s), but you are not obligated to do
* so. If you do not wish to do so, delete this exception statement from your
* version. If you delete this exception statement from all source files in
* the program, then also delete it here.
*
* SLURM is distributed in the hope that it will be useful, but WITHOUT ANY
* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
* FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
* details.
*
* You should have received a copy of the GNU General Public License along
* with SLURM; if not, write to the Free Software Foundation, Inc.,
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
\*****************************************************************************/
#ifdef HAVE_CONFIG_H
# include "config.h"
#endif
#include <ctype.h>
#include <dirent.h>
#include <errno.h>
#include <fcntl.h>
#include <libgen.h>
#include <signal.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <strings.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <unistd.h>
#include "slurm/slurm_errno.h"
#include "src/common/assoc_mgr.h"
#include "src/common/bitstring.h"
#include "src/common/forward.h"
#include "src/common/gres.h"
#include "src/common/hostlist.h"
#include "src/common/node_select.h"
#include "src/common/parse_time.h"
#include "src/common/slurm_accounting_storage.h"
#include "src/common/slurm_jobcomp.h"
#include "src/common/slurm_priority.h"
#include "src/common/slurm_protocol_pack.h"
#include "src/common/switch.h"
#include "src/common/timers.h"
#include "src/common/xassert.h"
#include "src/common/xstring.h"
#include "src/slurmctld/acct_policy.h"
#include "src/slurmctld/agent.h"
#include "src/slurmctld/front_end.h"
#include "src/slurmctld/job_scheduler.h"
#include "src/slurmctld/job_submit.h"
#include "src/slurmctld/licenses.h"
#include "src/slurmctld/locks.h"
#include "src/slurmctld/node_scheduler.h"
#include "src/slurmctld/preempt.h"
#include "src/slurmctld/proc_req.h"
#include "src/slurmctld/reservation.h"
#include "src/slurmctld/sched_plugin.h"
#include "src/slurmctld/slurmctld.h"
#include "src/slurmctld/srun_comm.h"
#include "src/slurmctld/state_save.h"
#include "src/slurmctld/trigger_mgr.h"
#define DETAILS_FLAG 0xdddd
#define SLURM_CREATE_JOB_FLAG_NO_ALLOCATE_0 0
#define STEP_FLAG 0xbbbb
#define TOP_PRIORITY 0xffff0000 /* large, but leave headroom for higher */
#define JOB_HASH_INX(_job_id) (_job_id % hash_table_size)
/* Change JOB_STATE_VERSION value when changing the state save format */
#define JOB_STATE_VERSION "VER011"
#define JOB_2_3_STATE_VERSION "VER011" /* SLURM version 2.3 */
#define JOB_2_2_STATE_VERSION "VER010" /* SLURM version 2.2 */
#define JOB_2_1_STATE_VERSION "VER009" /* SLURM version 2.1 */
#define JOB_CKPT_VERSION "JOB_CKPT_002"
#define JOB_2_2_CKPT_VERSION "JOB_CKPT_002" /* SLURM version 2.2 */
#define JOB_2_1_CKPT_VERSION "JOB_CKPT_001" /* SLURM version 2.1 */
/* Global variables */
List job_list = NULL; /* job_record list */
time_t last_job_update; /* time of last update to job records */
/* Local variables */
static uint32_t highest_prio = 0;
static uint32_t lowest_prio = TOP_PRIORITY;
static int hash_table_size = 0;
static int job_count = 0; /* job's in the system */
static uint32_t job_id_sequence = 0; /* first job_id to assign new job */
static struct job_record **job_hash = NULL;
static bool wiki_sched = false;
static bool wiki2_sched = false;
static bool wiki_sched_test = false;
/* Local functions */
static void _add_job_hash(struct job_record *job_ptr);
static int _checkpoint_job_record (struct job_record *job_ptr,
char *image_dir);
static int _copy_job_desc_to_file(job_desc_msg_t * job_desc,
uint32_t job_id);
static int _copy_job_desc_to_job_record(job_desc_msg_t * job_desc,
struct job_record **job_ptr,
bitstr_t ** exc_bitmap,
bitstr_t ** req_bitmap);
static job_desc_msg_t * _copy_job_record_to_job_desc(
struct job_record *job_ptr);
static char *_copy_nodelist_no_dup(char *node_list);
static void _del_batch_list_rec(void *x);
static void _delete_job_desc_files(uint32_t job_id);
static slurmdb_qos_rec_t *_determine_and_validate_qos(
slurmdb_association_rec_t *assoc_ptr,
slurmdb_qos_rec_t *qos_rec, int *error_code);
static void _dump_job_details(struct job_details *detail_ptr,
Buf buffer);
static void _dump_job_state(struct job_record *dump_job_ptr, Buf buffer);
static int _find_batch_dir(void *x, void *key);
static void _get_batch_job_dir_ids(List batch_dirs);
static void _job_timed_out(struct job_record *job_ptr);
static int _job_create(job_desc_msg_t * job_specs, int allocate, int will_run,
struct job_record **job_rec_ptr, uid_t submit_uid);
static void _list_delete_job(void *job_entry);
static int _list_find_job_id(void *job_entry, void *key);
static int _list_find_job_old(void *job_entry, void *key);
static int _load_job_details(struct job_record *job_ptr, Buf buffer,
uint16_t protocol_version);
static int _load_job_state(Buf buffer, uint16_t protocol_version);
static uint32_t _max_switch_wait(uint32_t input_wait);
static void _notify_srun_missing_step(struct job_record *job_ptr, int node_inx,
time_t now, time_t node_boot_time);
static int _open_job_state_file(char **state_file);
static void _pack_job_for_ckpt (struct job_record *job_ptr, Buf buffer);
static void _pack_default_job_details(struct job_record *job_ptr,
Buf buffer,
uint16_t protocol_version);
static void _pack_pending_job_details(struct job_details *detail_ptr,
Buf buffer,
uint16_t protocol_version);
static int _purge_job_record(uint32_t job_id);
static void _purge_missing_jobs(int node_inx, time_t now);
static void _read_data_array_from_file(char *file_name, char ***data,
uint32_t * size,
struct job_record *job_ptr);
static void _read_data_from_file(char *file_name, char **data);
static char *_read_job_ckpt_file(char *ckpt_file, int *size_ptr);
static void _remove_defunct_batch_dirs(List batch_dirs);
static int _reset_detail_bitmaps(struct job_record *job_ptr);
static void _reset_step_bitmaps(struct job_record *job_ptr);
static int _resume_job_nodes(struct job_record *job_ptr, bool indf_susp);
static void _send_job_kill(struct job_record *job_ptr);
static void _set_job_id(struct job_record *job_ptr);
static void _set_job_prio(struct job_record *job_ptr);
static void _signal_batch_job(struct job_record *job_ptr, uint16_t signal);
static void _signal_job(struct job_record *job_ptr, int signal);
static void _suspend_job(struct job_record *job_ptr, uint16_t op);
static int _suspend_job_nodes(struct job_record *job_ptr, bool indf_susp);
static bool _top_priority(struct job_record *job_ptr);
static int _validate_job_create_req(job_desc_msg_t * job_desc);
static int _validate_job_desc(job_desc_msg_t * job_desc_msg, int allocate,
uid_t submit_uid, struct part_record *part_ptr);
static void _validate_job_files(List batch_dirs);
static int _write_data_to_file(char *file_name, char *data);
static int _write_data_array_to_file(char *file_name, char **data,
uint32_t size);
static void _xmit_new_end_time(struct job_record *job_ptr);
/*
* create_job_record - create an empty job_record including job_details.
* load its values with defaults (zeros, nulls, and magic cookie)
* IN/OUT error_code - set to zero if no error, errno otherwise
* RET pointer to the record or NULL if error
* global: job_list - global job list
* job_count - number of jobs in the system
* last_job_update - time of last job table update
* NOTE: allocates memory that should be xfreed with _list_delete_job
*/
struct job_record *create_job_record(int *error_code)
{
struct job_record *job_ptr;
struct job_details *detail_ptr;
if (job_count >= slurmctld_conf.max_job_cnt) {
error("create_job_record: job_count exceeds limit");
*error_code = EAGAIN;
return NULL;
}
job_count++;
*error_code = 0;
last_job_update = time(NULL);
job_ptr = (struct job_record *) xmalloc(sizeof(struct job_record));
detail_ptr = (struct job_details *)xmalloc(sizeof(struct job_details));
job_ptr->magic = JOB_MAGIC;
job_ptr->details = detail_ptr;
job_ptr->prio_factors = xmalloc(sizeof(priority_factors_object_t));
job_ptr->step_list = list_create(NULL);
if (job_ptr->step_list == NULL)
fatal("memory allocation failure");
xassert (detail_ptr->magic = DETAILS_MAGIC); /* set value */
detail_ptr->submit_time = time(NULL);
job_ptr->requid = -1; /* force to -1 for sacct to know this
* hasn't been set yet */
if (list_append(job_list, job_ptr) == 0)
fatal("list_append memory allocation failure");
return job_ptr;
}
/*
* delete_job_details - delete a job's detail record and clear it's pointer
* this information can be deleted as soon as the job is allocated
* resources and running (could need to restart batch job)
* IN job_entry - pointer to job_record to clear the record of
*/
void delete_job_details(struct job_record *job_entry)
{
int i;
if (job_entry->details == NULL)
return;
xassert (job_entry->details->magic == DETAILS_MAGIC);
if (IS_JOB_FINISHED(job_entry))
_delete_job_desc_files(job_entry->job_id);
for (i=0; i<job_entry->details->argc; i++)
xfree(job_entry->details->argv[i]);
xfree(job_entry->details->argv);
xfree(job_entry->details->ckpt_dir);
xfree(job_entry->details->cpu_bind);
if (job_entry->details->depend_list)
list_destroy(job_entry->details->depend_list);
xfree(job_entry->details->dependency);
xfree(job_entry->details->orig_dependency);
for (i=0; i<job_entry->details->env_cnt; i++)
xfree(job_entry->details->env_sup[i]);
xfree(job_entry->details->env_sup);
xfree(job_entry->details->std_err);
FREE_NULL_BITMAP(job_entry->details->exc_node_bitmap);
xfree(job_entry->details->exc_nodes);
if (job_entry->details->feature_list)
list_destroy(job_entry->details->feature_list);
xfree(job_entry->details->features);
xfree(job_entry->details->std_in);
xfree(job_entry->details->mc_ptr);
xfree(job_entry->details->mem_bind);
xfree(job_entry->details->std_out);
FREE_NULL_BITMAP(job_entry->details->req_node_bitmap);
xfree(job_entry->details->req_node_layout);
xfree(job_entry->details->req_nodes);
xfree(job_entry->details->restart_dir);
xfree(job_entry->details->work_dir);
xfree(job_entry->details); /* Must be last */
}
/* _delete_job_desc_files - delete job descriptor related files */
static void _delete_job_desc_files(uint32_t job_id)
{
char *dir_name, job_dir[20], *file_name;
struct stat sbuf;
dir_name = slurm_get_state_save_location();
sprintf(job_dir, "/job.%d", job_id);
xstrcat(dir_name, job_dir);
file_name = xstrdup(dir_name);
xstrcat(file_name, "/environment");
(void) unlink(file_name);
xfree(file_name);
file_name = xstrdup(dir_name);
xstrcat(file_name, "/script");
(void) unlink(file_name);
xfree(file_name);
if (stat(dir_name, &sbuf) == 0) /* remove job directory as needed */
(void) rmdir(dir_name);
xfree(dir_name);
}
static uint32_t _max_switch_wait(uint32_t input_wait)
{
static time_t sched_update = 0;
static uint32_t max_wait = 60;
char *sched_params, *tmp_ptr;
int i;
if (sched_update != slurmctld_conf.last_update) {
sched_params = slurm_get_sched_params();
if (sched_params &&
(tmp_ptr = strstr(sched_params, "max_switch_wait="))) {
/* 0123456789012345 */
i = atoi(tmp_ptr + 16);
if (i < 0) {
error("ignoring SchedulerParameters: "
"max_switch_wait of %d", i);
} else {
max_wait = i;
}
}
xfree(sched_params);
}
if (max_wait > input_wait)
return input_wait;
return max_wait;
}
static slurmdb_qos_rec_t *_determine_and_validate_qos(
slurmdb_association_rec_t *assoc_ptr,
slurmdb_qos_rec_t *qos_rec,
int *error_code)
{
slurmdb_qos_rec_t *qos_ptr = NULL;
/* If enforcing associations make sure this is a valid qos
with the association. If not just fill in the qos and
continue. */
if(accounting_enforce & ACCOUNTING_ENFORCE_ASSOCS)
xassert(assoc_ptr);
xassert(qos_rec);
if(!qos_rec->name && !qos_rec->id) {
if(assoc_ptr && assoc_ptr->usage->valid_qos)
if(assoc_ptr->def_qos_id)
qos_rec->id = assoc_ptr->def_qos_id;
else if(bit_set_count(assoc_ptr->usage->valid_qos) == 1)
qos_rec->id =
bit_ffs(assoc_ptr->usage->valid_qos);
else if (assoc_mgr_root_assoc
&& assoc_mgr_root_assoc->def_qos_id)
qos_rec->id = assoc_mgr_root_assoc->def_qos_id;
else
qos_rec->name = "normal";
else if (assoc_mgr_root_assoc
&& assoc_mgr_root_assoc->def_qos_id)
qos_rec->id = assoc_mgr_root_assoc->def_qos_id;
else
qos_rec->name = "normal";
}
if(assoc_mgr_fill_in_qos(acct_db_conn, qos_rec, accounting_enforce,
&qos_ptr)
!= SLURM_SUCCESS) {
error("Invalid qos (%s)", qos_rec->name);
*error_code = ESLURM_INVALID_QOS;
return NULL;
}
if((accounting_enforce & ACCOUNTING_ENFORCE_QOS)
&& assoc_ptr
&& (!assoc_ptr->usage->valid_qos
|| !bit_test(assoc_ptr->usage->valid_qos, qos_rec->id))) {
error("This association %d(account='%s', "
"user='%s', partition='%s') does not have "
"access to qos %s",
assoc_ptr->id, assoc_ptr->acct, assoc_ptr->user,
assoc_ptr->partition, qos_rec->name);
*error_code = ESLURM_INVALID_QOS;
return NULL;
}
*error_code = SLURM_SUCCESS;
return qos_ptr;
}
/*
* dump_all_job_state - save the state of all jobs to file for checkpoint
* Changes here should be reflected in load_last_job_id() and
* load_all_job_state().
* RET 0 or error code */
int dump_all_job_state(void)
{
/* Save high-water mark to avoid buffer growth with copies */
static int high_buffer_size = (1024 * 1024);
int error_code = 0, log_fd;
char *old_file, *new_file, *reg_file;
struct stat stat_buf;
/* Locks: Read config and job */
slurmctld_lock_t job_read_lock =
{ READ_LOCK, READ_LOCK, NO_LOCK, NO_LOCK };
ListIterator job_iterator;
struct job_record *job_ptr;
Buf buffer = init_buf(high_buffer_size);
time_t min_age = 0, now = time(NULL);
DEF_TIMERS;
START_TIMER;
/* write header: version, time */
packstr(JOB_STATE_VERSION, buffer);
pack_time(now, buffer);
if (slurmctld_conf.min_job_age > 0)
min_age = now - slurmctld_conf.min_job_age;
/*
* write header: job id
* This is needed so that the job id remains persistent even after
* slurmctld is restarted.
*/
pack32( job_id_sequence, buffer);
debug3("Writing job id %u to header record of job_state file",
job_id_sequence);
/* write individual job records */
lock_slurmctld(job_read_lock);
job_iterator = list_iterator_create(job_list);
while ((job_ptr = (struct job_record *) list_next(job_iterator))) {
xassert (job_ptr->magic == JOB_MAGIC);
if ((min_age > 0) && (job_ptr->end_time < min_age) &&
(! IS_JOB_COMPLETING(job_ptr)) && IS_JOB_FINISHED(job_ptr))
continue; /* job ready for purging, don't dump */
_dump_job_state(job_ptr, buffer);
}
list_iterator_destroy(job_iterator);
/* write the buffer to file */
old_file = xstrdup(slurmctld_conf.state_save_location);
xstrcat(old_file, "/job_state.old");
reg_file = xstrdup(slurmctld_conf.state_save_location);
xstrcat(reg_file, "/job_state");
new_file = xstrdup(slurmctld_conf.state_save_location);
xstrcat(new_file, "/job_state.new");
unlock_slurmctld(job_read_lock);
if (stat(reg_file, &stat_buf) == 0) {
static time_t last_mtime = (time_t) 0;
int delta_t = difftime(stat_buf.st_mtime, last_mtime);
if (delta_t < -10) {
error("The modification time of %s moved backwards "
"by %d seconds",
reg_file, (0-delta_t));
error("There could be a problem with your clock or "
"file system mounting");
/* It could be safest to exit here. We likely mounted
* a different file system with the state save files */
}
last_mtime = time(NULL);
}
lock_state_files();
log_fd = creat(new_file, 0600);
if (log_fd < 0) {
error("Can't save state, create file %s error %m",
new_file);
error_code = errno;
} else {
int pos = 0, nwrite = get_buf_offset(buffer), amount, rc;
char *data = (char *)get_buf_data(buffer);
high_buffer_size = MAX(nwrite, high_buffer_size);
while (nwrite > 0) {
amount = write(log_fd, &data[pos], nwrite);
if ((amount < 0) && (errno != EINTR)) {
error("Error writing file %s, %m", new_file);
error_code = errno;
break;
}
nwrite -= amount;
pos += amount;
}
rc = fsync_and_close(log_fd, "job");
if (rc && !error_code)
error_code = rc;
}
if (error_code)
(void) unlink(new_file);
else { /* file shuffle */
(void) unlink(old_file);
if(link(reg_file, old_file))
debug4("unable to create link for %s -> %s: %m",
reg_file, old_file);
(void) unlink(reg_file);
if(link(new_file, reg_file))
debug4("unable to create link for %s -> %s: %m",
new_file, reg_file);
(void) unlink(new_file);
}
xfree(old_file);
xfree(reg_file);
xfree(new_file);
unlock_state_files();
free_buf(buffer);
END_TIMER2("dump_all_job_state");
return error_code;
}
/* Open the job state save file, or backup if necessary.
* state_file IN - the name of the state save file used
* RET the file description to read from or error code
*/
static int _open_job_state_file(char **state_file)
{
int state_fd;
struct stat stat_buf;
*state_file = slurm_get_state_save_location();
xstrcat(*state_file, "/job_state");
state_fd = open(*state_file, O_RDONLY);
if (state_fd < 0) {
error("Could not open job state file %s: %m", *state_file);
} else if (fstat(state_fd, &stat_buf) < 0) {
error("Could not stat job state file %s: %m", *state_file);
(void) close(state_fd);
} else if (stat_buf.st_size < 10) {
error("Job state file %s too small", *state_file);
(void) close(state_fd);
} else /* Success */
return state_fd;
error("NOTE: Trying backup state save file. Jobs may be lost!");
xstrcat(*state_file, ".old");
state_fd = open(*state_file, O_RDONLY);
return state_fd;
}
/*
* load_all_job_state - load the job state from file, recover from last
* checkpoint. Execute this after loading the configuration file data.
* Changes here should be reflected in load_last_job_id().
* RET 0 or error code
*/
extern int load_all_job_state(void)
{
int data_allocated, data_read = 0, error_code = SLURM_SUCCESS;
uint32_t data_size = 0;
int state_fd, job_cnt = 0;
char *data = NULL, *state_file;
Buf buffer;
time_t buf_time;
uint32_t saved_job_id;
char *ver_str = NULL;
uint32_t ver_str_len;
uint16_t protocol_version = (uint16_t)NO_VAL;
/* read the file */
lock_state_files();
state_fd = _open_job_state_file(&state_file);
if (state_fd < 0) {
info("No job state file (%s) to recover", state_file);
error_code = ENOENT;
} else {
data_allocated = BUF_SIZE;
data = xmalloc(data_allocated);
while (1) {
data_read = read(state_fd, &data[data_size],
BUF_SIZE);
if (data_read < 0) {
if (errno == EINTR)
continue;
else {
error("Read error on %s: %m",
state_file);
break;
}
} else if (data_read == 0) /* eof */
break;
data_size += data_read;
data_allocated += data_read;
xrealloc(data, data_allocated);
}
close(state_fd);
}
xfree(state_file);
unlock_state_files();
job_id_sequence = MAX(job_id_sequence, slurmctld_conf.first_job_id);
if (error_code)
return error_code;
buffer = create_buf(data, data_size);
safe_unpackstr_xmalloc(&ver_str, &ver_str_len, buffer);
debug3("Version string in job_state header is %s", ver_str);
if (ver_str) {
if (!strcmp(ver_str, JOB_STATE_VERSION)) {
protocol_version = SLURM_PROTOCOL_VERSION;
} else if (!strcmp(ver_str, JOB_2_2_STATE_VERSION)) {
protocol_version = SLURM_2_2_PROTOCOL_VERSION;
} else if (!strcmp(ver_str, JOB_2_1_STATE_VERSION)) {
protocol_version = SLURM_2_1_PROTOCOL_VERSION;
}
}
if (protocol_version == (uint16_t)NO_VAL) {
error("***********************************************");
error("Can not recover job state, incompatible version");
error("***********************************************");
xfree(ver_str);
free_buf(buffer);
return EFAULT;
}
xfree(ver_str);
safe_unpack_time(&buf_time, buffer);
safe_unpack32( &saved_job_id, buffer);
job_id_sequence = MAX(saved_job_id, job_id_sequence);
debug3("Job id in job_state header is %u", saved_job_id);
while (remaining_buf(buffer) > 0) {
error_code = _load_job_state(buffer, protocol_version);
if (error_code != SLURM_SUCCESS)
goto unpack_error;
job_cnt++;
}
debug3("Set job_id_sequence to %u", job_id_sequence);
free_buf(buffer);
info("Recovered information about %d jobs", job_cnt);
return error_code;
unpack_error:
error("Incomplete job data checkpoint file");
info("Recovered information about %d jobs", job_cnt);
free_buf(buffer);
return SLURM_FAILURE;
}
/*
* load_last_job_id - load only the last job ID from state save file.
* Changes here should be reflected in load_all_job_state().
* RET 0 or error code
*/
extern int load_last_job_id( void )
{
int data_allocated, data_read = 0, error_code = SLURM_SUCCESS;
uint32_t data_size = 0;
int state_fd;
char *data = NULL, *state_file;
Buf buffer;
time_t buf_time;
char *ver_str = NULL;
uint32_t ver_str_len;
/* read the file */
state_file = slurm_get_state_save_location();
xstrcat(state_file, "/job_state");
lock_state_files();
state_fd = open(state_file, O_RDONLY);
if (state_fd < 0) {
debug("No job state file (%s) to recover", state_file);
error_code = ENOENT;
} else {
data_allocated = BUF_SIZE;
data = xmalloc(data_allocated);
while (1) {
data_read = read(state_fd, &data[data_size],
BUF_SIZE);
if (data_read < 0) {
if (errno == EINTR)
continue;
else {
error("Read error on %s: %m",
state_file);
break;
}
} else if (data_read == 0) /* eof */
break;
data_size += data_read;
data_allocated += data_read;
xrealloc(data, data_allocated);
}
close(state_fd);
}
xfree(state_file);
unlock_state_files();
if (error_code)
return error_code;
buffer = create_buf(data, data_size);
safe_unpackstr_xmalloc(&ver_str, &ver_str_len, buffer);
debug3("Version string in job_state header is %s", ver_str);
if ((!ver_str) || (strcmp(ver_str, JOB_STATE_VERSION) != 0)) {
debug("*************************************************");
debug("Can not recover last job ID, incompatible version");
debug("*************************************************");
xfree(ver_str);
free_buf(buffer);
return EFAULT;
}
xfree(ver_str);
safe_unpack_time(&buf_time, buffer);
safe_unpack32( &job_id_sequence, buffer);
debug3("Job ID in job_state header is %u", job_id_sequence);
/* Ignore the state for individual jobs stored here */
free_buf(buffer);
return error_code;
unpack_error:
debug("Invalid job data checkpoint file");
free_buf(buffer);
return SLURM_FAILURE;
}
/*
* _dump_job_state - dump the state of a specific job, its details, and
* steps to a buffer
* IN dump_job_ptr - pointer to job for which information is requested
* IN/OUT buffer - location to store data, pointers automatically advanced
*/
static void _dump_job_state(struct job_record *dump_job_ptr, Buf buffer)
{
struct job_details *detail_ptr;
ListIterator step_iterator;
struct step_record *step_ptr;
/* Dump basic job info */
pack32(dump_job_ptr->assoc_id, buffer);
pack32(dump_job_ptr->job_id, buffer);
pack32(dump_job_ptr->user_id, buffer);
pack32(dump_job_ptr->group_id, buffer);
pack32(dump_job_ptr->time_limit, buffer);
pack32(dump_job_ptr->time_min, buffer);
pack32(dump_job_ptr->priority, buffer);
pack32(dump_job_ptr->alloc_sid, buffer);
pack32(dump_job_ptr->total_cpus, buffer);
pack32(dump_job_ptr->total_nodes, buffer);
pack32(dump_job_ptr->cpu_cnt, buffer);
pack32(dump_job_ptr->exit_code, buffer);
pack32(dump_job_ptr->derived_ec, buffer);
pack32(dump_job_ptr->db_index, buffer);
pack32(dump_job_ptr->resv_id, buffer);
pack32(dump_job_ptr->next_step_id, buffer);
pack32(dump_job_ptr->qos_id, buffer);
pack32(dump_job_ptr->req_switch, buffer);
pack32(dump_job_ptr->wait4switch, buffer);
pack_time(dump_job_ptr->preempt_time, buffer);
pack_time(dump_job_ptr->start_time, buffer);
pack_time(dump_job_ptr->end_time, buffer);
pack_time(dump_job_ptr->suspend_time, buffer);
pack_time(dump_job_ptr->pre_sus_time, buffer);
pack_time(dump_job_ptr->resize_time, buffer);
pack_time(dump_job_ptr->tot_sus_time, buffer);
pack16(dump_job_ptr->direct_set_prio, buffer);
pack16(dump_job_ptr->job_state, buffer);
pack16(dump_job_ptr->kill_on_node_fail, buffer);
pack16(dump_job_ptr->batch_flag, buffer);
pack16(dump_job_ptr->mail_type, buffer);
pack16(dump_job_ptr->state_reason, buffer);
pack16(dump_job_ptr->restart_cnt, buffer);
pack16(dump_job_ptr->resv_flags, buffer);
pack16(dump_job_ptr->wait_all_nodes, buffer);
pack16(dump_job_ptr->warn_signal, buffer);
pack16(dump_job_ptr->warn_time, buffer);
pack16(dump_job_ptr->limit_set_max_cpus, buffer);
pack16(dump_job_ptr->limit_set_max_nodes, buffer);
pack16(dump_job_ptr->limit_set_min_cpus, buffer);
pack16(dump_job_ptr->limit_set_min_nodes, buffer);
pack16(dump_job_ptr->limit_set_time, buffer);
packstr(dump_job_ptr->state_desc, buffer);
packstr(dump_job_ptr->resp_host, buffer);
pack16(dump_job_ptr->alloc_resp_port, buffer);
pack16(dump_job_ptr->other_port, buffer);
if (IS_JOB_COMPLETING(dump_job_ptr)) {
if (dump_job_ptr->nodes_completing == NULL) {
dump_job_ptr->nodes_completing =
bitmap2node_name(dump_job_ptr->node_bitmap);
}
packstr(dump_job_ptr->nodes_completing, buffer);
}
packstr(dump_job_ptr->nodes, buffer);
packstr(dump_job_ptr->partition, buffer);
packstr(dump_job_ptr->name, buffer);
packstr(dump_job_ptr->wckey, buffer);
packstr(dump_job_ptr->alloc_node, buffer);
packstr(dump_job_ptr->account, buffer);
packstr(dump_job_ptr->comment, buffer);
packstr(dump_job_ptr->gres, buffer);
packstr(dump_job_ptr->network, buffer);
packstr(dump_job_ptr->licenses, buffer);
packstr(dump_job_ptr->mail_user, buffer);
packstr(dump_job_ptr->resv_name, buffer);
packstr(dump_job_ptr->batch_host, buffer);
select_g_select_jobinfo_pack(dump_job_ptr->select_jobinfo,
buffer, SLURM_PROTOCOL_VERSION);
pack_job_resources(dump_job_ptr->job_resrcs, buffer,
SLURM_PROTOCOL_VERSION);
pack16(dump_job_ptr->ckpt_interval, buffer);
checkpoint_pack_jobinfo(dump_job_ptr->check_job, buffer,
SLURM_PROTOCOL_VERSION);
packstr_array(dump_job_ptr->spank_job_env,
dump_job_ptr->spank_job_env_size, buffer);
(void) gres_plugin_job_state_pack(dump_job_ptr->gres_list, buffer,
dump_job_ptr->job_id, true,
SLURM_PROTOCOL_VERSION);
/* Dump job details, if available */
detail_ptr = dump_job_ptr->details;
if (detail_ptr) {
xassert (detail_ptr->magic == DETAILS_MAGIC);
pack16((uint16_t) DETAILS_FLAG, buffer);
_dump_job_details(detail_ptr, buffer);
} else
pack16((uint16_t) 0, buffer); /* no details flag */
/* Dump job steps */
step_iterator = list_iterator_create(dump_job_ptr->step_list);
while ((step_ptr = (struct step_record *)
list_next(step_iterator))) {
pack16((uint16_t) STEP_FLAG, buffer);
dump_job_step_state(dump_job_ptr, step_ptr, buffer);
}
list_iterator_destroy(step_iterator);
pack16((uint16_t) 0, buffer); /* no step flag */
}
/* Unpack a job's state information from a buffer */
static int _load_job_state(Buf buffer, uint16_t protocol_version)
{
uint32_t job_id, user_id, group_id, time_limit, priority, alloc_sid;
uint32_t exit_code, assoc_id, db_index, name_len, time_min;
uint32_t next_step_id, total_cpus, total_nodes = 0, cpu_cnt;
uint32_t resv_id, spank_job_env_size = 0, qos_id, derived_ec = 0;
uint32_t req_switch = 0, wait4switch = 0;
time_t start_time, end_time, suspend_time, pre_sus_time, tot_sus_time;
time_t preempt_time = 0;
time_t resize_time = 0, now = time(NULL);
uint16_t job_state, details, batch_flag, step_flag;
uint16_t kill_on_node_fail, direct_set_prio;
uint16_t alloc_resp_port, other_port, mail_type, state_reason;
uint16_t restart_cnt, resv_flags, ckpt_interval;
uint16_t wait_all_nodes, warn_signal, warn_time;
uint16_t limit_set_max_cpus = 0, limit_set_max_nodes = 0,
limit_set_min_cpus = 0, limit_set_min_nodes = 0,
limit_set_time = 0;
char *nodes = NULL, *partition = NULL, *name = NULL, *resp_host = NULL;
char *account = NULL, *network = NULL, *mail_user = NULL;
char *comment = NULL, *nodes_completing = NULL, *alloc_node = NULL;
char *licenses = NULL, *state_desc = NULL, *wckey = NULL;
char *resv_name = NULL, *gres = NULL, *batch_host = NULL;
char **spank_job_env = (char **) NULL;
List gres_list = NULL, part_ptr_list = NULL;
struct job_record *job_ptr = NULL;
struct part_record *part_ptr;
int error_code, i, qos_error;
dynamic_plugin_data_t *select_jobinfo = NULL;
job_resources_t *job_resources = NULL;
check_jobinfo_t check_job = NULL;
slurmdb_association_rec_t assoc_rec;
slurmdb_qos_rec_t qos_rec;
bool job_finished = false;
if (protocol_version >= SLURM_2_3_PROTOCOL_VERSION) {
safe_unpack32(&assoc_id, buffer);
safe_unpack32(&job_id, buffer);
/* validity test as possible */
if (job_id == 0) {
verbose("Invalid job_id %u", job_id);
goto unpack_error;
}
job_ptr = find_job_record(job_id);
if (job_ptr == NULL) {
job_ptr = create_job_record(&error_code);
if (error_code) {
error("Create job entry failed for job_id %u",
job_id);
goto unpack_error;
}
job_ptr->job_id = job_id;
_add_job_hash(job_ptr);
}
safe_unpack32(&user_id, buffer);
safe_unpack32(&group_id, buffer);
safe_unpack32(&time_limit, buffer);
safe_unpack32(&time_min, buffer);
safe_unpack32(&priority, buffer);
safe_unpack32(&alloc_sid, buffer);
safe_unpack32(&total_cpus, buffer);
safe_unpack32(&total_nodes, buffer);
safe_unpack32(&cpu_cnt, buffer);
safe_unpack32(&exit_code, buffer);
safe_unpack32(&derived_ec, buffer);
safe_unpack32(&db_index, buffer);
safe_unpack32(&resv_id, buffer);
safe_unpack32(&next_step_id, buffer);
safe_unpack32(&qos_id, buffer);
safe_unpack32(&req_switch, buffer);
safe_unpack32(&wait4switch, buffer);
safe_unpack_time(&preempt_time, buffer);
safe_unpack_time(&start_time, buffer);
safe_unpack_time(&end_time, buffer);
safe_unpack_time(&suspend_time, buffer);
safe_unpack_time(&pre_sus_time, buffer);
safe_unpack_time(&resize_time, buffer);
safe_unpack_time(&tot_sus_time, buffer);
safe_unpack16(&direct_set_prio, buffer);
safe_unpack16(&job_state, buffer);
safe_unpack16(&kill_on_node_fail, buffer);
safe_unpack16(&batch_flag, buffer);
safe_unpack16(&mail_type, buffer);
safe_unpack16(&state_reason, buffer);
safe_unpack16(&restart_cnt, buffer);
safe_unpack16(&resv_flags, buffer);
safe_unpack16(&wait_all_nodes, buffer);
safe_unpack16(&warn_signal, buffer);
safe_unpack16(&warn_time, buffer);
safe_unpack16(&limit_set_max_cpus, buffer);
safe_unpack16(&limit_set_max_nodes, buffer);
safe_unpack16(&limit_set_min_cpus, buffer);
safe_unpack16(&limit_set_min_nodes, buffer);
safe_unpack16(&limit_set_time, buffer);
safe_unpackstr_xmalloc(&state_desc, &name_len, buffer);
safe_unpackstr_xmalloc(&resp_host, &name_len, buffer);
safe_unpack16(&alloc_resp_port, buffer);
safe_unpack16(&other_port, buffer);
if (job_state & JOB_COMPLETING) {
safe_unpackstr_xmalloc(&nodes_completing,
&name_len, buffer);
}
safe_unpackstr_xmalloc(&nodes, &name_len, buffer);
safe_unpackstr_xmalloc(&partition, &name_len, buffer);
if (partition == NULL) {
error("No partition for job %u", job_id);
goto unpack_error;
}
part_ptr = find_part_record (partition);
if (part_ptr == NULL) {
part_ptr_list = get_part_list(partition);
if (part_ptr_list)
part_ptr = list_peek(part_ptr_list);
}
if (part_ptr == NULL) {
verbose("Invalid partition (%s) for job_id %u",
partition, job_id);
/* not fatal error, partition could have been removed,
* reset_job_bitmaps() will clean-up this job */
}
safe_unpackstr_xmalloc(&name, &name_len, buffer);
safe_unpackstr_xmalloc(&wckey, &name_len, buffer);
safe_unpackstr_xmalloc(&alloc_node, &name_len, buffer);
safe_unpackstr_xmalloc(&account, &name_len, buffer);
safe_unpackstr_xmalloc(&comment, &name_len, buffer);
safe_unpackstr_xmalloc(&gres, &name_len, buffer);
safe_unpackstr_xmalloc(&network, &name_len, buffer);
safe_unpackstr_xmalloc(&licenses, &name_len, buffer);
safe_unpackstr_xmalloc(&mail_user, &name_len, buffer);
safe_unpackstr_xmalloc(&resv_name, &name_len, buffer);
safe_unpackstr_xmalloc(&batch_host, &name_len, buffer);
if (select_g_select_jobinfo_unpack(&select_jobinfo, buffer,
protocol_version))
goto unpack_error;
if (unpack_job_resources(&job_resources, buffer,
protocol_version))
goto unpack_error;
safe_unpack16(&ckpt_interval, buffer);
if (checkpoint_alloc_jobinfo(&check_job) ||
checkpoint_unpack_jobinfo(check_job, buffer,
protocol_version))
goto unpack_error;
safe_unpackstr_array(&spank_job_env, &spank_job_env_size,
buffer);
if (gres_plugin_job_state_unpack(&gres_list, buffer, job_id,
protocol_version) !=
SLURM_SUCCESS)
goto unpack_error;
gres_plugin_job_state_log(gres_list, job_id);
safe_unpack16(&details, buffer);
if ((details == DETAILS_FLAG) &&
(_load_job_details(job_ptr, buffer, protocol_version))) {
job_ptr->job_state = JOB_FAILED;
job_ptr->exit_code = 1;
job_ptr->state_reason = FAIL_SYSTEM;
xfree(job_ptr->state_desc);
job_ptr->end_time = now;
goto unpack_error;
}
safe_unpack16(&step_flag, buffer);
while (step_flag == STEP_FLAG) {
/* No need to put these into accounting if they
* haven't been since all information will be
* put in when the job is finished.
*/
if ((error_code = load_step_state(job_ptr, buffer,
protocol_version)))
goto unpack_error;
safe_unpack16(&step_flag, buffer);
}
} else if (protocol_version >= SLURM_2_2_PROTOCOL_VERSION) {
safe_unpack32(&assoc_id, buffer);
safe_unpack32(&job_id, buffer);
/* validity test as possible */
if (job_id == 0) {
verbose("Invalid job_id %u", job_id);
goto unpack_error;
}
job_ptr = find_job_record(job_id);
if (job_ptr == NULL) {
job_ptr = create_job_record(&error_code);
if (error_code) {
error("Create job entry failed for job_id %u",
job_id);
goto unpack_error;
}
job_ptr->job_id = job_id;
_add_job_hash(job_ptr);
}
safe_unpack32(&user_id, buffer);
safe_unpack32(&group_id, buffer);
safe_unpack32(&time_limit, buffer);
safe_unpack32(&time_min, buffer);
safe_unpack32(&priority, buffer);
safe_unpack32(&alloc_sid, buffer);
safe_unpack32(&total_cpus, buffer);
safe_unpack32(&total_nodes, buffer);
safe_unpack32(&cpu_cnt, buffer);
safe_unpack32(&exit_code, buffer);
safe_unpack32(&derived_ec, buffer);
safe_unpack32(&db_index, buffer);
safe_unpack32(&assoc_id, buffer);
safe_unpack32(&resv_id, buffer);
safe_unpack32(&next_step_id, buffer);
safe_unpack32(&qos_id, buffer);
safe_unpack_time(&start_time, buffer);
safe_unpack_time(&end_time, buffer);
safe_unpack_time(&suspend_time, buffer);
safe_unpack_time(&pre_sus_time, buffer);
safe_unpack_time(&resize_time, buffer);
safe_unpack_time(&tot_sus_time, buffer);
safe_unpack16(&direct_set_prio, buffer);
safe_unpack16(&job_state, buffer);
safe_unpack16(&kill_on_node_fail, buffer);
safe_unpack16(&batch_flag, buffer);
safe_unpack16(&mail_type, buffer);
safe_unpack16(&state_reason, buffer);
safe_unpack16(&restart_cnt, buffer);
safe_unpack16(&resv_flags, buffer);
safe_unpack16(&wait_all_nodes, buffer);
safe_unpack16(&warn_signal, buffer);
safe_unpack16(&warn_time, buffer);
safe_unpack16(&limit_set_max_cpus, buffer);
safe_unpack16(&limit_set_max_nodes, buffer);
safe_unpack16(&limit_set_min_cpus, buffer);
safe_unpack16(&limit_set_min_nodes, buffer);
safe_unpack16(&limit_set_time, buffer);
safe_unpackstr_xmalloc(&state_desc, &name_len, buffer);
safe_unpackstr_xmalloc(&resp_host, &name_len, buffer);
safe_unpack16(&alloc_resp_port, buffer);
safe_unpack16(&other_port, buffer);
if (job_state & JOB_COMPLETING) {
safe_unpackstr_xmalloc(&nodes_completing,
&name_len, buffer);
}
safe_unpackstr_xmalloc(&nodes, &name_len, buffer);
safe_unpackstr_xmalloc(&partition, &name_len, buffer);
if (partition == NULL) {
error("No partition for job %u", job_id);
goto unpack_error;
}
part_ptr = find_part_record (partition);
if (part_ptr == NULL) {
part_ptr_list = get_part_list(partition);
if (part_ptr_list)
part_ptr = list_peek(part_ptr_list);
}
if (part_ptr == NULL) {
verbose("Invalid partition (%s) for job_id %u",
partition, job_id);
/* not fatal error, partition could have been removed,
* reset_job_bitmaps() will clean-up this job */
}
safe_unpackstr_xmalloc(&name, &name_len, buffer);
safe_unpackstr_xmalloc(&wckey, &name_len, buffer);
safe_unpackstr_xmalloc(&alloc_node, &name_len, buffer);
safe_unpackstr_xmalloc(&account, &name_len, buffer);
safe_unpackstr_xmalloc(&comment, &name_len, buffer);
safe_unpackstr_xmalloc(&gres, &name_len, buffer);
safe_unpackstr_xmalloc(&network, &name_len, buffer);
safe_unpackstr_xmalloc(&licenses, &name_len, buffer);
safe_unpackstr_xmalloc(&mail_user, &name_len, buffer);
safe_unpackstr_xmalloc(&resv_name, &name_len, buffer);
if (select_g_select_jobinfo_unpack(&select_jobinfo, buffer,
protocol_version))
goto unpack_error;
if (unpack_job_resources(&job_resources, buffer,
protocol_version))
goto unpack_error;
safe_unpack16(&ckpt_interval, buffer);
if (checkpoint_alloc_jobinfo(&check_job) ||
checkpoint_unpack_jobinfo(check_job, buffer,
protocol_version))
goto unpack_error;
safe_unpackstr_array(&spank_job_env, &spank_job_env_size,
buffer);
if (gres_plugin_job_state_unpack(&gres_list, buffer, job_id,
protocol_version) !=
SLURM_SUCCESS)
goto unpack_error;
gres_plugin_job_state_log(gres_list, job_id);
safe_unpack16(&details, buffer);
if ((details == DETAILS_FLAG) &&
(_load_job_details(job_ptr, buffer, protocol_version))) {
job_ptr->job_state = JOB_FAILED;
job_ptr->exit_code = 1;
job_ptr->state_reason = FAIL_SYSTEM;
xfree(job_ptr->state_desc);
job_ptr->end_time = now;
goto unpack_error;
}
safe_unpack16(&step_flag, buffer);
while (step_flag == STEP_FLAG) {
/* No need to put these into accounting if they
* haven't been since all information will be
* put in when the job is finished.
*/
if ((error_code = load_step_state(job_ptr, buffer,
protocol_version)))
goto unpack_error;
safe_unpack16(&step_flag, buffer);
}
} else if (protocol_version >= SLURM_2_1_PROTOCOL_VERSION) {
uint16_t kill_on_step_done;
uint32_t min_cpus;
safe_unpack32(&assoc_id, buffer);
safe_unpack32(&job_id, buffer);
/* validity test as possible */
if (job_id == 0) {
verbose("Invalid job_id %u", job_id);
goto unpack_error;
}
job_ptr = find_job_record(job_id);
if (job_ptr == NULL) {
job_ptr = create_job_record(&error_code);
if (error_code) {
error("Create job entry failed for job_id %u",
job_id);
goto unpack_error;
}
job_ptr->job_id = job_id;
_add_job_hash(job_ptr);
}
safe_unpack32(&user_id, buffer);
safe_unpack32(&group_id, buffer);
safe_unpack32(&time_limit, buffer);
time_min = 0;
safe_unpack32(&priority, buffer);
safe_unpack32(&alloc_sid, buffer);
safe_unpack32(&min_cpus, buffer);
safe_unpack32(&total_cpus, buffer);
safe_unpack32(&cpu_cnt, buffer);
safe_unpack32(&exit_code, buffer);
safe_unpack32(&db_index, buffer);
safe_unpack32(&assoc_id, buffer);
safe_unpack32(&resv_id, buffer);
safe_unpack32(&next_step_id, buffer);
safe_unpack_time(&start_time, buffer);
safe_unpack_time(&end_time, buffer);
safe_unpack_time(&suspend_time, buffer);
safe_unpack_time(&pre_sus_time, buffer);
safe_unpack_time(&tot_sus_time, buffer);
safe_unpack16(&direct_set_prio, buffer);
safe_unpack16(&job_state, buffer);
safe_unpack16(&kill_on_node_fail, buffer);
safe_unpack16(&kill_on_step_done, buffer);
safe_unpack16(&batch_flag, buffer);
safe_unpack16(&mail_type, buffer);
safe_unpack16((uint16_t *)&qos_id, buffer);
safe_unpack16(&state_reason, buffer);
safe_unpack16(&restart_cnt, buffer);
safe_unpack16(&resv_flags, buffer);
safe_unpack16(&warn_signal, buffer);
safe_unpack16(&warn_time, buffer);
safe_unpackstr_xmalloc(&state_desc, &name_len, buffer);
safe_unpackstr_xmalloc(&resp_host, &name_len, buffer);
safe_unpack16(&alloc_resp_port, buffer);
safe_unpack16(&other_port, buffer);
if (job_state & JOB_COMPLETING) {
safe_unpackstr_xmalloc(&nodes_completing,
&name_len, buffer);
}
safe_unpackstr_xmalloc(&nodes, &name_len, buffer);
safe_unpackstr_xmalloc(&partition, &name_len, buffer);
if (partition == NULL) {
error("No partition for job %u", job_id);
goto unpack_error;
}
part_ptr = find_part_record (partition);
if (part_ptr == NULL) {
verbose("Invalid partition (%s) for job_id %u",
partition, job_id);
/* not fatal error, partition could have been removed,
* reset_job_bitmaps() will clean-up this job */
}
safe_unpackstr_xmalloc(&name, &name_len, buffer);
safe_unpackstr_xmalloc(&wckey, &name_len, buffer);
safe_unpackstr_xmalloc(&alloc_node, &name_len, buffer);
safe_unpackstr_xmalloc(&account, &name_len, buffer);
safe_unpackstr_xmalloc(&comment, &name_len, buffer);
safe_unpackstr_xmalloc(&network, &name_len, buffer);
safe_unpackstr_xmalloc(&licenses, &name_len, buffer);
safe_unpackstr_xmalloc(&mail_user, &name_len, buffer);
safe_unpackstr_xmalloc(&resv_name, &name_len, buffer);
if (select_g_select_jobinfo_unpack(&select_jobinfo, buffer,
protocol_version))
goto unpack_error;
if (unpack_job_resources(&job_resources, buffer,
protocol_version))
goto unpack_error;
/* Below is needed in 2.2 but isn't set in 2.1 so we will
* use an educated guess to make things work correctly */
if (job_resources != NULL) {
xfree(job_resources->nodes);
job_resources->nodes = xstrdup(nodes);
}
/**************************************/
safe_unpack16(&ckpt_interval, buffer);
if (checkpoint_alloc_jobinfo(&check_job) ||
checkpoint_unpack_jobinfo(check_job, buffer,
protocol_version))
goto unpack_error;
safe_unpackstr_array(&spank_job_env, &spank_job_env_size,
buffer);
safe_unpack16(&details, buffer);
if ((details == DETAILS_FLAG) &&
(_load_job_details(job_ptr, buffer, protocol_version))) {
job_ptr->job_state = JOB_FAILED;
job_ptr->exit_code = 1;
job_ptr->state_reason = FAIL_SYSTEM;
xfree(job_ptr->state_desc);
job_ptr->end_time = now;
goto unpack_error;
}
safe_unpack16(&step_flag, buffer);
job_ptr->details->min_cpus = min_cpus;
while (step_flag == STEP_FLAG) {
/* No need to put these into accounting if they
* haven't been since all information will be
* put in when the job is finished.
*/
if ((error_code = load_step_state(job_ptr, buffer,
protocol_version)))
goto unpack_error;
safe_unpack16(&step_flag, buffer);
}
wait_all_nodes = DEFAULT_WAIT_ALL_NODES;
} else
goto unpack_error;
if (((job_state & JOB_STATE_BASE) >= JOB_END) ||
(batch_flag > 2)) {
error("Invalid data for job %u: "
"job_state=%u batch_flag=%u",
job_id, job_state, batch_flag);
goto unpack_error;
}
if (kill_on_node_fail > 1) {
error("Invalid data for job %u: kill_on_node_fail=%u",
job_id, kill_on_node_fail);
goto unpack_error;
}
if (priority > 1) {
highest_prio = MAX(highest_prio, priority);
lowest_prio = MIN(lowest_prio, priority);
}
if (job_id_sequence <= job_id)
job_id_sequence = job_id + 1;
xfree(job_ptr->account);
job_ptr->account = account;
xstrtolower(job_ptr->account);
account = NULL; /* reused, nothing left to free */
xfree(job_ptr->alloc_node);
job_ptr->alloc_node = alloc_node;
alloc_node = NULL; /* reused, nothing left to free */
job_ptr->alloc_resp_port = alloc_resp_port;
job_ptr->alloc_sid = alloc_sid;
job_ptr->assoc_id = assoc_id;
job_ptr->batch_flag = batch_flag;
xfree(job_ptr->batch_host);
job_ptr->batch_host = batch_host;
batch_host = NULL; /* reused, nothing left to free */
xfree(job_ptr->comment);
job_ptr->comment = comment;
comment = NULL; /* reused, nothing left to free */
xfree(job_ptr->gres);
job_ptr->gres = gres;
gres = NULL; /* reused, nothing left to free */
job_ptr->gres_list = gres_list;
job_ptr->direct_set_prio = direct_set_prio;
job_ptr->db_index = db_index;
job_ptr->derived_ec = derived_ec;
job_ptr->end_time = end_time;
job_ptr->exit_code = exit_code;
job_ptr->group_id = group_id;
job_ptr->job_state = job_state;
job_ptr->kill_on_node_fail = kill_on_node_fail;
xfree(job_ptr->licenses);
job_ptr->licenses = licenses;
licenses = NULL; /* reused, nothing left to free */
job_ptr->mail_type = mail_type;
xfree(job_ptr->mail_user);
job_ptr->mail_user = mail_user;
mail_user = NULL; /* reused, nothing left to free */
xfree(job_ptr->name); /* in case duplicate record */
job_ptr->name = name;
name = NULL; /* reused, nothing left to free */
xfree(job_ptr->wckey); /* in case duplicate record */
job_ptr->wckey = wckey;
xstrtolower(job_ptr->wckey);
wckey = NULL; /* reused, nothing left to free */
xfree(job_ptr->network);
job_ptr->network = network;
network = NULL; /* reused, nothing left to free */
job_ptr->next_step_id = next_step_id;
xfree(job_ptr->nodes); /* in case duplicate record */
job_ptr->nodes = nodes;
nodes = NULL; /* reused, nothing left to free */
if (nodes_completing) {
xfree(job_ptr->nodes_completing);
job_ptr->nodes_completing = nodes_completing;
nodes_completing = NULL; /* reused, nothing left to free */
}
job_ptr->other_port = other_port;
xfree(job_ptr->partition);
job_ptr->partition = partition;
partition = NULL; /* reused, nothing left to free */
job_ptr->part_ptr = part_ptr;
job_ptr->part_ptr_list = part_ptr_list;
job_ptr->pre_sus_time = pre_sus_time;
job_ptr->priority = priority;
job_ptr->qos_id = qos_id;
xfree(job_ptr->resp_host);
job_ptr->resp_host = resp_host;
resp_host = NULL; /* reused, nothing left to free */
job_ptr->resize_time = resize_time;
job_ptr->restart_cnt = restart_cnt;
job_ptr->resv_id = resv_id;
job_ptr->resv_name = resv_name;
resv_name = NULL; /* reused, nothing left to free */
job_ptr->resv_flags = resv_flags;
job_ptr->select_jobinfo = select_jobinfo;
job_ptr->job_resrcs = job_resources;
job_ptr->spank_job_env = spank_job_env;
job_ptr->spank_job_env_size = spank_job_env_size;
job_ptr->ckpt_interval = ckpt_interval;
job_ptr->check_job = check_job;
job_ptr->start_time = start_time;
job_ptr->state_reason = state_reason;
job_ptr->state_desc = state_desc;
state_desc = NULL; /* reused, nothing left to free */
job_ptr->suspend_time = suspend_time;
job_ptr->time_last_active = now;
job_ptr->time_limit = time_limit;
job_ptr->time_min = time_min;
job_ptr->total_cpus = total_cpus;
job_ptr->total_nodes = total_nodes;
job_ptr->cpu_cnt = cpu_cnt;
job_ptr->tot_sus_time = tot_sus_time;
job_ptr->preempt_time = preempt_time;
job_ptr->user_id = user_id;
job_ptr->wait_all_nodes = wait_all_nodes;
job_ptr->warn_signal = warn_signal;
job_ptr->warn_time = warn_time;
job_ptr->limit_set_max_cpus = limit_set_max_cpus;
job_ptr->limit_set_max_nodes = limit_set_max_nodes;
job_ptr->limit_set_min_cpus = limit_set_min_cpus;
job_ptr->limit_set_min_nodes = limit_set_min_nodes;
job_ptr->limit_set_time = limit_set_time;
job_ptr->req_switch = req_switch;
job_ptr->wait4switch = wait4switch;
/* This needs to always to initialized to "true". The select
plugin will deal with it every time it goes through the
logic if req_switch or wait4switch are set.
*/
job_ptr->best_switch = true;
memset(&assoc_rec, 0, sizeof(slurmdb_association_rec_t));
/*
* For speed and accurracy we will first see if we once had an
* association record. If not look for it by
* account,partition, user_id.
*/
if(job_ptr->assoc_id)
assoc_rec.id = job_ptr->assoc_id;
else {
assoc_rec.acct = job_ptr->account;
assoc_rec.partition = job_ptr->partition;
assoc_rec.uid = job_ptr->user_id;
}
if (assoc_mgr_fill_in_assoc(acct_db_conn, &assoc_rec,
accounting_enforce,
(slurmdb_association_rec_t **)
&job_ptr->assoc_ptr) &&
(accounting_enforce & ACCOUNTING_ENFORCE_ASSOCS)
&& (!IS_JOB_FINISHED(job_ptr))) {
info("Cancelling job %u with invalid association",
job_id);
job_ptr->job_state = JOB_CANCELLED;
job_ptr->state_reason = FAIL_ACCOUNT;
xfree(job_ptr->state_desc);
if (IS_JOB_PENDING(job_ptr))
job_ptr->start_time = now;
job_ptr->end_time = now;
job_completion_logger(job_ptr, false);
job_finished = 1;
} else {
job_ptr->assoc_id = assoc_rec.id;
info("Recovered job %u %u", job_id, job_ptr->assoc_id);
/* make sure we have started this job in accounting */
if (!job_ptr->db_index) {
debug("starting job %u in accounting",
job_ptr->job_id);
jobacct_storage_g_job_start(acct_db_conn, job_ptr);
if (IS_JOB_SUSPENDED(job_ptr)) {
jobacct_storage_g_job_suspend(acct_db_conn,
job_ptr);
}
}
/* make sure we have this job completed in the
* database */
if(IS_JOB_FINISHED(job_ptr)) {
jobacct_storage_g_job_complete(acct_db_conn, job_ptr);
job_finished = 1;
}
}
if (!job_finished && job_ptr->qos_id) {
memset(&qos_rec, 0, sizeof(slurmdb_qos_rec_t));
qos_rec.id = job_ptr->qos_id;
job_ptr->qos_ptr = _determine_and_validate_qos(
job_ptr->assoc_ptr, &qos_rec, &qos_error);
if (qos_error != SLURM_SUCCESS) {
info("Cancelling job %u with invalid qos", job_id);
job_ptr->job_state = JOB_CANCELLED;
job_ptr->state_reason = FAIL_QOS;
xfree(job_ptr->state_desc);
if (IS_JOB_PENDING(job_ptr))
job_ptr->start_time = now;
job_ptr->end_time = now;
job_completion_logger(job_ptr, false);
job_finished = 1;
}
job_ptr->qos_id = qos_rec.id;
}
build_node_details(job_ptr); /* set node_addr */
return SLURM_SUCCESS;
unpack_error:
error("Incomplete job record");
xfree(alloc_node);
xfree(account);
xfree(batch_host);
xfree(comment);
xfree(gres);
xfree(resp_host);
xfree(licenses);
xfree(mail_user);
xfree(name);
xfree(nodes);
xfree(nodes_completing);
xfree(partition);
FREE_NULL_LIST(part_ptr_list);
xfree(resv_name);
for (i=0; i<spank_job_env_size; i++)
xfree(spank_job_env[i]);
xfree(spank_job_env);
xfree(state_desc);
xfree(wckey);
select_g_select_jobinfo_free(select_jobinfo);
checkpoint_free_jobinfo(check_job);
if (job_ptr) {
if (job_ptr->job_id == 0)
job_ptr->job_id = NO_VAL;
_purge_job_record(job_ptr->job_id);
}
return SLURM_FAILURE;
}
/*
* _dump_job_details - dump the state of a specific job details to
* a buffer
* IN detail_ptr - pointer to job details for which information is requested
* IN/OUT buffer - location to store data, pointers automatically advanced
*/
void _dump_job_details(struct job_details *detail_ptr, Buf buffer)
{
pack32(detail_ptr->min_cpus, buffer);
pack32(detail_ptr->max_cpus, buffer);
pack32(detail_ptr->min_nodes, buffer);
pack32(detail_ptr->max_nodes, buffer);
pack32(detail_ptr->num_tasks, buffer);
pack16(detail_ptr->acctg_freq, buffer);
pack16(detail_ptr->contiguous, buffer);
pack16(detail_ptr->cpus_per_task, buffer);
pack16(detail_ptr->nice, buffer);
pack16(detail_ptr->ntasks_per_node, buffer);
pack16(detail_ptr->requeue, buffer);
pack16(detail_ptr->shared, buffer);
pack16(detail_ptr->task_dist, buffer);
packstr(detail_ptr->cpu_bind, buffer);
pack16(detail_ptr->cpu_bind_type, buffer);
packstr(detail_ptr->mem_bind, buffer);
pack16(detail_ptr->mem_bind_type, buffer);
pack16(detail_ptr->plane_size, buffer);
pack8(detail_ptr->open_mode, buffer);
pack8(detail_ptr->overcommit, buffer);
pack8(detail_ptr->prolog_running, buffer);
pack32(detail_ptr->pn_min_cpus, buffer);
pack32(detail_ptr->pn_min_memory, buffer);
pack32(detail_ptr->pn_min_tmp_disk, buffer);
pack_time(detail_ptr->begin_time, buffer);
pack_time(detail_ptr->submit_time, buffer);
packstr(detail_ptr->req_nodes, buffer);
packstr(detail_ptr->exc_nodes, buffer);
packstr(detail_ptr->features, buffer);
packstr(detail_ptr->dependency, buffer);
packstr(detail_ptr->orig_dependency, buffer);
packstr(detail_ptr->std_err, buffer);
packstr(detail_ptr->std_in, buffer);
packstr(detail_ptr->std_out, buffer);
packstr(detail_ptr->work_dir, buffer);
packstr(detail_ptr->ckpt_dir, buffer);
packstr(detail_ptr->restart_dir, buffer);
pack_multi_core_data(detail_ptr->mc_ptr, buffer,
SLURM_PROTOCOL_VERSION);
packstr_array(detail_ptr->argv, detail_ptr->argc, buffer);
packstr_array(detail_ptr->env_sup, detail_ptr->env_cnt, buffer);
}
/* _load_job_details - Unpack a job details information from buffer */
static int _load_job_details(struct job_record *job_ptr, Buf buffer,
uint16_t protocol_version)
{
char *req_nodes = NULL, *exc_nodes = NULL, *features = NULL;
char *cpu_bind, *dependency = NULL, *orig_dependency = NULL, *mem_bind;
char *err = NULL, *in = NULL, *out = NULL, *work_dir = NULL;
char *ckpt_dir = NULL, *restart_dir = NULL;
char **argv = (char **) NULL, **env_sup = (char **) NULL;
uint32_t min_nodes, max_nodes;
uint32_t min_cpus = 1, max_cpus = NO_VAL;
uint32_t pn_min_cpus, pn_min_memory, pn_min_tmp_disk;
uint32_t num_tasks, name_len, argc = 0, env_cnt = 0;
uint16_t shared, contiguous, nice, ntasks_per_node;
uint16_t acctg_freq, cpus_per_task, requeue, task_dist;
uint16_t cpu_bind_type, mem_bind_type, plane_size;
uint8_t open_mode, overcommit, prolog_running;
time_t begin_time, submit_time;
int i;
multi_core_data_t *mc_ptr;
/* unpack the job's details from the buffer */
if (protocol_version >= SLURM_2_3_PROTOCOL_VERSION) {
safe_unpack32(&min_cpus, buffer);
safe_unpack32(&max_cpus, buffer);
safe_unpack32(&min_nodes, buffer);
safe_unpack32(&max_nodes, buffer);
safe_unpack32(&num_tasks, buffer);
safe_unpack16(&acctg_freq, buffer);
safe_unpack16(&contiguous, buffer);
safe_unpack16(&cpus_per_task, buffer);
safe_unpack16(&nice, buffer);
safe_unpack16(&ntasks_per_node, buffer);
safe_unpack16(&requeue, buffer);
safe_unpack16(&shared, buffer);
safe_unpack16(&task_dist, buffer);
safe_unpackstr_xmalloc(&cpu_bind, &name_len, buffer);
safe_unpack16(&cpu_bind_type, buffer);
safe_unpackstr_xmalloc(&mem_bind, &name_len, buffer);
safe_unpack16(&mem_bind_type, buffer);
safe_unpack16(&plane_size, buffer);
safe_unpack8(&open_mode, buffer);
safe_unpack8(&overcommit, buffer);
safe_unpack8(&prolog_running, buffer);
safe_unpack32(&pn_min_cpus, buffer);
safe_unpack32(&pn_min_memory, buffer);
safe_unpack32(&pn_min_tmp_disk, buffer);
safe_unpack_time(&begin_time, buffer);
safe_unpack_time(&submit_time, buffer);
safe_unpackstr_xmalloc(&req_nodes, &name_len, buffer);
safe_unpackstr_xmalloc(&exc_nodes, &name_len, buffer);
safe_unpackstr_xmalloc(&features, &name_len, buffer);
safe_unpackstr_xmalloc(&dependency, &name_len, buffer);
safe_unpackstr_xmalloc(&orig_dependency, &name_len, buffer);
safe_unpackstr_xmalloc(&err, &name_len, buffer);
safe_unpackstr_xmalloc(&in, &name_len, buffer);
safe_unpackstr_xmalloc(&out, &name_len, buffer);
safe_unpackstr_xmalloc(&work_dir, &name_len, buffer);
safe_unpackstr_xmalloc(&ckpt_dir, &name_len, buffer);
safe_unpackstr_xmalloc(&restart_dir, &name_len, buffer);
if (unpack_multi_core_data(&mc_ptr, buffer, protocol_version))
goto unpack_error;
safe_unpackstr_array(&argv, &argc, buffer);
safe_unpackstr_array(&env_sup, &env_cnt, buffer);
} else if (protocol_version >= SLURM_2_2_PROTOCOL_VERSION) {
safe_unpack32(&min_cpus, buffer);
safe_unpack32(&max_cpus, buffer);
safe_unpack32(&min_nodes, buffer);
safe_unpack32(&max_nodes, buffer);
safe_unpack32(&num_tasks, buffer);
safe_unpack16(&acctg_freq, buffer);
safe_unpack16(&contiguous, buffer);
safe_unpack16(&cpus_per_task, buffer);
safe_unpack16(&nice, buffer);
safe_unpack16(&ntasks_per_node, buffer);
safe_unpack16(&requeue, buffer);
safe_unpack16(&shared, buffer);
safe_unpack16(&task_dist, buffer);
safe_unpackstr_xmalloc(&cpu_bind, &name_len, buffer);
safe_unpack16(&cpu_bind_type, buffer);
safe_unpackstr_xmalloc(&mem_bind, &name_len, buffer);
safe_unpack16(&mem_bind_type, buffer);
safe_unpack16(&plane_size, buffer);
safe_unpack8(&open_mode, buffer);
safe_unpack8(&overcommit, buffer);
safe_unpack8(&prolog_running, buffer);
safe_unpack32(&pn_min_cpus, buffer);
safe_unpack32(&pn_min_memory, buffer);
safe_unpack32(&pn_min_tmp_disk, buffer);
safe_unpack_time(&begin_time, buffer);
safe_unpack_time(&submit_time, buffer);
safe_unpackstr_xmalloc(&req_nodes, &name_len, buffer);
safe_unpackstr_xmalloc(&exc_nodes, &name_len, buffer);
safe_unpackstr_xmalloc(&features, &name_len, buffer);
safe_unpackstr_xmalloc(&dependency, &name_len, buffer);
orig_dependency = xstrdup(dependency);
safe_unpackstr_xmalloc(&err, &name_len, buffer);
safe_unpackstr_xmalloc(&in, &name_len, buffer);
safe_unpackstr_xmalloc(&out, &name_len, buffer);
safe_unpackstr_xmalloc(&work_dir, &name_len, buffer);
safe_unpackstr_xmalloc(&ckpt_dir, &name_len, buffer);
safe_unpackstr_xmalloc(&restart_dir, &name_len, buffer);
if (unpack_multi_core_data(&mc_ptr, buffer, protocol_version))
goto unpack_error;
safe_unpackstr_array(&argv, &argc, buffer);
safe_unpackstr_array(&env_sup, &env_cnt, buffer);
} else if (protocol_version >= SLURM_2_1_PROTOCOL_VERSION) {
safe_unpack32(&min_nodes, buffer);
safe_unpack32(&max_nodes, buffer);
safe_unpack32(&num_tasks, buffer);
safe_unpack16(&acctg_freq, buffer);
safe_unpack16(&contiguous, buffer);
safe_unpack16(&cpus_per_task, buffer);
safe_unpack16(&nice, buffer);
safe_unpack16(&ntasks_per_node, buffer);
safe_unpack16(&requeue, buffer);
safe_unpack16(&shared, buffer);
safe_unpack16(&task_dist, buffer);
safe_unpackstr_xmalloc(&cpu_bind, &name_len, buffer);
safe_unpack16(&cpu_bind_type, buffer);
safe_unpackstr_xmalloc(&mem_bind, &name_len, buffer);
safe_unpack16(&mem_bind_type, buffer);
safe_unpack16(&plane_size, buffer);
safe_unpack8(&open_mode, buffer);
safe_unpack8(&overcommit, buffer);
safe_unpack8(&prolog_running, buffer);
safe_unpack32(&pn_min_cpus, buffer);
safe_unpack32(&pn_min_memory, buffer);
safe_unpack32(&pn_min_tmp_disk, buffer);
safe_unpack_time(&begin_time, buffer);
safe_unpack_time(&submit_time, buffer);
safe_unpackstr_xmalloc(&req_nodes, &name_len, buffer);
safe_unpackstr_xmalloc(&exc_nodes, &name_len, buffer);
safe_unpackstr_xmalloc(&features, &name_len, buffer);
safe_unpackstr_xmalloc(&dependency, &name_len, buffer);
orig_dependency = xstrdup(dependency);
safe_unpackstr_xmalloc(&err, &name_len, buffer);
safe_unpackstr_xmalloc(&in, &name_len, buffer);
safe_unpackstr_xmalloc(&out, &name_len, buffer);
safe_unpackstr_xmalloc(&work_dir, &name_len, buffer);
safe_unpackstr_xmalloc(&ckpt_dir, &name_len, buffer);
safe_unpackstr_xmalloc(&restart_dir, &name_len, buffer);
if (unpack_multi_core_data(&mc_ptr, buffer, protocol_version))
goto unpack_error;
safe_unpackstr_array(&argv, &argc, buffer);
safe_unpackstr_array(&env_sup, &env_cnt, buffer);
}
/* validity test as possible */
if (contiguous > 1) {
error("Invalid data for job %u: contiguous=%u",
job_ptr->job_id, contiguous);
goto unpack_error;
}
if ((requeue > 1) || (overcommit > 1)) {
error("Invalid data for job %u: requeue=%u overcommit=%u",
job_ptr->job_id, requeue, overcommit);
goto unpack_error;
}
if (prolog_running > 1) {
error("Invalid data for job %u: prolog_running=%u",
job_ptr->job_id, prolog_running);
goto unpack_error;
}
/* free any left-over detail data */
for (i=0; i<job_ptr->details->argc; i++)
xfree(job_ptr->details->argv[i]);
xfree(job_ptr->details->argv);
xfree(job_ptr->details->cpu_bind);
xfree(job_ptr->details->dependency);
xfree(job_ptr->details->orig_dependency);
xfree(job_ptr->details->std_err);
for (i=0; i<job_ptr->details->env_cnt; i++)
xfree(job_ptr->details->env_sup[i]);
xfree(job_ptr->details->env_sup);
xfree(job_ptr->details->exc_nodes);
xfree(job_ptr->details->features);
xfree(job_ptr->details->std_in);
xfree(job_ptr->details->mem_bind);
xfree(job_ptr->details->std_out);
xfree(job_ptr->details->req_nodes);
xfree(job_ptr->details->work_dir);
xfree(job_ptr->details->ckpt_dir);
xfree(job_ptr->details->restart_dir);
/* now put the details into the job record */
job_ptr->details->acctg_freq = acctg_freq;
job_ptr->details->argc = argc;
job_ptr->details->argv = argv;
job_ptr->details->begin_time = begin_time;
job_ptr->details->contiguous = contiguous;
job_ptr->details->cpu_bind = cpu_bind;
job_ptr->details->cpu_bind_type = cpu_bind_type;
job_ptr->details->cpus_per_task = cpus_per_task;
job_ptr->details->dependency = dependency;
job_ptr->details->orig_dependency = orig_dependency;
job_ptr->details->env_cnt = env_cnt;
job_ptr->details->env_sup = env_sup;
job_ptr->details->std_err = err;
job_ptr->details->exc_nodes = exc_nodes;
job_ptr->details->features = features;
job_ptr->details->std_in = in;
job_ptr->details->pn_min_cpus = pn_min_cpus;
job_ptr->details->pn_min_memory = pn_min_memory;
job_ptr->details->pn_min_tmp_disk = pn_min_tmp_disk;
job_ptr->details->max_cpus = max_cpus;
job_ptr->details->max_nodes = max_nodes;
job_ptr->details->mc_ptr = mc_ptr;
job_ptr->details->mem_bind = mem_bind;
job_ptr->details->mem_bind_type = mem_bind_type;
job_ptr->details->min_cpus = min_cpus;
job_ptr->details->min_nodes = min_nodes;
job_ptr->details->nice = nice;
job_ptr->details->ntasks_per_node = ntasks_per_node;
job_ptr->details->num_tasks = num_tasks;
job_ptr->details->open_mode = open_mode;
job_ptr->details->std_out = out;
job_ptr->details->overcommit = overcommit;
job_ptr->details->plane_size = plane_size;
job_ptr->details->prolog_running = prolog_running;
job_ptr->details->req_nodes = req_nodes;
job_ptr->details->requeue = requeue;
job_ptr->details->shared = shared;
job_ptr->details->submit_time = submit_time;
job_ptr->details->task_dist = task_dist;
job_ptr->details->work_dir = work_dir;
job_ptr->details->ckpt_dir = ckpt_dir;
job_ptr->details->restart_dir = restart_dir;
return SLURM_SUCCESS;
unpack_error:
/* for (i=0; i<argc; i++)
xfree(argv[i]); Don't trust this on unpack error */
xfree(argv);
xfree(cpu_bind);
xfree(dependency);
xfree(orig_dependency);
/* for (i=0; i<env_cnt; i++)
xfree(env_sup[i]); Don't trust this on unpack error */
xfree(env_sup);
xfree(err);
xfree(exc_nodes);
xfree(features);
xfree(in);
xfree(mem_bind);
xfree(out);
xfree(req_nodes);
xfree(work_dir);
xfree(ckpt_dir);
xfree(restart_dir);
return SLURM_FAILURE;
}
/* _add_job_hash - add a job hash entry for given job record, job_id must
* already be set
* IN job_ptr - pointer to job record
* Globals: hash table updated
*/
void _add_job_hash(struct job_record *job_ptr)
{
int inx;
inx = JOB_HASH_INX(job_ptr->job_id);
job_ptr->job_next = job_hash[inx];
job_hash[inx] = job_ptr;
}
/*
* find_job_record - return a pointer to the job record with the given job_id
* IN job_id - requested job's id
* RET pointer to the job's record, NULL on error
* global: job_list - global job list pointer
* job_hash - hash table into job records
*/
struct job_record *find_job_record(uint32_t job_id)
{
struct job_record *job_ptr;
job_ptr = job_hash[JOB_HASH_INX(job_id)];
while (job_ptr) {
if (job_ptr->job_id == job_id)
return job_ptr;
job_ptr = job_ptr->job_next;
}
return NULL;
}
/* rebuild a job's partition name list based upon the contents of its
* part_ptr_list */
static void _rebuild_part_name_list(struct job_record *job_ptr)
{
bool job_active = false, job_pending = false;
struct part_record *part_ptr;
ListIterator part_iterator;
xfree(job_ptr->partition);
if (IS_JOB_RUNNING(job_ptr) || IS_JOB_SUSPENDED(job_ptr)) {
job_active = true;
xfree(job_ptr->partition);
job_ptr->partition = xstrdup(job_ptr->part_ptr->name);
} else if (IS_JOB_PENDING(job_ptr))
job_pending = true;
part_iterator = list_iterator_create(job_ptr->part_ptr_list);
if (part_iterator == NULL)
fatal("list_iterator_create malloc failure");
while ((part_ptr = (struct part_record *) list_next(part_iterator))) {
if (job_pending) {
/* Reset job's one partition to a valid one */
job_ptr->part_ptr = part_ptr;
job_pending = false;
}
if (job_active && (part_ptr == job_ptr->part_ptr))
continue; /* already added */
if (job_ptr->partition)
xstrcat(job_ptr->partition, ",");
xstrcat(job_ptr->partition, part_ptr->name);
}
list_iterator_destroy(part_iterator);
last_job_update = time(NULL);
}
/*
* kill_job_by_part_name - Given a partition name, deallocate resource for
* its jobs and kill them. All jobs associated with this partition
* will have their partition pointer cleared.
* IN part_name - name of a partition
* RET number of jobs associated with this partition
*/
extern int kill_job_by_part_name(char *part_name)
{
ListIterator job_iterator, part_iterator;
struct job_record *job_ptr;
struct part_record *part_ptr, *part2_ptr;
int job_count = 0;
time_t now = time(NULL);
part_ptr = find_part_record (part_name);
if (part_ptr == NULL) /* No such partition */
return 0;
job_iterator = list_iterator_create(job_list);
while ((job_ptr = (struct job_record *) list_next(job_iterator))) {
bool pending = false, suspended = false;
pending = IS_JOB_PENDING(job_ptr);
if (job_ptr->part_ptr_list) {
/* Remove partition if candidate for a job */
bool rebuild_name_list = false;
part_iterator = list_iterator_create(job_ptr->
part_ptr_list);
if (part_iterator == NULL)
fatal("list_iterator_create malloc failure");
while ((part2_ptr = (struct part_record *)
list_next(part_iterator))) {
if (part2_ptr != part_ptr)
continue;
list_remove(part_iterator);
rebuild_name_list = true;
}
list_iterator_destroy(part_iterator);
if (rebuild_name_list) {
if (list_count(job_ptr->part_ptr_list) > 0) {
_rebuild_part_name_list(job_ptr);
job_ptr->part_ptr =
list_peek(job_ptr->
part_ptr_list);
} else {
FREE_NULL_LIST(job_ptr->part_ptr_list);
}
}
}
if (job_ptr->part_ptr != part_ptr)
continue;
if (IS_JOB_SUSPENDED(job_ptr)) {
enum job_states suspend_job_state = job_ptr->job_state;
/* we can't have it as suspended when we call the
* accounting stuff.
*/
job_ptr->job_state = JOB_CANCELLED;
jobacct_storage_g_job_suspend(acct_db_conn, job_ptr);
job_ptr->job_state = suspend_job_state;
suspended = true;
}
if (IS_JOB_RUNNING(job_ptr) || suspended) {
job_count++;
info("Killing job_id %u on defunct partition %s",
job_ptr->job_id, part_name);
job_ptr->job_state = JOB_NODE_FAIL | JOB_COMPLETING;
build_cg_bitmap(job_ptr);
job_ptr->exit_code = MAX(job_ptr->exit_code, 1);
job_ptr->state_reason = FAIL_DOWN_PARTITION;
xfree(job_ptr->state_desc);
if (suspended) {
job_ptr->end_time = job_ptr->suspend_time;
job_ptr->tot_sus_time +=
difftime(now, job_ptr->suspend_time);
} else
job_ptr->end_time = now;
if (!pending)
deallocate_nodes(job_ptr, false, suspended,
false);
job_completion_logger(job_ptr, false);
} else if (pending) {
job_count++;
info("Killing job_id %u on defunct partition %s",
job_ptr->job_id, part_name);
job_ptr->job_state = JOB_CANCELLED;
job_ptr->start_time = now;
job_ptr->end_time = now;
job_ptr->exit_code = 1;
job_completion_logger(job_ptr, false);
}
job_ptr->part_ptr = NULL;
FREE_NULL_LIST(job_ptr->part_ptr_list);
}
list_iterator_destroy(job_iterator);
if (job_count)
last_job_update = now;
return job_count;
}
/*
* kill_job_by_front_end_name - Given a front end node name, deallocate
* resource for its jobs and kill them.
* IN node_name - name of a front end node
* RET number of jobs associated with this front end node
* NOTE: Patterned after kill_running_job_by_node_name()
*/
extern int kill_job_by_front_end_name(char *node_name)
{
#ifdef HAVE_FRONT_END
ListIterator job_iterator;
struct job_record *job_ptr;
struct node_record *node_ptr;
time_t now = time(NULL);
int i, job_count = 0;
if (node_name == NULL)
fatal("kill_job_by_front_end_name: node_name is NULL");
job_iterator = list_iterator_create(job_list);
while ((job_ptr = (struct job_record *) list_next(job_iterator))) {
bool suspended = false;
if (!IS_JOB_RUNNING(job_ptr) && !IS_JOB_SUSPENDED(job_ptr) &&
!IS_JOB_COMPLETING(job_ptr))
continue;
if ((job_ptr->batch_host == NULL) ||
strcmp(job_ptr->batch_host, node_name))
continue; /* no match on node name */
if (IS_JOB_SUSPENDED(job_ptr)) {
enum job_states suspend_job_state = job_ptr->job_state;
/* we can't have it as suspended when we call the
* accounting stuff.
*/
job_ptr->job_state = JOB_CANCELLED;
jobacct_storage_g_job_suspend(acct_db_conn, job_ptr);
job_ptr->job_state = suspend_job_state;
suspended = true;
}
if (IS_JOB_COMPLETING(job_ptr)) {
job_count++;
while ((i = bit_ffs(job_ptr->node_bitmap_cg)) >= 0) {
bit_clear(job_ptr->node_bitmap_cg, i);
job_update_cpu_cnt(job_ptr, i);
if (job_ptr->node_cnt)
(job_ptr->node_cnt)--;
else {
error("node_cnt underflow on JobId=%u",
job_ptr->job_id);
}
if (job_ptr->node_cnt == 0) {
job_ptr->job_state &= (~JOB_COMPLETING);
delete_step_records(job_ptr);
slurm_sched_schedule();
}
node_ptr = &node_record_table_ptr[i];
if (node_ptr->comp_job_cnt)
(node_ptr->comp_job_cnt)--;
else {
error("Node %s comp_job_cnt underflow, "
"JobId=%u",
node_ptr->name, job_ptr->job_id);
}
}
} else if (IS_JOB_RUNNING(job_ptr) || suspended) {
job_count++;
if (job_ptr->batch_flag && job_ptr->details &&
(job_ptr->details->requeue > 0)) {
char requeue_msg[128];
srun_node_fail(job_ptr->job_id, node_name);
info("requeue job %u due to failure of node %s",
job_ptr->job_id, node_name);
_set_job_prio(job_ptr);
snprintf(requeue_msg, sizeof(requeue_msg),
"Job requeued due to failure "
"of node %s",
node_name);
slurm_sched_requeue(job_ptr, requeue_msg);
job_ptr->time_last_active = now;
if (suspended) {
job_ptr->end_time =
job_ptr->suspend_time;
job_ptr->tot_sus_time +=
difftime(now,
job_ptr->
suspend_time);
} else
job_ptr->end_time = now;
/* We want this job to look like it
* was terminated in the accounting logs.
* Set a new submit time so the restarted
* job looks like a new job. */
job_ptr->job_state = JOB_NODE_FAIL;
build_cg_bitmap(job_ptr);
deallocate_nodes(job_ptr, false, suspended,
false);
job_completion_logger(job_ptr, true);
job_ptr->db_index = 0;
job_ptr->job_state = JOB_PENDING;
if (job_ptr->node_cnt)
job_ptr->job_state |= JOB_COMPLETING;
job_ptr->details->submit_time = now;
/* restart from periodic checkpoint */
if (job_ptr->ckpt_interval &&
job_ptr->ckpt_time &&
job_ptr->details->ckpt_dir) {
xfree(job_ptr->details->restart_dir);
job_ptr->details->restart_dir =
xstrdup (job_ptr->details->
ckpt_dir);
xstrfmtcat(job_ptr->details->
restart_dir,
"/%u", job_ptr->job_id);
}
job_ptr->restart_cnt++;
/* Since the job completion logger
* removes the submit we need to add it
* again. */
acct_policy_add_job_submit(job_ptr);
} else {
info("Killing job_id %u on failed node %s",
job_ptr->job_id, node_name);
srun_node_fail(job_ptr->job_id, node_name);
job_ptr->job_state = JOB_NODE_FAIL |
JOB_COMPLETING;
build_cg_bitmap(job_ptr);
job_ptr->exit_code = MAX(job_ptr->exit_code, 1);
job_ptr->state_reason = FAIL_DOWN_NODE;
xfree(job_ptr->state_desc);
if (suspended) {
job_ptr->end_time =
job_ptr->suspend_time;
job_ptr->tot_sus_time +=
difftime(now,
job_ptr->suspend_time);
} else
job_ptr->end_time = now;
deallocate_nodes(job_ptr, false, suspended,
false);
job_completion_logger(job_ptr, false);
}
}
}
list_iterator_destroy(job_iterator);
if (job_count)
last_job_update = now;
return job_count;
#else
return 0;
#endif
}
/*
* partition_in_use - determine whether a partition is in use by a RUNNING
* PENDING or SUSPENDED job
* IN part_name - name of a partition
* RET true if the partition is in use, else false
*/
extern bool partition_in_use(char *part_name)
{
ListIterator job_iterator;
struct job_record *job_ptr;
struct part_record *part_ptr;
part_ptr = find_part_record (part_name);
if (part_ptr == NULL) /* No such partition */
return false;
job_iterator = list_iterator_create(job_list);
if (job_iterator == NULL)
fatal("list_iterator_create: malloc failure");
while ((job_ptr = (struct job_record *) list_next(job_iterator))) {
if (job_ptr->part_ptr == part_ptr) {
if (!IS_JOB_FINISHED(job_ptr)) {
list_iterator_destroy(job_iterator);
return true;
}
}
}
list_iterator_destroy(job_iterator);
return false;
}
/*
* allocated_session_in_use - check if an interactive session is already running
* IN new_alloc - allocation (alloc_node:alloc_sid) to test for
* Returns true if an interactive session of the same node:sid already is in use
* by a RUNNING, PENDING, or SUSPENDED job. Provides its own locking.
*/
extern bool allocated_session_in_use(job_desc_msg_t *new_alloc)
{
ListIterator job_iter;
struct job_record *job_ptr;
/* Locks: Read job */
slurmctld_lock_t job_read_lock = {
NO_LOCK, READ_LOCK, NO_LOCK, NO_LOCK };
if ((new_alloc->script != NULL) || (new_alloc->alloc_node == NULL))
return false;
lock_slurmctld(job_read_lock);
job_iter = list_iterator_create(job_list);
if (job_iter == NULL)
fatal("list_iterator_create: malloc failure");
while ((job_ptr = (struct job_record *)list_next(job_iter))) {
if (job_ptr->batch_flag || IS_JOB_FINISHED(job_ptr))
continue;
if (job_ptr->alloc_node &&
(strcmp(job_ptr->alloc_node, new_alloc->alloc_node) == 0) &&
(job_ptr->alloc_sid == new_alloc->alloc_sid))
break;
}
list_iterator_destroy(job_iter);
unlock_slurmctld(job_read_lock);
return job_ptr != NULL;
}
/*
* kill_running_job_by_node_name - Given a node name, deallocate RUNNING
* or COMPLETING jobs from the node or kill them
* IN node_name - name of a node
* RET number of killed jobs
*/
extern int kill_running_job_by_node_name(char *node_name)
{
ListIterator job_iterator;
struct job_record *job_ptr;
struct node_record *node_ptr;
int bit_position;
int job_count = 0;
time_t now = time(NULL);
node_ptr = find_node_record(node_name);
if (node_ptr == NULL) /* No such node */
return 0;
bit_position = node_ptr - node_record_table_ptr;
job_iterator = list_iterator_create(job_list);
if (job_iterator == NULL)
fatal("list_iterator_create: malloc failure");
while ((job_ptr = (struct job_record *) list_next(job_iterator))) {
bool suspended = false;
if ((job_ptr->node_bitmap == NULL) ||
(!bit_test(job_ptr->node_bitmap, bit_position)))
continue; /* job not on this node */
if (IS_JOB_SUSPENDED(job_ptr)) {
enum job_states suspend_job_state = job_ptr->job_state;
/* we can't have it as suspended when we call the
* accounting stuff.
*/
job_ptr->job_state = JOB_CANCELLED;
jobacct_storage_g_job_suspend(acct_db_conn, job_ptr);
job_ptr->job_state = suspend_job_state;
suspended = true;
}
if (IS_JOB_COMPLETING(job_ptr)) {
if (!bit_test(job_ptr->node_bitmap_cg, bit_position))
continue;
job_count++;
bit_clear(job_ptr->node_bitmap_cg, bit_position);
job_update_cpu_cnt(job_ptr, bit_position);
if (job_ptr->node_cnt)
(job_ptr->node_cnt)--;
else {
error("node_cnt underflow on JobId=%u",
job_ptr->job_id);
}
if (job_ptr->node_cnt == 0) {
job_ptr->job_state &= (~JOB_COMPLETING);
delete_step_records(job_ptr);
slurm_sched_schedule();
}
if (node_ptr->comp_job_cnt)
(node_ptr->comp_job_cnt)--;
else {
error("Node %s comp_job_cnt underflow, "
"JobId=%u",
node_ptr->name, job_ptr->job_id);
}
} else if (IS_JOB_RUNNING(job_ptr) || suspended) {
job_count++;
if ((job_ptr->details) &&
(job_ptr->kill_on_node_fail == 0) &&
(job_ptr->node_cnt > 1)) {
/* keep job running on remaining nodes */
srun_node_fail(job_ptr->job_id, node_name);
error("Removing failed node %s from job_id %u",
node_name, job_ptr->job_id);
job_pre_resize_acctg(job_ptr);
kill_step_on_node(job_ptr, node_ptr, true);
excise_node_from_job(job_ptr, node_ptr);
job_post_resize_acctg(job_ptr);
} else if (job_ptr->batch_flag && job_ptr->details &&
(job_ptr->details->requeue > 0)) {
char requeue_msg[128];
srun_node_fail(job_ptr->job_id, node_name);
info("requeue job %u due to failure of node %s",
job_ptr->job_id, node_name);
snprintf(requeue_msg, sizeof(requeue_msg),
"Job requeued due to failure "
"of node %s",
node_name);
slurm_sched_requeue(job_ptr, requeue_msg);
job_ptr->time_last_active = now;
if (suspended) {
job_ptr->end_time =
job_ptr->suspend_time;
job_ptr->tot_sus_time +=
difftime(now,
job_ptr->
suspend_time);
} else
job_ptr->end_time = now;
/* We want this job to look like it
* was terminated in the accounting logs.
* Set a new submit time so the restarted
* job looks like a new job. */
job_ptr->job_state = JOB_NODE_FAIL;
build_cg_bitmap(job_ptr);
deallocate_nodes(job_ptr, false, suspended,
false);
job_completion_logger(job_ptr, true);
job_ptr->db_index = 0;
job_ptr->job_state = JOB_PENDING;
if (job_ptr->node_cnt)
job_ptr->job_state |= JOB_COMPLETING;
job_ptr->details->submit_time = now;
/* restart from periodic checkpoint */
if (job_ptr->ckpt_interval &&
job_ptr->ckpt_time &&
job_ptr->details->ckpt_dir) {
xfree(job_ptr->details->restart_dir);
job_ptr->details->restart_dir =
xstrdup (job_ptr->details->
ckpt_dir);
xstrfmtcat(job_ptr->details->
restart_dir,
"/%u", job_ptr->job_id);
}
job_ptr->restart_cnt++;
/* Since the job completion logger
* removes the submit we need to add it
* again. */
acct_policy_add_job_submit(job_ptr);
} else {
info("Killing job_id %u on failed node %s",
job_ptr->job_id, node_name);
srun_node_fail(job_ptr->job_id, node_name);
job_ptr->job_state = JOB_NODE_FAIL |
JOB_COMPLETING;
build_cg_bitmap(job_ptr);
job_ptr->exit_code = MAX(job_ptr->exit_code, 1);
job_ptr->state_reason = FAIL_DOWN_NODE;
xfree(job_ptr->state_desc);
if (suspended) {
job_ptr->end_time =
job_ptr->suspend_time;
job_ptr->tot_sus_time +=
difftime(now,
job_ptr->suspend_time);
} else
job_ptr->end_time = now;
deallocate_nodes(job_ptr, false, suspended,
false);
job_completion_logger(job_ptr, false);
}
}
}
list_iterator_destroy(job_iterator);
if (job_count)
last_job_update = now;
return job_count;
}
/* Remove one node from a job's allocation */
extern void excise_node_from_job(struct job_record *job_ptr,
struct node_record *node_ptr)
{
int i, orig_pos = -1, new_pos = -1;
bitstr_t *orig_bitmap;
job_resources_t *job_resrcs_ptr = job_ptr->job_resrcs;
xassert(job_resrcs_ptr);
xassert(job_resrcs_ptr->cpus);
xassert(job_resrcs_ptr->cpus_used);
orig_bitmap = bit_copy(job_ptr->node_bitmap);
if (!orig_bitmap)
fatal("bit_copy memory allocation failure");
make_node_idle(node_ptr, job_ptr); /* updates bitmap */
xfree(job_ptr->nodes);
job_ptr->nodes = bitmap2node_name(job_ptr->node_bitmap);
for (i=bit_ffs(orig_bitmap); i<node_record_count; i++) {
if (!bit_test(orig_bitmap,i))
continue;
orig_pos++;
if (!bit_test(job_ptr->node_bitmap, i))
continue;
new_pos++;
if (orig_pos == new_pos)
continue;
memcpy(&job_ptr->node_addr[new_pos],
&job_ptr->node_addr[orig_pos], sizeof(slurm_addr_t));
/* NOTE: The job's allocation in the job_ptr->job_resrcs
* data structure is unchanged even after a node allocated
* to the job goes DOWN. */
}
job_ptr->total_nodes = job_ptr->node_cnt = new_pos + 1;
FREE_NULL_BITMAP(orig_bitmap);
(void) select_g_job_resized(job_ptr, node_ptr);
}
/*
* dump_job_desc - dump the incoming job submit request message
* IN job_specs - job specification from RPC
*/
void dump_job_desc(job_desc_msg_t * job_specs)
{
long job_id, time_min;
long pn_min_cpus, pn_min_memory, pn_min_tmp_disk, min_cpus;
long time_limit, priority, contiguous, acctg_freq;
long kill_on_node_fail, shared, immediate, wait_all_nodes;
long cpus_per_task, requeue, num_tasks, overcommit;
long ntasks_per_node, ntasks_per_socket, ntasks_per_core;
char *mem_type, buf[100];
if (job_specs == NULL)
return;
job_id = (job_specs->job_id != NO_VAL) ?
(long) job_specs->job_id : -1L;
debug3("JobDesc: user_id=%u job_id=%ld partition=%s name=%s",
job_specs->user_id, job_id,
job_specs->partition, job_specs->name);
min_cpus = (job_specs->min_cpus != NO_VAL) ?
(long) job_specs->min_cpus : -1L;
pn_min_cpus = (job_specs->pn_min_cpus != (uint16_t) NO_VAL) ?
(long) job_specs->pn_min_cpus : -1L;
debug3(" cpus=%ld-%u pn_min_cpus=%ld", min_cpus,
job_specs->max_cpus, pn_min_cpus);
debug3(" -N min-[max]: %u-[%u]:%u:%u:%u",
job_specs->min_nodes, job_specs->max_nodes,
job_specs->sockets_per_node, job_specs->cores_per_socket,
job_specs->threads_per_core);
if (job_specs->pn_min_memory == NO_VAL) {
pn_min_memory = -1L;
mem_type = "job";
} else if (job_specs->pn_min_memory & MEM_PER_CPU) {
pn_min_memory = (long) (job_specs->pn_min_memory &
(~MEM_PER_CPU));
mem_type = "cpu";
} else {
pn_min_memory = (long) job_specs->pn_min_memory;
mem_type = "job";
}
pn_min_tmp_disk = (job_specs->pn_min_tmp_disk != NO_VAL) ?
(long) job_specs->pn_min_tmp_disk : -1L;
debug3(" pn_min_memory_%s=%ld pn_min_tmp_disk=%ld",
mem_type, pn_min_memory, pn_min_tmp_disk);
immediate = (job_specs->immediate == 0) ? 0L : 1L;
debug3(" immediate=%ld features=%s reservation=%s",
immediate, job_specs->features, job_specs->reservation);
debug3(" req_nodes=%s exc_nodes=%s gres=%s",
job_specs->req_nodes, job_specs->exc_nodes, job_specs->gres);
time_limit = (job_specs->time_limit != NO_VAL) ?
(long) job_specs->time_limit : -1L;
time_min = (job_specs->time_min != NO_VAL) ?
(long) job_specs->time_min : time_limit;
priority = (job_specs->priority != NO_VAL) ?
(long) job_specs->priority : -1L;
contiguous = (job_specs->contiguous != (uint16_t) NO_VAL) ?
(long) job_specs->contiguous : -1L;
shared = (job_specs->shared != (uint16_t) NO_VAL) ?
(long) job_specs->shared : -1L;
debug3(" time_limit=%ld-%ld priority=%ld contiguous=%ld shared=%ld",
time_min, time_limit, priority, contiguous, shared);
kill_on_node_fail = (job_specs->kill_on_node_fail !=
(uint16_t) NO_VAL) ?
(long) job_specs->kill_on_node_fail : -1L;
if (job_specs->script) /* log has problem with string len & null */
debug3(" kill_on_node_fail=%ld script=%.40s...",
kill_on_node_fail, job_specs->script);
else
debug3(" kill_on_node_fail=%ld script=%s",
kill_on_node_fail, job_specs->script);
if (job_specs->argc == 1)
debug3(" argv=\"%s\"",
job_specs->argv[0]);
else if (job_specs->argc == 2)
debug3(" argv=%s,%s",
job_specs->argv[0],
job_specs->argv[1]);
else if (job_specs->argc > 2)
debug3(" argv=%s,%s,%s,...",
job_specs->argv[0],
job_specs->argv[1],
job_specs->argv[2]);
if (job_specs->env_size == 1)
debug3(" environment=\"%s\"",
job_specs->environment[0]);
else if (job_specs->env_size == 2)
debug3(" environment=%s,%s",
job_specs->environment[0],
job_specs->environment[1]);
else if (job_specs->env_size > 2)
debug3(" environment=%s,%s,%s,...",
job_specs->environment[0],
job_specs->environment[1],
job_specs->environment[2]);
if (job_specs->spank_job_env_size == 1)
debug3(" spank_job_env=\"%s\"",
job_specs->spank_job_env[0]);
else if (job_specs->spank_job_env_size == 2)
debug3(" spank_job_env=%s,%s",
job_specs->spank_job_env[0],
job_specs->spank_job_env[1]);
else if (job_specs->spank_job_env_size > 2)
debug3(" spank_job_env=%s,%s,%s,...",
job_specs->spank_job_env[0],
job_specs->spank_job_env[1],
job_specs->spank_job_env[2]);
debug3(" stdin=%s stdout=%s stderr=%s",
job_specs->std_in, job_specs->std_out, job_specs->std_err);
debug3(" work_dir=%s alloc_node:sid=%s:%u",
job_specs->work_dir,
job_specs->alloc_node, job_specs->alloc_sid);
debug3(" resp_host=%s alloc_resp_port=%u other_port=%u",
job_specs->resp_host,
job_specs->alloc_resp_port, job_specs->other_port);
debug3(" dependency=%s account=%s qos=%s comment=%s",
job_specs->dependency, job_specs->account,
job_specs->qos, job_specs->comment);
num_tasks = (job_specs->num_tasks != (uint16_t) NO_VAL) ?
(long) job_specs->num_tasks : -1L;
overcommit = (job_specs->overcommit != (uint8_t) NO_VAL) ?
(long) job_specs->overcommit : -1L;
acctg_freq = (job_specs->acctg_freq != (uint16_t) NO_VAL) ?
(long) job_specs->acctg_freq : -1L;
debug3(" mail_type=%u mail_user=%s nice=%d num_tasks=%ld "
"open_mode=%u overcommit=%ld acctg_freq=%ld",
job_specs->mail_type, job_specs->mail_user,
(int)job_specs->nice - NICE_OFFSET, num_tasks,
job_specs->open_mode, overcommit, acctg_freq);
slurm_make_time_str(&job_specs->begin_time, buf, sizeof(buf));
cpus_per_task = (job_specs->cpus_per_task != (uint16_t) NO_VAL) ?
(long) job_specs->cpus_per_task : -1L;
requeue = (job_specs->requeue != (uint16_t) NO_VAL) ?
(long) job_specs->requeue : -1L;
debug3(" network=%s begin=%s cpus_per_task=%ld requeue=%ld "
"licenses=%s",
job_specs->network, buf, cpus_per_task, requeue,
job_specs->licenses);
slurm_make_time_str(&job_specs->end_time, buf, sizeof(buf));
wait_all_nodes = (job_specs->wait_all_nodes != (uint16_t) NO_VAL) ?
(long) job_specs->wait_all_nodes : -1L;
debug3(" end_time=%s signal=%u@%u wait_all_nodes=%ld",
buf, job_specs->warn_signal, job_specs->warn_time,
wait_all_nodes);
ntasks_per_node = (job_specs->ntasks_per_node != (uint16_t) NO_VAL) ?
(long) job_specs->ntasks_per_node : -1L;
ntasks_per_socket = (job_specs->ntasks_per_socket !=
(uint16_t) NO_VAL) ?
(long) job_specs->ntasks_per_socket : -1L;
ntasks_per_core = (job_specs->ntasks_per_core != (uint16_t) NO_VAL) ?
(long) job_specs->ntasks_per_core : -1L;
debug3(" ntasks_per_node=%ld ntasks_per_socket=%ld "
"ntasks_per_core=%ld",
ntasks_per_node, ntasks_per_socket, ntasks_per_core);
debug3(" cpus_bind=%u:%s mem_bind=%u:%s plane_size:%u",
job_specs->cpu_bind_type, job_specs->cpu_bind,
job_specs->mem_bind_type, job_specs->mem_bind,
job_specs->plane_size);
select_g_select_jobinfo_sprint(job_specs->select_jobinfo,
buf, sizeof(buf), SELECT_PRINT_MIXED);
if (buf[0] != '\0')
debug3(" %s", buf);
}
/*
* init_job_conf - initialize the job configuration tables and values.
* this should be called after creating node information, but
* before creating any job entries. Pre-existing job entries are
* left unchanged.
* NOTE: The job hash table size does not change after initial creation.
* RET 0 if no error, otherwise an error code
* global: last_job_update - time of last job table update
* job_list - pointer to global job list
*/
int init_job_conf(void)
{
if (job_list == NULL) {
job_count = 0;
job_list = list_create(_list_delete_job);
if (job_list == NULL)
fatal ("Memory allocation failure");
}
last_job_update = time(NULL);
return SLURM_SUCCESS;
}
/*
* rehash_jobs - Create or rebuild the job hash table.
* NOTE: run lock_slurmctld before entry: Read config, write job
*/
extern void rehash_jobs(void)
{
if (job_hash == NULL) {
hash_table_size = slurmctld_conf.max_job_cnt;
job_hash = (struct job_record **)
xmalloc(hash_table_size * sizeof(struct job_record *));
} else if (hash_table_size < (slurmctld_conf.max_job_cnt / 2)) {
/* If the MaxJobCount grows by too much, the hash table will
* be ineffective without rebuilding. We don't presently bother
* to rebuild the hash table, but cut MaxJobCount back as
* needed. */
error ("MaxJobCount reset too high, restart slurmctld");
slurmctld_conf.max_job_cnt = hash_table_size;
}
}
/*
* job_allocate - create job_records for the supplied job specification and
* allocate nodes for it.
* IN job_specs - job specifications
* IN immediate - if set then either initiate the job immediately or fail
* IN will_run - don't initiate the job if set, just test if it could run
* now or later
* OUT resp - will run response (includes start location, time, etc.)
* IN allocate - resource allocation request if set, not a full job
* IN submit_uid -uid of user issuing the request
* OUT job_pptr - set to pointer to job record
* RET 0 or an error code. If the job would only be able to execute with
* some change in partition configuration then
* ESLURM_REQUESTED_PART_CONFIG_UNAVAILABLE is returned
* globals: job_list - pointer to global job list
* list_part - global list of partition info
* default_part_loc - pointer to default partition
* NOTE: lock_slurmctld on entry: Read config Write job, Write node, Read part
*/
extern int job_allocate(job_desc_msg_t * job_specs, int immediate,
int will_run, will_run_response_msg_t **resp,
int allocate, uid_t submit_uid,
struct job_record **job_pptr)
{
static int defer_sched = -1;
int error_code;
bool no_alloc, top_prio, test_only, too_fragmented, independent;
struct job_record *job_ptr;
error_code = _job_create(job_specs, allocate, will_run,
&job_ptr, submit_uid);
*job_pptr = job_ptr;
time_t now = time(NULL);
if (error_code) {
if (job_ptr && (immediate || will_run)) {
/* this should never really happen here */
job_ptr->job_state = JOB_FAILED;
job_ptr->exit_code = 1;
job_ptr->state_reason = FAIL_BAD_CONSTRAINTS;
xfree(job_ptr->state_desc);
job_ptr->start_time = job_ptr->end_time = now;
job_completion_logger(job_ptr, false);
}
return error_code;
}
xassert(job_ptr);
independent = job_independent(job_ptr, will_run);
/* priority needs to be calculated after this since we set a
* begin time in job_independent and that lets us know if the
* job is eligible.
*/
if (job_ptr->priority == NO_VAL)
_set_job_prio(job_ptr);
if (license_job_test(job_ptr, time(NULL)) != SLURM_SUCCESS)
independent = false;
/* Avoid resource fragmentation if important */
if ((submit_uid || (job_specs->req_nodes == NULL)) &&
independent && job_is_completing())
too_fragmented = true; /* Don't pick nodes for job now */
/* FIXME: Ideally we only want to refuse the request if the
* required node list is insufficient to satisfy the job's
* processor or node count requirements, but the overhead is
* rather high to do that right here. We let requests from
* user root proceed if a node list is specified, for
* meta-schedulers (e.g. LCRM). */
else
too_fragmented = false;
if (defer_sched == -1) {
char *sched_params = slurm_get_sched_params();
if (sched_params && strstr(sched_params, "defer"))
defer_sched = 1;
else
defer_sched = 0;
xfree(sched_params);
}
if (defer_sched == 1)
too_fragmented = true;
if (independent && (!too_fragmented))
top_prio = _top_priority(job_ptr);
else
top_prio = true; /* don't bother testing,
* it is not runable anyway */
if (immediate && (too_fragmented || (!top_prio) || (!independent))) {
job_ptr->job_state = JOB_FAILED;
job_ptr->exit_code = 1;
job_ptr->state_reason = FAIL_BAD_CONSTRAINTS;
xfree(job_ptr->state_desc);
job_ptr->start_time = job_ptr->end_time = now;
job_completion_logger(job_ptr, false);
if (!independent)
return ESLURM_DEPENDENCY;
else if (too_fragmented)
return ESLURM_FRAGMENTATION;
else
return ESLURM_NOT_TOP_PRIORITY;
}
if (will_run && resp) {
job_desc_msg_t job_desc_msg;
int rc;
memset(&job_desc_msg, 0, sizeof(job_desc_msg_t));
job_desc_msg.job_id = job_ptr->job_id;
rc = job_start_data(&job_desc_msg, resp);
job_ptr->job_state = JOB_FAILED;
job_ptr->exit_code = 1;
job_ptr->start_time = job_ptr->end_time = now;
_purge_job_record(job_ptr->job_id);
return rc;
}
test_only = will_run || (allocate == 0);
no_alloc = test_only || too_fragmented ||
(!top_prio) || (!independent);
if (!no_alloc && !avail_front_end()) {
debug("sched: job_allocate() returning, no front end nodes "
"are available");
error_code = ESLURM_NODES_BUSY;
} else
error_code = select_nodes(job_ptr, no_alloc, NULL);
if (!test_only) {
last_job_update = now;
slurm_sched_schedule(); /* work for external scheduler */
}
acct_policy_add_job_submit(job_ptr);
if ((error_code == ESLURM_NODES_BUSY) ||
(error_code == ESLURM_JOB_HELD) ||
(error_code == ESLURM_NODE_NOT_AVAIL) ||
(error_code == ESLURM_QOS_THRES) ||
(error_code == ESLURM_ACCOUNTING_POLICY) ||
(error_code == ESLURM_RESERVATION_NOT_USABLE) ||
(error_code == ESLURM_REQUESTED_PART_CONFIG_UNAVAILABLE)) {
/* Not fatal error, but job can't be scheduled right now */
if (immediate) {
job_ptr->job_state = JOB_FAILED;
job_ptr->exit_code = 1;
job_ptr->state_reason = FAIL_BAD_CONSTRAINTS;
xfree(job_ptr->state_desc);
job_ptr->start_time = job_ptr->end_time = now;
job_completion_logger(job_ptr, false);
} else { /* job remains queued */
if ((error_code == ESLURM_NODES_BUSY) ||
(error_code == ESLURM_ACCOUNTING_POLICY)) {
error_code = SLURM_SUCCESS;
}
}
return error_code;
}
if (error_code) { /* fundamental flaw in job request */
job_ptr->job_state = JOB_FAILED;
job_ptr->exit_code = 1;
job_ptr->state_reason = FAIL_BAD_CONSTRAINTS;
xfree(job_ptr->state_desc);
job_ptr->start_time = job_ptr->end_time = now;
job_completion_logger(job_ptr, false);
return error_code;
}
if (will_run) { /* job would run, flag job destruction */
job_ptr->job_state = JOB_FAILED;
job_ptr->exit_code = 1;
job_ptr->start_time = job_ptr->end_time = now;
_purge_job_record(job_ptr->job_id);
} else if (!with_slurmdbd && !job_ptr->db_index)
jobacct_storage_g_job_start(acct_db_conn, job_ptr);
if (!will_run) {
debug2("sched: JobId=%u allocated resources: NodeList=%s",
job_ptr->job_id, job_ptr->nodes);
rebuild_job_part_list(job_ptr);
}
return SLURM_SUCCESS;
}
/*
* job_fail - terminate a job due to initiation failure
* IN job_id - id of the job to be killed
* RET 0 on success, otherwise ESLURM error code
*/
extern int job_fail(uint32_t job_id)
{
struct job_record *job_ptr;
time_t now = time(NULL);
bool suspended = false;
job_ptr = find_job_record(job_id);
if (job_ptr == NULL) {
error("job_fail: invalid job id %u", job_id);
return ESLURM_INVALID_JOB_ID;
}
if (IS_JOB_FINISHED(job_ptr))
return ESLURM_ALREADY_DONE;
if (IS_JOB_SUSPENDED(job_ptr)) {
enum job_states suspend_job_state = job_ptr->job_state;
/* we can't have it as suspended when we call the
* accounting stuff.
*/
job_ptr->job_state = JOB_CANCELLED;
jobacct_storage_g_job_suspend(acct_db_conn, job_ptr);
job_ptr->job_state = suspend_job_state;
suspended = true;
}
if (IS_JOB_RUNNING(job_ptr) || suspended) {
/* No need to signal steps, deallocate kills them */
job_ptr->time_last_active = now;
if (suspended) {
job_ptr->end_time = job_ptr->suspend_time;
job_ptr->tot_sus_time +=
difftime(now, job_ptr->suspend_time);
} else
job_ptr->end_time = now;
last_job_update = now;
job_ptr->job_state = JOB_FAILED | JOB_COMPLETING;
build_cg_bitmap(job_ptr);
job_ptr->exit_code = 1;
job_ptr->state_reason = FAIL_LAUNCH;
xfree(job_ptr->state_desc);
deallocate_nodes(job_ptr, false, suspended, false);
job_completion_logger(job_ptr, false);
return SLURM_SUCCESS;
}
/* All other states */
verbose("job_fail: job %u can't be killed from state=%s",
job_id, job_state_string(job_ptr->job_state));
return ESLURM_TRANSITION_STATE_NO_UPDATE;
}
/*
* job_signal - signal the specified job
* IN job_id - id of the job to be signaled
* IN signal - signal to send, SIGKILL == cancel the job
* IN batch_flag - signal batch shell only if set
* IN uid - uid of requesting user
* IN preempt - true if job being preempted
* RET 0 on success, otherwise ESLURM error code
*/
extern int job_signal(uint32_t job_id, uint16_t signal, uint16_t batch_flag,
uid_t uid, bool preempt)
{
struct job_record *job_ptr;
time_t now = time(NULL);
uint16_t job_term_state;
/* Jobs submitted using Moab command should be cancelled using
* Moab command for accurate job records */
if (!wiki_sched_test) {
char *sched_type = slurm_get_sched_type();
if (strcmp(sched_type, "sched/wiki") == 0)
wiki_sched = true;
if (strcmp(sched_type, "sched/wiki2") == 0) {
wiki_sched = true;
wiki2_sched = true;
}
xfree(sched_type);
wiki_sched_test = true;
}
job_ptr = find_job_record(job_id);
if (job_ptr == NULL) {
info("job_signal: invalid job id %u", job_id);
return ESLURM_INVALID_JOB_ID;
}
if ((job_ptr->user_id != uid) && !validate_operator(uid) &&
!assoc_mgr_is_user_acct_coord(acct_db_conn, uid,
job_ptr->account)) {
error("Security violation, JOB_CANCEL RPC from uid %d",
uid);
return ESLURM_ACCESS_DENIED;
}
if (!validate_slurm_user(uid) && (signal == SIGKILL) &&
job_ptr->part_ptr &&
(job_ptr->part_ptr->flags & PART_FLAG_ROOT_ONLY) && wiki2_sched) {
info("Attempt to cancel Moab job using Slurm command from "
"uid %d", uid);
return ESLURM_ACCESS_DENIED;
}
if (IS_JOB_FINISHED(job_ptr))
return ESLURM_ALREADY_DONE;
/* let node select plugin do any state-dependent signalling actions */
select_g_job_signal(job_ptr, signal);
/* save user ID of the one who requested the job be cancelled */
if (signal == SIGKILL)
job_ptr->requid = uid;
if (IS_JOB_PENDING(job_ptr) && IS_JOB_COMPLETING(job_ptr) &&
(signal == SIGKILL)) {
if ((job_ptr->job_state & JOB_STATE_BASE) == JOB_PENDING) {
/* Prevent job requeue, otherwise preserve state */
job_ptr->job_state = JOB_CANCELLED | JOB_COMPLETING;
}
/* build_cg_bitmap() not needed, job already completing */
verbose("job_signal of requeuing job %u successful", job_id);
return SLURM_SUCCESS;
}
if (IS_JOB_PENDING(job_ptr) && (signal == SIGKILL)) {
last_job_update = now;
job_ptr->job_state = JOB_CANCELLED;
job_ptr->start_time = now;
job_ptr->end_time = now;
srun_allocate_abort(job_ptr);
job_completion_logger(job_ptr, false);
verbose("job_signal of pending job %u successful", job_id);
return SLURM_SUCCESS;
}
if (preempt)
job_term_state = JOB_PREEMPTED;
else
job_term_state = JOB_CANCELLED;
if (IS_JOB_SUSPENDED(job_ptr) && (signal == SIGKILL)) {
last_job_update = now;
job_ptr->end_time = job_ptr->suspend_time;
job_ptr->tot_sus_time += difftime(now, job_ptr->suspend_time);
job_ptr->job_state = job_term_state | JOB_COMPLETING;
build_cg_bitmap(job_ptr);
jobacct_storage_g_job_suspend(acct_db_conn, job_ptr);
deallocate_nodes(job_ptr, false, true, preempt);
job_completion_logger(job_ptr, false);
verbose("job_signal %u of suspended job %u successful",
signal, job_id);
return SLURM_SUCCESS;
}
if (IS_JOB_RUNNING(job_ptr)) {
if (signal == SIGKILL) {
/* No need to signal steps, deallocate kills them */
job_ptr->time_last_active = now;
job_ptr->end_time = now;
last_job_update = now;
job_ptr->job_state = job_term_state | JOB_COMPLETING;
build_cg_bitmap(job_ptr);
deallocate_nodes(job_ptr, false, false, preempt);
job_completion_logger(job_ptr, false);
} else if (batch_flag) {
if (job_ptr->batch_flag)
_signal_batch_job(job_ptr, signal);
else
return ESLURM_JOB_SCRIPT_MISSING;
} else {
_signal_job(job_ptr, signal);
}
verbose("job_signal %u of running job %u successful",
signal, job_id);
return SLURM_SUCCESS;
}
verbose("job_signal: job %u can't be sent signal %u from state=%s",
job_id, signal, job_state_string(job_ptr->job_state));
return ESLURM_TRANSITION_STATE_NO_UPDATE;
}
static void
_signal_batch_job(struct job_record *job_ptr, uint16_t signal)
{
bitoff_t i;
kill_tasks_msg_t *kill_tasks_msg = NULL;
agent_arg_t *agent_args = NULL;
xassert(job_ptr);
xassert(job_ptr->batch_host);
i = bit_ffs(job_ptr->node_bitmap);
if (i < 0) {
error("_signal_batch_job JobId=%u lacks assigned nodes",
job_ptr->job_id);
return;
}
agent_args = xmalloc(sizeof(agent_arg_t));
agent_args->msg_type = REQUEST_SIGNAL_TASKS;
agent_args->retry = 1;
agent_args->node_count = 1;
agent_args->hostlist = hostlist_create(job_ptr->batch_host);
kill_tasks_msg = xmalloc(sizeof(kill_tasks_msg_t));
kill_tasks_msg->job_id = job_ptr->job_id;
kill_tasks_msg->job_step_id = NO_VAL;
kill_tasks_msg->signal = signal;
agent_args->msg_args = kill_tasks_msg;
agent_args->node_count = 1;/* slurm/477 be sure to update node_count */
agent_queue_request(agent_args);
return;
}
/*
* job_complete - note the normal termination the specified job
* IN job_id - id of the job which completed
* IN uid - user id of user issuing the RPC
* IN requeue - job should be run again if possible
* IN node_fail - true of job terminated due to node failure
* IN job_return_code - job's return code, if set then set state to FAILED
* RET - 0 on success, otherwise ESLURM error code
* global: job_list - pointer global job list
* last_job_update - time of last job table update
*/
extern int job_complete(uint32_t job_id, uid_t uid, bool requeue,
bool node_fail, uint32_t job_return_code)
{
struct job_record *job_ptr;
time_t now = time(NULL);
uint32_t job_comp_flag = 0;
bool suspended = false;
info("completing job %u", job_id);
job_ptr = find_job_record(job_id);
if (job_ptr == NULL) {
info("job_complete: invalid JobId=%u", job_id);
return ESLURM_INVALID_JOB_ID;
}
if (IS_JOB_FINISHED(job_ptr))
return ESLURM_ALREADY_DONE;
if ((job_ptr->user_id != uid) && !validate_slurm_user(uid)) {
error("Security violation, JOB_COMPLETE RPC for job %u "
"from uid %u",
job_ptr->job_id, (unsigned int) uid);
return ESLURM_USER_ID_MISSING;
}
if (IS_JOB_COMPLETING(job_ptr))
return SLURM_SUCCESS; /* avoid replay */
if (IS_JOB_RUNNING(job_ptr))
job_comp_flag = JOB_COMPLETING;
else if (IS_JOB_PENDING(job_ptr))
job_ptr->start_time = now;
if ((job_return_code == NO_VAL) &&
(IS_JOB_RUNNING(job_ptr) || IS_JOB_PENDING(job_ptr))) {
info("Job %u cancelled from srun", job_ptr->job_id);
}
if (IS_JOB_SUSPENDED(job_ptr)) {
enum job_states suspend_job_state = job_ptr->job_state;
/* we can't have it as suspended when we call the
* accounting stuff.
*/
job_ptr->job_state = JOB_CANCELLED;
jobacct_storage_g_job_suspend(acct_db_conn, job_ptr);
job_ptr->job_state = suspend_job_state;
job_comp_flag = JOB_COMPLETING;
suspended = true;
}
if (requeue && (job_ptr->batch_flag > 1)) {
/* Failed one requeue, just kill it */
requeue = 0;
if (job_return_code == 0)
job_return_code = 1;
info("Batch job launch failure, JobId=%u", job_ptr->job_id);
}
if (requeue && job_ptr->details && job_ptr->batch_flag) {
/* We want this job to look like it
* was terminated in the accounting logs.
* Set a new submit time so the restarted
* job looks like a new job. */
job_ptr->end_time = now;
job_ptr->job_state = JOB_NODE_FAIL;
job_completion_logger(job_ptr, true);
job_ptr->db_index = 0;
/* Since this could happen on a launch we need to make
* sure the submit isn't the same as the last submit so
* put now + 1 so we get different records in the
* database */
job_ptr->details->submit_time = now + 1;
job_ptr->batch_flag++; /* only one retry */
job_ptr->restart_cnt++;
job_ptr->job_state = JOB_PENDING | job_comp_flag;
/* Since the job completion logger removes the job submit
* information, we need to add it again. */
acct_policy_add_job_submit(job_ptr);
info("Requeue JobId=%u due to node failure", job_ptr->job_id);
} else if (IS_JOB_PENDING(job_ptr) && job_ptr->details &&
job_ptr->batch_flag) {
/* Possible failure mode with DOWN node and job requeue.
* The DOWN node might actually respond to the cancel and
* take us here. Don't run job_completion_logger
* here since this is here to catch duplicate cancels
* from slow responding slurmds */
return SLURM_SUCCESS;
} else {
if (node_fail) {
job_ptr->job_state = JOB_NODE_FAIL | job_comp_flag;
job_ptr->requid = uid;
} else if (job_return_code == NO_VAL) {
job_ptr->job_state = JOB_CANCELLED | job_comp_flag;
job_ptr->requid = uid;
} else if (WIFEXITED(job_return_code) &&
WEXITSTATUS(job_return_code)) {
job_ptr->job_state = JOB_FAILED | job_comp_flag;
job_ptr->exit_code = job_return_code;
job_ptr->state_reason = FAIL_EXIT_CODE;
xfree(job_ptr->state_desc);
} else if (job_comp_flag && /* job was running */
(job_ptr->end_time < now)) { /* over time limit */
job_ptr->job_state = JOB_TIMEOUT | job_comp_flag;
job_ptr->exit_code = MAX(job_ptr->exit_code, 1);
job_ptr->state_reason = FAIL_TIMEOUT;
xfree(job_ptr->state_desc);
} else {
job_ptr->job_state = JOB_COMPLETE | job_comp_flag;
job_ptr->exit_code = job_return_code;
}
if (suspended) {
job_ptr->end_time = job_ptr->suspend_time;
job_ptr->tot_sus_time +=
difftime(now, job_ptr->suspend_time);
} else
job_ptr->end_time = now;
job_completion_logger(job_ptr, false);
}
last_job_update = now;
if (job_comp_flag) { /* job was running */
build_cg_bitmap(job_ptr);
deallocate_nodes(job_ptr, false, suspended, false);
}
info("sched: job_complete for JobId=%u successful", job_id);
return SLURM_SUCCESS;
}
static int _alt_part_test(struct part_record *part_ptr,
struct part_record **part_ptr_new)
{
struct part_record *alt_part_ptr = NULL;
char *alt_name;
*part_ptr_new = NULL;
if ((part_ptr->state_up & PARTITION_SUBMIT) == 0) {
info("_alt_part_test: original partition is not available "
"(drain or inactive): %s", part_ptr->name);
alt_name = part_ptr->alternate;
while (alt_name) {
alt_part_ptr = find_part_record(alt_name);
if (alt_part_ptr == NULL) {
info("_alt_part_test: invalid alternate "
"partition name specified: %s", alt_name);
return ESLURM_INVALID_PARTITION_NAME;
}
if (alt_part_ptr == part_ptr) {
info("_alt_part_test: no valid alternate "
"partition is available");
return ESLURM_PARTITION_NOT_AVAIL;
}
if (alt_part_ptr->state_up & PARTITION_SUBMIT)
break;
/* Try next alternate in the sequence */
alt_name = alt_part_ptr->alternate;
}
if (alt_name == NULL) {
info("_alt_part_test: no valid alternate partition is "
"available");
return ESLURM_PARTITION_NOT_AVAIL;
}
*part_ptr_new = alt_part_ptr;
}
return SLURM_SUCCESS;
}
/* Test if this job can use this partition */
static int _part_access_check(struct part_record *part_ptr,
job_desc_msg_t * job_desc, bitstr_t *req_bitmap,
uid_t submit_uid)
{
uint32_t total_nodes;
if ((part_ptr->flags & PART_FLAG_ROOT_ONLY) && (submit_uid != 0)) {
info("_part_access_check: uid %u access to partition %s "
"denied, not root",
(unsigned int) submit_uid, part_ptr->name);
return ESLURM_ACCESS_DENIED;
}
if ((job_desc->user_id == 0) && (part_ptr->flags & PART_FLAG_NO_ROOT)) {
error("_part_access_check: Security violation, SUBMIT_JOB for "
"user root disabled");
return ESLURM_USER_ID_MISSING;
}
if (validate_group(part_ptr, job_desc->user_id) == 0) {
info("_part_access_check: uid %u access to partition %s "
"denied, bad group",
(unsigned int) job_desc->user_id, part_ptr->name);
return ESLURM_JOB_MISSING_REQUIRED_PARTITION_GROUP;
}
if (validate_alloc_node(part_ptr, job_desc->alloc_node) == 0) {
info("_part_access_check: uid %u access to partition %s "
"denied, bad allocating node: %s",
(unsigned int) job_desc->user_id, part_ptr->name,
job_desc->alloc_node);
return ESLURM_ACCESS_DENIED;
}
if ((part_ptr->state_up & PARTITION_SCHED) &&
(job_desc->min_cpus != NO_VAL) &&
(job_desc->min_cpus > part_ptr->total_cpus)) {
info("_part_access_check: Job requested too many cpus (%u) of "
"partition %s(%u)",
job_desc->min_cpus, part_ptr->name,
part_ptr->total_cpus);
return ESLURM_TOO_MANY_REQUESTED_CPUS;
}
total_nodes = part_ptr->total_nodes;
select_g_alter_node_cnt(SELECT_APPLY_NODE_MAX_OFFSET, &total_nodes);
if ((part_ptr->state_up & PARTITION_SCHED) &&
(job_desc->min_nodes != NO_VAL) &&
(job_desc->min_nodes > total_nodes)) {
info("_part_access_check: Job requested too many nodes (%u) "
"of partition %s(%u)",
job_desc->min_nodes, part_ptr->name, total_nodes);
return ESLURM_INVALID_NODE_COUNT;
}
if (req_bitmap && !bit_super_set(req_bitmap, part_ptr->node_bitmap)) {
info("_part_access_check: requested nodes %s not in "
"partition %s", job_desc->req_nodes, part_ptr->name);
return ESLURM_REQUESTED_NODES_NOT_IN_PARTITION;
}
return SLURM_SUCCESS;
}
static int _valid_job_part(job_desc_msg_t * job_desc,
uid_t submit_uid, bitstr_t *req_bitmap,
struct part_record **part_pptr,
List *part_pptr_list)
{
int rc = SLURM_SUCCESS;
bool rebuild_name_list = false;
struct part_record *part_ptr, *part_ptr_tmp, *part_ptr_new;
List part_ptr_list = NULL;
ListIterator iter;
uint32_t min_nodes_orig = INFINITE, max_nodes_orig = 1;
uint32_t max_time = 0;
/* Identify partition(s) and set pointer(s) to their struct */
if (job_desc->partition) {
part_ptr = find_part_record(job_desc->partition);
if (part_ptr == NULL) {
part_ptr_list = get_part_list(job_desc->partition);
if (part_ptr_list)
part_ptr = list_peek(part_ptr_list);
}
if (part_ptr == NULL) {
info("_valid_job_part: invalid partition specified: %s",
job_desc->partition);
return ESLURM_INVALID_PARTITION_NAME;
}
} else {
if (default_part_loc == NULL) {
error("_valid_job_part: default partition not set");
return ESLURM_DEFAULT_PARTITION_NOT_SET;
}
part_ptr = default_part_loc;
job_desc->partition = xstrdup(part_ptr->name);
}
/* Change partition pointer(s) to alternates as needed */
if (part_ptr_list) {
int fail_rc = SLURM_SUCCESS;
iter = list_iterator_create(part_ptr_list);
while ((part_ptr_tmp = (struct part_record *)list_next(iter))) {
rc = _alt_part_test(part_ptr_tmp, &part_ptr_new);
if (rc == SLURM_SUCCESS) {
if (part_ptr_new)
part_ptr_tmp = part_ptr_new;
rc = _part_access_check(part_ptr_tmp, job_desc,
req_bitmap, submit_uid);
}
if (rc != SLURM_SUCCESS) {
fail_rc = rc;
list_remove(iter);
rebuild_name_list = true;
continue;
}
if (part_ptr_new) {
list_insert(iter, part_ptr_new);
list_remove(iter);
rebuild_name_list = true;
}
min_nodes_orig = MIN(min_nodes_orig,
part_ptr_tmp->min_nodes_orig);
max_nodes_orig = MAX(max_nodes_orig,
part_ptr_tmp->max_nodes_orig);
max_time = MAX(max_time, part_ptr_tmp->max_time);
}
list_iterator_destroy(iter);
if (list_is_empty(part_ptr_list)) {
if (fail_rc != SLURM_SUCCESS)
rc = fail_rc;
else
rc = ESLURM_PARTITION_NOT_AVAIL;
goto fini;
}
rc = SLURM_SUCCESS; /* At least some partition usable */
} else {
rc = _alt_part_test(part_ptr, &part_ptr_new);
if (rc != SLURM_SUCCESS)
goto fini;
if (part_ptr_new) {
part_ptr = part_ptr_new;
xfree(job_desc->partition);
job_desc->partition = xstrdup(part_ptr->name);
}
min_nodes_orig = part_ptr->min_nodes_orig;
max_nodes_orig = part_ptr->max_nodes_orig;
max_time = part_ptr->max_time;
rc = _part_access_check(part_ptr, job_desc, req_bitmap,
submit_uid);
if (rc != SLURM_SUCCESS)
goto fini;
}
if (rebuild_name_list) {
part_ptr = NULL;
xfree(job_desc->partition);
iter = list_iterator_create(part_ptr_list);
while ((part_ptr_tmp = (struct part_record *)list_next(iter))) {
if (job_desc->partition)
xstrcat(job_desc->partition, ",");
else
part_ptr = part_ptr_tmp;
xstrcat(job_desc->partition, part_ptr_tmp->name);
}
list_iterator_destroy(iter);
}
/* Validate job limits against partition limits */
if (job_desc->min_nodes == NO_VAL) {
job_desc->min_nodes = min_nodes_orig;
} else if ((job_desc->min_nodes > max_nodes_orig) &&
slurmctld_conf.enforce_part_limits) {
info("_valid_job_part: job's min nodes greater than "
"partition's max nodes (%u > %u)",
job_desc->min_nodes, max_nodes_orig);
rc = ESLURM_INVALID_NODE_COUNT;
goto fini;
} else if ((job_desc->min_nodes < min_nodes_orig) &&
((job_desc->max_nodes == NO_VAL) ||
(job_desc->max_nodes >= min_nodes_orig))) {
job_desc->min_nodes = min_nodes_orig;
}
if (job_desc->max_nodes == NO_VAL) {
#ifdef HAVE_BG
job_desc->max_nodes = min_nodes_orig;
#else
;
#endif
} else if (slurmctld_conf.enforce_part_limits &&
job_desc->max_nodes &&
(job_desc->max_nodes < min_nodes_orig)) {
info("_valid_job_part: job's max nodes less than partition's "
"min nodes (%u < %u)",
job_desc->max_nodes, min_nodes_orig);
rc = ESLURM_INVALID_NODE_COUNT;
goto fini;
}
if ((job_desc->time_limit == NO_VAL) &&
(part_ptr->default_time != NO_VAL))
job_desc->time_limit = part_ptr->default_time;
if ((job_desc->time_min != NO_VAL) &&
(job_desc->time_min > max_time)) {
info("_valid_job_part: job's min time greater than "
"partition's (%u > %u)",
job_desc->time_min, max_time);
rc = ESLURM_INVALID_TIME_LIMIT;
goto fini;
}
if ((job_desc->time_limit != NO_VAL) &&
(job_desc->time_limit > max_time) &&
(job_desc->time_min == NO_VAL) &&
slurmctld_conf.enforce_part_limits) {
info("_valid_job_part: job's time limit greater than "
"partition's (%u > %u)",
job_desc->time_limit, max_time);
rc = ESLURM_INVALID_TIME_LIMIT;
goto fini;
}
if ((job_desc->time_min != NO_VAL) &&
(job_desc->time_min > job_desc->time_limit)) {
info("_valid_job_part: job's min_time greater time limit "
"(%u > %u)",
job_desc->time_min, job_desc->time_limit);
rc = ESLURM_INVALID_TIME_LIMIT;
goto fini;
}
*part_pptr = part_ptr;
*part_pptr_list = part_ptr_list;
part_ptr_list = NULL;
fini: FREE_NULL_LIST(part_ptr_list);
return rc;
}
/*
* job_limits_check - check the limits specified for the job.
* IN job_ptr - pointer to job table entry.
* RET WAIT_NO_REASON on success, fail status otherwise.
*/
extern int job_limits_check(struct job_record **job_pptr)
{
struct job_details *detail_ptr;
enum job_state_reason fail_reason;
struct part_record *part_ptr = NULL;
struct job_record *job_ptr = NULL;
slurmdb_qos_rec_t *qos_ptr;
slurmdb_association_rec_t *assoc_ptr;
job_ptr = *job_pptr;
detail_ptr = job_ptr->details;
part_ptr = job_ptr->part_ptr;
qos_ptr = job_ptr->qos_ptr;
assoc_ptr = job_ptr->assoc_ptr;
fail_reason = WAIT_NO_REASON;
if ((detail_ptr->min_nodes > part_ptr->max_nodes) &&
(!qos_ptr || (qos_ptr && !(qos_ptr->flags
& QOS_FLAG_PART_MAX_NODE)))) {
info("Job %u requested too many nodes (%u) of "
"partition %s(MaxNodes %u)",
job_ptr->job_id, detail_ptr->min_nodes,
part_ptr->name, part_ptr->max_nodes);
fail_reason = WAIT_PART_NODE_LIMIT;
} else if ((detail_ptr->max_nodes != 0) && /* no max_nodes for job */
((detail_ptr->max_nodes < part_ptr->min_nodes) &&
(!qos_ptr || (qos_ptr && !(qos_ptr->flags &
QOS_FLAG_PART_MIN_NODE))))) {
info("Job %u requested too few nodes (%u) of "
"partition %s(MinNodes %u)",
job_ptr->job_id, detail_ptr->max_nodes,
part_ptr->name, part_ptr->min_nodes);
fail_reason = WAIT_PART_NODE_LIMIT;
} else if (part_ptr->state_up == PARTITION_DOWN) {
info("Job %u requested down partition %s",
job_ptr->job_id, part_ptr->name);
fail_reason = WAIT_PART_DOWN;
} else if (part_ptr->state_up == PARTITION_INACTIVE) {
info("Job %u requested inactive partition %s",
job_ptr->job_id, part_ptr->name);
fail_reason = WAIT_PART_INACTIVE;
} else if ((job_ptr->time_limit != NO_VAL) &&
((job_ptr->time_limit > part_ptr->max_time) &&
(!qos_ptr || (qos_ptr && !(qos_ptr->flags &
QOS_FLAG_PART_TIME_LIMIT))))) {
info("Job %u exceeds partition time limit", job_ptr->job_id);
fail_reason = WAIT_PART_TIME_LIMIT;
} else if (qos_ptr && assoc_ptr &&
(qos_ptr->flags & QOS_FLAG_ENFORCE_USAGE_THRES) &&
(!fuzzy_equal(qos_ptr->usage_thres, NO_VAL))) {
if (!job_ptr->prio_factors)
job_ptr->prio_factors =
xmalloc(sizeof(priority_factors_object_t));
if (!job_ptr->prio_factors->priority_fs) {
if (fuzzy_equal(assoc_ptr->usage->usage_efctv, NO_VAL))
priority_g_set_assoc_usage(assoc_ptr);
job_ptr->prio_factors->priority_fs =
priority_g_calc_fs_factor(
assoc_ptr->usage->usage_efctv,
(long double)assoc_ptr->usage->
shares_norm);
}
if (job_ptr->prio_factors->priority_fs < qos_ptr->usage_thres) {
info("Job %u exceeds usage threahold", job_ptr->job_id);
fail_reason = WAIT_QOS_THRES;
}
}
return (fail_reason);
}
/*
* _job_create - create a job table record for the supplied specifications.
* This performs only basic tests for request validity (access to
* partition, nodes count in partition, and sufficient processors in
* partition).
* IN job_specs - job specifications
* IN allocate - resource allocation request if set rather than job submit
* IN will_run - job is not to be created, test of validity only
* OUT job_pptr - pointer to the job (NULL on error)
* RET 0 on success, otherwise ESLURM error code. If the job would only be
* able to execute with some change in partition configuration then
* ESLURM_REQUESTED_PART_CONFIG_UNAVAILABLE is returned
*/
static int _job_create(job_desc_msg_t * job_desc, int allocate, int will_run,
struct job_record **job_pptr, uid_t submit_uid)
{
int error_code = SLURM_SUCCESS, i, qos_error;
enum job_state_reason fail_reason;
struct part_record *part_ptr = NULL;
List part_ptr_list = NULL;
bitstr_t *req_bitmap = NULL, *exc_bitmap = NULL;
struct job_record *job_ptr = NULL;
slurmdb_association_rec_t assoc_rec, *assoc_ptr;
List license_list = NULL;
bool valid;
slurmdb_qos_rec_t qos_rec, *qos_ptr;
uint32_t user_submit_priority;
uint16_t limit_set_max_cpus = 0;
uint16_t limit_set_max_nodes = 0;
uint16_t limit_set_min_cpus = 0;
uint16_t limit_set_min_nodes = 0;
uint16_t limit_set_time = 0;
static uint32_t node_scaling = 1;
static uint32_t cpus_per_mp = 1;
#ifdef HAVE_BG
uint16_t geo[SYSTEM_DIMENSIONS];
uint16_t reboot;
uint16_t rotate;
uint16_t conn_type[SYSTEM_DIMENSIONS];
static bool sub_mp_system = 0;
if (node_scaling == 1) {
select_g_alter_node_cnt(SELECT_GET_NODE_SCALING,
&node_scaling);
select_g_alter_node_cnt(SELECT_GET_MP_CPU_CNT,
&cpus_per_mp);
if (node_scaling < 512)
sub_mp_system = 1;
}
#endif
*job_pptr = (struct job_record *) NULL;
/*
* Check user permission for negative 'nice' and non-0 priority values
* (both restricted to SlurmUser) before running the job_submit plugin.
*/
if ((submit_uid != 0) && (submit_uid != slurmctld_conf.slurm_user_id)) {
if (job_desc->priority != 0)
job_desc->priority = NO_VAL;
if (job_desc->nice < NICE_OFFSET)
job_desc->nice = NICE_OFFSET;
}
user_submit_priority = job_desc->priority;
error_code = job_submit_plugin_submit(job_desc, (uint32_t) submit_uid);
if (error_code != SLURM_SUCCESS)
return error_code;
/* insure that selected nodes are in this partition */
if (job_desc->req_nodes) {
error_code = node_name2bitmap(job_desc->req_nodes, false,
&req_bitmap);
if (error_code) {
error_code = ESLURM_INVALID_NODE_NAME;
goto cleanup_fail;
}
if (job_desc->contiguous)
bit_fill_gaps(req_bitmap);
i = bit_set_count(req_bitmap);
if (i > job_desc->min_nodes)
job_desc->min_nodes = i * node_scaling;
if (i > job_desc->min_cpus)
job_desc->min_cpus = i * cpus_per_mp;
if (job_desc->max_nodes &&
(job_desc->min_nodes > job_desc->max_nodes)) {
#if 0
info("_job_create: max node count less than required "
"hostlist size for user %u", job_desc->user_id);
job_desc->max_nodes = job_desc->min_nodes;
#else
error_code = ESLURM_INVALID_NODE_COUNT;
goto cleanup_fail;
#endif
}
}
error_code = _valid_job_part(job_desc, submit_uid, req_bitmap,
&part_ptr, &part_ptr_list);
if (error_code != SLURM_SUCCESS)
goto cleanup_fail;
/* Make sure anything that may be put in the database will be
lower case */
xstrtolower(job_desc->account);
xstrtolower(job_desc->wckey);
if ((error_code = _validate_job_desc(job_desc, allocate, submit_uid,
part_ptr))) {
error_code = error_code;
goto cleanup_fail;
}
memset(&assoc_rec, 0, sizeof(slurmdb_association_rec_t));
assoc_rec.uid = job_desc->user_id;
assoc_rec.partition = part_ptr->name;
assoc_rec.acct = job_desc->account;
if (assoc_mgr_fill_in_assoc(acct_db_conn, &assoc_rec,
accounting_enforce, &assoc_ptr)) {
info("_job_create: invalid account or partition for user %u, "
"account '%s', and partition '%s'",
job_desc->user_id, assoc_rec.acct, assoc_rec.partition);
error_code = ESLURM_INVALID_ACCOUNT;
goto cleanup_fail;
} else if (association_based_accounting &&
!assoc_ptr &&
!(accounting_enforce & ACCOUNTING_ENFORCE_ASSOCS)) {
/* If not enforcing associations we want to look for the
* default account and use it to avoid getting trash in the
* accounting records. */
assoc_rec.acct = NULL;
assoc_mgr_fill_in_assoc(acct_db_conn, &assoc_rec,
accounting_enforce, &assoc_ptr);
if (assoc_ptr) {
info("_job_create: account '%s' has no association "
"for user %u using default account '%s'",
job_desc->account, job_desc->user_id,
assoc_rec.acct);
xfree(job_desc->account);
}
}
if (job_desc->account == NULL)
job_desc->account = xstrdup(assoc_rec.acct);
/* This must be done after we have the assoc_ptr set */
memset(&qos_rec, 0, sizeof(slurmdb_qos_rec_t));
qos_rec.name = job_desc->qos;
if (wiki_sched && job_desc->comment &&
strstr(job_desc->comment, "QOS:")) {
if (strstr(job_desc->comment, "FLAGS:PREEMPTOR"))
qos_rec.name = "expedite";
else if (strstr(job_desc->comment, "FLAGS:PREEMPTEE"))
qos_rec.name = "standby";
}
qos_ptr = _determine_and_validate_qos(assoc_ptr, &qos_rec, &qos_error);
if (qos_error != SLURM_SUCCESS) {
error_code = qos_error;
goto cleanup_fail;
}
if ((accounting_enforce & ACCOUNTING_ENFORCE_LIMITS) &&
(!acct_policy_validate(job_desc, part_ptr,
assoc_ptr, qos_ptr,
&limit_set_max_cpus,
&limit_set_max_nodes,
&limit_set_time, 0))) {
info("_job_create: exceeded association's node or time limit "
"for user %u", job_desc->user_id);
error_code = ESLURM_ACCOUNTING_POLICY;
goto cleanup_fail;
}
#ifdef HAVE_BG
/* This needs to be done after the association acct policy check since
* it looks at unaltered nodes for bluegene systems
*/
debug3("before alteration asking for nodes %u-%u cpus %u-%u",
job_desc->min_nodes, job_desc->max_nodes,
job_desc->min_cpus, job_desc->max_cpus);
select_g_alter_node_cnt(SELECT_SET_NODE_CNT, job_desc);
debug3("after alteration asking for nodes %u-%u cpus %u-%u",
job_desc->min_nodes, job_desc->max_nodes,
job_desc->min_cpus, job_desc->max_cpus);
#endif
if (job_desc->exc_nodes) {
error_code = node_name2bitmap(job_desc->exc_nodes, false,
&exc_bitmap);
if (error_code) {
error_code = ESLURM_INVALID_NODE_NAME;
goto cleanup_fail;
}
}
if (exc_bitmap && req_bitmap) {
bitstr_t *tmp_bitmap = NULL;
bitoff_t first_set;
tmp_bitmap = bit_copy(exc_bitmap);
if (tmp_bitmap == NULL)
fatal("bit_copy malloc failure");
bit_and(tmp_bitmap, req_bitmap);
first_set = bit_ffs(tmp_bitmap);
FREE_NULL_BITMAP(tmp_bitmap);
if (first_set != -1) {
info("Job's required and excluded node lists overlap");
error_code = ESLURM_INVALID_NODE_NAME;
goto cleanup_fail;
}
}
if (job_desc->min_nodes == NO_VAL)
job_desc->min_nodes = 1;
#ifdef HAVE_BG
select_g_select_jobinfo_get(job_desc->select_jobinfo,
SELECT_JOBDATA_GEOMETRY, &geo);
if (geo[0] == (uint16_t) NO_VAL) {
for (i=0; i<SYSTEM_DIMENSIONS; i++)
geo[i] = 0;
select_g_select_jobinfo_set(job_desc->select_jobinfo,
SELECT_JOBDATA_GEOMETRY, &geo);
} else if (geo[0] != 0) {
uint32_t i, tot = 1;
for (i=0; i<SYSTEM_DIMENSIONS; i++)
tot *= geo[i];
if (job_desc->min_nodes > tot) {
info("MinNodes(%d) > GeometryNodes(%d)",
job_desc->min_nodes, tot);
error_code = ESLURM_TOO_MANY_REQUESTED_CPUS;
goto cleanup_fail;
}
job_desc->min_nodes = tot;
}
select_g_select_jobinfo_get(job_desc->select_jobinfo,
SELECT_JOBDATA_REBOOT, &reboot);
if (reboot == (uint16_t) NO_VAL) {
reboot = 0; /* default is no reboot */
select_g_select_jobinfo_set(job_desc->select_jobinfo,
SELECT_JOBDATA_REBOOT, &reboot);
}
select_g_select_jobinfo_get(job_desc->select_jobinfo,
SELECT_JOBDATA_ROTATE, &rotate);
if (rotate == (uint16_t) NO_VAL) {
rotate = 1; /* refault is to rotate */
select_g_select_jobinfo_set(job_desc->select_jobinfo,
SELECT_JOBDATA_ROTATE, &rotate);
}
select_g_select_jobinfo_get(job_desc->select_jobinfo,
SELECT_JOBDATA_CONN_TYPE, &conn_type);
if (conn_type[0] == (uint16_t) NO_VAL) {
conn_type[0] = (uint16_t) SELECT_NAV;
select_g_select_jobinfo_set(job_desc->select_jobinfo,
SELECT_JOBDATA_CONN_TYPE,
&conn_type);
} else if(((conn_type[0] >= SELECT_SMALL)
&& ((job_desc->min_cpus >= cpus_per_mp) && !sub_mp_system))
|| (!sub_mp_system
&& ((conn_type[0] == SELECT_TORUS)
|| (conn_type[0] == SELECT_MESH))
&& (job_desc->min_cpus < cpus_per_mp))) {
/* check to make sure we have a valid conn_type with
* the cpu count */
info("Job's cpu count at %u makes our conn_type "
"of '%s' invalid.",
job_desc->min_cpus, conn_type_string(conn_type[0]));
error_code = ESLURM_INVALID_NODE_COUNT;
goto cleanup_fail;
} else if ((conn_type[0] == SELECT_TORUS)
|| (conn_type[0] == SELECT_MESH)) {
int dim;
for (dim=1; dim<SYSTEM_DIMENSIONS; dim++)
conn_type[dim] = conn_type[0];
select_g_select_jobinfo_set(job_desc->select_jobinfo,
SELECT_JOBDATA_CONN_TYPE,
&conn_type);
}
#endif
if (job_desc->max_nodes == NO_VAL)
job_desc->max_nodes = 0;
if (job_desc->max_nodes &&
(job_desc->max_nodes < job_desc->min_nodes)) {
info("_job_create: Job's max_nodes(%u) < min_nodes(%u)",
job_desc->max_nodes, job_desc->min_nodes);
error_code = ESLURM_INVALID_NODE_COUNT;
goto cleanup_fail;
}
license_list = license_validate(job_desc->licenses, &valid);
if (!valid) {
info("Job's requested licenses are invalid: %s",
job_desc->licenses);
error_code = ESLURM_INVALID_LICENSES;
goto cleanup_fail;
}
if ((error_code =_validate_job_create_req(job_desc)))
goto cleanup;
if ((error_code = _copy_job_desc_to_job_record(job_desc,
job_pptr,
&req_bitmap,
&exc_bitmap))) {
if(error_code == SLURM_ERROR)
error_code = ESLURM_ERROR_ON_DESC_TO_RECORD_COPY;
goto cleanup_fail;
}
job_ptr = *job_pptr;
job_ptr->part_ptr = part_ptr;
job_ptr->part_ptr_list = part_ptr_list;
part_ptr_list = NULL;
if ((error_code = checkpoint_alloc_jobinfo(&(job_ptr->check_job)))) {
error("Failed to allocate checkpoint info for job");
goto cleanup_fail;
}
job_ptr->limit_set_max_cpus = limit_set_max_cpus;
job_ptr->limit_set_max_nodes = limit_set_max_nodes;
job_ptr->limit_set_min_cpus = limit_set_min_cpus;
job_ptr->limit_set_min_nodes = limit_set_min_nodes;
job_ptr->limit_set_time = limit_set_time;
job_ptr->assoc_id = assoc_rec.id;
job_ptr->assoc_ptr = (void *) assoc_ptr;
job_ptr->qos_ptr = (void *) qos_ptr;
job_ptr->qos_id = qos_rec.id;
/*
* Permission for altering priority was confirmed above. The job_submit
* plugin may have set the priority directly or put the job on hold. If
* the priority is not given, we will figure it out later after we see
* if the job is eligible or not. So we want NO_VAL if not set.
*/
job_ptr->priority = job_desc->priority;
if (job_ptr->priority == 0) {
if (user_submit_priority == 0)
job_ptr->state_reason = WAIT_HELD_USER;
else
job_ptr->state_reason = WAIT_HELD;
} else if (job_ptr->priority != NO_VAL) {
job_ptr->direct_set_prio = true;
}
error_code = update_job_dependency(job_ptr, job_desc->dependency);
if (error_code != SLURM_SUCCESS)
goto cleanup_fail;
job_ptr->details->orig_dependency = xstrdup(job_ptr->details->
dependency);
if (build_feature_list(job_ptr)) {
error_code = ESLURM_INVALID_FEATURE;
goto cleanup_fail;
}
/* NOTE: If this job is being used to expand another job, this job's
* gres_list has already been filled in with a copy of gres_list job
* to be expanded by update_job_dependency() */
if ((job_ptr->details->expanding_jobid == 0) &&
gres_plugin_job_state_validate(job_ptr->gres, &job_ptr->gres_list)){
error_code = ESLURM_INVALID_GRES;
goto cleanup_fail;
}
gres_plugin_job_state_log(job_ptr->gres_list, job_ptr->job_id);
if ((error_code = validate_job_resv(job_ptr)))
goto cleanup_fail;
if (job_desc->script
&& (!will_run)) { /* don't bother with copy if just a test */
if ((error_code = _copy_job_desc_to_file(job_desc,
job_ptr->job_id))) {
error_code = ESLURM_WRITING_TO_FILE;
goto cleanup_fail;
}
job_ptr->batch_flag = 1;
} else
job_ptr->batch_flag = 0;
job_ptr->license_list = license_list;
license_list = NULL;
if (job_desc->req_switch != NO_VAL) { /* Max # of switches */
job_ptr->req_switch = job_desc->req_switch;
if (job_desc->wait4switch != NO_VAL) {
job_ptr->wait4switch =
_max_switch_wait(job_desc->wait4switch);
} else
job_ptr->wait4switch = _max_switch_wait(INFINITE);
}
job_ptr->best_switch = true;
/* Insure that requested partition is valid right now,
* otherwise leave job queued and provide warning code */
fail_reason = job_limits_check(&job_ptr);
if (fail_reason != WAIT_NO_REASON) {
if (fail_reason == WAIT_QOS_THRES)
error_code = ESLURM_QOS_THRES;
else
error_code = ESLURM_REQUESTED_PART_CONFIG_UNAVAILABLE;
job_ptr->priority = 1; /* Move to end of queue */
job_ptr->state_reason = fail_reason;
xfree(job_ptr->state_desc);
}
cleanup:
FREE_NULL_LIST(license_list);
FREE_NULL_BITMAP(req_bitmap);
FREE_NULL_BITMAP(exc_bitmap);
return error_code;
cleanup_fail:
if (job_ptr) {
job_ptr->job_state = JOB_FAILED;
job_ptr->exit_code = 1;
job_ptr->state_reason = FAIL_SYSTEM;
xfree(job_ptr->state_desc);
job_ptr->start_time = job_ptr->end_time = time(NULL);
_purge_job_record(job_ptr->job_id);
}
FREE_NULL_LIST(license_list);
FREE_NULL_LIST(part_ptr_list);
FREE_NULL_BITMAP(req_bitmap);
FREE_NULL_BITMAP(exc_bitmap);
return error_code;
}
static int _test_strlen(char *test_str, char *str_name, int max_str_len)
{
int i = 0;
if (test_str)
i = strlen(test_str);
if (i > max_str_len) {
info("job_create_request: strlen(%s) too big (%d > %d)",
str_name, i, max_str_len);
return ESLURM_PATHNAME_TOO_LONG;
}
return SLURM_SUCCESS;
}
/* Perform some size checks on strings we store to prevent
* malicious user filling slurmctld's memory
* RET 0 or error code */
static int _validate_job_create_req(job_desc_msg_t * job_desc)
{
if (_test_strlen(job_desc->account, "account", 1024) ||
_test_strlen(job_desc->alloc_node, "alloc_node", 1024) ||
_test_strlen(job_desc->blrtsimage, "blrtsimage", 1024) ||
_test_strlen(job_desc->ckpt_dir, "ckpt_dir", 1024) ||
_test_strlen(job_desc->comment, "comment", 1024) ||
_test_strlen(job_desc->cpu_bind, "cpu_bind", 1024) ||
_test_strlen(job_desc->dependency, "dependency", 1024) ||
_test_strlen(job_desc->exc_nodes, "exc_nodes", 1024*64) ||
_test_strlen(job_desc->features, "features", 1024) ||
_test_strlen(job_desc->gres, "gres", 1024) ||
_test_strlen(job_desc->licenses, "licenses", 1024) ||
_test_strlen(job_desc->linuximage, "linuximage", 1024) ||
_test_strlen(job_desc->mail_user, "mail_user", 1024) ||
_test_strlen(job_desc->mem_bind, "mem_bind", 1024) ||
_test_strlen(job_desc->mloaderimage, "mloaderimage", 1024) ||
_test_strlen(job_desc->name, "name", 1024) ||
_test_strlen(job_desc->network, "network", 1024) ||
_test_strlen(job_desc->partition, "partition", 1024) ||
_test_strlen(job_desc->qos, "qos", 1024) ||
_test_strlen(job_desc->ramdiskimage, "ramdiskimage", 1024) ||
_test_strlen(job_desc->req_nodes, "req_nodes", 1024*64) ||
_test_strlen(job_desc->reservation, "reservation", 1024) ||
_test_strlen(job_desc->script, "script", 1024 * 1024 * 4) ||
_test_strlen(job_desc->std_err, "std_err", 1024) ||
_test_strlen(job_desc->std_in, "std_in", 1024) ||
_test_strlen(job_desc->std_out, "std_out", 1024) ||
_test_strlen(job_desc->wckey, "wckey", 1024) ||
_test_strlen(job_desc->work_dir, "work_dir", 1024))
return ESLURM_PATHNAME_TOO_LONG;
return SLURM_SUCCESS;
}
/* _copy_job_desc_to_file - copy the job script and environment from the RPC
* structure into a file */
static int
_copy_job_desc_to_file(job_desc_msg_t * job_desc, uint32_t job_id)
{
int error_code = 0;
char *dir_name, job_dir[20], *file_name;
DEF_TIMERS;
START_TIMER;
/* Create state_save_location directory */
dir_name = slurm_get_state_save_location();
/* Create job_id specific directory */
sprintf(job_dir, "/job.%u", job_id);
xstrcat(dir_name, job_dir);
if (mkdir(dir_name, 0700)) {
error("mkdir(%s) error %m", dir_name);
xfree(dir_name);
return ESLURM_WRITING_TO_FILE;
}
/* Create environment file, and write data to it */
file_name = xstrdup(dir_name);
xstrcat(file_name, "/environment");
error_code = _write_data_array_to_file(file_name,
job_desc->environment,
job_desc->env_size);
xfree(file_name);
if (error_code == 0) {
/* Create script file */
file_name = xstrdup(dir_name);
xstrcat(file_name, "/script");
error_code = _write_data_to_file(file_name, job_desc->script);
xfree(file_name);
}
xfree(dir_name);
END_TIMER2("_copy_job_desc_to_file");
return error_code;
}
/*
* Create file with specified name and write the supplied data array to it
* IN file_name - file to create and write to
* IN data - array of pointers to strings (e.g. env)
* IN size - number of elements in data
*/
static int
_write_data_array_to_file(char *file_name, char **data, uint32_t size)
{
int fd, i, pos, nwrite, amount;
fd = creat(file_name, 0600);
if (fd < 0) {
error("Error creating file %s, %m", file_name);
return ESLURM_WRITING_TO_FILE;
}
amount = write(fd, &size, sizeof(uint32_t));
if (amount < sizeof(uint32_t)) {
error("Error writing file %s, %m", file_name);
close(fd);
return ESLURM_WRITING_TO_FILE;
}
if (data == NULL)
return SLURM_SUCCESS;
for (i = 0; i < size; i++) {
nwrite = strlen(data[i]) + 1;
pos = 0;
while (nwrite > 0) {
amount = write(fd, &data[i][pos], nwrite);
if ((amount < 0) && (errno != EINTR)) {
error("Error writing file %s, %m",
file_name);
close(fd);
return ESLURM_WRITING_TO_FILE;
}
nwrite -= amount;
pos += amount;
}
}
close(fd);
return SLURM_SUCCESS;
}
/*
* Create file with specified name and write the supplied data array to it
* IN file_name - file to create and write to
* IN data - pointer to string
*/
static int _write_data_to_file(char *file_name, char *data)
{
int fd, pos, nwrite, amount;
if (data == NULL) {
(void) unlink(file_name);
return SLURM_SUCCESS;
}
fd = creat(file_name, 0700);
if (fd < 0) {
error("Error creating file %s, %m", file_name);
return ESLURM_WRITING_TO_FILE;
}
nwrite = strlen(data) + 1;
pos = 0;
while (nwrite > 0) {
amount = write(fd, &data[pos], nwrite);
if ((amount < 0) && (errno != EINTR)) {
error("Error writing file %s, %m", file_name);
close(fd);
return ESLURM_WRITING_TO_FILE;
}
nwrite -= amount;
pos += amount;
}
close(fd);
return SLURM_SUCCESS;
}
/*
* get_job_env - return the environment variables and their count for a
* given job
* IN job_ptr - pointer to job for which data is required
* OUT env_size - number of elements to read
* RET point to array of string pointers containing environment variables
*/
char **get_job_env(struct job_record *job_ptr, uint32_t * env_size)
{
char job_dir[30], *file_name, **environment = NULL;
file_name = slurm_get_state_save_location();
sprintf(job_dir, "/job.%d/environment", job_ptr->job_id);
xstrcat(file_name, job_dir);
_read_data_array_from_file(file_name, &environment, env_size, job_ptr);
xfree(file_name);
return environment;
}
/*
* get_job_script - return the script for a given job
* IN job_ptr - pointer to job for which data is required
* RET point to string containing job script
*/
char *get_job_script(struct job_record *job_ptr)
{
char *script = NULL;
if (job_ptr->batch_flag) {
char *file_name = slurm_get_state_save_location();
char job_dir[30];
sprintf(job_dir, "/job.%d/script", job_ptr->job_id);
xstrcat(file_name, job_dir);
_read_data_from_file(file_name, &script);
xfree(file_name);
}
return script;
}
/*
* Read a collection of strings from a file
* IN file_name - file to read from
* OUT data - pointer to array of pointers to strings (e.g. env),
* must be xfreed when no longer needed
* OUT size - number of elements in data
* IN job_ptr - job
* NOTE: The output format of this must be identical with _xduparray2()
*/
static void
_read_data_array_from_file(char *file_name, char ***data, uint32_t * size,
struct job_record *job_ptr)
{
int fd, pos, buf_size, amount, i, j;
char *buffer, **array_ptr;
uint32_t rec_cnt;
xassert(file_name);
xassert(data);
xassert(size);
*data = NULL;
*size = 0;
fd = open(file_name, 0);
if (fd < 0) {
error("Error opening file %s, %m", file_name);
return;
}
amount = read(fd, &rec_cnt, sizeof(uint32_t));
if (amount < sizeof(uint32_t)) {
if (amount != 0) /* incomplete write */
error("Error reading file %s, %m", file_name);
else
verbose("File %s has zero size", file_name);
close(fd);
return;
}
if (rec_cnt == 0) {
*data = NULL;
*size = 0;
return;
}
pos = 0;
buf_size = BUF_SIZE;
buffer = xmalloc(buf_size);
while (1) {
amount = read(fd, &buffer[pos], BUF_SIZE);
if (amount < 0) {
error("Error reading file %s, %m", file_name);
xfree(buffer);
close(fd);
return;
}
pos += amount;
if (amount < BUF_SIZE) /* end of file */
break;
buf_size += amount;
xrealloc(buffer, buf_size);
}
close(fd);
/* Allocate extra space for supplemental environment variables
* as set by Moab */
if (job_ptr->details->env_cnt) {
for (j = 0; j < job_ptr->details->env_cnt; j++)
pos += (strlen(job_ptr->details->env_sup[j]) + 1);
xrealloc(buffer, pos);
}
/* We have all the data, now let's compute the pointers */
array_ptr = xmalloc(sizeof(char *) *
(rec_cnt + job_ptr->details->env_cnt));
for (i = 0, pos = 0; i < rec_cnt; i++) {
array_ptr[i] = &buffer[pos];
pos += strlen(&buffer[pos]) + 1;
if ((pos > buf_size) && ((i + 1) < rec_cnt)) {
error("Bad environment file %s", file_name);
rec_cnt = i;
break;
}
}
/* Add supplemental environment variables for Moab */
if (job_ptr->details->env_cnt) {
char *tmp_chr;
int env_len, name_len;
for (j = 0; j < job_ptr->details->env_cnt; j++) {
tmp_chr = strchr(job_ptr->details->env_sup[j], '=');
if (tmp_chr == NULL) {
error("Invalid supplemental environment "
"variable: %s",
job_ptr->details->env_sup[j]);
continue;
}
env_len = strlen(job_ptr->details->env_sup[j]) + 1;
name_len = tmp_chr - job_ptr->details->env_sup[j] + 1;
/* search for duplicate */
for (i = 0; i < rec_cnt; i++) {
if (strncmp(array_ptr[i],
job_ptr->details->env_sup[j],
name_len)) {
continue;
}
/* over-write duplicate */
memcpy(&buffer[pos],
job_ptr->details->env_sup[j], env_len);
array_ptr[i] = &buffer[pos];
pos += env_len;
break;
}
if (i >= rec_cnt) { /* add env to array end */
memcpy(&buffer[pos],
job_ptr->details->env_sup[j], env_len);
array_ptr[rec_cnt++] = &buffer[pos];
pos += env_len;
}
}
}
*size = rec_cnt;
*data = array_ptr;
return;
}
/*
* Read a string from a file
* IN file_name - file to read from
* OUT data - pointer to string
* must be xfreed when no longer needed
*/
void _read_data_from_file(char *file_name, char **data)
{
int fd, pos, buf_size, amount;
char *buffer;
xassert(file_name);
xassert(data);
*data = NULL;
fd = open(file_name, 0);
if (fd < 0) {
error("Error opening file %s, %m", file_name);
return;
}
pos = 0;
buf_size = BUF_SIZE;
buffer = xmalloc(buf_size);
while (1) {
amount = read(fd, &buffer[pos], BUF_SIZE);
if (amount < 0) {
error("Error reading file %s, %m", file_name);
xfree(buffer);
close(fd);
return;
}
if (amount < BUF_SIZE) /* end of file */
break;
pos += amount;
buf_size += amount;
xrealloc(buffer, buf_size);
}
*data = buffer;
close(fd);
return;
}
/* Given a job request, return a multi_core_data struct.
* Returns NULL if no values set in the job/step request */
static multi_core_data_t *
_set_multi_core_data(job_desc_msg_t * job_desc)
{
multi_core_data_t * mc_ptr;
if ((job_desc->sockets_per_node == (uint16_t) NO_VAL) &&
(job_desc->cores_per_socket == (uint16_t) NO_VAL) &&
(job_desc->threads_per_core == (uint16_t) NO_VAL) &&
(job_desc->ntasks_per_socket == (uint16_t) NO_VAL) &&
(job_desc->ntasks_per_core == (uint16_t) NO_VAL) &&
(job_desc->plane_size == (uint16_t) NO_VAL))
return NULL;
mc_ptr = xmalloc(sizeof(multi_core_data_t));
mc_ptr->sockets_per_node = job_desc->sockets_per_node;
mc_ptr->cores_per_socket = job_desc->cores_per_socket;
mc_ptr->threads_per_core = job_desc->threads_per_core;
if (job_desc->ntasks_per_socket != (uint16_t) NO_VAL)
mc_ptr->ntasks_per_socket = job_desc->ntasks_per_socket;
else
mc_ptr->ntasks_per_socket = (uint16_t) INFINITE;
if (job_desc->ntasks_per_core != (uint16_t) NO_VAL)
mc_ptr->ntasks_per_core = job_desc->ntasks_per_core;
else if (slurmctld_conf.select_type_param & CR_ONE_TASK_PER_CORE)
mc_ptr->ntasks_per_core = 1;
else
mc_ptr->ntasks_per_core = (uint16_t) INFINITE;
if (job_desc->plane_size != (uint16_t) NO_VAL)
mc_ptr->plane_size = job_desc->plane_size;
else
mc_ptr->plane_size = 0;
return mc_ptr;
}
/* _copy_job_desc_to_job_record - copy the job descriptor from the RPC
* structure into the actual slurmctld job record */
static int
_copy_job_desc_to_job_record(job_desc_msg_t * job_desc,
struct job_record **job_rec_ptr,
bitstr_t ** req_bitmap,
bitstr_t ** exc_bitmap)
{
int error_code;
struct job_details *detail_ptr;
struct job_record *job_ptr;
if (slurm_get_track_wckey()) {
if (!job_desc->wckey) {
/* get the default wckey for this user since none was
* given */
slurmdb_user_rec_t user_rec;
memset(&user_rec, 0, sizeof(slurmdb_user_rec_t));
user_rec.uid = job_desc->user_id;
assoc_mgr_fill_in_user(acct_db_conn, &user_rec,
accounting_enforce, NULL);
if (user_rec.default_wckey)
job_desc->wckey = xstrdup_printf(
"*%s", user_rec.default_wckey);
else if (!(accounting_enforce &
ACCOUNTING_ENFORCE_WCKEYS))
job_desc->wckey = xstrdup("*");
else {
error("Job didn't specify wckey and user "
"%d has no default.", job_desc->user_id);
return ESLURM_INVALID_WCKEY;
}
} else if (job_desc->wckey) {
slurmdb_wckey_rec_t wckey_rec, *wckey_ptr = NULL;
memset(&wckey_rec, 0, sizeof(slurmdb_wckey_rec_t));
wckey_rec.uid = job_desc->user_id;
wckey_rec.name = job_desc->wckey;
if (assoc_mgr_fill_in_wckey(acct_db_conn, &wckey_rec,
accounting_enforce,
&wckey_ptr)) {
if (accounting_enforce &
ACCOUNTING_ENFORCE_WCKEYS) {
error("_copy_job_desc_to_job_record: "
"invalid wckey '%s' for user %u.",
wckey_rec.name,
job_desc->user_id);
return ESLURM_INVALID_WCKEY;
}
}
} else if (accounting_enforce & ACCOUNTING_ENFORCE_WCKEYS) {
/* This should never happen */
info("_copy_job_desc_to_job_record: no wckey was given "
"for job submit.");
return ESLURM_INVALID_WCKEY;
}
}
job_ptr = create_job_record(&error_code);
if (error_code)
return error_code;
job_ptr->partition = xstrdup(job_desc->partition);
if (job_desc->job_id != NO_VAL) /* already confirmed unique */
job_ptr->job_id = job_desc->job_id;
else
_set_job_id(job_ptr);
if (job_desc->name)
job_ptr->name = xstrdup(job_desc->name);
if (job_desc->wckey)
job_ptr->wckey = xstrdup(job_desc->wckey);
_add_job_hash(job_ptr);
job_ptr->user_id = (uid_t) job_desc->user_id;
job_ptr->group_id = (gid_t) job_desc->group_id;
job_ptr->job_state = JOB_PENDING;
job_ptr->time_limit = job_desc->time_limit;
if (job_desc->time_min != NO_VAL)
job_ptr->time_min = job_desc->time_min;
job_ptr->alloc_sid = job_desc->alloc_sid;
job_ptr->alloc_node = xstrdup(job_desc->alloc_node);
job_ptr->account = xstrdup(job_desc->account);
job_ptr->gres = xstrdup(job_desc->gres);
job_ptr->network = xstrdup(job_desc->network);
job_ptr->resv_name = xstrdup(job_desc->reservation);
job_ptr->comment = xstrdup(job_desc->comment);
if (!wiki_sched_test) {
char *sched_type = slurm_get_sched_type();
if (strcmp(sched_type, "sched/wiki") == 0)
wiki_sched = true;
if (strcmp(sched_type, "sched/wiki2") == 0) {
wiki_sched = true;
wiki2_sched = true;
}
xfree(sched_type);
wiki_sched_test = true;
}
if (job_desc->kill_on_node_fail != (uint16_t) NO_VAL)
job_ptr->kill_on_node_fail = job_desc->kill_on_node_fail;
job_ptr->resp_host = xstrdup(job_desc->resp_host);
job_ptr->alloc_resp_port = job_desc->alloc_resp_port;
job_ptr->other_port = job_desc->other_port;
job_ptr->time_last_active = time(NULL);
job_ptr->cr_enabled = 0;
job_ptr->derived_ec = 0;
job_ptr->licenses = xstrdup(job_desc->licenses);
job_ptr->mail_type = job_desc->mail_type;
job_ptr->mail_user = xstrdup(job_desc->mail_user);
job_ptr->ckpt_interval = job_desc->ckpt_interval;
job_ptr->spank_job_env = job_desc->spank_job_env;
job_ptr->spank_job_env_size = job_desc->spank_job_env_size;
job_desc->spank_job_env = (char **) NULL; /* nothing left to free */
job_desc->spank_job_env_size = 0; /* nothing left to free */
if (job_desc->wait_all_nodes == (uint16_t) NO_VAL)
job_ptr->wait_all_nodes = DEFAULT_WAIT_ALL_NODES;
else
job_ptr->wait_all_nodes = job_desc->wait_all_nodes;
job_ptr->warn_signal = job_desc->warn_signal;
job_ptr->warn_time = job_desc->warn_time;
detail_ptr = job_ptr->details;
detail_ptr->argc = job_desc->argc;
detail_ptr->argv = job_desc->argv;
job_desc->argv = (char **) NULL; /* nothing left to free */
job_desc->argc = 0; /* nothing left to free */
detail_ptr->acctg_freq = job_desc->acctg_freq;
detail_ptr->nice = job_desc->nice;
detail_ptr->open_mode = job_desc->open_mode;
detail_ptr->min_cpus = job_desc->min_cpus;
detail_ptr->max_cpus = job_desc->max_cpus;
detail_ptr->min_nodes = job_desc->min_nodes;
detail_ptr->max_nodes = job_desc->max_nodes;
if (job_desc->req_nodes) {
detail_ptr->req_nodes =
_copy_nodelist_no_dup(job_desc->req_nodes);
detail_ptr->req_node_bitmap = *req_bitmap;
detail_ptr->req_node_layout = NULL; /* Layout specified at
* start time */
*req_bitmap = NULL; /* Reused nothing left to free */
}
if (job_desc->exc_nodes) {
detail_ptr->exc_nodes =
_copy_nodelist_no_dup(job_desc->exc_nodes);
detail_ptr->exc_node_bitmap = *exc_bitmap;
*exc_bitmap = NULL; /* Reused nothing left to free */
}
if (job_desc->features)
detail_ptr->features = xstrdup(job_desc->features);
detail_ptr->shared = job_desc->shared;
if (job_desc->contiguous != (uint16_t) NO_VAL)
detail_ptr->contiguous = job_desc->contiguous;
if (job_desc->task_dist != (uint16_t) NO_VAL)
detail_ptr->task_dist = job_desc->task_dist;
if (job_desc->cpus_per_task != (uint16_t) NO_VAL)
detail_ptr->cpus_per_task = MAX(job_desc->cpus_per_task, 1);
else
detail_ptr->cpus_per_task = 1;
if (job_desc->pn_min_cpus != (uint16_t) NO_VAL)
detail_ptr->pn_min_cpus = job_desc->pn_min_cpus;
if (job_desc->overcommit != (uint8_t) NO_VAL)
detail_ptr->overcommit = job_desc->overcommit;
if (job_desc->ntasks_per_node != (uint16_t) NO_VAL) {
detail_ptr->ntasks_per_node = job_desc->ntasks_per_node;
if (detail_ptr->overcommit == 0) {
detail_ptr->pn_min_cpus =
MAX(detail_ptr->pn_min_cpus,
(detail_ptr->cpus_per_task *
detail_ptr->ntasks_per_node));
}
} else {
detail_ptr->pn_min_cpus = MAX(detail_ptr->pn_min_cpus,
detail_ptr->cpus_per_task);
}
if (job_desc->requeue != (uint16_t) NO_VAL)
detail_ptr->requeue = MIN(job_desc->requeue, 1);
else
detail_ptr->requeue = slurmctld_conf.job_requeue;
if (job_desc->pn_min_memory != NO_VAL)
detail_ptr->pn_min_memory = job_desc->pn_min_memory;
if (job_desc->pn_min_tmp_disk != NO_VAL)
detail_ptr->pn_min_tmp_disk = job_desc->pn_min_tmp_disk;
if (job_desc->num_tasks != NO_VAL)
detail_ptr->num_tasks = job_desc->num_tasks;
if (job_desc->std_err)
detail_ptr->std_err = xstrdup(job_desc->std_err);
if (job_desc->std_in)
detail_ptr->std_in = xstrdup(job_desc->std_in);
if (job_desc->std_out)
detail_ptr->std_out = xstrdup(job_desc->std_out);
if (job_desc->work_dir)
detail_ptr->work_dir = xstrdup(job_desc->work_dir);
if (job_desc->begin_time > time(NULL))
detail_ptr->begin_time = job_desc->begin_time;
job_ptr->select_jobinfo =
select_g_select_jobinfo_copy(job_desc->select_jobinfo);
if (job_desc->ckpt_dir)
detail_ptr->ckpt_dir = xstrdup(job_desc->ckpt_dir);
else
detail_ptr->ckpt_dir = xstrdup(detail_ptr->work_dir);
/* The priority needs to be set after this since we don't have
* an association rec yet
*/
detail_ptr->mc_ptr = _set_multi_core_data(job_desc);
*job_rec_ptr = job_ptr;
return SLURM_SUCCESS;
}
/*
* _copy_nodelist_no_dup - Take a node_list string and convert it to an
* expression without duplicate names. For example, we want to convert
* a users request for nodes "lx1,lx2,lx1,lx3" to "lx[1-3]"
* node_list IN - string describing a list of nodes
* RET a compact node expression, must be xfreed by the user
*/
static char *_copy_nodelist_no_dup(char *node_list)
{
char *buf;
hostlist_t hl = hostlist_create(node_list);
if (hl == NULL)
return NULL;
hostlist_uniq(hl);
buf = hostlist_ranged_string_xmalloc(hl);
hostlist_destroy(hl);
return buf;
}
static bool _valid_pn_min_mem(job_desc_msg_t * job_desc_msg,
struct part_record *part_ptr)
{
uint32_t job_mem_limit = job_desc_msg->pn_min_memory;
uint32_t sys_mem_limit;
uint16_t cpus_per_node, ratio;
if (part_ptr && part_ptr->max_mem_per_cpu)
sys_mem_limit = part_ptr->max_mem_per_cpu;
else
sys_mem_limit = slurmctld_conf.max_mem_per_cpu;
if ((sys_mem_limit == 0) || (sys_mem_limit == MEM_PER_CPU))
return true;
if ((job_mem_limit & MEM_PER_CPU) && (sys_mem_limit & MEM_PER_CPU)) {
job_mem_limit &= (~MEM_PER_CPU);
sys_mem_limit &= (~MEM_PER_CPU);
if (job_mem_limit <= sys_mem_limit)
return true;
ratio = (job_mem_limit + sys_mem_limit - 1) / sys_mem_limit;
if (job_desc_msg->cpus_per_task == (uint16_t) NO_VAL) {
job_desc_msg->cpus_per_task = ratio;
job_desc_msg->pn_min_memory = job_mem_limit + ratio-1;
job_desc_msg->pn_min_memory /= ratio;
job_desc_msg->pn_min_memory |= MEM_PER_CPU;
return true;
}
return false;
}
if (((job_mem_limit & MEM_PER_CPU) == 0) &&
((sys_mem_limit & MEM_PER_CPU) == 0)) {
if (job_mem_limit <= sys_mem_limit)
return true;
return false;
}
/* Our size is per CPU and limit per node or vice-versa.
* CPU count my vary by node, but we don't have a good
* way to identify specific nodes for the job at this
* point, so just pick the first node as a basis for enforcing
* MaxMemPerCPU and convert both numbers to per-node values. */
if (slurmctld_conf.fast_schedule)
cpus_per_node = node_record_table_ptr[0].config_ptr->cpus;
else
cpus_per_node = node_record_table_ptr[0].cpus;
if (job_desc_msg->min_cpus != NO_VAL)
cpus_per_node = MIN(cpus_per_node, job_desc_msg->min_cpus);
if (job_mem_limit & MEM_PER_CPU) {
job_mem_limit &= (~MEM_PER_CPU);
job_mem_limit *= cpus_per_node;
} else {
uint32_t min_cpus;
sys_mem_limit &= (~MEM_PER_CPU);
min_cpus = (job_mem_limit + sys_mem_limit - 1) / sys_mem_limit;
if ((job_desc_msg->pn_min_cpus == (uint16_t) NO_VAL) ||
(job_desc_msg->pn_min_cpus < min_cpus)) {
debug("Setting job's pn_min_cpus to %u due to memory "
"limit", min_cpus);
job_desc_msg->pn_min_cpus = min_cpus;
sys_mem_limit *= min_cpus;
} else {
sys_mem_limit *= cpus_per_node;
}
}
if (job_mem_limit <= sys_mem_limit)
return true;
return false;
}
/*
* job_time_limit - terminate jobs which have exceeded their time limit
* global: job_list - pointer global job list
* last_job_update - time of last job table update
* NOTE: READ lock_slurmctld config before entry
*/
void job_time_limit(void)
{
ListIterator job_iterator;
struct job_record *job_ptr;
time_t now = time(NULL);
time_t old = now - (slurmctld_conf.inactive_limit * 4 / 3) +
slurmctld_conf.msg_timeout + 1;
time_t over_run;
int resv_status = 0;
uint64_t job_cpu_usage_mins = 0;
uint64_t usage_mins;
uint32_t wall_mins;
if (slurmctld_conf.over_time_limit == (uint16_t) INFINITE)
over_run = now - (365 * 24 * 60 * 60); /* one year */
else
over_run = now - (slurmctld_conf.over_time_limit * 60);
begin_job_resv_check();
job_iterator = list_iterator_create(job_list);
while ((job_ptr =(struct job_record *) list_next(job_iterator))) {
slurmdb_qos_rec_t *qos = NULL;
slurmdb_association_rec_t *assoc = NULL;
assoc_mgr_lock_t locks = { READ_LOCK, NO_LOCK,
READ_LOCK, NO_LOCK, NO_LOCK };
xassert (job_ptr->magic == JOB_MAGIC);
if (IS_JOB_CONFIGURING(job_ptr)) {
if (!IS_JOB_RUNNING(job_ptr) ||
((bit_overlap(job_ptr->node_bitmap,
power_node_bitmap) == 0) &&
(bit_overlap(job_ptr->node_bitmap,
avail_node_bitmap) == 0))) {
debug("Configuration for job %u is complete",
job_ptr->job_id);
job_ptr->job_state &= (~JOB_CONFIGURING);
}
}
/* This needs to be near the top of the loop, checks every
* running, suspended and pending job */
resv_status = job_resv_check(job_ptr);
if ((job_ptr->priority == 1) && (!IS_JOB_FINISHED(job_ptr))) {
/* Rather than resetting job priorities whenever a
* DOWN, DRAINED or non-responsive node is returned to
* service, we pick them up here. There will be a small
* delay in restting a job's priority, but the code is
* a lot cleaner this way. */
_set_job_prio(job_ptr);
}
if (!IS_JOB_RUNNING(job_ptr))
continue;
/* find out how many cpu minutes this job has been
* running for. */
job_cpu_usage_mins = (uint64_t)
((((now - job_ptr->start_time)
- job_ptr->tot_sus_time) / 60)
* job_ptr->total_cpus);
if (slurmctld_conf.inactive_limit &&
(job_ptr->batch_flag == 0) &&
(job_ptr->time_last_active <= old) &&
(job_ptr->part_ptr) &&
(!(job_ptr->part_ptr->flags & PART_FLAG_ROOT_ONLY))) {
/* job inactive, kill it */
info("Inactivity time limit reached for JobId=%u",
job_ptr->job_id);
_job_timed_out(job_ptr);
job_ptr->state_reason = FAIL_INACTIVE_LIMIT;
xfree(job_ptr->state_desc);
continue;
}
if (job_ptr->time_limit != INFINITE) {
if (job_ptr->end_time <= over_run) {
last_job_update = now;
info("Time limit exhausted for JobId=%u",
job_ptr->job_id);
_job_timed_out(job_ptr);
job_ptr->state_reason = FAIL_TIMEOUT;
xfree(job_ptr->state_desc);
continue;
} else if ((job_ptr->warn_time) &&
(job_ptr->warn_time + PERIODIC_TIMEOUT +
now >= job_ptr->end_time)) {
debug("Warning signal %u to job %u ",
job_ptr->warn_signal, job_ptr->job_id);
(void) job_signal(job_ptr->job_id,
job_ptr->warn_signal, 0, 0,
false);
job_ptr->warn_signal = 0;
job_ptr->warn_time = 0;
}
}
if (resv_status != SLURM_SUCCESS) {
last_job_update = now;
info("Reservation ended for JobId=%u",
job_ptr->job_id);
_job_timed_out(job_ptr);
job_ptr->state_reason = FAIL_TIMEOUT;
xfree(job_ptr->state_desc);
continue;
}
/* check if any individual job steps have exceeded
* their time limit */
if (job_ptr->step_list &&
(list_count(job_ptr->step_list) > 0))
check_job_step_time_limit(job_ptr, now);
assoc_mgr_lock(&locks);
qos = (slurmdb_qos_rec_t *)job_ptr->qos_ptr;
assoc = (slurmdb_association_rec_t *)job_ptr->assoc_ptr;
/* The idea here is for qos to trump what an association
* has set for a limit, so if an association set of
* wall 10 mins and the qos has 20 mins set and the
* job has been running for 11 minutes it continues
* until 20.
*/
if(qos) {
usage_mins = (uint64_t)(qos->usage->usage_raw / 60.0);
wall_mins = qos->usage->grp_used_wall / 60;
if ((qos->grp_cpu_mins != (uint64_t)INFINITE)
&& (usage_mins >= qos->grp_cpu_mins)) {
last_job_update = now;
info("Job %u timed out, "
"the job is at or exceeds QOS %s's "
"group max cpu minutes of %"PRIu64" "
"with %"PRIu64"",
job_ptr->job_id,
qos->name,
qos->grp_cpu_mins,
usage_mins);
job_ptr->state_reason = FAIL_TIMEOUT;
goto job_failed;
}
if ((qos->grp_wall != INFINITE)
&& (wall_mins >= qos->grp_wall)) {
last_job_update = now;
info("Job %u timed out, "
"the job is at or exceeds QOS %s's "
"group wall limit of %u with %u",
job_ptr->job_id,
qos->name, qos->grp_wall,
wall_mins);
job_ptr->state_reason = FAIL_TIMEOUT;
goto job_failed;
}
if ((qos->max_cpu_mins_pj != (uint64_t)INFINITE)
&& (job_cpu_usage_mins >= qos->max_cpu_mins_pj)) {
last_job_update = now;
info("Job %u timed out, "
"the job is at or exceeds QOS %s's "
"max cpu minutes of %"PRIu64" "
"with %"PRIu64"",
job_ptr->job_id,
qos->name,
qos->max_cpu_mins_pj,
job_cpu_usage_mins);
job_ptr->state_reason = FAIL_TIMEOUT;
goto job_failed;
}
}
/* handle any association stuff here */
while(assoc) {
usage_mins = (uint64_t)(assoc->usage->usage_raw / 60.0);
wall_mins = assoc->usage->grp_used_wall / 60;
if ((qos && (qos->grp_cpu_mins == INFINITE))
&& (assoc->grp_cpu_mins != (uint64_t)INFINITE)
&& (usage_mins >= assoc->grp_cpu_mins)) {
info("Job %u timed out, "
"assoc %u is at or exceeds "
"group max cpu minutes limit %"PRIu64" "
"with %"PRIu64" for account %s",
job_ptr->job_id, assoc->id,
assoc->grp_cpu_mins,
usage_mins,
assoc->acct);
job_ptr->state_reason = FAIL_TIMEOUT;
break;
}
if ((qos && (qos->grp_wall == INFINITE))
&& (assoc->grp_wall != INFINITE)
&& (wall_mins >= assoc->grp_wall)) {
info("Job %u timed out, "
"assoc %u is at or exceeds "
"group wall limit %u "
"with %u for account %s",
job_ptr->job_id, assoc->id,
assoc->grp_wall,
wall_mins, assoc->acct);
job_ptr->state_reason = FAIL_TIMEOUT;
break;
}
if ((qos && (qos->max_cpu_mins_pj == INFINITE))
&& (assoc->max_cpu_mins_pj != (uint64_t)INFINITE)
&& (job_cpu_usage_mins >= assoc->max_cpu_mins_pj)) {
info("Job %u timed out, "
"assoc %u is at or exceeds "
"max cpu minutes limit %"PRIu64" "
"with %"PRIu64" for account %s",
job_ptr->job_id, assoc->id,
assoc->max_cpu_mins_pj,
job_cpu_usage_mins,
assoc->acct);
job_ptr->state_reason = FAIL_TIMEOUT;
break;
}
assoc = assoc->usage->parent_assoc_ptr;
/* these limits don't apply to the root assoc */
if(assoc == assoc_mgr_root_assoc)
break;
}
job_failed:
assoc_mgr_unlock(&locks);
if(job_ptr->state_reason == FAIL_TIMEOUT) {
last_job_update = now;
_job_timed_out(job_ptr);
xfree(job_ptr->state_desc);
continue;
}
/* Give srun command warning message about pending timeout */
if (job_ptr->end_time <= (now + PERIODIC_TIMEOUT * 2))
srun_timeout (job_ptr);
}
list_iterator_destroy(job_iterator);
fini_job_resv_check();
}
extern int job_update_cpu_cnt(struct job_record *job_ptr, int node_inx)
{
int cnt, offset, rc = SLURM_SUCCESS;
xassert(job_ptr);
#ifdef HAVE_BG
/* This function doesn't apply to a bluegene system since the
* cpu count isn't set up on that system. */
return SLURM_SUCCESS;
#endif
if ((offset = job_resources_node_inx_to_cpu_inx(
job_ptr->job_resrcs, node_inx)) < 0) {
error("job_update_cpu_cnt: problem getting offset of job %u",
job_ptr->job_id);
job_ptr->cpu_cnt = 0;
return SLURM_ERROR;
}
cnt = job_ptr->job_resrcs->cpus[offset];
if (cnt > job_ptr->cpu_cnt) {
error("job_update_cpu_cnt: cpu_cnt underflow on job_id %u",
job_ptr->job_id);
job_ptr->cpu_cnt = 0;
rc = SLURM_ERROR;
} else
job_ptr->cpu_cnt -= cnt;
if (IS_JOB_RESIZING(job_ptr)) {
if (cnt > job_ptr->total_cpus) {
error("job_update_cpu_cnt: total_cpus "
"underflow on job_id %u",
job_ptr->job_id);
job_ptr->total_cpus = 0;
rc = SLURM_ERROR;
} else
job_ptr->total_cpus -= cnt;
}
return rc;
}
/* Terminate a job that has exhausted its time limit */
static void _job_timed_out(struct job_record *job_ptr)
{
xassert(job_ptr);
if (job_ptr->details) {
time_t now = time(NULL);
job_ptr->end_time = now;
job_ptr->time_last_active = now;
job_ptr->job_state = JOB_TIMEOUT | JOB_COMPLETING;
build_cg_bitmap(job_ptr);
job_ptr->exit_code = MAX(job_ptr->exit_code, 1);
deallocate_nodes(job_ptr, true, false, false);
job_completion_logger(job_ptr, false);
} else
job_signal(job_ptr->job_id, SIGKILL, 0, 0, false);
return;
}
/* _validate_job_desc - validate that a job descriptor for job submit or
* allocate has valid data, set values to defaults as required
* IN/OUT job_desc_msg - pointer to job descriptor, modified as needed
* IN allocate - if clear job to be queued, if set allocate for user now
* IN submit_uid - who request originated
*/
static int _validate_job_desc(job_desc_msg_t * job_desc_msg, int allocate,
uid_t submit_uid, struct part_record *part_ptr)
{
if ((job_desc_msg->min_cpus == NO_VAL) &&
(job_desc_msg->min_nodes == NO_VAL) &&
(job_desc_msg->req_nodes == NULL)) {
info("Job specified no min_cpus, min_nodes or req_nodes");
return ESLURM_JOB_MISSING_SIZE_SPECIFICATION;
}
if ((allocate == SLURM_CREATE_JOB_FLAG_NO_ALLOCATE_0) &&
(job_desc_msg->script == NULL)) {
info("_validate_job_desc: job failed to specify Script");
return ESLURM_JOB_SCRIPT_MISSING;
}
if (job_desc_msg->user_id == NO_VAL) {
info("_validate_job_desc: job failed to specify User");
return ESLURM_USER_ID_MISSING;
}
if ( job_desc_msg->group_id == NO_VAL ) {
debug("_validate_job_desc: job failed to specify group");
job_desc_msg->group_id = 0; /* uses user default */
}
if (job_desc_msg->contiguous == (uint16_t) NO_VAL)
job_desc_msg->contiguous = 0;
if (job_desc_msg->task_dist == (uint16_t) NO_VAL) {
/* not typically set by salloc or sbatch */
job_desc_msg->task_dist = SLURM_DIST_CYCLIC;
}
if (job_desc_msg->plane_size == (uint16_t) NO_VAL)
job_desc_msg->plane_size = 0;
if (job_desc_msg->kill_on_node_fail == (uint16_t) NO_VAL)
job_desc_msg->kill_on_node_fail = 1;
if (job_desc_msg->job_id != NO_VAL) {
struct job_record *dup_job_ptr;
if ((submit_uid != 0) &&
(submit_uid != slurmctld_conf.slurm_user_id)) {
info("attempt by uid %u to set job_id", submit_uid);
return ESLURM_INVALID_JOB_ID;
}
if (job_desc_msg->job_id == 0) {
info("attempt by uid %u to set zero job_id",
submit_uid);
return ESLURM_INVALID_JOB_ID;
}
dup_job_ptr = find_job_record((uint32_t) job_desc_msg->job_id);
if (dup_job_ptr &&
(!(IS_JOB_COMPLETED(dup_job_ptr)))) {
info("attempt re-use active job_id %u",
job_desc_msg->job_id);
return ESLURM_DUPLICATE_JOB_ID;
}
if (dup_job_ptr) /* Purge the record for re-use */
_purge_job_record(job_desc_msg->job_id);
}
if (job_desc_msg->nice == (uint16_t) NO_VAL)
job_desc_msg->nice = NICE_OFFSET;
if (job_desc_msg->pn_min_memory == NO_VAL) {
/* Default memory limit is DefMemPerCPU (if set) or no limit */
if (part_ptr && part_ptr->def_mem_per_cpu) {
job_desc_msg->pn_min_memory =
part_ptr->def_mem_per_cpu;
} else {
job_desc_msg->pn_min_memory =
slurmctld_conf.def_mem_per_cpu;
}
} else if (!_valid_pn_min_mem(job_desc_msg, part_ptr))
return ESLURM_INVALID_TASK_MEMORY;
if (job_desc_msg->min_nodes == NO_VAL)
job_desc_msg->min_nodes = 1; /* default node count of 1 */
if (job_desc_msg->min_cpus == NO_VAL)
job_desc_msg->min_cpus = job_desc_msg->min_nodes;
if ((job_desc_msg->pn_min_cpus == (uint16_t) NO_VAL) ||
(job_desc_msg->pn_min_cpus == 0))
job_desc_msg->pn_min_cpus = 1; /* default 1 cpu per node */
if (job_desc_msg->pn_min_tmp_disk == NO_VAL)
job_desc_msg->pn_min_tmp_disk = 0;/* default 0MB disk per node */
return SLURM_SUCCESS;
}
/*
* _list_delete_job - delete a job record and its corresponding job_details,
* see common/list.h for documentation
* IN job_entry - pointer to job_record to delete
* global: job_list - pointer to global job list
* job_count - count of job list entries
* job_hash - hash table into job records
*/
static void _list_delete_job(void *job_entry)
{
struct job_record *job_ptr = (struct job_record *) job_entry;
struct job_record **job_pptr;
int i;
xassert(job_entry);
xassert (job_ptr->magic == JOB_MAGIC);
job_ptr->magic = 0; /* make sure we don't delete record twice */
/* Remove the record from the hash table */
job_pptr = &job_hash[JOB_HASH_INX(job_ptr->job_id)];
while ((job_pptr != NULL) &&
((job_ptr = *job_pptr) != (struct job_record *) job_entry)) {
job_pptr = &job_ptr->job_next;
}
if (job_pptr == NULL)
fatal("job hash error");
*job_pptr = job_ptr->job_next;
delete_job_details(job_ptr);
xfree(job_ptr->account);
xfree(job_ptr->alloc_node);
xfree(job_ptr->batch_host);
xfree(job_ptr->comment);
xfree(job_ptr->gres);
FREE_NULL_LIST(job_ptr->gres_list);
xfree(job_ptr->licenses);
if (job_ptr->license_list)
list_destroy(job_ptr->license_list);
job_ptr->magic = 0;
xfree(job_ptr->mail_user);
xfree(job_ptr->name);
xfree(job_ptr->network);
xfree(job_ptr->node_addr);
FREE_NULL_BITMAP(job_ptr->node_bitmap);
FREE_NULL_BITMAP(job_ptr->node_bitmap_cg);
xfree(job_ptr->nodes);
xfree(job_ptr->nodes_completing);
xfree(job_ptr->partition);
FREE_NULL_LIST(job_ptr->part_ptr_list);
slurm_destroy_priority_factors_object(job_ptr->prio_factors);
xfree(job_ptr->resp_host);
xfree(job_ptr->resv_name);
free_job_resources(&job_ptr->job_resrcs);
select_g_select_jobinfo_free(job_ptr->select_jobinfo);
for (i=0; i<job_ptr->spank_job_env_size; i++)
xfree(job_ptr->spank_job_env[i]);
xfree(job_ptr->spank_job_env);
xfree(job_ptr->state_desc);
if (job_ptr->step_list) {
delete_step_records(job_ptr);
list_destroy(job_ptr->step_list);
}
xfree(job_ptr->wckey);
job_count--;
xfree(job_ptr);
}
/*
* _list_find_job_id - find specific job_id entry in the job list,
* see common/list.h for documentation, key is job_id_ptr
* global- job_list - the global partition list
*/
static int _list_find_job_id(void *job_entry, void *key)
{
uint32_t *job_id_ptr = (uint32_t *) key;
if (((struct job_record *) job_entry)->job_id == *job_id_ptr)
return 1;
else
return 0;
}
/*
* _list_find_job_old - find old entries in the job list,
* see common/list.h for documentation, key is ignored
* global- job_list - the global partition list
*/
static int _list_find_job_old(void *job_entry, void *key)
{
time_t kill_age, min_age, now = time(NULL);;
struct job_record *job_ptr = (struct job_record *)job_entry;
if (IS_JOB_COMPLETING(job_ptr)) {
kill_age = now - (slurmctld_conf.kill_wait +
2 * slurm_get_msg_timeout());
if (job_ptr->time_last_active < kill_age) {
job_ptr->time_last_active = now;
re_kill_job(job_ptr);
}
return 0; /* Job still completing */
}
if (slurmctld_conf.min_job_age == 0)
return 0; /* No job record purging */
min_age = now - slurmctld_conf.min_job_age;
if (job_ptr->end_time > min_age)
return 0; /* Too new to purge */
if (!(IS_JOB_FINISHED(job_ptr)))
return 0; /* Job still active */
/* If we don't have a db_index by now and we are running with
the slurmdbd lets put it on the list to be handled later
when it comes back up since we won't get another chance.
*/
if(with_slurmdbd && !job_ptr->db_index)
jobacct_storage_g_job_start(acct_db_conn, job_ptr);
return 1; /* Purge the job */
}
/*
* pack_all_jobs - dump all job information for all jobs in
* machine independent form (for network transmission)
* OUT buffer_ptr - the pointer is set to the allocated buffer.
* OUT buffer_size - set to size of the buffer in bytes
* IN show_flags - job filtering options
* IN uid - uid of user making request (for partition filtering)
* global: job_list - global list of job records
* NOTE: the buffer at *buffer_ptr must be xfreed by the caller
* NOTE: change _unpack_job_desc_msg() in common/slurm_protocol_pack.c
* whenever the data format changes
*/
extern void pack_all_jobs(char **buffer_ptr, int *buffer_size,
uint16_t show_flags, uid_t uid,
uint16_t protocol_version)
{
ListIterator job_iterator;
struct job_record *job_ptr;
uint32_t jobs_packed = 0, tmp_offset;
Buf buffer;
time_t min_age = 0, now = time(NULL);
buffer_ptr[0] = NULL;
*buffer_size = 0;
buffer = init_buf(BUF_SIZE);
/* write message body header : size and time */
/* put in a place holder job record count of 0 for now */
pack32(jobs_packed, buffer);
pack_time(now, buffer);
if (slurmctld_conf.min_job_age > 0)
min_age = now - slurmctld_conf.min_job_age;
/* write individual job records */
part_filter_set(uid);
job_iterator = list_iterator_create(job_list);
while ((job_ptr = (struct job_record *) list_next(job_iterator))) {
xassert (job_ptr->magic == JOB_MAGIC);
if (((show_flags & SHOW_ALL) == 0) && (uid != 0) &&
(job_ptr->part_ptr) &&
(job_ptr->part_ptr->flags & PART_FLAG_HIDDEN))
continue;
if ((slurmctld_conf.private_data & PRIVATE_DATA_JOBS) &&
(job_ptr->user_id != uid) && !validate_operator(uid) &&
!assoc_mgr_is_user_acct_coord(acct_db_conn, uid,
job_ptr->account))
continue;
if ((min_age > 0) && (job_ptr->end_time < min_age) &&
(! IS_JOB_COMPLETING(job_ptr)) && IS_JOB_FINISHED(job_ptr))
continue; /* job ready for purging, don't dump */
pack_job(job_ptr, show_flags, buffer, protocol_version, uid);
jobs_packed++;
}
part_filter_clear();
list_iterator_destroy(job_iterator);
/* put the real record count in the message body header */
tmp_offset = get_buf_offset(buffer);
set_buf_offset(buffer, 0);
pack32(jobs_packed, buffer);
set_buf_offset(buffer, tmp_offset);
*buffer_size = get_buf_offset(buffer);
buffer_ptr[0] = xfer_buf_data(buffer);
}
/*
* pack_one_job - dump information for one jobs in
* machine independent form (for network transmission)
* OUT buffer_ptr - the pointer is set to the allocated buffer.
* OUT buffer_size - set to size of the buffer in bytes
* IN job_id - ID of job that we want info for
* IN show_flags - job filtering options
* IN uid - uid of user making request (for partition filtering)
* NOTE: the buffer at *buffer_ptr must be xfreed by the caller
* NOTE: change _unpack_job_desc_msg() in common/slurm_protocol_pack.c
* whenever the data format changes
*/
extern int pack_one_job(char **buffer_ptr, int *buffer_size,
uint32_t job_id, uint16_t show_flags, uid_t uid,
uint16_t protocol_version)
{
ListIterator job_iterator;
struct job_record *job_ptr;
uint32_t jobs_packed = 0;
Buf buffer;
buffer_ptr[0] = NULL;
*buffer_size = 0;
job_iterator = list_iterator_create(job_list);
while ((job_ptr = (struct job_record *) list_next(job_iterator))) {
if (job_ptr->job_id != job_id)
continue;
if ((slurmctld_conf.private_data & PRIVATE_DATA_JOBS) &&
(job_ptr->user_id != uid) && !validate_operator(uid) &&
!assoc_mgr_is_user_acct_coord(acct_db_conn, uid,
job_ptr->account))
break;
jobs_packed++;
break;
}
list_iterator_destroy(job_iterator);
if (jobs_packed == 0)
return ESLURM_INVALID_JOB_ID;
buffer = init_buf(BUF_SIZE);
pack32(jobs_packed, buffer);
pack_time(time(NULL), buffer);
pack_job(job_ptr, show_flags, buffer, protocol_version, uid);
*buffer_size = get_buf_offset(buffer);
buffer_ptr[0] = xfer_buf_data(buffer);
return SLURM_SUCCESS;
}
/*
* pack_job - dump all configuration information about a specific job in
* machine independent form (for network transmission)
* IN dump_job_ptr - pointer to job for which information is requested
* IN show_flags - job filtering options
* IN/OUT buffer - buffer in which data is placed, pointers automatically
* updated
* IN uid - user requesting the data
* NOTE: change _unpack_job_info_members() in common/slurm_protocol_pack.c
* whenever the data format changes
*/
void pack_job(struct job_record *dump_job_ptr, uint16_t show_flags, Buf buffer,
uint16_t protocol_version, uid_t uid)
{
struct job_details *detail_ptr;
time_t begin_time = 0;
char *nodelist = NULL;
assoc_mgr_lock_t locks = { NO_LOCK, NO_LOCK,
READ_LOCK, NO_LOCK, NO_LOCK };
if (protocol_version >= SLURM_2_3_PROTOCOL_VERSION) {
pack32(dump_job_ptr->assoc_id, buffer);
pack32(dump_job_ptr->job_id, buffer);
pack32(dump_job_ptr->user_id, buffer);
pack32(dump_job_ptr->group_id, buffer);
pack16(dump_job_ptr->job_state, buffer);
pack16(dump_job_ptr->batch_flag, buffer);
pack16(dump_job_ptr->state_reason, buffer);
pack16(dump_job_ptr->restart_cnt, buffer);
pack16(show_flags, buffer);
pack32(dump_job_ptr->alloc_sid, buffer);
if ((dump_job_ptr->time_limit == NO_VAL)
&& dump_job_ptr->part_ptr)
pack32(dump_job_ptr->part_ptr->max_time, buffer);
else
pack32(dump_job_ptr->time_limit, buffer);
pack32(dump_job_ptr->time_min, buffer);
if (dump_job_ptr->details) {
pack16(dump_job_ptr->details->nice, buffer);
pack_time(dump_job_ptr->details->submit_time, buffer);
/* Earliest possible begin time */
begin_time = dump_job_ptr->details->begin_time;
} else {
pack16(0, buffer);
pack_time((time_t) 0, buffer);
}
pack_time(begin_time, buffer);
/* Actual or expected start time */
if((dump_job_ptr->start_time) || (begin_time <= time(NULL)))
pack_time(dump_job_ptr->start_time, buffer);
else /* earliest start time in the future */
pack_time(begin_time, buffer);
pack_time(dump_job_ptr->end_time, buffer);
pack_time(dump_job_ptr->suspend_time, buffer);
pack_time(dump_job_ptr->pre_sus_time, buffer);
pack_time(dump_job_ptr->resize_time, buffer);
pack_time(dump_job_ptr->preempt_time, buffer);
pack32(dump_job_ptr->priority, buffer);
/* Only send the allocated nodelist since we are only sending
* the number of cpus and nodes that are currently allocated. */
if (!IS_JOB_COMPLETING(dump_job_ptr))
packstr(dump_job_ptr->nodes, buffer);
else {
nodelist =
bitmap2node_name(dump_job_ptr->node_bitmap_cg);
packstr(nodelist, buffer);
xfree(nodelist);
}
if (!IS_JOB_PENDING(dump_job_ptr) && dump_job_ptr->part_ptr)
packstr(dump_job_ptr->part_ptr->name, buffer);
else
packstr(dump_job_ptr->partition, buffer);
packstr(dump_job_ptr->account, buffer);
packstr(dump_job_ptr->network, buffer);
packstr(dump_job_ptr->comment, buffer);
packstr(dump_job_ptr->gres, buffer);
packstr(dump_job_ptr->batch_host, buffer);
if (!IS_JOB_COMPLETED(dump_job_ptr) &&
(show_flags & SHOW_DETAIL) &&
((dump_job_ptr->user_id == (uint32_t) uid) ||
validate_slurm_user(uid))) {
char *batch_script = get_job_script(dump_job_ptr);
packstr(batch_script, buffer);
xfree(batch_script);
} else {
packnull(buffer);
}
assoc_mgr_lock(&locks);
if (assoc_mgr_qos_list) {
packstr(slurmdb_qos_str(assoc_mgr_qos_list,
dump_job_ptr->qos_id), buffer);
} else
packnull(buffer);
assoc_mgr_unlock(&locks);
packstr(dump_job_ptr->licenses, buffer);
packstr(dump_job_ptr->state_desc, buffer);
packstr(dump_job_ptr->resv_name, buffer);
pack32(dump_job_ptr->exit_code, buffer);
pack32(dump_job_ptr->derived_ec, buffer);
if (show_flags & SHOW_DETAIL) {
pack_job_resources(dump_job_ptr->job_resrcs, buffer,
protocol_version);
} else {
uint32_t empty = NO_VAL;
pack32(empty, buffer);
}
packstr(dump_job_ptr->name, buffer);
packstr(dump_job_ptr->wckey, buffer);
pack32(dump_job_ptr->req_switch, buffer);
pack32(dump_job_ptr->wait4switch, buffer);
packstr(dump_job_ptr->alloc_node, buffer);
if (!IS_JOB_COMPLETING(dump_job_ptr))
pack_bit_fmt(dump_job_ptr->node_bitmap, buffer);
else
pack_bit_fmt(dump_job_ptr->node_bitmap_cg, buffer);
select_g_select_jobinfo_pack(dump_job_ptr->select_jobinfo,
buffer, protocol_version);
detail_ptr = dump_job_ptr->details;
/* A few details are always dumped here */
_pack_default_job_details(dump_job_ptr, buffer,
protocol_version);
/* other job details are only dumped until the job starts
* running (at which time they become meaningless) */
if (detail_ptr)
_pack_pending_job_details(detail_ptr, buffer,
protocol_version);
else
_pack_pending_job_details(NULL, buffer,
protocol_version);
} else if (protocol_version >= SLURM_2_2_PROTOCOL_VERSION) {
pack32(dump_job_ptr->assoc_id, buffer);
pack32(dump_job_ptr->job_id, buffer);
pack32(dump_job_ptr->user_id, buffer);
pack32(dump_job_ptr->group_id, buffer);
pack16(dump_job_ptr->job_state, buffer);
pack16(dump_job_ptr->batch_flag, buffer);
pack16(dump_job_ptr->state_reason, buffer);
pack16(dump_job_ptr->restart_cnt, buffer);
pack16(show_flags, buffer);
pack32(dump_job_ptr->alloc_sid, buffer);
if ((dump_job_ptr->time_limit == NO_VAL)
&& dump_job_ptr->part_ptr)
pack32(dump_job_ptr->part_ptr->max_time, buffer);
else
pack32(dump_job_ptr->time_limit, buffer);
pack32(dump_job_ptr->time_min, buffer);
if (dump_job_ptr->details) {
pack16(dump_job_ptr->details->nice, buffer);
pack_time(dump_job_ptr->details->submit_time, buffer);
/* Earliest possible begin time */
begin_time = dump_job_ptr->details->begin_time;
} else {
pack16(0, buffer);
pack_time((time_t) 0, buffer);
}
pack_time(begin_time, buffer);
/* Actual or expected start time */
if((dump_job_ptr->start_time) || (begin_time <= time(NULL)))
pack_time(dump_job_ptr->start_time, buffer);
else /* earliest start time in the future */
pack_time(begin_time, buffer);
pack_time(dump_job_ptr->end_time, buffer);
pack_time(dump_job_ptr->suspend_time, buffer);
pack_time(dump_job_ptr->pre_sus_time, buffer);
pack_time(dump_job_ptr->resize_time, buffer);
pack32(dump_job_ptr->priority, buffer);
/* Only send the allocated nodelist since we are only sending
* the number of cpus and nodes that are currently allocated. */
if (!IS_JOB_COMPLETING(dump_job_ptr))
packstr(dump_job_ptr->nodes, buffer);
else {
nodelist =
bitmap2node_name(dump_job_ptr->node_bitmap_cg);
packstr(nodelist, buffer);
xfree(nodelist);
}
if (!IS_JOB_PENDING(dump_job_ptr) && dump_job_ptr->part_ptr)
packstr(dump_job_ptr->part_ptr->name, buffer);
else
packstr(dump_job_ptr->partition, buffer);
packstr(dump_job_ptr->account, buffer);
packstr(dump_job_ptr->network, buffer);
packstr(dump_job_ptr->comment, buffer);
packstr(dump_job_ptr->gres, buffer);
assoc_mgr_lock(&locks);
if (assoc_mgr_qos_list) {
packstr(slurmdb_qos_str(assoc_mgr_qos_list,
dump_job_ptr->qos_id), buffer);
} else
packnull(buffer);
assoc_mgr_unlock(&locks);
packstr(dump_job_ptr->licenses, buffer);
packstr(dump_job_ptr->state_desc, buffer);
packstr(dump_job_ptr->resv_name, buffer);
pack32(dump_job_ptr->exit_code, buffer);
pack32(dump_job_ptr->derived_ec, buffer);
if (show_flags & SHOW_DETAIL) {
pack_job_resources(dump_job_ptr->job_resrcs, buffer,
protocol_version);
} else {
uint32_t empty = NO_VAL;
pack32(empty, buffer);
}
packstr(dump_job_ptr->name, buffer);
packstr(dump_job_ptr->wckey, buffer);
packstr(dump_job_ptr->alloc_node, buffer);
if (!IS_JOB_COMPLETING(dump_job_ptr))
pack_bit_fmt(dump_job_ptr->node_bitmap, buffer);
else
pack_bit_fmt(dump_job_ptr->node_bitmap_cg, buffer);
select_g_select_jobinfo_pack(dump_job_ptr->select_jobinfo,
buffer, protocol_version);
detail_ptr = dump_job_ptr->details;
/* A few details are always dumped here */
_pack_default_job_details(dump_job_ptr, buffer,
protocol_version);
/* other job details are only dumped until the job starts
* running (at which time they become meaningless) */
if (detail_ptr)
_pack_pending_job_details(detail_ptr, buffer,
protocol_version);
else
_pack_pending_job_details(NULL, buffer,
protocol_version);
} else if (protocol_version >= SLURM_2_1_PROTOCOL_VERSION) {
pack32(dump_job_ptr->assoc_id, buffer);
pack32(dump_job_ptr->job_id, buffer);
pack32(dump_job_ptr->user_id, buffer);
pack32(dump_job_ptr->group_id, buffer);
pack16(dump_job_ptr->job_state, buffer);
pack16(dump_job_ptr->batch_flag, buffer);
pack16(dump_job_ptr->state_reason, buffer);
pack16(dump_job_ptr->restart_cnt, buffer);
pack32(dump_job_ptr->alloc_sid, buffer);
if ((dump_job_ptr->time_limit == NO_VAL)
&& dump_job_ptr->part_ptr)
pack32(dump_job_ptr->part_ptr->max_time, buffer);
else
pack32(dump_job_ptr->time_limit, buffer);
if (dump_job_ptr->details) {
pack16(dump_job_ptr->details->nice, buffer);
pack_time(dump_job_ptr->details->submit_time, buffer);
/* Earliest possible begin time */
begin_time = dump_job_ptr->details->begin_time;
} else {
pack16(0, buffer);
pack_time((time_t) 0, buffer);
}
pack_time(begin_time, buffer);
/* Actual or expected start time */
if((dump_job_ptr->start_time) || (begin_time <= time(NULL)))
pack_time(dump_job_ptr->start_time, buffer);
else /* earliest start time in the future */
pack_time(begin_time, buffer);
pack_time(dump_job_ptr->end_time, buffer);
pack_time(dump_job_ptr->suspend_time, buffer);
pack_time(dump_job_ptr->pre_sus_time, buffer);
pack32(dump_job_ptr->priority, buffer);
/* Only send the allocated nodelist since we are only sending
* the number of cpus and nodes that are currently allocated. */
if (!IS_JOB_COMPLETING(dump_job_ptr))
packstr(dump_job_ptr->nodes, buffer);
else {
nodelist =
bitmap2node_name(dump_job_ptr->node_bitmap_cg);
packstr(nodelist, buffer);
xfree(nodelist);
}
if (!IS_JOB_PENDING(dump_job_ptr) && dump_job_ptr->part_ptr)
packstr(dump_job_ptr->part_ptr->name, buffer);
else
packstr(dump_job_ptr->partition, buffer);
packstr(dump_job_ptr->account, buffer);
packstr(dump_job_ptr->network, buffer);
packstr(dump_job_ptr->comment, buffer);
assoc_mgr_lock(&locks);
if (assoc_mgr_qos_list)
packstr(slurmdb_qos_str(assoc_mgr_qos_list,
dump_job_ptr->qos_id),
buffer);
else
packnull(buffer);
assoc_mgr_unlock(&locks);
packstr(dump_job_ptr->licenses, buffer);
packstr(dump_job_ptr->state_desc, buffer);
packstr(dump_job_ptr->resv_name, buffer);
pack32(dump_job_ptr->exit_code, buffer);
if (show_flags & SHOW_DETAIL) {
pack_job_resources(dump_job_ptr->job_resrcs, buffer,
protocol_version);
} else {
uint32_t empty = NO_VAL;
pack32(empty, buffer);
}
packstr(dump_job_ptr->name, buffer);
packstr(dump_job_ptr->wckey, buffer);
packstr(dump_job_ptr->alloc_node, buffer);
if (!IS_JOB_COMPLETING(dump_job_ptr))
pack_bit_fmt(dump_job_ptr->node_bitmap, buffer);
else
pack_bit_fmt(dump_job_ptr->node_bitmap_cg, buffer);
detail_ptr = dump_job_ptr->details;
if (IS_JOB_COMPLETING(dump_job_ptr) && dump_job_ptr->cpu_cnt)
pack32(dump_job_ptr->cpu_cnt, buffer);
else if (dump_job_ptr->total_cpus)
pack32(dump_job_ptr->total_cpus, buffer);
else if(detail_ptr)
pack32(detail_ptr->min_cpus, buffer);
else
pack32(dump_job_ptr->cpu_cnt, buffer);
select_g_select_jobinfo_pack(dump_job_ptr->select_jobinfo,
buffer, protocol_version);
/* A few details are always dumped here */
_pack_default_job_details(dump_job_ptr, buffer,
protocol_version);
/* other job details are only dumped until the job starts
* running (at which time they become meaningless) */
if (detail_ptr)
_pack_pending_job_details(detail_ptr, buffer,
protocol_version);
else
_pack_pending_job_details(NULL, buffer,
protocol_version);
}
}
/* pack default job details for "get_job_info" RPC */
static void _pack_default_job_details(struct job_record *job_ptr,
Buf buffer, uint16_t protocol_version)
{
int i;
struct job_details *detail_ptr = job_ptr->details;
char *cmd_line = NULL;
char *tmp = NULL;
uint32_t len = 0;
if(protocol_version >= SLURM_2_2_PROTOCOL_VERSION) {
if (detail_ptr) {
packstr(detail_ptr->features, buffer);
packstr(detail_ptr->work_dir, buffer);
packstr(detail_ptr->dependency, buffer);
if (detail_ptr->argv) {
/* Determine size needed for a string
containing all arguments */
for (i=0; detail_ptr->argv[i]; i++) {
len += strlen(detail_ptr->argv[i]);
}
len += i;
cmd_line = xmalloc(len*sizeof(char));
tmp = cmd_line;
for (i=0; detail_ptr->argv[i]; i++) {
if (i != 0) {
*tmp = ' ';
tmp++;
}
strcpy(tmp,detail_ptr->argv[i]);
tmp += strlen(detail_ptr->argv[i]);
}
packstr(cmd_line, buffer);
xfree(cmd_line);
} else
packnull(buffer);
if (IS_JOB_COMPLETING(job_ptr) && job_ptr->cpu_cnt) {
pack32(job_ptr->cpu_cnt, buffer);
pack32((uint32_t) 0, buffer);
} else if (job_ptr->total_cpus) {
pack32(job_ptr->total_cpus, buffer);
pack32((uint32_t) 0, buffer);
} else {
pack32(detail_ptr->min_cpus, buffer);
if(detail_ptr->max_cpus != NO_VAL)
pack32(detail_ptr->max_cpus, buffer);
else
pack32((uint32_t) 0, buffer);
}
if (IS_JOB_COMPLETING(job_ptr) && job_ptr->node_cnt) {
pack32(job_ptr->node_cnt, buffer);
pack32((uint32_t) 0, buffer);
} else if (job_ptr->total_nodes) {
pack32(job_ptr->total_nodes, buffer);
pack32((uint32_t) 0, buffer);
} else {
pack32(detail_ptr->min_nodes, buffer);
pack32(detail_ptr->max_nodes, buffer);
}
pack16(detail_ptr->requeue, buffer);
} else {
packnull(buffer);
packnull(buffer);
packnull(buffer);
packnull(buffer);
if (job_ptr->total_cpus)
pack32(job_ptr->total_cpus, buffer);
else
pack32(job_ptr->cpu_cnt, buffer);
pack32((uint32_t) 0, buffer);
pack32(job_ptr->node_cnt, buffer);
pack32((uint32_t) 0, buffer);
pack16((uint16_t) 0, buffer);
}
} else if(protocol_version >= SLURM_2_1_PROTOCOL_VERSION) {
if (detail_ptr) {
packstr(detail_ptr->features, buffer);
packstr(detail_ptr->work_dir, buffer);
packstr(detail_ptr->dependency, buffer);
if (detail_ptr->argv) {
for (i=0; detail_ptr->argv[i]; i++) {
if (cmd_line)
xstrcat(cmd_line, " ");
xstrcat(cmd_line, detail_ptr->argv[i]);
}
packstr(cmd_line, buffer);
xfree(cmd_line);
} else
packnull(buffer);
if (job_ptr->node_cnt) {
pack32(job_ptr->node_cnt, buffer);
pack32((uint32_t) 0, buffer);
} else {
pack32(detail_ptr->min_nodes, buffer);
pack32(detail_ptr->max_nodes, buffer);
}
pack16(detail_ptr->requeue, buffer);
} else {
packnull(buffer);
packnull(buffer);
packnull(buffer);
packnull(buffer);
pack32(job_ptr->node_cnt, buffer);
pack32((uint32_t) 0, buffer);
pack16((uint16_t) 0, buffer);
}
}
}
/* pack pending job details for "get_job_info" RPC */
static void _pack_pending_job_details(struct job_details *detail_ptr,
Buf buffer, uint16_t protocol_version)
{
if(protocol_version >= SLURM_2_1_PROTOCOL_VERSION) {
if (detail_ptr) {
pack16(detail_ptr->shared, buffer);
pack16(detail_ptr->contiguous, buffer);
pack16(detail_ptr->cpus_per_task, buffer);
pack16(detail_ptr->pn_min_cpus, buffer);
pack32(detail_ptr->pn_min_memory, buffer);
pack32(detail_ptr->pn_min_tmp_disk, buffer);
packstr(detail_ptr->req_nodes, buffer);
pack_bit_fmt(detail_ptr->req_node_bitmap, buffer);
/* detail_ptr->req_node_layout is not packed */
packstr(detail_ptr->exc_nodes, buffer);
pack_bit_fmt(detail_ptr->exc_node_bitmap, buffer);
pack_multi_core_data(detail_ptr->mc_ptr, buffer,
protocol_version);
} else {
pack16((uint16_t) 0, buffer);
pack16((uint16_t) 0, buffer);
pack16((uint16_t) 0, buffer);
pack16((uint16_t) 0, buffer);
pack32((uint32_t) 0, buffer);
pack32((uint32_t) 0, buffer);
packnull(buffer);
packnull(buffer);
packnull(buffer);
packnull(buffer);
pack_multi_core_data(NULL, buffer, protocol_version);
}
}
}
/*
* purge_old_job - purge old job records.
* The jobs must have completed at least MIN_JOB_AGE minutes ago.
* Test job dependencies, handle after_ok, after_not_ok before
* purging any jobs.
* NOTE: READ lock slurmctld config and WRITE lock jobs before entry
*/
void purge_old_job(void)
{
ListIterator job_iterator;
struct job_record *job_ptr;
time_t now = time(NULL);
int i;
job_iterator = list_iterator_create(job_list);
while ((job_ptr = (struct job_record *) list_next(job_iterator))) {
if (!IS_JOB_PENDING(job_ptr))
continue;
if (test_job_dependency(job_ptr) == 2) {
info("Job dependency can't be satisfied, cancelling "
"job %u", job_ptr->job_id);
job_ptr->job_state = JOB_CANCELLED;
xfree(job_ptr->state_desc);
job_ptr->start_time = now;
job_ptr->end_time = now;
job_completion_logger(job_ptr, false);
last_job_update = now;
srun_allocate_abort(job_ptr);
}
}
list_iterator_destroy(job_iterator);
i = list_delete_all(job_list, &_list_find_job_old, "");
if (i) {
debug2("purge_old_job: purged %d old job records", i);
/* last_job_update = now; don't worry about state save */
}
}
/*
* _purge_job_record - purge specific job record
* IN job_id - job_id of job record to be purged
* RET int - count of job's purged
* global: job_list - global job table
*/
static int _purge_job_record(uint32_t job_id)
{
return list_delete_all(job_list, &_list_find_job_id, (void *) &job_id);
}
/*
* reset_job_bitmaps - reestablish bitmaps for existing jobs.
* this should be called after rebuilding node information,
* but before using any job entries.
* global: last_job_update - time of last job table update
* job_list - pointer to global job list
*/
void reset_job_bitmaps(void)
{
ListIterator job_iterator;
struct job_record *job_ptr;
struct part_record *part_ptr;
List part_ptr_list = NULL;
bool job_fail = false;
time_t now = time(NULL);
bool gang_flag = false;
static uint32_t cr_flag = NO_VAL;
xassert(job_list);
if (cr_flag == NO_VAL) {
cr_flag = 0; /* call is no-op for select/linear and bluegene */
if (select_g_get_info_from_plugin(SELECT_CR_PLUGIN,
NULL, &cr_flag)) {
cr_flag = NO_VAL; /* error */
}
}
if (slurm_get_preempt_mode() == PREEMPT_MODE_GANG)
gang_flag = true;
job_iterator = list_iterator_create(job_list);
while ((job_ptr = (struct job_record *) list_next(job_iterator))) {
xassert (job_ptr->magic == JOB_MAGIC);
job_fail = false;
if (job_ptr->partition == NULL) {
error("No partition for job_id %u", job_ptr->job_id);
part_ptr = NULL;
job_fail = true;
} else {
part_ptr = find_part_record(job_ptr->partition);
if (part_ptr == NULL) {
part_ptr_list = get_part_list(job_ptr->
partition);
if (part_ptr_list)
part_ptr = list_peek(part_ptr_list);
}
if (part_ptr == NULL) {
error("Invalid partition (%s) for job %u",
job_ptr->partition, job_ptr->job_id);
job_fail = true;
}
}
job_ptr->part_ptr = part_ptr;
FREE_NULL_LIST(job_ptr->part_ptr_list);
if (part_ptr_list) {
job_ptr->part_ptr_list = part_ptr_list;
part_ptr_list = NULL; /* clear for next job */
}
FREE_NULL_BITMAP(job_ptr->node_bitmap_cg);
if (job_ptr->nodes_completing &&
node_name2bitmap(job_ptr->nodes_completing,
false, &job_ptr->node_bitmap_cg)) {
error("Invalid nodes (%s) for job_id %u",
job_ptr->nodes_completing,
job_ptr->job_id);
job_fail = true;
}
FREE_NULL_BITMAP(job_ptr->node_bitmap);
if (job_ptr->nodes &&
node_name2bitmap(job_ptr->nodes, false,
&job_ptr->node_bitmap) && !job_fail) {
error("Invalid nodes (%s) for job_id %u",
job_ptr->nodes, job_ptr->job_id);
job_fail = true;
}
if (reset_node_bitmap(job_ptr->job_resrcs, job_ptr->job_id))
job_fail = true;
if (!job_fail && !IS_JOB_FINISHED(job_ptr) &&
job_ptr->job_resrcs && (cr_flag || gang_flag) &&
valid_job_resources(job_ptr->job_resrcs,
node_record_table_ptr,
slurmctld_conf.fast_schedule)) {
error("Aborting JobID %u due to change in socket/core "
"configuration of allocated nodes",
job_ptr->job_id);
job_fail = true;
}
_reset_step_bitmaps(job_ptr);
build_node_details(job_ptr); /* set node_addr */
if (_reset_detail_bitmaps(job_ptr))
job_fail = true;
if (job_fail) {
if (IS_JOB_PENDING(job_ptr)) {
job_ptr->start_time =
job_ptr->end_time = time(NULL);
job_ptr->job_state = JOB_NODE_FAIL;
} else if (IS_JOB_RUNNING(job_ptr)) {
job_ptr->end_time = time(NULL);
job_ptr->job_state = JOB_NODE_FAIL |
JOB_COMPLETING;
build_cg_bitmap(job_ptr);
} else if (IS_JOB_SUSPENDED(job_ptr)) {
job_ptr->end_time = job_ptr->suspend_time;
job_ptr->job_state = JOB_NODE_FAIL |
JOB_COMPLETING;
build_cg_bitmap(job_ptr);
job_ptr->tot_sus_time +=
difftime(now, job_ptr->suspend_time);
jobacct_storage_g_job_suspend(acct_db_conn,
job_ptr);
}
job_ptr->exit_code = MAX(job_ptr->exit_code, 1);
job_ptr->state_reason = FAIL_DOWN_NODE;
xfree(job_ptr->state_desc);
job_completion_logger(job_ptr, false);
}
}
list_iterator_reset(job_iterator);
/* This will reinitialize the select plugin database, which
* we can only do after ALL job's states and bitmaps are set
* (i.e. it needs to be in this second loop) */
while ((job_ptr = (struct job_record *) list_next(job_iterator))) {
if (select_g_select_nodeinfo_set(job_ptr) != SLURM_SUCCESS) {
error("select_g_select_nodeinfo_set(%u): %m",
job_ptr->job_id);
}
}
list_iterator_destroy(job_iterator);
last_job_update = now;
}
static int _reset_detail_bitmaps(struct job_record *job_ptr)
{
if (job_ptr->details == NULL)
return SLURM_SUCCESS;
FREE_NULL_BITMAP(job_ptr->details->req_node_bitmap);
xfree(job_ptr->details->req_node_layout); /* layout info is lost
* but should be re-generated
* at job start time */
if ((job_ptr->details->req_nodes) &&
(node_name2bitmap(job_ptr->details->req_nodes, false,
&job_ptr->details->req_node_bitmap))) {
error("Invalid req_nodes (%s) for job_id %u",
job_ptr->details->req_nodes, job_ptr->job_id);
return SLURM_ERROR;
}
FREE_NULL_BITMAP(job_ptr->details->exc_node_bitmap);
if ((job_ptr->details->exc_nodes) &&
(node_name2bitmap(job_ptr->details->exc_nodes, true,
&job_ptr->details->exc_node_bitmap))) {
error("Invalid exc_nodes (%s) for job_id %u",
job_ptr->details->exc_nodes, job_ptr->job_id);
return SLURM_ERROR;
}
return SLURM_SUCCESS;
}
static void _reset_step_bitmaps(struct job_record *job_ptr)
{
ListIterator step_iterator;
struct step_record *step_ptr;
step_iterator = list_iterator_create (job_ptr->step_list);
while ((step_ptr = (struct step_record *) list_next (step_iterator))) {
FREE_NULL_BITMAP(step_ptr->step_node_bitmap);
if (step_ptr->step_layout &&
step_ptr->step_layout->node_list &&
(node_name2bitmap(step_ptr->step_layout->node_list, false,
&step_ptr->step_node_bitmap))) {
error("Invalid step_node_list (%s) for step_id %u.%u",
step_ptr->step_layout->node_list,
job_ptr->job_id, step_ptr->step_id);
delete_step_record (job_ptr, step_ptr->step_id);
}
if ((step_ptr->step_node_bitmap == NULL) &&
(step_ptr->batch_step == 0)) {
error("Missing node_list for step_id %u.%u",
job_ptr->job_id, step_ptr->step_id);
delete_step_record (job_ptr, step_ptr->step_id);
}
}
list_iterator_destroy (step_iterator);
return;
}
/* update first assigned job id as needed on reconfigure
* NOTE: READ lock_slurmctld config before entry */
void reset_first_job_id(void)
{
job_id_sequence = MAX(job_id_sequence, slurmctld_conf.first_job_id);
}
/*
* get_next_job_id - return the job_id to be used by default for
* the next job
*/
extern uint32_t get_next_job_id(void)
{
uint32_t next_id;
job_id_sequence = MAX(job_id_sequence, slurmctld_conf.first_job_id);
next_id = job_id_sequence + 1;
if (next_id >= slurmctld_conf.max_job_id)
next_id = slurmctld_conf.first_job_id;
return next_id;
}
/*
* _set_job_id - set a default job_id, insure that it is unique
* IN job_ptr - pointer to the job_record
*/
static void _set_job_id(struct job_record *job_ptr)
{
int i;
uint32_t new_id;
job_id_sequence = MAX(job_id_sequence, slurmctld_conf.first_job_id);
xassert(job_ptr);
xassert (job_ptr->magic == JOB_MAGIC);
if ((job_ptr->partition == NULL)
|| (strlen(job_ptr->partition) == 0))
fatal("_set_job_id: partition not set");
/* Insure no conflict in job id if we roll over 32 bits */
for (i = 0; i < 1000; i++) {
if (++job_id_sequence >= slurmctld_conf.max_job_id)
job_id_sequence = slurmctld_conf.first_job_id;
new_id = job_id_sequence;
if (find_job_record(new_id) == NULL) {
job_ptr->job_id = new_id;
return;
}
}
fatal("We have exhausted our supply of valid job id values."
"FirstJobId=%u MaxJobId=%u", slurmctld_conf.first_job_id,
slurmctld_conf.max_job_id);
}
/*
* _set_job_prio - set a default job priority
* IN job_ptr - pointer to the job_record
* NOTE: this is a simple prototype, we need to re-establish value on restart
*/
static void _set_job_prio(struct job_record *job_ptr)
{
xassert(job_ptr);
xassert (job_ptr->magic == JOB_MAGIC);
if (IS_JOB_FINISHED(job_ptr))
return;
job_ptr->priority = slurm_sched_initial_priority(lowest_prio,
job_ptr);
if ((job_ptr->priority <= 1) ||
(job_ptr->direct_set_prio) ||
(job_ptr->details && (job_ptr->details->nice != NICE_OFFSET)))
return;
lowest_prio = MIN(job_ptr->priority, lowest_prio);
}
/* After recovering job state, if using priority/basic then we increment the
* priorities of all jobs to avoid decrementing the base down to zero */
extern void sync_job_priorities(void)
{
ListIterator job_iterator;
struct job_record *job_ptr;
uint32_t prio_boost = 0;
if ((highest_prio != 0) && (highest_prio < TOP_PRIORITY))
prio_boost = TOP_PRIORITY - highest_prio;
if (strcmp(slurmctld_conf.priority_type, "priority/basic") ||
(prio_boost < 1000000))
return;
job_iterator = list_iterator_create(job_list);
while ((job_ptr = (struct job_record *) list_next(job_iterator))) {
if (job_ptr->priority > 1)
job_ptr->priority += prio_boost;
}
list_iterator_destroy(job_iterator);
lowest_prio += prio_boost;
}
/* After a node is returned to service, reset the priority of jobs
* which may have been held due to that node being unavailable */
extern void reset_job_priority(void)
{
ListIterator job_iterator;
struct job_record *job_ptr;
int count = 0;
job_iterator = list_iterator_create(job_list);
while ((job_ptr = (struct job_record *) list_next(job_iterator))) {
if ((job_ptr->priority == 1) && (!IS_JOB_FINISHED(job_ptr))) {
_set_job_prio(job_ptr);
count++;
}
}
list_iterator_destroy(job_iterator);
if (count)
last_job_update = time(NULL);
}
/*
* _top_priority - determine if any other job has a higher priority than the
* specified job
* IN job_ptr - pointer to selected job
* RET true if selected job has highest priority
*/
static bool _top_priority(struct job_record *job_ptr)
{
struct job_details *detail_ptr = job_ptr->details;
bool top;
#ifdef HAVE_BG
static uint16_t static_part = (uint16_t)NO_VAL;
int rc = SLURM_SUCCESS;
/* On BlueGene with static partitioning, we don't want to delay
* jobs based upon priority since jobs of different sizes can
* execute on different sets of nodes. While sched/backfill would
* eventually start the job if delayed here based upon priority,
* that could delay the initiation of a job by a few seconds. */
if(static_part == (uint16_t)NO_VAL) {
/* Since this never changes we can just set it once
and not look at it again. */
rc = select_g_get_info_from_plugin(SELECT_STATIC_PART, job_ptr,
&static_part);
}
if ((rc == SLURM_SUCCESS) && (static_part == 1))
return true;
#endif
if (job_ptr->priority == 0) /* user held */
top = false;
else {
ListIterator job_iterator;
struct job_record *job_ptr2;
top = true; /* assume top priority until found otherwise */
job_iterator = list_iterator_create(job_list);
while ((job_ptr2 = (struct job_record *)
list_next(job_iterator))) {
if (job_ptr2 == job_ptr)
continue;
if (!IS_JOB_PENDING(job_ptr2))
continue;
if (IS_JOB_COMPLETING(job_ptr2)) {
/* Job is hung in pending & completing state,
* indicative of job requeue */
continue;
}
if (!job_independent(job_ptr2, 0))
continue;
if ((job_ptr2->resv_name && (!job_ptr->resv_name)) ||
((!job_ptr2->resv_name) && job_ptr->resv_name))
continue; /* different reservation */
if (job_ptr2->resv_name && job_ptr->resv_name &&
(!strcmp(job_ptr2->resv_name,
job_ptr->resv_name))) {
/* same reservation */
if (job_ptr2->priority <= job_ptr->priority)
continue;
top = false;
break;
}
if (job_ptr2->part_ptr == job_ptr->part_ptr) {
/* same partition */
if (job_ptr2->priority <= job_ptr->priority)
continue;
top = false;
break;
}
if (bit_overlap(job_ptr->part_ptr->node_bitmap,
job_ptr2->part_ptr->node_bitmap) == 0)
continue; /* no node overlap in partitions */
if ((job_ptr2->part_ptr->priority >
job_ptr ->part_ptr->priority) ||
((job_ptr2->part_ptr->priority ==
job_ptr ->part_ptr->priority) &&
(job_ptr2->priority > job_ptr->priority))) {
top = false;
break;
}
}
list_iterator_destroy(job_iterator);
}
if ((!top) && detail_ptr) { /* not top prio */
if (job_ptr->priority == 0) { /* user/admin hold */
if ((job_ptr->state_reason != WAIT_HELD) &&
(job_ptr->state_reason != WAIT_HELD_USER)) {
job_ptr->state_reason = WAIT_HELD;
xfree(job_ptr->state_desc);
}
} else if (job_ptr->priority != 1) { /* not system hold */
job_ptr->state_reason = WAIT_PRIORITY;
xfree(job_ptr->state_desc);
}
}
return top;
}
static void _merge_job_licenses(struct job_record *shrink_job_ptr,
struct job_record *expand_job_ptr)
{
xassert(shrink_job_ptr);
xassert(expand_job_ptr);
if (!shrink_job_ptr->licenses) /* No licenses to add */
return;
if (!expand_job_ptr->licenses) { /* Just transfer licenses */
expand_job_ptr->licenses = shrink_job_ptr->licenses;
shrink_job_ptr->licenses = NULL;
FREE_NULL_LIST(expand_job_ptr->license_list);
expand_job_ptr->license_list = shrink_job_ptr->license_list;
shrink_job_ptr->license_list = NULL;
return;
}
/* Merge the license information into expanding job */
xstrcat(expand_job_ptr->licenses, ",");
xstrcat(expand_job_ptr->licenses, shrink_job_ptr->licenses);
xfree(shrink_job_ptr->licenses);
FREE_NULL_LIST(expand_job_ptr->license_list);
FREE_NULL_LIST(shrink_job_ptr->license_list);
license_job_merge(expand_job_ptr);
return;
}
/*
* update_job - update a job's parameters per the supplied specifications
* IN job_specs - a job's specification
* IN uid - uid of user issuing RPC
* RET returns an error code from slurm_errno.h
* global: job_list - global list of job entries
* last_job_update - time of last job table update
*/
int update_job(job_desc_msg_t * job_specs, uid_t uid)
{
int error_code = SLURM_SUCCESS;
enum job_state_reason fail_reason;
bool authorized = false;
uint32_t save_min_nodes = 0, save_max_nodes = 0;
uint32_t save_min_cpus = 0, save_max_cpus = 0;
struct job_record *job_ptr;
struct job_details *detail_ptr;
struct part_record *tmp_part_ptr;
bitstr_t *exc_bitmap = NULL, *req_bitmap = NULL;
time_t now = time(NULL);
multi_core_data_t *mc_ptr = NULL;
bool update_accounting = false;
uint16_t limit_set_max_cpus = 0;
uint16_t limit_set_max_nodes = 0;
uint16_t limit_set_min_cpus = 0;
uint16_t limit_set_min_nodes = 0;
uint16_t limit_set_time = 0;
#ifdef HAVE_BG
uint16_t conn_type = (uint16_t) NO_VAL;
uint16_t reboot = (uint16_t) NO_VAL;
uint16_t rotate = (uint16_t) NO_VAL;
uint16_t geometry[SYSTEM_DIMENSIONS] = {(uint16_t) NO_VAL};
char *image = NULL;
static uint32_t cpus_per_mp = 0;
static uint16_t cpus_per_node = 0;
if (!cpus_per_mp)
select_g_alter_node_cnt(SELECT_GET_MP_CPU_CNT, &cpus_per_mp);
if (!cpus_per_node)
select_g_alter_node_cnt(SELECT_GET_NODE_CPU_CNT,
&cpus_per_node);
#endif
/* Make sure anything that may be put in the database will be
lower case */
xstrtolower(job_specs->account);
xstrtolower(job_specs->wckey);
job_ptr = find_job_record(job_specs->job_id);
if (job_ptr == NULL) {
error("update_job: job_id %u does not exist.",
job_specs->job_id);
return ESLURM_INVALID_JOB_ID;
}
error_code = job_submit_plugin_modify(job_specs, job_ptr,
(uint32_t) uid);
if (error_code != SLURM_SUCCESS)
return error_code;
authorized = validate_operator(uid) || assoc_mgr_is_user_acct_coord(
acct_db_conn, uid, job_ptr->account);
if ((job_ptr->user_id != uid) && !authorized) {
error("Security violation, JOB_UPDATE RPC from uid %d",
uid);
return ESLURM_USER_ID_MISSING;
}
if (!wiki_sched_test) {
char *sched_type = slurm_get_sched_type();
if (strcmp(sched_type, "sched/wiki") == 0)
wiki_sched = true;
if (strcmp(sched_type, "sched/wiki2") == 0) {
wiki_sched = true;
wiki2_sched = true;
}
xfree(sched_type);
wiki_sched_test = true;
}
detail_ptr = job_ptr->details;
if (detail_ptr)
mc_ptr = detail_ptr->mc_ptr;
last_job_update = now;
if (job_specs->account) {
if (!IS_JOB_PENDING(job_ptr))
error_code = ESLURM_DISABLED;
else {
int rc = update_job_account("update_job", job_ptr,
job_specs->account);
if (rc != SLURM_SUCCESS)
error_code = rc;
else
update_accounting = true;
}
}
if (error_code != SLURM_SUCCESS)
goto fini;
if (job_specs->exc_nodes) {
if ((!IS_JOB_PENDING(job_ptr)) || (detail_ptr == NULL))
error_code = ESLURM_DISABLED;
else if (job_specs->exc_nodes[0] == '\0') {
xfree(detail_ptr->exc_nodes);
FREE_NULL_BITMAP(detail_ptr->exc_node_bitmap);
} else {
if (node_name2bitmap(job_specs->exc_nodes, false,
&exc_bitmap)) {
error("sched: Invalid node list for "
"job_update: %s",job_specs->exc_nodes);
FREE_NULL_BITMAP(exc_bitmap);
error_code = ESLURM_INVALID_NODE_NAME;
}
if (exc_bitmap) {
xfree(detail_ptr->exc_nodes);
detail_ptr->exc_nodes =
job_specs->exc_nodes;
FREE_NULL_BITMAP(detail_ptr->exc_node_bitmap);
detail_ptr->exc_node_bitmap = exc_bitmap;
info("sched: update_job: setting exc_nodes to "
"%s for job_id %u", job_specs->exc_nodes,
job_specs->job_id);
job_specs->exc_nodes = NULL;
}
}
}
if (error_code != SLURM_SUCCESS)
goto fini;
#ifndef HAVE_BG
if (job_specs->req_nodes &&
(IS_JOB_RUNNING(job_ptr) || IS_JOB_SUSPENDED(job_ptr))) {
/* Use req_nodes to change the nodes associated with a running
* for lack of other field in the job request to use */
if ((job_specs->req_nodes[0] == '\0') ||
node_name2bitmap(job_specs->req_nodes,false, &req_bitmap) ||
!bit_super_set(req_bitmap, job_ptr->node_bitmap) ||
job_ptr->details->expanding_jobid) {
info("sched: Invalid node list (%s) for job %u update",
job_specs->req_nodes, job_specs->job_id);
error_code = ESLURM_INVALID_NODE_NAME;
goto fini;
} else if (req_bitmap) {
int i, i_first, i_last;
struct node_record *node_ptr;
info("sched: update_job: setting nodes to %s for "
"job_id %u",
job_specs->req_nodes, job_specs->job_id);
job_pre_resize_acctg(job_ptr);
i_first = bit_ffs(job_ptr->node_bitmap);
i_last = bit_fls(job_ptr->node_bitmap);
for (i=i_first; i<=i_last; i++) {
if (bit_test(req_bitmap, i) ||
!bit_test(job_ptr->node_bitmap, i))
continue;
node_ptr = node_record_table_ptr + i;
kill_step_on_node(job_ptr, node_ptr, false);
excise_node_from_job(job_ptr, node_ptr);
}
job_post_resize_acctg(job_ptr);
/* Since job_post_resize_acctg will restart
* things, don't do it again. */
update_accounting = false;
} else {
update_accounting = true;
}
FREE_NULL_BITMAP(req_bitmap);
xfree(job_specs->req_nodes);
}
#endif
if (job_specs->req_nodes) {
if ((!IS_JOB_PENDING(job_ptr)) || (detail_ptr == NULL))
error_code = ESLURM_DISABLED;
else if (job_specs->req_nodes[0] == '\0') {
xfree(detail_ptr->req_nodes);
FREE_NULL_BITMAP(detail_ptr->req_node_bitmap);
xfree(detail_ptr->req_node_layout);
} else {
if (node_name2bitmap(job_specs->req_nodes, false,
&req_bitmap)) {
info("sched: Invalid node list for "
"job_update: %s", job_specs->req_nodes);
FREE_NULL_BITMAP(req_bitmap);
error_code = ESLURM_INVALID_NODE_NAME;
}
if (req_bitmap) {
xfree(detail_ptr->req_nodes);
detail_ptr->req_nodes =
job_specs->req_nodes;
FREE_NULL_BITMAP(detail_ptr->req_node_bitmap);
xfree(detail_ptr->req_node_layout);
detail_ptr->req_node_bitmap = req_bitmap;
info("sched: update_job: setting req_nodes to "
"%s for job_id %u", job_specs->req_nodes,
job_specs->job_id);
job_specs->req_nodes = NULL;
}
}
}
if (error_code != SLURM_SUCCESS)
goto fini;
if (job_specs->min_nodes == INFINITE) {
/* Used by scontrol just to get current configuration info */
job_specs->min_nodes = NO_VAL;
}
#if defined(HAVE_BG) || defined(HAVE_CRAY)
if ((job_specs->min_nodes != NO_VAL) &&
(IS_JOB_RUNNING(job_ptr) || IS_JOB_SUSPENDED(job_ptr))) {
#else
if ((job_specs->min_nodes != NO_VAL) &&
(job_specs->min_nodes > job_ptr->node_cnt) &&
!select_g_job_expand_allow() &&
(IS_JOB_RUNNING(job_ptr) || IS_JOB_SUSPENDED(job_ptr))) {
#endif
info("Change of size for job %u not supported",
job_specs->job_id);
error_code = ESLURM_NOT_SUPPORTED;
goto fini;
}
if (job_specs->req_switch != NO_VAL) {
job_ptr->req_switch = job_specs->req_switch;
info("Change of switches to %u job %u",
job_specs->req_switch, job_specs->job_id);
}
if (job_specs->wait4switch != NO_VAL) {
job_ptr->wait4switch = _max_switch_wait(job_specs->wait4switch);
info("Change of switch wait to %u secs job %u",
job_ptr->wait4switch, job_specs->job_id);
}
if (job_specs->partition) {
List part_ptr_list = NULL;
if (!IS_JOB_PENDING(job_ptr)) {
error_code = ESLURM_DISABLED;
goto fini;
}
if (job_specs->min_nodes == NO_VAL)
job_specs->min_nodes = detail_ptr->min_nodes;
if ((job_specs->max_nodes == NO_VAL) &&
(detail_ptr->max_nodes != 0))
job_specs->max_nodes = detail_ptr->max_nodes;
if ((job_specs->time_min == NO_VAL) &&
(job_ptr->time_min != 0))
job_specs->time_min = job_ptr->time_min;
if (job_specs->time_limit == NO_VAL)
job_specs->time_limit = job_ptr->time_limit;
error_code = _valid_job_part(job_specs, uid,
job_ptr->details->req_node_bitmap,
&tmp_part_ptr, &part_ptr_list);
if (error_code != SLURM_SUCCESS)
;
else if ((tmp_part_ptr->state_up & PARTITION_SUBMIT) == 0)
error_code = ESLURM_PARTITION_NOT_AVAIL;
else {
slurmdb_association_rec_t assoc_rec;
memset(&assoc_rec, 0,
sizeof(slurmdb_association_rec_t));
assoc_rec.uid = job_ptr->user_id;
assoc_rec.partition = tmp_part_ptr->name;
assoc_rec.acct = job_ptr->account;
if (assoc_mgr_fill_in_assoc(
acct_db_conn, &assoc_rec,
accounting_enforce,
(slurmdb_association_rec_t **)
&job_ptr->assoc_ptr)) {
info("job_update: invalid account %s "
"for job %u",
job_specs->account, job_ptr->job_id);
error_code = ESLURM_INVALID_ACCOUNT;
/* Let update proceed. Note there is an invalid
* association ID for accounting purposes */
} else
job_ptr->assoc_id = assoc_rec.id;
xfree(job_ptr->partition);
job_ptr->partition = xstrdup(job_specs->partition);
job_ptr->part_ptr = tmp_part_ptr;
FREE_NULL_LIST(job_ptr->part_ptr_list);
job_ptr->part_ptr_list = part_ptr_list;
part_ptr_list = NULL; /* nothing to free */
info("update_job: setting partition to %s for "
"job_id %u", job_specs->partition,
job_specs->job_id);
update_accounting = true;
}
FREE_NULL_LIST(part_ptr_list); /* error clean-up */
if (error_code != SLURM_SUCCESS)
goto fini;
}
/* Always do this last just in case the assoc_ptr changed */
if (job_specs->comment && wiki_sched && !validate_slurm_user(uid)) {
/* User must use Moab command to change job comment */
error("Attempt to change comment for job %u",
job_specs->job_id);
error_code = ESLURM_ACCESS_DENIED;
} else if (job_specs->comment) {
xfree(job_ptr->comment);
job_ptr->comment = job_specs->comment;
job_specs->comment = NULL; /* Nothing left to free */
info("update_job: setting comment to %s for job_id %u",
job_ptr->comment, job_specs->job_id);
if (wiki_sched && strstr(job_ptr->comment, "QOS:")) {
slurmdb_qos_rec_t qos_rec;
if (!IS_JOB_PENDING(job_ptr))
error_code = ESLURM_DISABLED;
else {
memset(&qos_rec, 0, sizeof(slurmdb_qos_rec_t));
if (strstr(job_ptr->comment,
"FLAGS:PREEMPTOR"))
qos_rec.name = "expedite";
else if (strstr(job_ptr->comment,
"FLAGS:PREEMPTEE"))
qos_rec.name = "standby";
job_ptr->qos_ptr = _determine_and_validate_qos(
job_ptr->assoc_ptr, &qos_rec,
&error_code);
if (error_code == SLURM_SUCCESS) {
job_ptr->qos_id = qos_rec.id;
update_accounting = true;
}
}
}
}
if (error_code != SLURM_SUCCESS)
goto fini;
if (job_specs->qos) {
slurmdb_qos_rec_t qos_rec;
if (!IS_JOB_PENDING(job_ptr))
error_code = ESLURM_DISABLED;
else {
info("update_job: setting qos to %s for job_id %u",
job_specs->qos, job_specs->job_id);
memset(&qos_rec, 0, sizeof(slurmdb_qos_rec_t));
qos_rec.name = job_specs->qos;
job_ptr->qos_ptr = _determine_and_validate_qos(
job_ptr->assoc_ptr, &qos_rec, &error_code);
if (error_code == SLURM_SUCCESS) {
job_ptr->qos_id = qos_rec.id;
update_accounting = true;
}
}
}
if (error_code != SLURM_SUCCESS)
goto fini;
if (!authorized && (accounting_enforce & ACCOUNTING_ENFORCE_LIMITS)) {
if (!acct_policy_validate(job_specs, job_ptr->part_ptr,
job_ptr->assoc_ptr, job_ptr->qos_ptr,
&limit_set_max_cpus,
&limit_set_max_nodes,
&limit_set_time, 1)) {
info("update_job: exceeded association's cpu, node or "
"time limit for user %u", job_specs->user_id);
error_code = ESLURM_ACCOUNTING_POLICY;
goto fini;
}
/* Perhaps the limit was removed, so we will remove it
since it was imposed previously.
*/
if (!limit_set_max_cpus && (job_ptr->limit_set_max_cpus == 1))
job_ptr->details->max_cpus = NO_VAL;
if (!limit_set_max_nodes && (job_ptr->limit_set_max_nodes == 1))
job_ptr->details->max_nodes = NO_VAL;
if (!limit_set_time && (job_ptr->limit_set_time == 1))
job_ptr->time_limit = NO_VAL;
if (job_ptr->limit_set_max_cpus != ADMIN_SET_LIMIT)
job_ptr->limit_set_max_cpus = limit_set_max_cpus;
if (job_ptr->limit_set_max_nodes != ADMIN_SET_LIMIT)
job_ptr->limit_set_max_nodes = limit_set_max_nodes;
if (job_ptr->limit_set_time != ADMIN_SET_LIMIT)
job_ptr->limit_set_time = limit_set_time;
} else if (authorized) {
limit_set_max_cpus = ADMIN_SET_LIMIT;
limit_set_max_nodes = ADMIN_SET_LIMIT;
limit_set_min_cpus = ADMIN_SET_LIMIT;
limit_set_min_nodes = ADMIN_SET_LIMIT;
limit_set_time = ADMIN_SET_LIMIT;
}
/* This needs to be done after the association acct policy check since
* it looks at unaltered nodes for bluegene systems
*/
debug3("update before alteration asking for nodes %u-%u cpus %u-%u",
job_specs->min_nodes, job_specs->max_nodes,
job_specs->min_cpus, job_specs->max_cpus);
select_g_alter_node_cnt(SELECT_SET_NODE_CNT, job_specs);
debug3("update after alteration asking for nodes %u-%u cpus %u-%u",
job_specs->min_nodes, job_specs->max_nodes,
job_specs->min_cpus, job_specs->max_cpus);
/* Reset min and max cpu counts as needed, insure consistency */
if (job_specs->min_cpus != NO_VAL) {
if ((!IS_JOB_PENDING(job_ptr)) || (detail_ptr == NULL))
error_code = ESLURM_DISABLED;
else if (job_specs->min_cpus < 1)
error_code = ESLURM_INVALID_CPU_COUNT;
else {
save_min_cpus = detail_ptr->min_cpus;
detail_ptr->min_cpus = job_specs->min_cpus;
}
}
if (job_specs->max_cpus != NO_VAL) {
if ((!IS_JOB_PENDING(job_ptr)) || (detail_ptr == NULL))
error_code = ESLURM_DISABLED;
else {
save_max_cpus = detail_ptr->max_cpus;
detail_ptr->max_cpus = job_specs->max_cpus;
}
}
if ((save_min_cpus || save_max_cpus) && detail_ptr->max_cpus &&
(detail_ptr->max_cpus < detail_ptr->min_cpus)) {
error_code = ESLURM_INVALID_CPU_COUNT;
if (save_min_cpus) {
detail_ptr->min_cpus = save_min_cpus;
save_min_cpus = 0;
}
if (save_max_cpus) {
detail_ptr->max_cpus = save_max_cpus;
save_max_cpus = 0;
}
}
if (error_code != SLURM_SUCCESS)
goto fini;
if (save_min_cpus && (detail_ptr->min_cpus != save_min_cpus)) {
#ifdef HAVE_BG
uint32_t node_cnt = detail_ptr->min_cpus;
if (cpus_per_node)
node_cnt /= cpus_per_node;
/* Ensure that accounting is set up correctly */
select_g_select_jobinfo_set(job_ptr->select_jobinfo,
SELECT_JOBDATA_NODE_CNT,
&node_cnt);
/* Reset geo since changing this makes any geo
* potentially invalid */
select_g_select_jobinfo_set(job_ptr->select_jobinfo,
SELECT_JOBDATA_GEOMETRY,
geometry);
#endif
info("update_job: setting min_cpus from "
"%u to %u for job_id %u",
save_min_cpus, detail_ptr->min_cpus, job_specs->job_id);
job_ptr->limit_set_min_cpus = limit_set_min_cpus;
update_accounting = true;
}
if (save_max_cpus && (detail_ptr->max_cpus != save_max_cpus)) {
info("update_job: setting max_cpus from "
"%u to %u for job_id %u",
save_max_cpus, detail_ptr->max_cpus, job_specs->job_id);
/* Always use the limit_set_* since if set by a
* super user it be set correctly */
job_ptr->limit_set_max_cpus = limit_set_max_cpus;
update_accounting = true;
}
if ((job_specs->pn_min_cpus != (uint16_t) NO_VAL) &&
(job_specs->pn_min_cpus != 0)) {
if ((!IS_JOB_PENDING(job_ptr)) || (detail_ptr == NULL))
error_code = ESLURM_DISABLED;
else if (authorized
|| (detail_ptr->pn_min_cpus
> job_specs->pn_min_cpus)) {
detail_ptr->pn_min_cpus = job_specs->pn_min_cpus;
info("update_job: setting pn_min_cpus to %u for "
"job_id %u", job_specs->pn_min_cpus,
job_specs->job_id);
} else {
error("Attempt to increase pn_min_cpus for job %u",
job_specs->job_id);
error_code = ESLURM_ACCESS_DENIED;
}
}
if (error_code != SLURM_SUCCESS)
goto fini;
if (job_specs->num_tasks != NO_VAL) {
if (!IS_JOB_PENDING(job_ptr))
error_code = ESLURM_DISABLED;
else if (job_specs->num_tasks < 1)
error_code = ESLURM_BAD_TASK_COUNT;
else {
#ifdef HAVE_BG
uint32_t node_cnt = job_specs->num_tasks;
if (cpus_per_node)
node_cnt /= cpus_per_node;
/* This is only set up so accounting is set up
correctly */
select_g_select_jobinfo_set(job_ptr->select_jobinfo,
SELECT_JOBDATA_NODE_CNT,
&node_cnt);
#endif
detail_ptr->num_tasks = job_specs->num_tasks;
info("update_job: setting num_tasks to %u for "
"job_id %u", job_specs->num_tasks,
job_specs->job_id);
if (detail_ptr->cpus_per_task) {
uint32_t new_cpus = detail_ptr->num_tasks
/ detail_ptr->cpus_per_task;
if ((new_cpus < detail_ptr->min_cpus) ||
(!detail_ptr->overcommit &&
(new_cpus > detail_ptr->min_cpus))) {
detail_ptr->min_cpus = new_cpus;
detail_ptr->max_cpus = new_cpus;
info("update_job: setting "
"min_cpus to %u for "
"job_id %u", detail_ptr->min_cpus,
job_specs->job_id);
/* Always use the limit_set_*
* since if set by a
* super user it be set correctly */
job_ptr->limit_set_min_cpus =
limit_set_min_cpus;
job_ptr->limit_set_max_cpus =
limit_set_max_cpus;
}
}
}
}
if (error_code != SLURM_SUCCESS)
goto fini;
/* Reset min and max node counts as needed, insure consistency */
if (job_specs->min_nodes != NO_VAL) {
if (IS_JOB_RUNNING(job_ptr) || IS_JOB_SUSPENDED(job_ptr))
; /* shrink running job, processed later */
else if ((!IS_JOB_PENDING(job_ptr)) || (detail_ptr == NULL))
error_code = ESLURM_DISABLED;
else if (job_specs->min_nodes < 1) {
info("update_job: min_nodes < 1 for job %u",
job_specs->job_id);
error_code = ESLURM_INVALID_NODE_COUNT;
} else {
/* Resize of pending job */
save_min_nodes = detail_ptr->min_nodes;
detail_ptr->min_nodes = job_specs->min_nodes;
}
}
if (job_specs->max_nodes != NO_VAL) {
if ((!IS_JOB_PENDING(job_ptr)) || (detail_ptr == NULL))
error_code = ESLURM_DISABLED;
else {
save_max_nodes = detail_ptr->max_nodes;
detail_ptr->max_nodes = job_specs->max_nodes;
}
}
if ((save_min_nodes || save_max_nodes) && detail_ptr->max_nodes &&
(detail_ptr->max_nodes < detail_ptr->min_nodes)) {
info("update_job: max_nodes < min_nodes (%u < %u) for job %u",
detail_ptr->max_nodes, detail_ptr->min_nodes,
job_specs->job_id);
error_code = ESLURM_INVALID_NODE_COUNT;
if (save_min_nodes) {
detail_ptr->min_nodes = save_min_nodes;
save_min_nodes = 0;
}
if (save_max_nodes) {
detail_ptr->max_nodes = save_max_nodes;
save_max_nodes = 0;
}
}
if (error_code != SLURM_SUCCESS)
goto fini;
if (save_min_nodes && (save_min_nodes!= detail_ptr->min_nodes)) {
info("update_job: setting min_nodes from "
"%u to %u for job_id %u",
save_min_nodes, detail_ptr->min_nodes, job_specs->job_id);
job_ptr->limit_set_min_nodes = limit_set_min_nodes;
update_accounting = true;
}
if (save_max_nodes && (save_max_nodes != detail_ptr->max_nodes)) {
info("update_job: setting max_nodes from "
"%u to %u for job_id %u",
save_max_nodes, detail_ptr->max_nodes, job_specs->job_id);
/* Always use the limit_set_* since if set by a
* super user it be set correctly */
job_ptr->limit_set_max_nodes = limit_set_max_nodes;
update_accounting = true;
}
if (job_specs->time_limit != NO_VAL) {
if (IS_JOB_FINISHED(job_ptr) || job_ptr->preempt_time)
error_code = ESLURM_DISABLED;
else if (job_ptr->time_limit == job_specs->time_limit) {
debug("sched: update_job: new time limit identical to "
"old time limit %u", job_specs->job_id);
} else if (authorized ||
(job_ptr->time_limit > job_specs->time_limit)) {
time_t old_time = job_ptr->time_limit;
if (old_time == INFINITE) /* one year in mins */
old_time = (365 * 24 * 60);
job_ptr->time_limit = job_specs->time_limit;
if (IS_JOB_RUNNING(job_ptr) ||
IS_JOB_SUSPENDED(job_ptr)) {
if (job_ptr->time_limit == INFINITE) {
/* Set end time in one year */
job_ptr->end_time = now +
(365 * 24 * 60 * 60);
} else {
/* Update end_time based upon change
* to preserve suspend time info */
job_ptr->end_time = job_ptr->end_time +
((job_ptr->time_limit -
old_time) * 60);
}
if (job_ptr->end_time < now)
job_ptr->end_time = now;
if (IS_JOB_RUNNING(job_ptr) &&
(list_is_empty(job_ptr->step_list) == 0)) {
_xmit_new_end_time(job_ptr);
}
}
info("sched: update_job: setting time_limit to %u for "
"job_id %u", job_specs->time_limit,
job_specs->job_id);
/* Always use the limit_set_* since if set by a
* super user it be set correctly */
job_ptr->limit_set_time = limit_set_time;
update_accounting = true;
} else if (IS_JOB_PENDING(job_ptr) && job_ptr->part_ptr &&
(job_ptr->part_ptr->max_time >=
job_specs->time_limit)) {
job_ptr->time_limit = job_specs->time_limit;
info("sched: update_job: setting time_limit to %u for "
"job_id %u", job_specs->time_limit,
job_specs->job_id);
/* Always use the limit_set_* since if set by a
* super user it be set correctly */
job_ptr->limit_set_time = limit_set_time;
update_accounting = true;
} else {
info("sched: Attempt to increase time limit for job %u",
job_specs->job_id);
error_code = ESLURM_ACCESS_DENIED;
}
}
if (error_code != SLURM_SUCCESS)
goto fini;
if ((job_specs->time_min != NO_VAL) && IS_JOB_PENDING(job_ptr)) {
if (job_specs->time_min > job_ptr->time_limit) {
info("update_job: attempt to set TimeMin > TimeLimit "
"(%u > %u)",
job_specs->time_min, job_ptr->time_limit);
error_code = ESLURM_INVALID_TIME_LIMIT;
} else if (job_ptr->time_min != job_specs->time_min) {
job_ptr->time_min = job_specs->time_min;
info("update_job: setting TimeMin to %u for job_id %u",
job_specs->time_min, job_specs->job_id);
}
}
if (error_code != SLURM_SUCCESS)
goto fini;
if (job_specs->end_time) {
if (!IS_JOB_RUNNING(job_ptr)) {
/* We may want to use this for deadline scheduling
* at some point in the future. For now only reset
* the time limit of running jobs. */
error_code = ESLURM_DISABLED;
} else if (job_specs->end_time < now) {
error_code = ESLURM_INVALID_TIME_VALUE;
} else if (authorized ||
(job_ptr->end_time > job_specs->end_time)) {
int delta_t = job_specs->end_time - job_ptr->end_time;
job_ptr->end_time = job_specs->end_time;
job_ptr->time_limit += (delta_t+30)/60; /* Sec->min */
info("sched: update_job: setting time_limit to %u for "
"job_id %u", job_ptr->time_limit,
job_specs->job_id);
/* Always use the limit_set_* since if set by a
* super user it be set correctly */
job_ptr->limit_set_time = limit_set_time;
update_accounting = true;
} else {
info("sched: Attempt to extend end time for job %u",
job_specs->job_id);
error_code = ESLURM_ACCESS_DENIED;
}
}
if (error_code != SLURM_SUCCESS)
goto fini;
if (job_specs->reservation) {
if (!IS_JOB_PENDING(job_ptr) && !IS_JOB_RUNNING(job_ptr)) {
error_code = ESLURM_DISABLED;
} else {
int rc;
char *save_resv_name = job_ptr->resv_name;
job_ptr->resv_name = job_specs->reservation;
job_specs->reservation = NULL; /* Nothing to free */
rc = validate_job_resv(job_ptr);
if (rc == SLURM_SUCCESS) {
info("sched: update_job: setting reservation "
"to %s for job_id %u", job_ptr->resv_name,
job_ptr->job_id);
xfree(save_resv_name);
update_accounting = true;
} else {
/* Restore reservation info */
job_specs->reservation = job_ptr->resv_name;
job_ptr->resv_name = save_resv_name;
error_code = rc;
}
}
}
if (error_code != SLURM_SUCCESS)
goto fini;
if (job_specs->requeue != (uint16_t) NO_VAL) {
detail_ptr->requeue = job_specs->requeue;
info("sched: update_job: setting requeue to %u for job_id %u",
job_specs->requeue, job_specs->job_id);
}
if (job_specs->priority != NO_VAL) {
/* If we are doing time slicing we could update the
priority of the job while running to give better
position (larger time slices) than competing jobs
*/
if (IS_JOB_FINISHED(job_ptr) || (detail_ptr == NULL))
error_code = ESLURM_DISABLED;
else if (job_ptr->priority == job_specs->priority) {
debug("update_job: setting priority to current value");
if ((job_ptr->priority == 0) &&
(job_ptr->user_id != uid) && authorized) {
/* Authorized user can change from user hold
* to admin hold or admin hold to user hold */
if (job_specs->alloc_sid == ALLOC_SID_USER_HOLD)
job_ptr->state_reason = WAIT_HELD_USER;
else
job_ptr->state_reason = WAIT_HELD;
}
} else if ((job_ptr->priority == 0) &&
(job_ptr->state_reason == WAIT_HELD_USER)) {
job_ptr->direct_set_prio = 0;
_set_job_prio(job_ptr);
info("sched: update_job: releasing user hold "
"for job_id %u", job_specs->job_id);
job_ptr->state_reason = WAIT_NO_REASON;
xfree(job_ptr->state_desc);
} else if (authorized ||
(job_ptr->priority > job_specs->priority)) {
if (job_specs->priority != 0)
job_ptr->details->nice = NICE_OFFSET;
if (job_specs->priority == INFINITE) {
job_ptr->direct_set_prio = 0;
_set_job_prio(job_ptr);
} else {
job_ptr->direct_set_prio = 1;
job_ptr->priority = job_specs->priority;
}
info("sched: update_job: setting priority to %u for "
"job_id %u", job_ptr->priority,
job_specs->job_id);
update_accounting = true;
if (job_ptr->priority == 0) {
if ((job_ptr->user_id == uid) ||
(job_specs->alloc_sid ==
ALLOC_SID_USER_HOLD)) {
job_ptr->state_reason = WAIT_HELD_USER;
} else
job_ptr->state_reason = WAIT_HELD;
xfree(job_ptr->state_desc);
} else if ((job_ptr->state_reason == WAIT_HELD) ||
(job_ptr->state_reason == WAIT_HELD_USER)) {
job_ptr->state_reason = WAIT_NO_REASON;
xfree(job_ptr->state_desc);
}
} else {
error("sched: Attempt to increase priority for job %u",
job_specs->job_id);
error_code = ESLURM_ACCESS_DENIED;
}
}
if (error_code != SLURM_SUCCESS)
goto fini;
if (job_specs->nice != (uint16_t) NO_VAL) {
if (IS_JOB_FINISHED(job_ptr))
error_code = ESLURM_DISABLED;
else if (authorized || (job_specs->nice >= NICE_OFFSET)) {
int64_t new_prio = job_ptr->priority;
new_prio += job_ptr->details->nice;
new_prio -= job_specs->nice;
job_ptr->priority = MAX(new_prio, 2);
job_ptr->details->nice = job_specs->nice;
info("sched: update_job: setting priority to %u for "
"job_id %u", job_ptr->priority,
job_specs->job_id);
update_accounting = true;
} else {
error("sched: Attempt to increase nice for "
"job %u", job_specs->job_id);
error_code = ESLURM_ACCESS_DENIED;
}
}
if (error_code != SLURM_SUCCESS)
goto fini;
if (job_specs->pn_min_memory != NO_VAL) {
if ((!IS_JOB_PENDING(job_ptr)) || (detail_ptr == NULL))
error_code = ESLURM_DISABLED;
else if (authorized) {
char *entity;
if (job_specs->pn_min_memory & MEM_PER_CPU)
entity = "cpu";
else
entity = "job";
detail_ptr->pn_min_memory = job_specs->pn_min_memory;
info("sched: update_job: setting min_memory_%s to %u "
"for job_id %u", entity,
(job_specs->pn_min_memory & (~MEM_PER_CPU)),
job_specs->job_id);
} else {
error("sched: Attempt to increase pn_min_memory for "
"job %u", job_specs->job_id);
error_code = ESLURM_ACCESS_DENIED;
}
}
if (error_code != SLURM_SUCCESS)
goto fini;
if (job_specs->pn_min_tmp_disk != NO_VAL) {
if ((!IS_JOB_PENDING(job_ptr)) || (detail_ptr == NULL))
error_code = ESLURM_DISABLED;
else if (authorized
|| (detail_ptr->pn_min_tmp_disk
> job_specs->pn_min_tmp_disk)) {
detail_ptr->pn_min_tmp_disk =
job_specs->pn_min_tmp_disk;
info("sched: update_job: setting job_min_tmp_disk to "
"%u for job_id %u", job_specs->pn_min_tmp_disk,
job_specs->job_id);
} else {
error("sched: Attempt to increase pn_min_tmp_disk "
"for job %u",
job_specs->job_id);
error_code = ESLURM_ACCESS_DENIED;
}
}
if (error_code != SLURM_SUCCESS)
goto fini;
if (job_specs->sockets_per_node != (uint16_t) NO_VAL) {
if ((!IS_JOB_PENDING(job_ptr)) || (mc_ptr == NULL)) {
error_code = ESLURM_DISABLED;
goto fini;
} else {
mc_ptr->sockets_per_node = job_specs->sockets_per_node;
info("sched: update_job: setting sockets_per_node to "
"%u for job_id %u", job_specs->sockets_per_node,
job_specs->job_id);
}
}
if (job_specs->cores_per_socket != (uint16_t) NO_VAL) {
if ((!IS_JOB_PENDING(job_ptr)) || (mc_ptr == NULL)) {
error_code = ESLURM_DISABLED;
goto fini;
} else {
mc_ptr->cores_per_socket = job_specs->cores_per_socket;
info("sched: update_job: setting cores_per_socket to "
"%u for job_id %u", job_specs->cores_per_socket,
job_specs->job_id);
}
}
if ((job_specs->threads_per_core != (uint16_t) NO_VAL)) {
if ((!IS_JOB_PENDING(job_ptr)) || (mc_ptr == NULL)) {
error_code = ESLURM_DISABLED;
goto fini;
} else {
mc_ptr->threads_per_core = job_specs->threads_per_core;
info("sched: update_job: setting threads_per_core to "
"%u for job_id %u", job_specs->threads_per_core,
job_specs->job_id);
}
}
if (job_specs->shared != (uint16_t) NO_VAL) {
if ((!IS_JOB_PENDING(job_ptr)) || (detail_ptr == NULL))
error_code = ESLURM_DISABLED;
else if (authorized
|| (detail_ptr->shared > job_specs->shared)) {
detail_ptr->shared = job_specs->shared;
info("sched: update_job: setting shared to %u for "
"job_id %u",
job_specs->shared, job_specs->job_id);
} else {
error("sched: Attempt to remove sharing for job %u",
job_specs->job_id);
error_code = ESLURM_ACCESS_DENIED;
}
}
if (error_code != SLURM_SUCCESS)
goto fini;
if (job_specs->contiguous != (uint16_t) NO_VAL) {
if ((!IS_JOB_PENDING(job_ptr)) || (detail_ptr == NULL))
error_code = ESLURM_DISABLED;
else if (authorized
|| (detail_ptr->contiguous > job_specs->contiguous)) {
detail_ptr->contiguous = job_specs->contiguous;
info("sched: update_job: setting contiguous to %u "
"for job_id %u", job_specs->contiguous,
job_specs->job_id);
} else {
error("sched: Attempt to add contiguous for job %u",
job_specs->job_id);
error_code = ESLURM_ACCESS_DENIED;
}
}
if (error_code != SLURM_SUCCESS)
goto fini;
if (job_specs->features) {
if ((!IS_JOB_PENDING(job_ptr)) || (detail_ptr == NULL))
error_code = ESLURM_DISABLED;
else if (job_specs->features[0] != '\0') {
char *old_features = detail_ptr->features;
List old_list = detail_ptr->feature_list;
detail_ptr->features = job_specs->features;
detail_ptr->feature_list = NULL;
if (build_feature_list(job_ptr)) {
info("sched: update_job: invalid features"
"(%s) for job_id %u",
job_specs->features, job_specs->job_id);
if (detail_ptr->feature_list)
list_destroy(detail_ptr->feature_list);
detail_ptr->features = old_features;
detail_ptr->feature_list = old_list;
error_code = ESLURM_INVALID_FEATURE;
} else {
info("sched: update_job: setting features to "
"%s for job_id %u",
job_specs->features, job_specs->job_id);
xfree(old_features);
if (old_list)
list_destroy(old_list);
job_specs->features = NULL;
}
} else {
info("sched: update_job: cleared features for job %u",
job_specs->job_id);
xfree(detail_ptr->features);
if (detail_ptr->feature_list) {
list_destroy(detail_ptr->feature_list);
detail_ptr->feature_list = NULL;
}
}
}
if (error_code != SLURM_SUCCESS)
goto fini;
if (job_specs->gres) {
List tmp_gres_list = NULL;
if ((!IS_JOB_PENDING(job_ptr)) || (detail_ptr == NULL) ||
(detail_ptr->expanding_jobid != 0)) {
error_code = ESLURM_DISABLED;
} else if (job_specs->gres[0] == '\0') {
info("sched: update_job: cleared gres for job %u",
job_specs->job_id);
xfree(job_ptr->gres);
FREE_NULL_LIST(job_ptr->gres_list);
} else if (gres_plugin_job_state_validate(job_specs->gres,
&tmp_gres_list)) {
info("sched: update_job: invalid gres %s for job %u",
job_specs->gres, job_specs->job_id);
error_code = ESLURM_INVALID_GRES;
FREE_NULL_LIST(tmp_gres_list);
} else {
info("sched: update_job: setting gres to "
"%s for job_id %u",
job_specs->gres, job_specs->job_id);
xfree(job_ptr->gres);
job_ptr->gres = job_specs->gres;
job_specs->gres = NULL;
FREE_NULL_LIST(job_ptr->gres_list);
job_ptr->gres_list = tmp_gres_list;
}
}
if (error_code != SLURM_SUCCESS)
goto fini;
if (job_specs->name) {
if (!IS_JOB_PENDING(job_ptr)) {
error_code = ESLURM_DISABLED;
goto fini;
} else {
job_ptr->name = job_specs->name;
job_specs->name = NULL;
info("sched: update_job: setting name to %s for "
"job_id %u", job_ptr->name, job_specs->job_id);
update_accounting = true;
}
}
if (job_specs->wckey) {
if (!IS_JOB_PENDING(job_ptr))
error_code = ESLURM_DISABLED;
else {
int rc = update_job_wckey("update_job",
job_ptr,
job_specs->wckey);
if (rc != SLURM_SUCCESS)
error_code = rc;
else
update_accounting = true;
}
}
if (error_code != SLURM_SUCCESS)
goto fini;
if ((job_specs->min_nodes != NO_VAL) &&
(IS_JOB_RUNNING(job_ptr) || IS_JOB_SUSPENDED(job_ptr))) {
/* Use req_nodes to change the nodes associated with a running
* for lack of other field in the job request to use */
if ((job_specs->min_nodes == 0) && (job_ptr->node_cnt > 0) &&
job_ptr->details && job_ptr->details->expanding_jobid) {
struct job_record *expand_job_ptr;
bitstr_t *orig_job_node_bitmap;
expand_job_ptr = find_job_record(job_ptr->details->
expanding_jobid);
if (expand_job_ptr == NULL) {
info("Invalid node count (%u) for job %u "
"update, job %u to expand not found",
job_specs->min_nodes, job_specs->job_id,
job_ptr->details->expanding_jobid);
error_code = ESLURM_INVALID_JOB_ID;
goto fini;
}
if (IS_JOB_SUSPENDED(job_ptr) ||
IS_JOB_SUSPENDED(expand_job_ptr)) {
info("Can not expand job %u from job %u, "
"job is suspended",
expand_job_ptr->job_id, job_ptr->job_id);
error_code = ESLURM_JOB_SUSPENDED;
goto fini;
}
if ((job_ptr->step_list != NULL) &&
(list_count(job_ptr->step_list) != 0)) {
info("Attempt to merge job %u with active "
"steps into job %u",
job_specs->job_id,
job_ptr->details->expanding_jobid);
error_code = ESLURMD_STEP_EXISTS;
goto fini;
}
info("sched: killing job %u and moving all resources "
"to job %u", job_specs->job_id,
expand_job_ptr->job_id);
job_pre_resize_acctg(job_ptr);
job_pre_resize_acctg(expand_job_ptr);
_send_job_kill(job_ptr);
xassert(job_ptr->job_resrcs);
xassert(job_ptr->job_resrcs->node_bitmap);
orig_job_node_bitmap = bit_copy(expand_job_ptr->
job_resrcs->
node_bitmap);
error_code = select_g_job_expand(job_ptr,
expand_job_ptr);
if (error_code == SLURM_SUCCESS) {
_merge_job_licenses(job_ptr, expand_job_ptr);
rebuild_step_bitmaps(expand_job_ptr,
orig_job_node_bitmap);
}
bit_free(orig_job_node_bitmap);
job_post_resize_acctg(job_ptr);
job_post_resize_acctg(expand_job_ptr);
/* Since job_post_resize_acctg will restart things,
* don't do it again. */
update_accounting = false;
if (error_code)
goto fini;
} else if ((job_specs->min_nodes == 0) ||
(job_specs->min_nodes > job_ptr->node_cnt) ||
job_ptr->details->expanding_jobid) {
info("sched: Invalid node count (%u) for job %u update",
job_specs->min_nodes, job_specs->job_id);
error_code = ESLURM_INVALID_NODE_COUNT;
goto fini;
} else if (job_specs->min_nodes == job_ptr->node_cnt) {
debug2("No change in node count update for job %u",
job_specs->job_id);
} else {
int i, i_first, i_last, total;
struct node_record *node_ptr;
info("sched: update_job: set node count to %u for "
"job_id %u",
job_specs->min_nodes, job_specs->job_id);
job_pre_resize_acctg(job_ptr);
i_first = bit_ffs(job_ptr->node_bitmap);
i_last = bit_fls(job_ptr->node_bitmap);
for (i=i_first, total=0; i<=i_last; i++) {
if (!bit_test(job_ptr->node_bitmap, i))
continue;
if (++total <= job_specs->min_nodes)
continue;
node_ptr = node_record_table_ptr + i;
kill_step_on_node(job_ptr, node_ptr, false);
excise_node_from_job(job_ptr, node_ptr);
}
job_post_resize_acctg(job_ptr);
info("sched: update_job: set nodes to %s for "
"job_id %u",
job_ptr->nodes, job_specs->job_id);
/* Since job_post_resize_acctg will restart
* things don't do it again. */
update_accounting = false;
}
}
if (job_specs->ntasks_per_node != (uint16_t) NO_VAL) {
if ((!IS_JOB_PENDING(job_ptr)) || (detail_ptr == NULL))
error_code = ESLURM_DISABLED;
else if (authorized) {
detail_ptr->ntasks_per_node =
job_specs->ntasks_per_node;
info("sched: update_job: setting ntasks_per_node to %u"
" for job_id %u", job_specs->ntasks_per_node,
job_specs->job_id);
} else {
error("sched: Not super user: ignore ntasks_oper_node "
"change for job %u", job_specs->job_id);
error_code = ESLURM_ACCESS_DENIED;
}
}
if (error_code != SLURM_SUCCESS)
goto fini;
if (job_specs->dependency) {
if ((!IS_JOB_PENDING(job_ptr)) || (job_ptr->details == NULL))
error_code = ESLURM_DISABLED;
else {
int rc;
rc = update_job_dependency(job_ptr,
job_specs->dependency);
if (rc != SLURM_SUCCESS)
error_code = rc;
else {
job_ptr->details->orig_dependency =
xstrdup(job_ptr->details->dependency);
info("sched: update_job: setting dependency to "
"%s for job_id %u",
job_ptr->details->dependency,
job_ptr->job_id);
}
}
}
if (error_code != SLURM_SUCCESS)
goto fini;
if (job_specs->begin_time) {
if (IS_JOB_PENDING(job_ptr) && detail_ptr) {
char time_str[32];
/* Make sure this time is current, it does no good for
* accounting to say this job could have started before
* now */
if (job_specs->begin_time < now)
job_specs->begin_time = now;
detail_ptr->begin_time = job_specs->begin_time;
update_accounting = true;
if ((job_ptr->priority == 1) &&
(detail_ptr->begin_time <= now))
_set_job_prio(job_ptr);
slurm_make_time_str(&detail_ptr->begin_time, time_str,
sizeof(time_str));
info("sched: update_job: setting begin to %s for "
"job_id %u",
time_str, job_ptr->job_id);
} else {
error_code = ESLURM_DISABLED;
goto fini;
}
}
if (job_specs->licenses) {
List license_list;
bool valid;
license_list = license_validate(job_specs->licenses, &valid);
if (!valid) {
info("sched: update_job: invalid licenses: %s",
job_specs->licenses);
error_code = ESLURM_INVALID_LICENSES;
} else if (IS_JOB_PENDING(job_ptr)) {
FREE_NULL_LIST(job_ptr->license_list);
job_ptr->license_list = license_list;
info("sched: update_job: changing licenses from '%s' "
"to '%s' for pending job %u",
job_ptr->licenses, job_specs->licenses,
job_ptr->job_id);
xfree(job_ptr->licenses);
job_ptr->licenses = job_specs->licenses;
job_specs->licenses = NULL; /* nothing to free */
} else if (IS_JOB_RUNNING(job_ptr) &&
(authorized || (license_list == NULL))) {
/* NOTE: This can result in oversubscription of
* licenses */
license_job_return(job_ptr);
FREE_NULL_LIST(job_ptr->license_list);
job_ptr->license_list = license_list;
info("sched: update_job: changing licenses from '%s' "
"to '%s' for running job %u",
job_ptr->licenses, job_specs->licenses,
job_ptr->job_id);
xfree(job_ptr->licenses);
job_ptr->licenses = job_specs->licenses;
job_specs->licenses = NULL; /* nothing to free */
license_job_get(job_ptr);
} else {
/* licenses are valid, but job state or user not
* allowed to make changes */
info("sched: update_job: could not change licenses "
"for job %u", job_ptr->job_id);
error_code = ESLURM_DISABLED;
FREE_NULL_LIST(license_list);
}
}
if (error_code != SLURM_SUCCESS)
goto fini;
fail_reason = job_limits_check(&job_ptr);
if (fail_reason != WAIT_NO_REASON) {
if (fail_reason == WAIT_QOS_THRES)
error_code = ESLURM_QOS_THRES;
else if (fail_reason == WAIT_PART_TIME_LIMIT ||
fail_reason == WAIT_PART_NODE_LIMIT)
error_code = SLURM_SUCCESS;
else
error_code = ESLURM_REQUESTED_PART_CONFIG_UNAVAILABLE;
job_ptr->priority = 1; /* Move to end of queue */
job_ptr->state_reason = fail_reason;
xfree(job_ptr->state_desc);
return error_code;
} else if ((job_ptr->state_reason != WAIT_HELD) &&
(job_ptr->state_reason != WAIT_HELD_USER)) {
job_ptr->state_reason = WAIT_NO_REASON;
}
#ifdef HAVE_BG
select_g_select_jobinfo_get(job_specs->select_jobinfo,
SELECT_JOBDATA_CONN_TYPE, &conn_type);
if (conn_type != (uint16_t) NO_VAL) {
if ((!IS_JOB_PENDING(job_ptr)) || (detail_ptr == NULL))
error_code = ESLURM_DISABLED;
else {
if((conn_type >= SELECT_SMALL)
&& (detail_ptr->min_cpus >= cpus_per_mp)) {
info("update_job: could not change "
"conn_type to '%s' because cpu "
"count is %u for job %u making "
"the conn_type invalid.",
conn_type_string(conn_type),
detail_ptr->min_cpus,
job_ptr->job_id);
error_code = ESLURM_INVALID_NODE_COUNT;
} else if(((conn_type == SELECT_TORUS)
|| (conn_type == SELECT_MESH))
&& (detail_ptr->min_cpus < cpus_per_mp)) {
info("update_job: could not change "
"conn_type to '%s' because cpu "
"count is %u for job %u making "
"the conn_type invalid.",
conn_type_string(conn_type),
detail_ptr->min_cpus,
job_ptr->job_id);
error_code = ESLURM_INVALID_NODE_COUNT;
} else {
info("update_job: setting conn_type to '%s' "
"for jobid %u",
conn_type_string(conn_type),
job_ptr->job_id);
select_g_select_jobinfo_set(
job_ptr->select_jobinfo,
SELECT_JOBDATA_CONN_TYPE, &conn_type);
}
}
}
if (error_code != SLURM_SUCCESS)
goto fini;
/* check to make sure we didn't mess up with the proc count */
select_g_select_jobinfo_get(job_ptr->select_jobinfo,
SELECT_JOBDATA_CONN_TYPE, &conn_type);
if(detail_ptr &&
(((conn_type >= SELECT_SMALL)
&& (detail_ptr->min_cpus >= cpus_per_mp))
|| (((conn_type == SELECT_TORUS)|| (conn_type == SELECT_MESH))
&& (detail_ptr->min_cpus < cpus_per_mp)))) {
info("update_job: With cpu count at %u our conn_type "
"of '%s' is invalid for job %u.",
detail_ptr->min_cpus,
conn_type_string(conn_type),
job_ptr->job_id);
error_code = ESLURM_INVALID_NODE_COUNT;
goto fini;
}
select_g_select_jobinfo_get(job_specs->select_jobinfo,
SELECT_JOBDATA_ROTATE, &rotate);
if (rotate != (uint16_t) NO_VAL) {
if (!IS_JOB_PENDING(job_ptr)) {
error_code = ESLURM_DISABLED;
goto fini;
} else {
info("sched: update_job: setting rotate to %u for "
"jobid %u", rotate, job_ptr->job_id);
select_g_select_jobinfo_set(
job_ptr->select_jobinfo,
SELECT_JOBDATA_ROTATE, &rotate);
}
}
select_g_select_jobinfo_get(job_specs->select_jobinfo,
SELECT_JOBDATA_REBOOT, &reboot);
if (reboot != (uint16_t) NO_VAL) {
if (!IS_JOB_PENDING(job_ptr)) {
error_code = ESLURM_DISABLED;
goto fini;
} else {
info("sched: update_job: setting reboot to %u for "
"jobid %u", reboot, job_ptr->job_id);
select_g_select_jobinfo_set(
job_ptr->select_jobinfo,
SELECT_JOBDATA_REBOOT, &reboot);
}
}
select_g_select_jobinfo_get(job_specs->select_jobinfo,
SELECT_JOBDATA_GEOMETRY, geometry);
if (geometry[0] != (uint16_t) NO_VAL) {
if (!IS_JOB_PENDING(job_ptr))
error_code = ESLURM_DISABLED;
else if (authorized) {
uint32_t i, tot = 1;
for (i=0; i<SYSTEM_DIMENSIONS; i++)
tot *= geometry[i];
info("sched: update_job: setting geometry to %ux%ux%u"
" min_nodes=%u for jobid %u",
geometry[0], geometry[1],
geometry[2], tot, job_ptr->job_id);
select_g_select_jobinfo_set(job_ptr->select_jobinfo,
SELECT_JOBDATA_GEOMETRY,
geometry);
detail_ptr->min_nodes = tot;
} else {
error("sched: Attempt to change geometry for job %u",
job_specs->job_id);
error_code = ESLURM_ACCESS_DENIED;
}
}
if (error_code != SLURM_SUCCESS)
goto fini;
select_g_select_jobinfo_get(job_specs->select_jobinfo,
SELECT_JOBDATA_BLRTS_IMAGE, &image);
if (image) {
if (!IS_JOB_PENDING(job_ptr)) {
xfree(image);
error_code = ESLURM_DISABLED;
goto fini;
} else {
info("sched: update_job: setting BlrtsImage to %s "
"for jobid %u", image, job_ptr->job_id);
select_g_select_jobinfo_set(
job_ptr->select_jobinfo,
SELECT_JOBDATA_BLRTS_IMAGE,
image);
}
xfree(image);
}
select_g_select_jobinfo_get(job_specs->select_jobinfo,
SELECT_JOBDATA_LINUX_IMAGE, &image);
if (image) {
if (!IS_JOB_PENDING(job_ptr)) {
error_code = ESLURM_DISABLED;
xfree(image);
goto fini;
} else {
info("sched: update_job: setting LinuxImage to %s "
"for jobid %u", image, job_ptr->job_id);
select_g_select_jobinfo_set(
job_ptr->select_jobinfo,
SELECT_JOBDATA_LINUX_IMAGE, image);
}
xfree(image);
}
select_g_select_jobinfo_get(job_specs->select_jobinfo,
SELECT_JOBDATA_MLOADER_IMAGE, &image);
if (image) {
if (!IS_JOB_PENDING(job_ptr)) {
error_code = ESLURM_DISABLED;
xfree(image);
goto fini;
} else {
info("sched: update_job: setting MloaderImage to %s "
"for jobid %u", image, job_ptr->job_id);
select_g_select_jobinfo_set(
job_ptr->select_jobinfo,
SELECT_JOBDATA_MLOADER_IMAGE,
image);
}
xfree(image);
}
select_g_select_jobinfo_get(job_specs->select_jobinfo,
SELECT_JOBDATA_RAMDISK_IMAGE, &image);
if (image) {
if (!IS_JOB_PENDING(job_ptr)) {
error_code = ESLURM_DISABLED;
xfree(image);
goto fini;
} else {
info("sched: update_job: setting RamdiskImage to %s "
"for jobid %u", image, job_ptr->job_id);
select_g_select_jobinfo_set(
job_ptr->select_jobinfo,
SELECT_JOBDATA_RAMDISK_IMAGE,
image);
}
xfree(image);
}
#endif
fini:
if (update_accounting) {
info("updating accounting");
if (job_ptr->details && job_ptr->details->begin_time) {
/* Update job record in accounting to reflect
* changes */
jobacct_storage_g_job_start(acct_db_conn,
job_ptr);
}
}
/* If job update is successful and priority is calculated (not only
* based upon job submit order), recalculate the job priority, since
* many factors of an update may affect priority considerations.
* If job has a hold then do nothing */
if ((error_code == SLURM_SUCCESS) && (job_ptr->priority > 1) &&
strcmp(slurmctld_conf.priority_type, "priority/basic"))
_set_job_prio(job_ptr);
return error_code;
}
static void _send_job_kill(struct job_record *job_ptr)
{
kill_job_msg_t *kill_job = NULL;
agent_arg_t *agent_args = NULL;
#ifdef HAVE_FRONT_END
front_end_record_t *front_end_ptr;
#else
int i;
struct node_record *node_ptr;
#endif
xassert(job_ptr);
xassert(job_ptr->details);
agent_args = xmalloc(sizeof(agent_arg_t));
agent_args->msg_type = REQUEST_TERMINATE_JOB;
agent_args->retry = 0; /* re_kill_job() resends as needed */
agent_args->hostlist = hostlist_create("");
if (agent_args->hostlist == NULL)
fatal("hostlist_create: malloc failure");
kill_job = xmalloc(sizeof(kill_job_msg_t));
last_node_update = time(NULL);
kill_job->job_id = job_ptr->job_id;
kill_job->step_id = NO_VAL;
kill_job->job_state = job_ptr->job_state;
kill_job->job_uid = job_ptr->user_id;
kill_job->nodes = xstrdup(job_ptr->nodes);
kill_job->time = time(NULL);
kill_job->start_time = job_ptr->start_time;
kill_job->select_jobinfo = select_g_select_jobinfo_copy(
job_ptr->select_jobinfo);
kill_job->spank_job_env = xduparray(job_ptr->spank_job_env_size,
job_ptr->spank_job_env);
kill_job->spank_job_env_size = job_ptr->spank_job_env_size;
#ifdef HAVE_FRONT_END
if (job_ptr->batch_host &&
(front_end_ptr = job_ptr->front_end_ptr)) {
hostlist_push(agent_args->hostlist, job_ptr->batch_host);
agent_args->node_count++;
}
#else
for (i = 0, node_ptr = node_record_table_ptr;
i < node_record_count; i++, node_ptr++) {
if (!bit_test(job_ptr->node_bitmap, i))
continue;
hostlist_push(agent_args->hostlist, node_ptr->name);
agent_args->node_count++;
}
#endif
if (agent_args->node_count == 0) {
if (job_ptr->details->expanding_jobid == 0) {
error("Job %u allocated no nodes to be killed on",
job_ptr->job_id);
}
xfree(kill_job->nodes);
xfree(kill_job);
hostlist_destroy(agent_args->hostlist);
xfree(agent_args);
return;
}
agent_args->msg_args = kill_job;
agent_queue_request(agent_args);
return;
}
/* Record accounting information for a job immediately before changing size */
extern void job_pre_resize_acctg(struct job_record *job_ptr)
{
/* if we don't have a db_index go a start this one up since if
running with the slurmDBD the job may not have started yet.
*/
if(!job_ptr->db_index)
jobacct_storage_g_job_start(acct_db_conn, job_ptr);
job_ptr->job_state |= JOB_RESIZING;
job_ptr->resize_time = time(NULL);
/* NOTE: job_completion_logger() calls
* acct_policy_remove_job_submit() */
job_completion_logger(job_ptr, false);
/* This doesn't happen in job_completion_logger, but gets
* added back in with job_post_resize_acctg so remove it here. */
acct_policy_job_fini(job_ptr);
/* NOTE: The RESIZING FLAG needed to be cleared with
job_post_resize_acctg */
}
/* Record accounting information for a job immediately after changing size */
extern void job_post_resize_acctg(struct job_record *job_ptr)
{
/* NOTE: The RESIZING FLAG needed to be set with
job_pre_resize_acctg the assert is here to make sure we
code it that way. */
xassert(IS_JOB_RESIZING(job_ptr));
acct_policy_add_job_submit(job_ptr);
acct_policy_job_begin(job_ptr);
jobacct_storage_g_job_start(acct_db_conn, job_ptr);
job_ptr->job_state &= (~JOB_RESIZING);
}
/*
* validate_jobs_on_node - validate that any jobs that should be on the node
* are actually running, if not clean up the job records and/or node
* records, call this function after validate_node_specs() sets the node
* state properly
* IN reg_msg - node registration message
*/
extern void
validate_jobs_on_node(slurm_node_registration_status_msg_t *reg_msg)
{
int i, node_inx, jobs_on_node;
struct node_record *node_ptr;
struct job_record *job_ptr;
struct step_record *step_ptr;
time_t now = time(NULL);
node_ptr = find_node_record(reg_msg->node_name);
if (node_ptr == NULL) {
error("slurmd registered on unknown node %s",
reg_msg->node_name);
return;
}
if (node_ptr->up_time > reg_msg->up_time) {
verbose("Node %s rebooted %u secs ago",
reg_msg->node_name, reg_msg->up_time);
}
if (reg_msg->up_time <= now) {
node_ptr->up_time = reg_msg->up_time;
node_ptr->boot_time = now - reg_msg->up_time;
node_ptr->slurmd_start_time = reg_msg->slurmd_start_time;
} else {
error("Node up_time is invalid: %u>%u", reg_msg->up_time,
(uint32_t) now);
}
node_inx = node_ptr - node_record_table_ptr;
/* Check that jobs running are really supposed to be there */
for (i = 0; i < reg_msg->job_count; i++) {
if ( (reg_msg->job_id[i] >= MIN_NOALLOC_JOBID) &&
(reg_msg->job_id[i] <= MAX_NOALLOC_JOBID) ) {
info("NoAllocate job %u.%u reported on node %s",
reg_msg->job_id[i], reg_msg->step_id[i],
reg_msg->node_name);
continue;
}
job_ptr = find_job_record(reg_msg->job_id[i]);
if (job_ptr == NULL) {
error("Orphan job %u.%u reported on node %s",
reg_msg->job_id[i], reg_msg->step_id[i],
reg_msg->node_name);
abort_job_on_node(reg_msg->job_id[i],
job_ptr, node_ptr->name);
}
else if (IS_JOB_RUNNING(job_ptr) ||
IS_JOB_SUSPENDED(job_ptr)) {
if (bit_test(job_ptr->node_bitmap, node_inx)) {
debug3("Registered job %u.%u on node %s ",
reg_msg->job_id[i],
reg_msg->step_id[i],
reg_msg->node_name);
if ((job_ptr->batch_flag) &&
(node_inx == bit_ffs(
job_ptr->node_bitmap))) {
/* NOTE: Used for purging defunct
* batch jobs */
job_ptr->time_last_active = now;
}
step_ptr = find_step_record(job_ptr,
reg_msg->
step_id[i]);
if (step_ptr)
step_ptr->time_last_active = now;
} else {
/* Typically indicates a job requeue and
* restart on another nodes. A node from the
* original allocation just responded here. */
error("Registered job %u.%u on wrong node %s ",
reg_msg->job_id[i],
reg_msg->step_id[i],
reg_msg->node_name);
abort_job_on_node(reg_msg->job_id[i], job_ptr,
node_ptr->name);
}
}
else if (IS_JOB_COMPLETING(job_ptr)) {
/* Re-send kill request as needed,
* not necessarily an error */
kill_job_on_node(reg_msg->job_id[i], job_ptr,
node_ptr);
}
else if (IS_JOB_PENDING(job_ptr)) {
/* Typically indicates a job requeue and the hung
* slurmd that went DOWN is now responding */
error("Registered PENDING job %u.%u on node %s ",
reg_msg->job_id[i], reg_msg->step_id[i],
reg_msg->node_name);
abort_job_on_node(reg_msg->job_id[i],
job_ptr, node_ptr->name);
}
else { /* else job is supposed to be done */
error("Registered job %u.%u in state %s on node %s ",
reg_msg->job_id[i], reg_msg->step_id[i],
job_state_string(job_ptr->job_state),
reg_msg->node_name);
kill_job_on_node(reg_msg->job_id[i], job_ptr,
node_ptr);
}
}
jobs_on_node = node_ptr->run_job_cnt + node_ptr->comp_job_cnt;
if (jobs_on_node)
_purge_missing_jobs(node_inx, now);
if (jobs_on_node != reg_msg->job_count) {
/* slurmd will not know of a job unless the job has
* steps active at registration time, so this is not
* an error condition, slurmd is also reporting steps
* rather than jobs */
debug3("resetting job_count on node %s from %d to %d",
reg_msg->node_name, reg_msg->job_count, jobs_on_node);
reg_msg->job_count = jobs_on_node;
}
return;
}
/* Purge any batch job that should have its script running on node
* node_inx, but is not. Allow BatchStartTimeout + ResumeTimeout seconds
* for startup.
*
* Purge all job steps that were started before the node was last booted.
*
* Also notify srun if any job steps should be active on this node
* but are not found. */
static void _purge_missing_jobs(int node_inx, time_t now)
{
ListIterator job_iterator;
struct job_record *job_ptr;
struct node_record *node_ptr = node_record_table_ptr + node_inx;
uint16_t batch_start_timeout = slurm_get_batch_start_timeout();
uint16_t msg_timeout = slurm_get_msg_timeout();
uint16_t resume_timeout = slurm_get_resume_timeout();
uint32_t suspend_time = slurm_get_suspend_time();
time_t batch_startup_time, node_boot_time = (time_t) 0, startup_time;
if (node_ptr->boot_time > (msg_timeout + 5)) {
/* allow for message timeout and other delays */
node_boot_time = node_ptr->boot_time - (msg_timeout + 5);
}
batch_startup_time = now - batch_start_timeout;
batch_startup_time -= msg_timeout;
job_iterator = list_iterator_create(job_list);
while ((job_ptr = (struct job_record *) list_next(job_iterator))) {
bool job_active = IS_JOB_RUNNING(job_ptr) ||
IS_JOB_SUSPENDED(job_ptr);
if ((!job_active) ||
(!bit_test(job_ptr->node_bitmap, node_inx)))
continue;
if ((job_ptr->batch_flag != 0) &&
(suspend_time != 0) /* power mgmt on */ &&
(job_ptr->start_time < node_boot_time)) {
startup_time = batch_startup_time - resume_timeout;
} else
startup_time = batch_startup_time;
if ((job_ptr->batch_flag != 0) &&
(job_ptr->time_last_active < startup_time) &&
(job_ptr->start_time < startup_time) &&
(node_inx == bit_ffs(job_ptr->node_bitmap))) {
info("Batch JobId=%u missing from node 0, killing it",
job_ptr->job_id);
job_complete(job_ptr->job_id, 0, false, true, NO_VAL);
} else {
_notify_srun_missing_step(job_ptr, node_inx,
now, node_boot_time);
}
}
list_iterator_destroy(job_iterator);
}
static void _notify_srun_missing_step(struct job_record *job_ptr, int node_inx,
time_t now, time_t node_boot_time)
{
ListIterator step_iterator;
struct step_record *step_ptr;
char *node_name = node_record_table_ptr[node_inx].name;
xassert(job_ptr);
step_iterator = list_iterator_create (job_ptr->step_list);
while ((step_ptr = (struct step_record *) list_next (step_iterator))) {
if (!bit_test(step_ptr->step_node_bitmap, node_inx))
continue;
if (step_ptr->time_last_active >= now) {
/* Back up timer in case more than one node
* registration happens at this same time.
* We don't want this node's registration
* to count toward a different node's
* registration message. */
step_ptr->time_last_active = now - 1;
} else if (step_ptr->host && step_ptr->port) {
/* srun may be able to verify step exists on
* this node using I/O sockets and kill the
* job as needed */
srun_step_missing(step_ptr, node_name);
} else if ((step_ptr->start_time < node_boot_time) &&
(step_ptr->no_kill == 0)) {
/* There is a risk that the job step's tasks completed
* on this node before its reboot, but that should be
* very rare and there is no srun to work with (POE) */
info("Node %s rebooted, killing missing step %u.%u",
node_name, job_ptr->job_id, step_ptr->step_id);
signal_step_tasks_on_node(node_name, step_ptr, SIGKILL,
REQUEST_TERMINATE_TASKS);
}
}
list_iterator_destroy (step_iterator);
}
/*
* abort_job_on_node - Kill the specific job_id on a specific node,
* the request is not processed immediately, but queued.
* This is to prevent a flood of pthreads if slurmctld restarts
* without saved state and slurmd daemons register with a
* multitude of running jobs. Slurmctld will not recognize
* these jobs and use this function to kill them - one
* agent request per node as they register.
* IN job_id - id of the job to be killed
* IN job_ptr - pointer to terminating job (NULL if unknown, e.g. orphaned)
* IN node_name - name of the node on which the job resides
*/
extern void
abort_job_on_node(uint32_t job_id, struct job_record *job_ptr, char *node_name)
{
agent_arg_t *agent_info;
kill_job_msg_t *kill_req;
kill_req = xmalloc(sizeof(kill_job_msg_t));
kill_req->job_id = job_id;
kill_req->step_id = NO_VAL;
kill_req->time = time(NULL);
kill_req->nodes = xstrdup(node_name);
if (job_ptr) { /* NULL if unknown */
kill_req->start_time = job_ptr->start_time;
kill_req->select_jobinfo =
select_g_select_jobinfo_copy(job_ptr->select_jobinfo);
kill_req->spank_job_env = xduparray(job_ptr->spank_job_env_size,
job_ptr->spank_job_env);
kill_req->spank_job_env_size = job_ptr->spank_job_env_size;
} else {
/* kill_req->start_time = 0; Default value */
}
agent_info = xmalloc(sizeof(agent_arg_t));
agent_info->node_count = 1;
agent_info->retry = 0;
agent_info->hostlist = hostlist_create(node_name);
#ifdef HAVE_FRONT_END
debug("Aborting job %u on front end node %s", job_id, node_name);
#else
debug("Aborting job %u on node %s", job_id, node_name);
#endif
agent_info->msg_type = REQUEST_ABORT_JOB;
agent_info->msg_args = kill_req;
agent_queue_request(agent_info);
}
/*
* kill_job_on_node - Kill the specific job_id on a specific node.
* IN job_id - id of the job to be killed
* IN job_ptr - pointer to terminating job (NULL if unknown, e.g. orphaned)
* IN node_ptr - pointer to the node on which the job resides
*/
extern void
kill_job_on_node(uint32_t job_id, struct job_record *job_ptr,
struct node_record *node_ptr)
{
agent_arg_t *agent_info;
kill_job_msg_t *kill_req;
kill_req = xmalloc(sizeof(kill_job_msg_t));
kill_req->job_id = job_id;
kill_req->step_id = NO_VAL;
kill_req->time = time(NULL);
kill_req->start_time = job_ptr->start_time;
kill_req->nodes = xstrdup(node_ptr->name);
if (job_ptr) { /* NULL if unknown */
kill_req->select_jobinfo =
select_g_select_jobinfo_copy(job_ptr->select_jobinfo);
kill_req->job_state = job_ptr->job_state;
}
kill_req->spank_job_env = xduparray(job_ptr->spank_job_env_size,
job_ptr->spank_job_env);
kill_req->spank_job_env_size = job_ptr->spank_job_env_size;
agent_info = xmalloc(sizeof(agent_arg_t));
agent_info->node_count = 1;
agent_info->retry = 0;
#ifdef HAVE_FRONT_END
xassert(job_ptr->batch_host);
agent_info->hostlist = hostlist_create(job_ptr->batch_host);
debug("Killing job %u on front end node %s", job_id,
job_ptr->batch_host);
#else
agent_info->hostlist = hostlist_create(node_ptr->name);
debug("Killing job %u on node %s", job_id, node_ptr->name);
#endif
agent_info->msg_type = REQUEST_TERMINATE_JOB;
agent_info->msg_args = kill_req;
agent_queue_request(agent_info);
}
/*
* job_alloc_info - get details about an existing job allocation
* IN uid - job issuing the code
* IN job_id - ID of job for which info is requested
* OUT job_pptr - set to pointer to job record
*/
extern int
job_alloc_info(uint32_t uid, uint32_t job_id, struct job_record **job_pptr)
{
struct job_record *job_ptr;
job_ptr = find_job_record(job_id);
if (job_ptr == NULL)
return ESLURM_INVALID_JOB_ID;
if ((slurmctld_conf.private_data & PRIVATE_DATA_JOBS) &&
(job_ptr->user_id != uid) && !validate_operator(uid) &&
!assoc_mgr_is_user_acct_coord(acct_db_conn, uid, job_ptr->account))
return ESLURM_ACCESS_DENIED;
if (IS_JOB_PENDING(job_ptr))
return ESLURM_JOB_PENDING;
if (IS_JOB_FINISHED(job_ptr))
return ESLURM_ALREADY_DONE;
*job_pptr = job_ptr;
return SLURM_SUCCESS;
}
/*
* Synchronize the batch job in the system with their files.
* All pending batch jobs must have script and environment files
* No other jobs should have such files
* NOTE: READ lock_slurmctld config before entry
*/
int sync_job_files(void)
{
List batch_dirs;
batch_dirs = list_create(_del_batch_list_rec);
_get_batch_job_dir_ids(batch_dirs);
_validate_job_files(batch_dirs);
_remove_defunct_batch_dirs(batch_dirs);
list_destroy(batch_dirs);
return SLURM_SUCCESS;
}
/* Append to the batch_dirs list the job_id's associated with
* every batch job directory in existence
* NOTE: READ lock_slurmctld config before entry
*/
static void _get_batch_job_dir_ids(List batch_dirs)
{
DIR *f_dir;
struct dirent *dir_ent;
long long_job_id;
uint32_t *job_id_ptr;
char *endptr;
xassert(slurmctld_conf.state_save_location);
f_dir = opendir(slurmctld_conf.state_save_location);
if (!f_dir) {
error("opendir(%s): %m",
slurmctld_conf.state_save_location);
return;
}
while ((dir_ent = readdir(f_dir))) {
if (strncmp("job.#", dir_ent->d_name, 4))
continue;
long_job_id = strtol(&dir_ent->d_name[4], &endptr, 10);
if ((long_job_id == 0) || (endptr[0] != '\0'))
continue;
debug3("found batch directory for job_id %ld", long_job_id);
job_id_ptr = xmalloc(sizeof(uint32_t));
*job_id_ptr = long_job_id;
list_append (batch_dirs, job_id_ptr);
}
closedir(f_dir);
}
/* All pending batch jobs must have a batch_dir entry,
* otherwise we flag it as FAILED and don't schedule
* If the batch_dir entry exists for a PENDING or RUNNING batch job,
* remove it the list (of directories to be deleted) */
static void _validate_job_files(List batch_dirs)
{
ListIterator job_iterator;
struct job_record *job_ptr;
int del_cnt;
job_iterator = list_iterator_create(job_list);
while ((job_ptr = (struct job_record *) list_next(job_iterator))) {
if (!job_ptr->batch_flag)
continue;
if (IS_JOB_FINISHED(job_ptr))
continue;
/* Want to keep this job's files */
del_cnt = list_delete_all(batch_dirs, _find_batch_dir,
&(job_ptr->job_id));
if ((del_cnt == 0) && IS_JOB_PENDING(job_ptr)) {
error("Script for job %u lost, state set to FAILED",
job_ptr->job_id);
job_ptr->job_state = JOB_FAILED;
job_ptr->exit_code = 1;
job_ptr->state_reason = FAIL_SYSTEM;
xfree(job_ptr->state_desc);
job_ptr->start_time = job_ptr->end_time = time(NULL);
job_completion_logger(job_ptr, false);
}
}
list_iterator_destroy(job_iterator);
}
/* List matching function, see common/list.h */
static int _find_batch_dir(void *x, void *key)
{
uint32_t *key1 = x;
uint32_t *key2 = key;
return (int)(*key1 == *key2);
}
/* List entry deletion function, see common/list.h */
static void _del_batch_list_rec(void *x)
{
xfree(x);
}
/* Remove all batch_dir entries in the list
* NOTE: READ lock_slurmctld config before entry */
static void _remove_defunct_batch_dirs(List batch_dirs)
{
ListIterator batch_dir_inx;
uint32_t *job_id_ptr;
batch_dir_inx = list_iterator_create(batch_dirs);
while ((job_id_ptr = list_next(batch_dir_inx))) {
info("Purging files for defunct batch job %u",
*job_id_ptr);
_delete_job_desc_files(*job_id_ptr);
}
list_iterator_destroy(batch_dir_inx);
}
/*
* _xmit_new_end_time
* Tell all slurmd's associated with a job of its new end time
* IN job_ptr - pointer to terminating job
* globals: node_record_count - number of nodes in the system
* node_record_table_ptr - pointer to global node table
*/
static void
_xmit_new_end_time(struct job_record *job_ptr)
{
#ifndef HAVE_FRONT_END
int i;
#endif
job_time_msg_t *job_time_msg_ptr;
agent_arg_t *agent_args;
agent_args = xmalloc(sizeof(agent_arg_t));
agent_args->msg_type = REQUEST_UPDATE_JOB_TIME;
agent_args->retry = 1;
agent_args->hostlist = hostlist_create("");
if (agent_args->hostlist == NULL)
fatal("hostlist_create: malloc failure");
job_time_msg_ptr = xmalloc(sizeof(job_time_msg_t));
job_time_msg_ptr->job_id = job_ptr->job_id;
job_time_msg_ptr->expiration_time = job_ptr->end_time;
#ifdef HAVE_FRONT_END
xassert(job_ptr->batch_host);
hostlist_push(agent_args->hostlist, job_ptr->batch_host);
agent_args->node_count = 1;
#else
for (i = 0; i < node_record_count; i++) {
if (bit_test(job_ptr->node_bitmap, i) == 0)
continue;
hostlist_push(agent_args->hostlist,
node_record_table_ptr[i].name);
agent_args->node_count++;
}
#endif
agent_args->msg_args = job_time_msg_ptr;
agent_queue_request(agent_args);
return;
}
/*
* job_epilog_complete - Note the completion of the epilog script for a
* given job
* IN job_id - id of the job for which the epilog was executed
* IN node_name - name of the node on which the epilog was executed
* IN return_code - return code from epilog script
* RET true if job is COMPLETED, otherwise false
*/
extern bool job_epilog_complete(uint32_t job_id, char *node_name,
uint32_t return_code)
{
#ifdef HAVE_FRONT_END
int i;
#endif
struct job_record *job_ptr = find_job_record(job_id);
struct node_record *node_ptr;
if (job_ptr == NULL)
return true;
/* There is a potential race condition this handles.
* If slurmctld cold-starts while slurmd keeps running,
* slurmd could notify slurmctld of a job epilog completion
* before getting synced up with slurmctld state. If
* a new job arrives and the job_id is reused, we
* could try to note the termination of a job that
* hasn't really started. Very rare obviously. */
if ((IS_JOB_PENDING(job_ptr) && (!IS_JOB_COMPLETING(job_ptr))) ||
(job_ptr->node_bitmap == NULL)) {
#ifndef HAVE_FRONT_END
uint16_t base_state = NODE_STATE_UNKNOWN;
node_ptr = find_node_record(node_name);
if (node_ptr)
base_state = node_ptr->node_state & NODE_STATE_BASE;
if (base_state == NODE_STATE_DOWN) {
debug("Epilog complete response for job %u from DOWN "
"node %s", job_id, node_name);
} else {
error("Epilog complete response for non-running job "
"%u, slurmctld and slurmd out of sync", job_id);
}
#endif
return false;
}
#ifdef HAVE_FRONT_END
xassert(job_ptr->batch_host);
/* If there is a bad epilog error don't down the frontend
node. If needed (not on a bluegene) the nodes in use by
the job will be downed below.
*/
if (return_code)
error("Epilog error for job %u on %s",
job_ptr->job_id, job_ptr->batch_host);
if (job_ptr->front_end_ptr && IS_JOB_COMPLETING(job_ptr)) {
front_end_record_t *front_end_ptr = job_ptr->front_end_ptr;
if (front_end_ptr->job_cnt_comp)
front_end_ptr->job_cnt_comp--;
else {
error("job_cnt_comp underflow for for job %u on "
"front end %s",
job_ptr->job_id, front_end_ptr->name);
}
if (front_end_ptr->job_cnt_comp == 0)
front_end_ptr->node_state &= (~NODE_STATE_COMPLETING);
}
if ((job_ptr->total_nodes == 0) && IS_JOB_COMPLETING(job_ptr)) {
/* Job resources moved into another job and
* tasks already killed */
front_end_record_t *front_end_ptr = job_ptr->front_end_ptr;
if (front_end_ptr)
front_end_ptr->node_state &= (~NODE_STATE_COMPLETING);
} else {
for (i = 0; i < node_record_count; i++) {
if (!bit_test(job_ptr->node_bitmap, i))
continue;
node_ptr = &node_record_table_ptr[i];
if (return_code)
set_node_down_ptr(node_ptr, "Epilog error");
else
make_node_idle(node_ptr, job_ptr);
}
}
#else
if (return_code) {
error("Epilog error on %s, setting DOWN", node_name);
set_node_down(node_name, "Epilog error");
} else {
node_ptr = find_node_record(node_name);
if (node_ptr)
make_node_idle(node_ptr, job_ptr);
}
#endif
step_epilog_complete(job_ptr, node_name);
/* nodes_completing is out of date, rebuild when next saved */
xfree(job_ptr->nodes_completing);
if (!IS_JOB_COMPLETING(job_ptr)) { /* COMPLETED */
if (IS_JOB_PENDING(job_ptr) && (job_ptr->batch_flag)) {
info("requeue batch job %u", job_ptr->job_id);
/* Clear everything so this appears to be a new job
* and then restart it in accounting. */
job_ptr->start_time = job_ptr->end_time = 0;
job_ptr->total_cpus = 0;
/* Current code (<= 2.1) has it so we start the new
* job with the next step id. This could be used
* when restarting to figure out which step the
* previous run of this job stopped on. */
//job_ptr->next_step_id = 0;
job_ptr->node_cnt = 0;
#ifdef HAVE_BG
select_g_select_jobinfo_set(
job_ptr->select_jobinfo,
SELECT_JOBDATA_BLOCK_ID,
"unassigned");
#endif
xfree(job_ptr->nodes);
xfree(job_ptr->nodes_completing);
FREE_NULL_BITMAP(job_ptr->node_bitmap);
if (job_ptr->details) {
/* the time stamp on the new batch launch
* credential must be larger than the time
* stamp on the revoke request. Also the
* I/O must be all cleared out and the
* named socket purged, so delay for at
* least ten seconds. */
job_ptr->details->begin_time = time(NULL) + 10;
if (!with_slurmdbd)
jobacct_storage_g_job_start(
acct_db_conn, job_ptr);
}
}
return true;
} else
return false;
}
/* job_fini - free all memory associated with job records */
void job_fini (void)
{
if (job_list) {
list_destroy(job_list);
job_list = NULL;
}
xfree(job_hash);
}
/* log the completion of the specified job */
extern void job_completion_logger(struct job_record *job_ptr, bool requeue)
{
int base_state;
xassert(job_ptr);
acct_policy_remove_job_submit(job_ptr);
if (!IS_JOB_RESIZING(job_ptr)) {
/* Remove configuring state just to make sure it isn't there
* since it will throw off displays of the job. */
job_ptr->job_state &= (~JOB_CONFIGURING);
/* make sure all parts of the job are notified */
srun_job_complete(job_ptr);
/* mail out notifications of completion */
base_state = job_ptr->job_state & JOB_STATE_BASE;
if ((base_state == JOB_COMPLETE) ||
(base_state == JOB_CANCELLED)) {
if (requeue && (job_ptr->mail_type & MAIL_JOB_REQUEUE))
mail_job_info(job_ptr, MAIL_JOB_REQUEUE);
if (!requeue && (job_ptr->mail_type & MAIL_JOB_END))
mail_job_info(job_ptr, MAIL_JOB_END);
} else { /* JOB_FAILED, JOB_TIMEOUT, etc. */
if (job_ptr->mail_type & MAIL_JOB_FAIL)
mail_job_info(job_ptr, MAIL_JOB_FAIL);
else if (job_ptr->mail_type & MAIL_JOB_END)
mail_job_info(job_ptr, MAIL_JOB_END);
}
}
g_slurm_jobcomp_write(job_ptr);
/* When starting the resized job everything is taken care of
there, so don't call it here. */
if (IS_JOB_RESIZING(job_ptr))
return;
if(!job_ptr->assoc_id) {
slurmdb_association_rec_t assoc_rec;
/* In case accounting enabled after starting the job */
memset(&assoc_rec, 0, sizeof(slurmdb_association_rec_t));
assoc_rec.acct = job_ptr->account;
assoc_rec.partition = job_ptr->partition;
assoc_rec.uid = job_ptr->user_id;
if(!(assoc_mgr_fill_in_assoc(acct_db_conn, &assoc_rec,
accounting_enforce,
(slurmdb_association_rec_t **)
&job_ptr->assoc_ptr))) {
job_ptr->assoc_id = assoc_rec.id;
/* we have to call job start again because the
* associd does not get updated in job complete */
jobacct_storage_g_job_start(acct_db_conn, job_ptr);
}
}
if(!with_slurmdbd && !job_ptr->db_index)
jobacct_storage_g_job_start(acct_db_conn, job_ptr);
jobacct_storage_g_job_complete(acct_db_conn, job_ptr);
}
/*
* job_independent - determine if this job has a dependent job pending
* or if the job's scheduled begin time is in the future
* IN job_ptr - pointer to job being tested
* RET - true if job no longer must be deferred for another job
*/
extern bool job_independent(struct job_record *job_ptr, int will_run)
{
struct job_details *detail_ptr = job_ptr->details;
time_t now = time(NULL);
int depend_rc;
bool independent = false;
/* Test dependencies first so we can cancel jobs before dependent
* job records get purged (e.g. afterok, afternotok) */
depend_rc = test_job_dependency(job_ptr);
if (depend_rc == 1) {
if ((job_ptr->state_reason != WAIT_HELD) &&
(job_ptr->state_reason != WAIT_HELD_USER)) {
job_ptr->state_reason = WAIT_DEPENDENCY;
xfree(job_ptr->state_desc);
}
return false;
} else if (depend_rc == 2) {
time_t now = time(NULL);
info("Job dependency can't be satisfied, cancelling job %u",
job_ptr->job_id);
job_ptr->job_state = JOB_CANCELLED;
xfree(job_ptr->state_desc);
job_ptr->start_time = now;
job_ptr->end_time = now;
srun_allocate_abort(job_ptr);
job_completion_logger(job_ptr, false);
srun_allocate_abort(job_ptr);
return false;
}
if (detail_ptr && (detail_ptr->begin_time > now)) {
job_ptr->state_reason = WAIT_TIME;
xfree(job_ptr->state_desc);
return false; /* not yet time */
}
if (job_test_resv_now(job_ptr) != SLURM_SUCCESS) {
job_ptr->state_reason = WAIT_RESERVATION;
xfree(job_ptr->state_desc);
return false; /* not yet time */
}
/* Job is eligible to start now */
if (job_ptr->state_reason == WAIT_DEPENDENCY) {
job_ptr->state_reason = WAIT_NO_REASON;
independent = true;
xfree(job_ptr->state_desc);
}
if ((detail_ptr && (detail_ptr->begin_time == 0) &&
(job_ptr->priority != 0))) {
detail_ptr->begin_time = now;
if (independent)
_set_job_prio(job_ptr);
} else if (job_ptr->state_reason == WAIT_TIME) {
job_ptr->state_reason = WAIT_NO_REASON;
xfree(job_ptr->state_desc);
}
return true;
}
/*
* determine if job is ready to execute per the node select plugin
* IN job_id - job to test
* OUT ready - 1 if job is ready to execute 0 otherwise
* RET SLURM error code
*/
extern int job_node_ready(uint32_t job_id, int *ready)
{
int rc;
struct job_record *job_ptr;
xassert(ready);
*ready = 0;
job_ptr = find_job_record(job_id);
if (job_ptr == NULL)
return ESLURM_INVALID_JOB_ID;
/* Always call select_g_job_ready() so that select/bluegene can
* test and update block state information. */
rc = select_g_job_ready(job_ptr);
if (rc == READY_JOB_FATAL)
return ESLURM_INVALID_PARTITION_NAME;
if (rc == READY_JOB_ERROR)
return EAGAIN;
if (rc)
rc = READY_NODE_STATE;
if (IS_JOB_RUNNING(job_ptr) || IS_JOB_SUSPENDED(job_ptr))
rc |= READY_JOB_STATE;
*ready = rc;
return SLURM_SUCCESS;
}
/* Send specified signal to all steps associated with a job */
static void _signal_job(struct job_record *job_ptr, int signal)
{
#ifndef HAVE_FRONT_END
int i;
#endif
agent_arg_t *agent_args = NULL;
signal_job_msg_t *signal_job_msg = NULL;
agent_args = xmalloc(sizeof(agent_arg_t));
agent_args->msg_type = REQUEST_SIGNAL_JOB;
agent_args->retry = 1;
agent_args->hostlist = hostlist_create("");
if (agent_args->hostlist == NULL)
fatal("hostlist_create: malloc failure");
signal_job_msg = xmalloc(sizeof(kill_tasks_msg_t));
signal_job_msg->job_id = job_ptr->job_id;
signal_job_msg->signal = signal;
#ifdef HAVE_FRONT_END
xassert(job_ptr->batch_host);
hostlist_push(agent_args->hostlist, job_ptr->batch_host);
agent_args->node_count = 1;
#else
for (i = 0; i < node_record_count; i++) {
if (bit_test(job_ptr->node_bitmap, i) == 0)
continue;
hostlist_push(agent_args->hostlist,
node_record_table_ptr[i].name);
agent_args->node_count++;
}
#endif
if (agent_args->node_count == 0) {
xfree(signal_job_msg);
xfree(agent_args);
return;
}
agent_args->msg_args = signal_job_msg;
agent_queue_request(agent_args);
return;
}
/* Send suspend request to slumrd of all nodes associated with a job */
static void _suspend_job(struct job_record *job_ptr, uint16_t op)
{
#ifndef HAVE_FRONT_END
int i;
#endif
agent_arg_t *agent_args;
suspend_msg_t *sus_ptr;
agent_args = xmalloc(sizeof(agent_arg_t));
agent_args->msg_type = REQUEST_SUSPEND;
agent_args->retry = 0; /* don't resend, gang scheduler
* sched/wiki or sched/wiki2 can
* quickly induce huge backlog
* of agent.c RPCs */
agent_args->hostlist = hostlist_create("");
if (agent_args->hostlist == NULL)
fatal("hostlist_create: malloc failure");
sus_ptr = xmalloc(sizeof(suspend_msg_t));
sus_ptr->job_id = job_ptr->job_id;
sus_ptr->op = op;
#ifdef HAVE_FRONT_END
xassert(job_ptr->batch_host);
hostlist_push(agent_args->hostlist, job_ptr->batch_host);
agent_args->node_count = 1;
#else
for (i = 0; i < node_record_count; i++) {
if (bit_test(job_ptr->node_bitmap, i) == 0)
continue;
hostlist_push(agent_args->hostlist,
node_record_table_ptr[i].name);
agent_args->node_count++;
}
#endif
if (agent_args->node_count == 0) {
xfree(sus_ptr);
xfree(agent_args);
return;
}
agent_args->msg_args = sus_ptr;
agent_queue_request(agent_args);
return;
}
/*
* Specified job is being suspended, release allocated nodes
* job_ptr IN - job to be suspended
* indf_susp IN - set if job is being suspended indefinitely by user
* or admin, otherwise suspended for gang scheduling
*/
static int _suspend_job_nodes(struct job_record *job_ptr, bool indf_susp)
{
int i, rc = SLURM_SUCCESS;
struct node_record *node_ptr = node_record_table_ptr;
uint16_t node_flags;
time_t now = time(NULL);
if ((rc = select_g_job_suspend(job_ptr, indf_susp)) != SLURM_SUCCESS)
return rc;
for (i=0; i<node_record_count; i++, node_ptr++) {
if (bit_test(job_ptr->node_bitmap, i) == 0)
continue;
node_ptr->sus_job_cnt++;
if (node_ptr->run_job_cnt)
(node_ptr->run_job_cnt)--;
else {
error("Node %s run_job_cnt underflow",
node_ptr->name);
}
if (job_ptr->details && (job_ptr->details->shared == 0)) {
if (node_ptr->no_share_job_cnt)
(node_ptr->no_share_job_cnt)--;
else {
error("Node %s no_share_job_cnt "
"underflow", node_ptr->name);
}
if (node_ptr->no_share_job_cnt == 0)
bit_set(share_node_bitmap, i);
}
node_flags = node_ptr->node_state & NODE_STATE_FLAGS;
if ((node_ptr->run_job_cnt == 0) &&
(node_ptr->comp_job_cnt == 0)) {
bit_set(idle_node_bitmap, i);
}
if (IS_NODE_DOWN(node_ptr)) {
debug3("_suspend_job_nodes: Node %s left DOWN",
node_ptr->name);
} else if (node_ptr->run_job_cnt) {
node_ptr->node_state = NODE_STATE_ALLOCATED |
node_flags;
} else {
node_ptr->node_state = NODE_STATE_IDLE | node_flags;
node_ptr->last_idle = now;
}
}
last_job_update = last_node_update = now;
return rc;
}
/*
* Specified job is being resumed, re-allocate the nodes
* job_ptr IN - job to be resumed
* indf_susp IN - set i f job is being resumed from indefinite suspend by user
* or admin, otherwise resume from gang scheduling
*/
static int _resume_job_nodes(struct job_record *job_ptr, bool indf_susp)
{
int i, rc = SLURM_SUCCESS;
struct node_record *node_ptr = node_record_table_ptr;
uint16_t node_flags;
if ((rc = select_g_job_resume(job_ptr, indf_susp)) != SLURM_SUCCESS)
return rc;
for (i=0; i<node_record_count; i++, node_ptr++) {
if (bit_test(job_ptr->node_bitmap, i) == 0)
continue;
if (IS_NODE_DOWN(node_ptr))
return SLURM_ERROR;
}
node_ptr = node_record_table_ptr;
for (i=0; i<node_record_count; i++, node_ptr++) {
if (bit_test(job_ptr->node_bitmap, i) == 0)
continue;
if (node_ptr->sus_job_cnt)
(node_ptr->sus_job_cnt)--;
else {
error("Node %s sus_job_cnt underflow",
node_ptr->name);
}
node_ptr->run_job_cnt++;
if (job_ptr->details &&
(job_ptr->details->shared == 0)) {
node_ptr->no_share_job_cnt++;
if (node_ptr->no_share_job_cnt)
bit_clear(share_node_bitmap, i);
}
bit_clear(idle_node_bitmap, i);
node_flags = node_ptr->node_state & NODE_STATE_FLAGS;
node_ptr->node_state = NODE_STATE_ALLOCATED | node_flags;
}
last_job_update = last_node_update = time(NULL);
return rc;
}
/*
* job_suspend - perform some suspend/resume operation
* IN sus_ptr - suspend/resume request message
* IN uid - user id of the user issuing the RPC
* IN conn_fd - file descriptor on which to send reply,
* -1 if none
* indf_susp IN - set if job is being suspended indefinitely by user or admin
* and we should clear it's priority, otherwise suspended
* temporarily for gang scheduling
* IN protocol_version - slurm protocol version of client
* RET 0 on success, otherwise ESLURM error code
*/
extern int job_suspend(suspend_msg_t *sus_ptr, uid_t uid,
slurm_fd_t conn_fd, bool indf_susp,
uint16_t protocol_version)
{
int rc = SLURM_SUCCESS;
time_t now = time(NULL);
struct job_record *job_ptr = NULL;
slurm_msg_t resp_msg;
return_code_msg_t rc_msg;
/* test if this system configuration
* supports job suspend/resume */
if (strcasecmp(slurmctld_conf.switch_type,
"switch/federation") == 0) {
/* Work is needed to support the
* release and reuse of switch
* windows associated with a job */
rc = ESLURM_NOT_SUPPORTED;
}
#ifdef HAVE_BG
rc = ESLURM_NOT_SUPPORTED;
#endif
if (rc)
goto reply;
/* find the job */
job_ptr = find_job_record (sus_ptr->job_id);
if (job_ptr == NULL) {
rc = ESLURM_INVALID_JOB_ID;
goto reply;
}
/* validate the request */
if ((uid != 0) && (uid != getuid())) {
rc = ESLURM_ACCESS_DENIED;
goto reply;
}
if (IS_JOB_PENDING(job_ptr)) {
rc = ESLURM_JOB_PENDING;
goto reply;
}
if (IS_JOB_FINISHED(job_ptr)) {
rc = ESLURM_ALREADY_DONE;
goto reply;
}
/* Notify salloc/srun of suspend/resume */
srun_job_suspend(job_ptr, sus_ptr->op);
/* perform the operation */
if (sus_ptr->op == SUSPEND_JOB) {
if (!IS_JOB_RUNNING(job_ptr)) {
rc = ESLURM_DISABLED;
goto reply;
}
rc = _suspend_job_nodes(job_ptr, indf_susp);
if (rc != SLURM_SUCCESS)
goto reply;
_suspend_job(job_ptr, sus_ptr->op);
job_ptr->job_state = JOB_SUSPENDED;
if (indf_susp)
job_ptr->priority = 0;
if (job_ptr->suspend_time) {
job_ptr->pre_sus_time +=
difftime(now,
job_ptr->suspend_time);
} else {
job_ptr->pre_sus_time +=
difftime(now,
job_ptr->start_time);
}
suspend_job_step(job_ptr);
} else if (sus_ptr->op == RESUME_JOB) {
if (!IS_JOB_SUSPENDED(job_ptr)) {
rc = ESLURM_DISABLED;
goto reply;
}
rc = _resume_job_nodes(job_ptr, indf_susp);
if (rc != SLURM_SUCCESS)
goto reply;
_suspend_job(job_ptr, sus_ptr->op);
job_ptr->job_state = JOB_RUNNING;
job_ptr->tot_sus_time +=
difftime(now, job_ptr->suspend_time);
if (!wiki_sched_test) {
char *sched_type = slurm_get_sched_type();
if (strcmp(sched_type, "sched/wiki") == 0)
wiki_sched = true;
if (strcmp(sched_type, "sched/wiki2") == 0) {
wiki_sched = true;
wiki2_sched = true;
}
xfree(sched_type);
wiki_sched_test = true;
}
if ((job_ptr->time_limit != INFINITE) && (!wiki2_sched)) {
debug3("Job %u resumed, updating end_time",
job_ptr->job_id);
job_ptr->end_time = now +
(job_ptr->time_limit * 60)
- job_ptr->pre_sus_time;
}
resume_job_step(job_ptr);
}
job_ptr->time_last_active = now;
job_ptr->suspend_time = now;
jobacct_storage_g_job_suspend(acct_db_conn, job_ptr);
reply:
if (conn_fd >= 0) {
slurm_msg_t_init(&resp_msg);
resp_msg.protocol_version = protocol_version;
resp_msg.msg_type = RESPONSE_SLURM_RC;
rc_msg.return_code = rc;
resp_msg.data = &rc_msg;
slurm_send_node_msg(conn_fd, &resp_msg);
}
return rc;
}
/*
* job_requeue - Requeue a running or pending batch job
* IN uid - user id of user issuing the RPC
* IN job_id - id of the job to be requeued
* IN conn_fd - file descriptor on which to send reply
* IN protocol_version - slurm protocol version of client
* IN preempt - true if job being preempted
* RET 0 on success, otherwise ESLURM error code
*/
extern int job_requeue (uid_t uid, uint32_t job_id, slurm_fd_t conn_fd,
uint16_t protocol_version, bool preempt)
{
int rc = SLURM_SUCCESS;
struct job_record *job_ptr = NULL;
bool suspended = false;
slurm_msg_t resp_msg;
return_code_msg_t rc_msg;
time_t now = time(NULL);
/* find the job */
job_ptr = find_job_record (job_id);
if (job_ptr == NULL) {
rc = ESLURM_INVALID_JOB_ID;
goto reply;
}
/* validate the request */
if ((uid != job_ptr->user_id) && !validate_operator(uid) &&
!assoc_mgr_is_user_acct_coord(acct_db_conn, uid,
job_ptr->account)) {
rc = ESLURM_ACCESS_DENIED;
goto reply;
}
if (IS_JOB_FINISHED(job_ptr)) {
rc = ESLURM_ALREADY_DONE;
goto reply;
}
if ((job_ptr->details == NULL) || (job_ptr->details->requeue == 0)) {
rc = ESLURM_DISABLED;
goto reply;
}
if (IS_JOB_COMPLETING(job_ptr)) {
if (IS_JOB_PENDING(job_ptr))
goto reply; /* already requeued */
rc = ESLURM_TRANSITION_STATE_NO_UPDATE;
goto reply;
}
/* nothing else to do if pending */
if (IS_JOB_PENDING(job_ptr))
goto reply;
if (job_ptr->batch_flag == 0) {
debug("Job-requeue can only be done for batch jobs");
rc = ESLURM_BATCH_ONLY;
goto reply;
}
if (!IS_JOB_SUSPENDED(job_ptr) && !IS_JOB_RUNNING(job_ptr)) {
error("job_requeue job %u state is bad %s", job_id,
job_state_string(job_ptr->job_state));
rc = EINVAL;
goto reply;
}
slurm_sched_requeue(job_ptr, "Job requeued by user/admin");
last_job_update = now;
if (IS_JOB_SUSPENDED(job_ptr)) {
enum job_states suspend_job_state = job_ptr->job_state;
/* we can't have it as suspended when we call the
* accounting stuff.
*/
job_ptr->job_state = JOB_CANCELLED;
jobacct_storage_g_job_suspend(acct_db_conn, job_ptr);
job_ptr->job_state = suspend_job_state;
suspended = true;
}
job_ptr->time_last_active = now;
if (suspended)
job_ptr->end_time = job_ptr->suspend_time;
else
job_ptr->end_time = now;
/* We want this job to look like it was cancelled in the
* accounting logs. Set a new submit time so the restarted
* job looks like a new job. */
job_ptr->job_state = JOB_CANCELLED;
build_cg_bitmap(job_ptr);
deallocate_nodes(job_ptr, false, suspended, preempt);
xfree(job_ptr->details->req_node_layout);
job_completion_logger(job_ptr, true);
job_ptr->db_index = 0;
job_ptr->job_state = JOB_PENDING;
if (job_ptr->node_cnt)
job_ptr->job_state |= JOB_COMPLETING;
job_ptr->details->submit_time = now;
job_ptr->pre_sus_time = (time_t) 0;
job_ptr->suspend_time = (time_t) 0;
job_ptr->tot_sus_time = (time_t) 0;
job_ptr->restart_cnt++;
/* Since the job completion logger removes the submit we need
* to add it again. */
acct_policy_add_job_submit(job_ptr);
reply:
if (conn_fd >= 0) {
slurm_msg_t_init(&resp_msg);
resp_msg.protocol_version = protocol_version;
resp_msg.msg_type = RESPONSE_SLURM_RC;
rc_msg.return_code = rc;
resp_msg.data = &rc_msg;
slurm_send_node_msg(conn_fd, &resp_msg);
}
return rc;
}
/*
* job_end_time - Process JOB_END_TIME
* IN time_req_msg - job end time request
* OUT timeout_msg - job timeout response to be sent
* RET SLURM_SUCESS or an error code
*/
extern int job_end_time(job_alloc_info_msg_t *time_req_msg,
srun_timeout_msg_t *timeout_msg)
{
struct job_record *job_ptr;
xassert(timeout_msg);
job_ptr = find_job_record(time_req_msg->job_id);
if (!job_ptr)
return ESLURM_INVALID_JOB_ID;
timeout_msg->job_id = time_req_msg->job_id;
timeout_msg->step_id = NO_VAL;
timeout_msg->timeout = job_ptr->end_time;
return SLURM_SUCCESS;
}
/* Reset nodes_completing field for all jobs.
* Job write lock must be set before calling. */
extern void update_job_nodes_completing(void)
{
ListIterator job_iterator;
struct job_record *job_ptr;
if (!job_list)
return;
job_iterator = list_iterator_create(job_list);
while ((job_ptr = (struct job_record *) list_next(job_iterator))) {
if ((!IS_JOB_COMPLETING(job_ptr)) ||
(job_ptr->node_bitmap == NULL))
continue;
xfree(job_ptr->nodes_completing);
job_ptr->nodes_completing =
bitmap2node_name(job_ptr->node_bitmap);
}
list_iterator_destroy(job_iterator);
}
/*
* job_cancel_by_assoc_id - Cancel all pending and running jobs with a given
* association ID. This happens when an association is deleted (e.g. when
* a user is removed from the association database).
* RET count of cancelled jobs
*/
extern int job_cancel_by_assoc_id(uint32_t assoc_id)
{
int cnt = 0;
ListIterator job_iterator;
struct job_record *job_ptr;
/* Write lock on jobs */
slurmctld_lock_t job_write_lock =
{ NO_LOCK, WRITE_LOCK, NO_LOCK, NO_LOCK };
if (!job_list)
return cnt;
lock_slurmctld(job_write_lock);
job_iterator = list_iterator_create(job_list);
while ((job_ptr = (struct job_record *) list_next(job_iterator))) {
if (job_ptr->assoc_id != assoc_id)
continue;
/* move up to the parent that should still exist */
if (job_ptr->assoc_ptr) {
/* Force a start so the association doesn't
get lost. Since there could be some delay
in the start of the job when running with
the slurmdbd.
*/
if (!job_ptr->db_index) {
jobacct_storage_g_job_start(acct_db_conn,
job_ptr);
}
job_ptr->assoc_ptr =
((slurmdb_association_rec_t *)
job_ptr->assoc_ptr)->usage->parent_assoc_ptr;
if(job_ptr->assoc_ptr)
job_ptr->assoc_id =
((slurmdb_association_rec_t *)
job_ptr->assoc_ptr)->id;
}
if(IS_JOB_FINISHED(job_ptr))
continue;
info("Association deleted, cancelling job %u",
job_ptr->job_id);
/* make sure the assoc_mgr_lock isn't locked before this. */
job_signal(job_ptr->job_id, SIGKILL, 0, 0, false);
job_ptr->state_reason = FAIL_ACCOUNT;
xfree(job_ptr->state_desc);
cnt++;
}
list_iterator_destroy(job_iterator);
unlock_slurmctld(job_write_lock);
return cnt;
}
/*
* job_cancel_by_qos_id - Cancel all pending and running jobs with a given
* QOS ID. This happens when a QOS is deleted (e.g. when
* a QOS is removed from the association database).
* RET count of cancelled jobs
*/
extern int job_cancel_by_qos_id(uint32_t qos_id)
{
int cnt = 0;
ListIterator job_iterator;
struct job_record *job_ptr;
/* Write lock on jobs */
slurmctld_lock_t job_write_lock =
{ NO_LOCK, WRITE_LOCK, NO_LOCK, NO_LOCK };
if (!job_list)
return cnt;
lock_slurmctld(job_write_lock);
job_iterator = list_iterator_create(job_list);
while ((job_ptr = (struct job_record *) list_next(job_iterator))) {
if (job_ptr->qos_id != qos_id)
continue;
/* move up to the parent that should still exist */
if(job_ptr->qos_ptr) {
/* Force a start so the association doesn't
get lost. Since there could be some delay
in the start of the job when running with
the slurmdbd.
*/
if(!job_ptr->db_index) {
jobacct_storage_g_job_start(acct_db_conn,
job_ptr);
}
job_ptr->qos_ptr = NULL;
}
if(IS_JOB_FINISHED(job_ptr))
continue;
info("QOS deleted, cancelling job %u",
job_ptr->job_id);
/* make sure the assoc_mgr_lock isn't locked before this. */
job_signal(job_ptr->job_id, SIGKILL, 0, 0, false);
job_ptr->state_reason = FAIL_QOS;
xfree(job_ptr->state_desc);
cnt++;
}
list_iterator_destroy(job_iterator);
unlock_slurmctld(job_write_lock);
return cnt;
}
/*
* Modify the account associated with a pending job
* IN module - where this is called from
* IN job_ptr - pointer to job which should be modified
* IN new_account - desired account name
* RET SLURM_SUCCESS or error code
*/
extern int update_job_account(char *module, struct job_record *job_ptr,
char *new_account)
{
slurmdb_association_rec_t assoc_rec;
if ((!IS_JOB_PENDING(job_ptr)) || (job_ptr->details == NULL)) {
info("%s: attempt to modify account for non-pending "
"job_id %u", module, job_ptr->job_id);
return ESLURM_DISABLED;
}
memset(&assoc_rec, 0, sizeof(slurmdb_association_rec_t));
assoc_rec.uid = job_ptr->user_id;
assoc_rec.partition = job_ptr->partition;
assoc_rec.acct = new_account;
if (assoc_mgr_fill_in_assoc(acct_db_conn, &assoc_rec,
accounting_enforce,
(slurmdb_association_rec_t **)
&job_ptr->assoc_ptr)) {
info("%s: invalid account %s for job_id %u",
module, new_account, job_ptr->job_id);
return ESLURM_INVALID_ACCOUNT;
} else if(association_based_accounting &&
!job_ptr->assoc_ptr &&
!(accounting_enforce & ACCOUNTING_ENFORCE_ASSOCS)) {
/* if not enforcing associations we want to look for
* the default account and use it to avoid getting
* trash in the accounting records.
*/
assoc_rec.acct = NULL;
assoc_mgr_fill_in_assoc(acct_db_conn, &assoc_rec,
accounting_enforce,
(slurmdb_association_rec_t **)
&job_ptr->assoc_ptr);
if(!job_ptr->assoc_ptr) {
debug("%s: we didn't have an association for account "
"'%s' and user '%u', and we can't seem to find "
"a default one either. Keeping new account "
"'%s'. This will produce trash in accounting. "
"If this is not what you desire please put "
"AccountStorageEnforce=associations "
"in your slurm.conf "
"file.", module, new_account,
job_ptr->user_id, new_account);
assoc_rec.acct = new_account;
}
}
xfree(job_ptr->account);
if (assoc_rec.acct && assoc_rec.acct[0] != '\0') {
job_ptr->account = xstrdup(assoc_rec.acct);
info("%s: setting account to %s for job_id %u",
module, assoc_rec.acct, job_ptr->job_id);
} else {
info("%s: cleared account for job_id %u",
module, job_ptr->job_id);
}
job_ptr->assoc_id = assoc_rec.id;
last_job_update = time(NULL);
return SLURM_SUCCESS;
}
/*
* Modify the account associated with a pending job
* IN module - where this is called from
* IN job_ptr - pointer to job which should be modified
* IN new_wckey - desired wckey name
* RET SLURM_SUCCESS or error code
*/
extern int update_job_wckey(char *module, struct job_record *job_ptr,
char *new_wckey)
{
slurmdb_wckey_rec_t wckey_rec, *wckey_ptr;
if ((!IS_JOB_PENDING(job_ptr)) || (job_ptr->details == NULL)) {
info("%s: attempt to modify account for non-pending "
"job_id %u", module, job_ptr->job_id);
return ESLURM_DISABLED;
}
memset(&wckey_rec, 0, sizeof(slurmdb_wckey_rec_t));
wckey_rec.uid = job_ptr->user_id;
wckey_rec.name = new_wckey;
if (assoc_mgr_fill_in_wckey(acct_db_conn, &wckey_rec,
accounting_enforce, &wckey_ptr)) {
info("%s: invalid wckey %s for job_id %u",
module, new_wckey, job_ptr->job_id);
return ESLURM_INVALID_WCKEY;
} else if(association_based_accounting
&& !wckey_ptr
&& !(accounting_enforce & ACCOUNTING_ENFORCE_WCKEYS)) {
/* if not enforcing associations we want to look for
the default account and use it to avoid getting
trash in the accounting records.
*/
wckey_rec.name = NULL;
assoc_mgr_fill_in_wckey(acct_db_conn, &wckey_rec,
accounting_enforce, &wckey_ptr);
if(!wckey_ptr) {
debug("%s: we didn't have a wckey record for wckey "
"'%s' and user '%u', and we can't seem to find "
"a default one either. Setting it anyway. "
"This will produce trash in accounting. "
"If this is not what you desire please put "
"AccountStorageEnforce=wckeys in your slurm.conf "
"file.", module, new_wckey,
job_ptr->user_id);
wckey_rec.name = new_wckey;
}
}
xfree(job_ptr->wckey);
if (wckey_rec.name && wckey_rec.name[0] != '\0') {
job_ptr->wckey = xstrdup(wckey_rec.name);
info("%s: setting wckey to %s for job_id %u",
module, wckey_rec.name, job_ptr->job_id);
} else {
info("%s: cleared wckey for job_id %u",
module, job_ptr->job_id);
}
last_job_update = time(NULL);
return SLURM_SUCCESS;
}
extern int send_jobs_to_accounting(void)
{
ListIterator itr = NULL;
struct job_record *job_ptr;
slurmctld_lock_t job_write_lock = {
NO_LOCK, WRITE_LOCK, READ_LOCK, READ_LOCK };
time_t now = time(NULL);
/* send jobs in pending or running state */
lock_slurmctld(job_write_lock);
itr = list_iterator_create(job_list);
while ((job_ptr = list_next(itr))) {
if(!job_ptr->assoc_id) {
slurmdb_association_rec_t assoc_rec;
memset(&assoc_rec, 0,
sizeof(slurmdb_association_rec_t));
assoc_rec.uid = job_ptr->user_id;
assoc_rec.partition = job_ptr->partition;
assoc_rec.acct = job_ptr->account;
if(assoc_mgr_fill_in_assoc(
acct_db_conn, &assoc_rec,
accounting_enforce,
(slurmdb_association_rec_t **)
&job_ptr->assoc_ptr) &&
(accounting_enforce & ACCOUNTING_ENFORCE_ASSOCS)
&& (!IS_JOB_FINISHED(job_ptr))) {
info("Cancelling job %u with "
"invalid association",
job_ptr->job_id);
job_ptr->job_state = JOB_CANCELLED;
job_ptr->state_reason = FAIL_ACCOUNT;
if (IS_JOB_PENDING(job_ptr))
job_ptr->start_time = now;
job_ptr->end_time = now;
job_completion_logger(job_ptr, false);
continue;
} else
job_ptr->assoc_id = assoc_rec.id;
}
/* we only want active, un accounted for jobs */
if(job_ptr->db_index || IS_JOB_FINISHED(job_ptr))
continue;
debug("first reg: starting job %u in accounting",
job_ptr->job_id);
jobacct_storage_g_job_start(acct_db_conn, job_ptr);
if (IS_JOB_SUSPENDED(job_ptr))
jobacct_storage_g_job_suspend(acct_db_conn, job_ptr);
}
list_iterator_destroy(itr);
unlock_slurmctld(job_write_lock);
return SLURM_SUCCESS;
}
/* Perform checkpoint operation on a job */
extern int job_checkpoint(checkpoint_msg_t *ckpt_ptr, uid_t uid,
slurm_fd_t conn_fd, uint16_t protocol_version)
{
int rc = SLURM_SUCCESS;
struct job_record *job_ptr;
struct step_record *step_ptr;
checkpoint_resp_msg_t resp_data;
slurm_msg_t resp_msg;
slurm_msg_t_init(&resp_msg);
resp_msg.protocol_version = protocol_version;
/* find the job */
job_ptr = find_job_record (ckpt_ptr->job_id);
if (job_ptr == NULL) {
rc = ESLURM_INVALID_JOB_ID;
goto reply;
}
if ((uid != job_ptr->user_id) && !validate_slurm_user(uid)) {
rc = ESLURM_ACCESS_DENIED ;
goto reply;
}
if (IS_JOB_PENDING(job_ptr)) {
rc = ESLURM_JOB_PENDING;
goto reply;
} else if (IS_JOB_SUSPENDED(job_ptr)) {
/* job can't get cycles for checkpoint
* if it is already suspended */
rc = ESLURM_DISABLED;
goto reply;
} else if (!IS_JOB_RUNNING(job_ptr)) {
rc = ESLURM_ALREADY_DONE;
goto reply;
}
memset((void *)&resp_data, 0, sizeof(checkpoint_resp_msg_t));
if (job_ptr->batch_flag) { /* operate on batch job */
if ((ckpt_ptr->op == CHECK_CREATE) ||
(ckpt_ptr->op == CHECK_REQUEUE) ||
(ckpt_ptr->op == CHECK_VACATE)) {
if (job_ptr->details == NULL) {
rc = ESLURM_DISABLED;
goto reply;
}
if (ckpt_ptr->image_dir == NULL) {
if (job_ptr->details->ckpt_dir == NULL) {
rc = ESLURM_DISABLED;
goto reply;
}
ckpt_ptr->image_dir = xstrdup(job_ptr->details
->ckpt_dir);
}
rc = _checkpoint_job_record(job_ptr,
ckpt_ptr->image_dir);
if (rc != SLURM_SUCCESS)
goto reply;
}
/* append job id to ckpt image dir */
xstrfmtcat(ckpt_ptr->image_dir, "/%u", job_ptr->job_id);
rc = checkpoint_op(ckpt_ptr->job_id, ckpt_ptr->step_id, NULL,
ckpt_ptr->op, ckpt_ptr->data,
ckpt_ptr->image_dir, &resp_data.event_time,
&resp_data.error_code,
&resp_data.error_msg);
info("checkpoint_op %u of %u.%u complete, rc=%d",
ckpt_ptr->op, ckpt_ptr->job_id, ckpt_ptr->step_id, rc);
last_job_update = time(NULL);
} else { /* operate on all of a job's steps */
int update_rc = -2;
ListIterator step_iterator;
step_iterator = list_iterator_create (job_ptr->step_list);
while ((step_ptr = (struct step_record *)
list_next (step_iterator))) {
char *image_dir = NULL;
if (ckpt_ptr->image_dir) {
image_dir = xstrdup(ckpt_ptr->image_dir);
} else {
image_dir = xstrdup(step_ptr->ckpt_dir);
}
xstrfmtcat(image_dir, "/%u.%u", job_ptr->job_id,
step_ptr->step_id);
update_rc = checkpoint_op(ckpt_ptr->job_id,
step_ptr->step_id,
step_ptr,
ckpt_ptr->op,
ckpt_ptr->data,
image_dir,
&resp_data.event_time,
&resp_data.error_code,
&resp_data.error_msg);
info("checkpoint_op %u of %u.%u complete, rc=%d",
ckpt_ptr->op, ckpt_ptr->job_id,
step_ptr->step_id, rc);
rc = MAX(rc, update_rc);
xfree(image_dir);
}
if (update_rc != -2) /* some work done */
last_job_update = time(NULL);
list_iterator_destroy (step_iterator);
}
reply:
if (conn_fd < 0) /* periodic checkpoint */
return rc;
if ((rc == SLURM_SUCCESS) &&
((ckpt_ptr->op == CHECK_ABLE) || (ckpt_ptr->op == CHECK_ERROR))) {
resp_msg.msg_type = RESPONSE_CHECKPOINT;
resp_msg.data = &resp_data;
(void) slurm_send_node_msg(conn_fd, &resp_msg);
} else {
return_code_msg_t rc_msg;
rc_msg.return_code = rc;
resp_msg.msg_type = RESPONSE_SLURM_RC;
resp_msg.data = &rc_msg;
(void) slurm_send_node_msg(conn_fd, &resp_msg);
}
return rc;
}
/*
* _checkpoint_job_record - save job to file for checkpoint
*
*/
static int _checkpoint_job_record (struct job_record *job_ptr, char *image_dir)
{
static int high_buffer_size = (1024*1024);
char *ckpt_file = NULL, *old_file = NULL, *new_file = NULL;
int ckpt_fd, error_code = SLURM_SUCCESS;
Buf buffer = init_buf(high_buffer_size);
ckpt_file = xstrdup(slurmctld_conf.job_ckpt_dir);
xstrfmtcat(ckpt_file, "/%u.ckpt", job_ptr->job_id);
debug("_checkpoint_job_record: checkpoint job record of %u to file %s",
job_ptr->job_id, ckpt_file);
old_file = xstrdup(ckpt_file);
xstrcat(old_file, ".old");
new_file = xstrdup(ckpt_file);
xstrcat(new_file, ".new");
/* save version string */
packstr(JOB_CKPT_VERSION, buffer);
/* save checkpoint image directory */
packstr(image_dir, buffer);
_pack_job_for_ckpt(job_ptr, buffer);
ckpt_fd = creat(new_file, 0600);
if (ckpt_fd < 0) {
error("Can't ckpt job, create file %s error: %m",
new_file);
error_code = errno;
} else {
int pos = 0, nwrite = get_buf_offset(buffer), amount, rc;
char *data = (char *)get_buf_data(buffer);
while (nwrite > 0) {
amount = write(ckpt_fd, &data[pos], nwrite);
if ((amount < 0) && (errno != EINTR)) {
error("Error writing file %s, %m", new_file);
error_code = errno;
break;
} else if (amount >= 0) {
nwrite -= amount;
pos += amount;
}
}
rc = fsync_and_close(ckpt_fd, "checkpoint");
if (rc && !error_code)
error_code = rc;
}
if (error_code)
(void) unlink(new_file);
else { /* file shuffle */
(void) unlink(old_file);
if(link(ckpt_file, old_file))
debug4("unable to create link for %s -> %s: %m",
ckpt_file, old_file);
(void) unlink(ckpt_file);
if(link(new_file, ckpt_file))
debug4("unable to create link for %s -> %s: %m",
new_file, ckpt_file);
(void) unlink(new_file);
}
xfree(ckpt_file);
xfree(old_file);
xfree(new_file);
free_buf(buffer);
return error_code;
}
/*
* _pack_job_for_ckpt - save RUNNING job to buffer for checkpoint
*
* Just save enough information to restart it
*
* IN job_ptr - id of the job to be checkpointed
* IN buffer - buffer to save the job state
*/
static void _pack_job_for_ckpt (struct job_record *job_ptr, Buf buffer)
{
slurm_msg_t msg;
job_desc_msg_t *job_desc;
/* save allocated nodes */
packstr(job_ptr->nodes, buffer);
/* save job req */
job_desc = _copy_job_record_to_job_desc(job_ptr);
msg.msg_type = REQUEST_SUBMIT_BATCH_JOB;
msg.protocol_version = SLURM_PROTOCOL_VERSION;
msg.data = job_desc;
pack_msg(&msg, buffer);
/* free the environment since all strings are stored in one
* xmalloced buffer */
if (job_desc->environment) {
xfree(job_desc->environment[0]);
xfree(job_desc->environment);
job_desc->env_size = 0;
}
slurm_free_job_desc_msg(job_desc);
}
/*
* _copy_job_record_to_job_desc - construct a job_desc_msg_t for a job.
* IN job_ptr - the job record
* RET the job_desc_msg_t, NULL on error
*/
static job_desc_msg_t *
_copy_job_record_to_job_desc(struct job_record *job_ptr)
{
job_desc_msg_t *job_desc;
struct job_details *details = job_ptr->details;
multi_core_data_t *mc_ptr = details->mc_ptr;
int i;
/* construct a job_desc_msg_t from job */
job_desc = xmalloc(sizeof(job_desc_msg_t));
job_desc->account = xstrdup(job_ptr->account);
job_desc->acctg_freq = details->acctg_freq;
job_desc->alloc_node = xstrdup(job_ptr->alloc_node);
/* Since the allocating salloc or srun is not expected to exist
* when this checkpointed job is restarted, do not save these:
*
* job_desc->alloc_resp_port = job_ptr->alloc_resp_port;
* job_desc->alloc_sid = job_ptr->alloc_sid;
*/
job_desc->argc = details->argc;
job_desc->argv = xmalloc(sizeof(char *) * job_desc->argc);
for (i = 0; i < job_desc->argc; i ++)
job_desc->argv[i] = xstrdup(details->argv[i]);
job_desc->begin_time = details->begin_time;
job_desc->ckpt_interval = job_ptr->ckpt_interval;
job_desc->ckpt_dir = xstrdup(details->ckpt_dir);
job_desc->comment = xstrdup(job_ptr->comment);
job_desc->contiguous = details->contiguous;
job_desc->cpu_bind = xstrdup(details->cpu_bind);
job_desc->cpu_bind_type = details->cpu_bind_type;
job_desc->dependency = xstrdup(details->dependency);
job_desc->end_time = 0; /* Unused today */
job_desc->environment = get_job_env(job_ptr,
&job_desc->env_size);
job_desc->exc_nodes = xstrdup(details->exc_nodes);
job_desc->features = xstrdup(details->features);
job_desc->gres = xstrdup(job_ptr->gres);
job_desc->group_id = job_ptr->group_id;
job_desc->immediate = 0; /* nowhere to get this value */
job_desc->job_id = job_ptr->job_id;
job_desc->kill_on_node_fail = job_ptr->kill_on_node_fail;
job_desc->licenses = xstrdup(job_ptr->licenses);
job_desc->mail_type = job_ptr->mail_type;
job_desc->mail_user = xstrdup(job_ptr->mail_user);
job_desc->mem_bind = xstrdup(details->mem_bind);
job_desc->mem_bind_type = details->mem_bind_type;
job_desc->name = xstrdup(job_ptr->name);
job_desc->network = xstrdup(job_ptr->network);
job_desc->nice = details->nice;
job_desc->num_tasks = details->num_tasks;
job_desc->open_mode = details->open_mode;
job_desc->other_port = job_ptr->other_port;
job_desc->overcommit = details->overcommit;
job_desc->partition = xstrdup(job_ptr->partition);
job_desc->plane_size = details->plane_size;
job_desc->priority = job_ptr->priority;
if (job_ptr->qos_ptr) {
slurmdb_qos_rec_t *qos_ptr =
(slurmdb_qos_rec_t *)job_ptr->qos_ptr;
job_desc->qos = xstrdup(qos_ptr->name);
}
job_desc->resp_host = xstrdup(job_ptr->resp_host);
job_desc->req_nodes = xstrdup(details->req_nodes);
job_desc->requeue = details->requeue;
job_desc->reservation = xstrdup(job_ptr->resv_name);
job_desc->script = get_job_script(job_ptr);
job_desc->shared = details->shared;
job_desc->spank_job_env_size = job_ptr->spank_job_env_size;
job_desc->spank_job_env = xmalloc(sizeof(char *) *
job_desc->spank_job_env_size);
for (i = 0; i < job_desc->spank_job_env_size; i ++)
job_desc->spank_job_env[i]= xstrdup(job_ptr->spank_job_env[i]);
job_desc->std_err = xstrdup(details->std_err);
job_desc->std_in = xstrdup(details->std_in);
job_desc->std_out = xstrdup(details->std_out);
job_desc->task_dist = details->task_dist;
job_desc->time_limit = job_ptr->time_limit;
job_desc->time_min = job_ptr->time_min;
job_desc->user_id = job_ptr->user_id;
job_desc->wait_all_nodes = job_ptr->wait_all_nodes;
job_desc->warn_signal = job_ptr->warn_signal;
job_desc->warn_time = job_ptr->warn_time;
job_desc->wckey = xstrdup(job_ptr->wckey);
job_desc->work_dir = xstrdup(details->work_dir);
job_desc->pn_min_cpus = details->pn_min_cpus;
job_desc->pn_min_memory = details->pn_min_memory;
job_desc->pn_min_tmp_disk = details->pn_min_tmp_disk;
job_desc->min_cpus = details->min_cpus;
job_desc->max_cpus = details->max_cpus;
job_desc->min_nodes = details->min_nodes;
job_desc->max_nodes = details->max_nodes;
job_desc->sockets_per_node = mc_ptr->sockets_per_node;
job_desc->cores_per_socket = mc_ptr->cores_per_socket;
job_desc->threads_per_core = mc_ptr->threads_per_core;
job_desc->cpus_per_task = details->cpus_per_task;
job_desc->ntasks_per_node = details->ntasks_per_node;
job_desc->ntasks_per_socket = mc_ptr->ntasks_per_socket;
job_desc->ntasks_per_core = mc_ptr->ntasks_per_core;
#if 0
/* select_jobinfo is unused at job submit time, only it's
* components are set. We recover those from the structure below.
* job_desc->select_jobinfo = select_g_select_jobinfo_copy(job_ptr->
select_jobinfo); */
/* The following fields are used only on BlueGene systems.
* Since BlueGene does not use the checkpoint/restart logic today,
* we do not them. */
select_g_select_jobinfo_get(job_ptr->select_jobinfo,
SELECT_JOBDATA_GEOMETRY,
&job_desc->geometry);
select_g_select_jobinfo_get(job_ptr->select_jobinfo,
SELECT_JOBDATA_CONN_TYPE,
&job_desc->conn_type);
select_g_select_jobinfo_get(job_ptr->select_jobinfo,
SELECT_JOBDATA_REBOOT,
&job_desc->reboot);
select_g_select_jobinfo_get(job_ptr->select_jobinfo,
SELECT_JOBDATA_ROTATE,
&job_desc->rotate);
select_g_select_jobinfo_get(job_ptr->select_jobinfo,
SELECT_JOBDATA_BLRTS_IMAGE,
&job_desc->blrtsimage);
select_g_select_jobinfo_get(job_ptr->select_jobinfo,
SELECT_JOBDATA_LINUX_IMAGE,
&job_desc->linuximage);
select_g_select_jobinfo_get(job_ptr->select_jobinfo,
SELECT_JOBDATA_MLOADER_IMAGE,
&job_desc->mloaderimage);
select_g_select_jobinfo_get(job_ptr->select_jobinfo,
SELECT_JOBDATA_RAMDISK_IMAGE,
&job_desc->ramdiskimage);
#endif
return job_desc;
}
/*
* job_restart - Restart a batch job from checkpointed state
*
* Restarting a job is similar to submitting a new job, except that
* the job requirements are loaded from the checkpoint file, and
* the job id is restored.
*
* IN ckpt_ptr - checkpoint request message
* IN uid - user id of the user issuing the RPC
* IN conn_fd - file descriptor on which to send reply
* IN protocol_version - slurm protocol version of client
* RET 0 on success, otherwise ESLURM error code
*/
extern int job_restart(checkpoint_msg_t *ckpt_ptr, uid_t uid, slurm_fd_t conn_fd,
uint16_t protocol_version)
{
struct job_record *job_ptr;
char *image_dir, *ckpt_file, *data, *ver_str = NULL;
char *alloc_nodes = NULL;
int data_size = 0;
Buf buffer;
uint32_t tmp_uint32;
slurm_msg_t msg, resp_msg;
return_code_msg_t rc_msg;
job_desc_msg_t *job_desc = NULL;
int rc = SLURM_SUCCESS;
uint16_t ckpt_version = (uint16_t) NO_VAL;
if (ckpt_ptr->step_id != SLURM_BATCH_SCRIPT) {
rc = ESLURM_NOT_SUPPORTED;
goto reply;
}
if ((job_ptr = find_job_record(ckpt_ptr->job_id)) &&
! IS_JOB_FINISHED(job_ptr)) {
rc = ESLURM_DISABLED;
goto reply;
}
ckpt_file = xstrdup(slurmctld_conf.job_ckpt_dir);
xstrfmtcat(ckpt_file, "/%u.ckpt", ckpt_ptr->job_id);
data = _read_job_ckpt_file(ckpt_file, &data_size);
xfree(ckpt_file);
if (data == NULL) {
rc = errno;
xfree (ckpt_file);
goto reply;
}
buffer = create_buf(data, data_size);
/* unpack version string */
safe_unpackstr_xmalloc(&ver_str, &tmp_uint32, buffer);
debug3("Version string in job_ckpt header is %s", ver_str);
if (ver_str) {
if (!strcmp(ver_str, JOB_CKPT_VERSION)) {
ckpt_version = SLURM_PROTOCOL_VERSION;
} else if (!strcmp(ver_str, JOB_2_1_CKPT_VERSION)) {
ckpt_version = SLURM_2_1_PROTOCOL_VERSION;
}
}
if (ckpt_version == (uint16_t)NO_VAL) {
error("***************************************************");
error("Can not restart from job ckpt, incompatible version");
error("***************************************************");
rc = EINVAL;
goto unpack_error;
}
/* unpack checkpoint image directory */
safe_unpackstr_xmalloc(&image_dir, &tmp_uint32, buffer);
/* unpack the allocated nodes */
safe_unpackstr_xmalloc(&alloc_nodes, &tmp_uint32, buffer);
/* unpack the job req */
msg.msg_type = REQUEST_SUBMIT_BATCH_JOB;
msg.protocol_version = ckpt_version;
if (unpack_msg(&msg, buffer) != SLURM_SUCCESS)
goto unpack_error;
job_desc = msg.data;
/* sanity check */
if (job_desc->job_id != ckpt_ptr->job_id) {
error("saved job id(%u) is different from required job id(%u)",
job_desc->job_id, ckpt_ptr->job_id);
rc = EINVAL;
goto unpack_error;
}
if (!validate_slurm_user(uid) && (job_desc->user_id != uid)) {
error("Security violation, user %u not allowed to restart "
"job %u of user %u",
uid, ckpt_ptr->job_id, job_desc->user_id);
rc = EPERM;
goto unpack_error;
}
if (ckpt_ptr->data == 1) { /* stick to nodes */
xfree(job_desc->req_nodes);
job_desc->req_nodes = alloc_nodes;
alloc_nodes = NULL; /* Nothing left to xfree */
}
/* set open mode to append */
job_desc->open_mode = OPEN_MODE_APPEND;
/* Set new job priority */
job_desc->priority = NO_VAL;
/*
* XXX: we set submit_uid to 0 in the following job_allocate() call
* This is for setting the job_id to the original one.
* But this will bypass some partition access permission checks.
* TODO: fix this.
*/
rc = job_allocate(job_desc,
0, /* immediate */
0, /* will_run */
NULL, /* resp */
0, /* allocate */
0, /* submit_uid. set to 0 to set job_id */
&job_ptr);
/* set restart directory */
if (job_ptr) {
if (ckpt_ptr->image_dir) {
xfree (image_dir);
image_dir = xstrdup(ckpt_ptr->image_dir);
}
xstrfmtcat(image_dir, "/%u", ckpt_ptr->job_id);
job_ptr->details->restart_dir = image_dir;
image_dir = NULL; /* Nothing left to xfree */
last_job_update = time(NULL);
}
unpack_error:
free_buf(buffer);
xfree(ver_str);
xfree(image_dir);
xfree(alloc_nodes);
xfree(ckpt_file);
reply:
slurm_msg_t_init(&resp_msg);
resp_msg.protocol_version = protocol_version;
rc_msg.return_code = rc;
resp_msg.msg_type = RESPONSE_SLURM_RC;
resp_msg.data = &rc_msg;
(void) slurm_send_node_msg(conn_fd, &resp_msg);
return rc;
}
static char *
_read_job_ckpt_file(char *ckpt_file, int *size_ptr)
{
int ckpt_fd, error_code = 0;
int data_allocated, data_read, data_size = 0;
char *data = NULL;
ckpt_fd = open(ckpt_file, O_RDONLY);
if (ckpt_fd < 0) {
info("No job ckpt file (%s) to read", ckpt_file);
error_code = ENOENT;
} else {
data_allocated = BUF_SIZE;
data = xmalloc(data_allocated);
while (1) {
data_read = read(ckpt_fd, &data[data_size],
BUF_SIZE);
if (data_read < 0) {
if (errno == EINTR)
continue;
else {
error("Read error on %s: %m",
ckpt_file);
error_code = errno;
break;
}
} else if (data_read == 0) /* eof */
break;
data_size += data_read;
data_allocated += data_read;
xrealloc(data, data_allocated);
}
close(ckpt_fd);
}
if (error_code) {
xfree(data);
return NULL;
}
*size_ptr = data_size;
return data;
}
/* Build a bitmap of nodes completing this job */
extern void build_cg_bitmap(struct job_record *job_ptr)
{
FREE_NULL_BITMAP(job_ptr->node_bitmap_cg);
if (job_ptr->node_bitmap) {
job_ptr->node_bitmap_cg = bit_copy(job_ptr->node_bitmap);
if (bit_set_count(job_ptr->node_bitmap_cg) == 0)
job_ptr->job_state &= (~JOB_COMPLETING);
} else {
error("build_cg_bitmap: node_bitmap is NULL");
job_ptr->node_bitmap_cg = bit_alloc(node_record_count);
job_ptr->job_state &= (~JOB_COMPLETING);
}
if (job_ptr->node_bitmap_cg == NULL)
fatal("bit_copy: memory allocation failure");
}