blob: 9d2c0ab81aba4833a5165653a48cca7ec77f2f97 [file] [log] [blame]
/*****************************************************************************\
* jobcomp_common.c - common functions for jobcomp plugins
*****************************************************************************
* Copyright (C) SchedMD LLC.
*
* This file is part of Slurm, a resource management program.
* For details, see <https://slurm.schedmd.com/>.
* Please also read the included file: DISCLAIMER.
*
* Slurm is free software; you can redistribute it and/or modify it under
* the terms of the GNU General Public License as published by the Free
* Software Foundation; either version 2 of the License, or (at your option)
* any later version.
*
* In addition, as a special exception, the copyright holders give permission
* to link the code of portions of this program with the OpenSSL library under
* certain conditions as described in each individual source file, and
* distribute linked combinations including the two. You must obey the GNU
* General Public License in all respects for all of the code used other than
* OpenSSL. If you modify file(s) with this exception, you may extend this
* exception to your version of the file(s), but you are not obligated to do
* so. If you do not wish to do so, delete this exception statement from your
* version. If you delete this exception statement from all source files in
* the program, then also delete it here.
*
* Slurm is distributed in the hope that it will be useful, but WITHOUT ANY
* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
* FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
* details.
*
* You should have received a copy of the GNU General Public License along
* with Slurm; if not, write to the Free Software Foundation, Inc.,
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
\*****************************************************************************/
#include "src/common/assoc_mgr.h"
#include "src/common/data.h"
#include "src/common/fd.h"
#include "src/common/id_util.h"
#include "src/common/parse_time.h"
#include "src/interfaces/jobcomp.h"
#include "src/plugins/jobcomp/common/jobcomp_common.h"
#include "src/slurmctld/slurmctld.h"
#define JOBCOMP_CONF_DEFAULT_EVENTS JOBCOMP_CONF_JOB_FINISH
static bool send_script = false;
static bool _valid_date_format(char *date_str)
{
if (!date_str || !*date_str ||
!xstrcasecmp(date_str, "unknown") ||
!xstrcasecmp(date_str, "none"))
return false;
return true;
}
extern void jobcomp_common_conf_init(void)
{
if (xstrcasestr(slurm_conf.job_comp_params, "send_script"))
send_script = true;
}
extern void jobcomp_common_conf_fini(void)
{
/* not currently used */
}
/*
* Return an string associated with a jobcomp_event_t.
* Caller should NOT free it.
*
* IN: event
* OUT: associated event string
*/
extern char *jobcomp_common_get_event_name(uint32_t event)
{
switch (event) {
case JOBCOMP_EVENT_JOB_FINISH:
return "job finish";
case JOBCOMP_EVENT_JOB_START:
return "job start";
case JOBCOMP_EVENT_INVALID:
default:
return "invalid";
}
}
extern data_t *jobcomp_common_job_record_to_data(job_record_t *job_ptr,
uint32_t event)
{
char start_str[32], end_str[32], time_str[32];
char *usr_str = NULL, *grp_str = NULL, *state_string = NULL;
char *exit_code_str = NULL, *derived_ec_str = NULL, *partition = NULL;
buf_t *script = NULL;
enum job_states job_state;
int i, tmp_int, tmp_int2;
time_t elapsed_time = 0;
uint32_t time_limit;
data_t *record = NULL;
bool event_job_finish = (event & JOBCOMP_EVENT_JOB_FINISH);
usr_str = user_from_job(job_ptr);
grp_str = group_from_job(job_ptr);
partition = job_ptr->part_ptr ? job_ptr->part_ptr->name :
job_ptr->partition;
if ((job_ptr->time_limit == NO_VAL) && job_ptr->part_ptr)
time_limit = job_ptr->part_ptr->max_time;
else
time_limit = job_ptr->time_limit;
if (!event_job_finish) {
parse_time_make_str_utc(&job_ptr->start_time, start_str,
sizeof(start_str));
} else if (job_ptr->job_state & JOB_RESIZING) {
time_t now = time(NULL);
state_string = job_state_string(job_ptr->job_state);
if (job_ptr->resize_time) {
parse_time_make_str_utc(&job_ptr->resize_time,
start_str, sizeof(start_str));
} else {
parse_time_make_str_utc(&job_ptr->start_time, start_str,
sizeof(start_str));
}
parse_time_make_str_utc(&now, end_str, sizeof(end_str));
} else {
/* Job state will typically have JOB_COMPLETING or JOB_RESIZING
* flag set when called. We remove the flags to get the eventual
* completion state: JOB_FAILED, JOB_TIMEOUT, etc. */
job_state = job_ptr->job_state & JOB_STATE_BASE;
state_string = job_state_string(job_state);
if (job_ptr->resize_time) {
parse_time_make_str_utc(&job_ptr->resize_time,
start_str, sizeof(start_str));
} else if (job_ptr->start_time > job_ptr->end_time) {
/* Job cancelled while pending and
* expected start time is in the future. */
snprintf(start_str, sizeof(start_str), "Unknown");
} else {
parse_time_make_str_utc(&job_ptr->start_time, start_str,
sizeof(start_str));
}
parse_time_make_str_utc(&job_ptr->end_time, end_str,
sizeof(end_str));
}
if (event_job_finish) {
if (job_ptr->end_time && job_ptr->start_time &&
job_ptr->start_time < job_ptr->end_time)
elapsed_time = job_ptr->end_time - job_ptr->start_time;
else
elapsed_time = 0;
tmp_int = tmp_int2 = 0;
if (job_ptr->derived_ec == NO_VAL)
;
else if (WIFSIGNALED(job_ptr->derived_ec))
tmp_int2 = WTERMSIG(job_ptr->derived_ec);
else if (WIFEXITED(job_ptr->derived_ec))
tmp_int = WEXITSTATUS(job_ptr->derived_ec);
xstrfmtcat(derived_ec_str, "%d:%d", tmp_int, tmp_int2);
tmp_int = tmp_int2 = 0;
if (job_ptr->exit_code == NO_VAL)
;
else if (WIFSIGNALED(job_ptr->exit_code))
tmp_int2 = WTERMSIG(job_ptr->exit_code);
else if (WIFEXITED(job_ptr->exit_code))
tmp_int = WEXITSTATUS(job_ptr->exit_code);
xstrfmtcat(exit_code_str, "%d:%d", tmp_int, tmp_int2);
}
record = data_set_dict(data_new());
data_set_int(data_key_set(record, "jobid"), job_ptr->job_id);
data_set_string(data_key_set(record, "container"), job_ptr->container);
data_set_string(data_key_set(record, "username"), usr_str);
data_set_int(data_key_set(record, "user_id"), job_ptr->user_id);
data_set_string(data_key_set(record, "groupname"), grp_str);
data_set_int(data_key_set(record, "group_id"), job_ptr->group_id);
if (_valid_date_format(start_str))
data_set_string(data_key_set(record, "@start"), start_str);
if (event_job_finish && _valid_date_format(end_str))
data_set_string(data_key_set(record, "@end"), end_str);
if (event_job_finish)
data_set_int(data_key_set(record, "elapsed"), elapsed_time);
data_set_string(data_key_set(record, "partition"), partition);
data_set_string(data_key_set(record, "alloc_node"),
job_ptr->alloc_node);
data_set_string(data_key_set(record, "nodes"), job_ptr->nodes);
data_set_int(data_key_set(record, "total_cpus"), job_ptr->total_cpus);
data_set_int(data_key_set(record, "total_nodes"), job_ptr->total_nodes);
if (event_job_finish) {
data_set_string_own(data_key_set(record, "derived_ec"),
derived_ec_str);
derived_ec_str = NULL;
data_set_string_own(data_key_set(record, "exit_code"),
exit_code_str);
exit_code_str = NULL;
}
data_set_string(data_key_set(record, "state"), state_string);
if (event_job_finish) {
data_set_string(data_key_set(record, "failed_node"),
job_ptr->failed_node);
data_set_float(data_key_set(record, "cpu_hours"),
((elapsed_time * job_ptr->total_cpus) /
3600.0f));
}
if (job_ptr->array_task_id != NO_VAL) {
data_set_int(data_key_set(record, "array_job_id"),
job_ptr->array_job_id);
data_set_int(data_key_set(record, "array_task_id"),
job_ptr->array_task_id);
}
if (job_ptr->het_job_id != NO_VAL) {
if (event_job_finish) {
/* Continue supporting the old terms. */
data_set_int(data_key_set(record, "pack_job_id"),
job_ptr->het_job_id);
data_set_int(data_key_set(record, "pack_job_offset"),
job_ptr->het_job_offset);
}
data_set_int(data_key_set(record, "het_job_id"),
job_ptr->het_job_id);
data_set_int(data_key_set(record, "het_job_offset"),
job_ptr->het_job_offset);
}
if ((job_ptr->priority != NO_VAL) && (job_ptr->priority != INFINITE))
data_set_int(data_key_set(record, "priority"),
job_ptr->priority);
if (job_ptr->details && job_ptr->details->submit_time) {
parse_time_make_str_utc(&job_ptr->details->submit_time,
time_str, sizeof(time_str));
if (_valid_date_format(time_str))
data_set_string(data_key_set(record, "@submit"),
time_str);
}
if (job_ptr->details && job_ptr->details->begin_time) {
parse_time_make_str_utc(&job_ptr->details->begin_time, time_str,
sizeof(time_str));
if (_valid_date_format(time_str))
data_set_string(data_key_set(record, "@eligible"),
time_str);
if (job_ptr->start_time) {
int64_t queue_wait = (int64_t)difftime(
job_ptr->start_time,
job_ptr->details->begin_time);
if (queue_wait >= 0)
data_set_int(data_key_set(record,
"@queue_wait"),
queue_wait);
}
}
if (job_ptr->details && job_ptr->details->work_dir)
data_set_string(data_key_set(record, "work_dir"),
job_ptr->details->work_dir);
if (job_ptr->details && job_ptr->details->std_err)
data_set_string(data_key_set(record, "std_err"),
job_ptr->details->std_err);
if (job_ptr->details && job_ptr->details->std_in)
data_set_string(data_key_set(record, "std_in"),
job_ptr->details->std_in);
if (job_ptr->details && job_ptr->details->std_out)
data_set_string(data_key_set(record, "std_out"),
job_ptr->details->std_out);
if (job_ptr->assoc_ptr && job_ptr->assoc_ptr->cluster)
data_set_string(data_key_set(record, "cluster"),
job_ptr->assoc_ptr->cluster);
if (job_ptr->qos_ptr && job_ptr->qos_ptr->name)
data_set_string(data_key_set(record, "qos"),
job_ptr->qos_ptr->name);
if (job_ptr->details && (job_ptr->details->num_tasks != NO_VAL))
data_set_int(data_key_set(record, "ntasks"),
job_ptr->details->num_tasks);
if (job_ptr->details && (job_ptr->details->ntasks_per_node != NO_VAL16))
data_set_int(data_key_set(record, "ntasks_per_node"),
job_ptr->details->ntasks_per_node);
if (job_ptr->details && (job_ptr->details->ntasks_per_tres != NO_VAL16))
data_set_int(data_key_set(record, "ntasks_per_tres"),
job_ptr->details->ntasks_per_tres);
if (job_ptr->details && (job_ptr->details->cpus_per_task != NO_VAL16))
data_set_int(data_key_set(record, "cpus_per_task"),
job_ptr->details->cpus_per_task);
if (job_ptr->details && job_ptr->details->orig_dependency)
data_set_string(data_key_set(record, "orig_dependency"),
job_ptr->details->orig_dependency);
if (job_ptr->details && job_ptr->details->exc_nodes)
data_set_string(data_key_set(record, "excluded_nodes"),
job_ptr->details->exc_nodes);
if (job_ptr->details && job_ptr->details->features)
data_set_string(data_key_set(record, "features"),
job_ptr->details->features);
if (time_limit != INFINITE)
data_set_int(data_key_set(record, "time_limit"),
(time_limit * 60));
if (job_ptr->name)
data_set_string(data_key_set(record, "job_name"),
job_ptr->name);
if (job_ptr->resv_name)
data_set_string(data_key_set(record, "reservation_name"),
job_ptr->resv_name);
if (job_ptr->wckey)
data_set_string(data_key_set(record, "wc_key"), job_ptr->wckey);
if (job_ptr->tres_req_str)
data_set_string(data_key_set(record, "tres_req_raw"),
job_ptr->tres_req_str);
if (job_ptr->tres_fmt_req_str)
data_set_string(data_key_set(record, "tres_req"),
job_ptr->tres_fmt_req_str);
if (job_ptr->tres_alloc_str)
data_set_string(data_key_set(record, "tres_alloc_raw"),
job_ptr->tres_alloc_str);
if (job_ptr->tres_fmt_alloc_str)
data_set_string(data_key_set(record, "tres_alloc"),
job_ptr->tres_fmt_alloc_str);
if (job_ptr->account)
data_set_string(data_key_set(record, "account"),
job_ptr->account);
if (send_script && (script = get_job_script(job_ptr))) {
data_set_string(data_key_set(record, "script"),
get_buf_data(script));
FREE_NULL_BUFFER(script);
}
if (job_ptr->assoc_ptr) {
assoc_mgr_lock_t locks = { READ_LOCK, NO_LOCK, NO_LOCK, NO_LOCK,
NO_LOCK, NO_LOCK, NO_LOCK };
slurmdb_assoc_rec_t *assoc_ptr = job_ptr->assoc_ptr;
char *parent_accounts = NULL;
char **acc_aux = NULL;
int nparents = 0;
assoc_mgr_lock(&locks);
/* Start at the first parent and go up. When studying
* this code it was slightly faster to do 2 loops on
* the association linked list and only 1 xmalloc but
* we opted for cleaner looking code and going with a
* realloc. */
while (assoc_ptr) {
if (assoc_ptr->acct) {
acc_aux = xrealloc(acc_aux,
sizeof(char *) *
(nparents + 1));
acc_aux[nparents++] = assoc_ptr->acct;
}
assoc_ptr = assoc_ptr->usage->parent_assoc_ptr;
}
for (i = nparents - 1; i >= 0; i--)
xstrfmtcat(parent_accounts, "/%s", acc_aux[i]);
xfree(acc_aux);
data_set_string(data_key_set(record, "parent_accounts"),
parent_accounts);
xfree(parent_accounts);
assoc_mgr_unlock(&locks);
}
xfree(usr_str);
xfree(grp_str);
return record;
}
extern uint32_t jobcomp_common_parse_enabled_events(void)
{
uint32_t enabled_events = 0;
enabled_events |= JOBCOMP_CONF_DEFAULT_EVENTS;
if (xstrcasestr(slurm_conf.job_comp_params, "enable_job_start"))
enabled_events |= JOBCOMP_CONF_JOB_START;
return enabled_events;
}