blob: 274e83c8d01a5daf3fc86c605a2451888381e5d9 [file] [log] [blame]
/*****************************************************************************\
* job_step_info.c - get/print the job step state information of slurm
*****************************************************************************
* Copyright (C) 2002-2006 The Regents of the University of California.
* Copyright (C) 2008-2010 Lawrence Livermore National Security.
* Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
* Written by Morris Jette <jette1@llnl.gov>, et. al.
* CODE-OCEC-09-009. All rights reserved.
*
* This file is part of Slurm, a resource management program.
* For details, see <https://slurm.schedmd.com/>.
* Please also read the included file: DISCLAIMER.
*
* Slurm is free software; you can redistribute it and/or modify it under
* the terms of the GNU General Public License as published by the Free
* Software Foundation; either version 2 of the License, or (at your option)
* any later version.
*
* In addition, as a special exception, the copyright holders give permission
* to link the code of portions of this program with the OpenSSL library under
* certain conditions as described in each individual source file, and
* distribute linked combinations including the two. You must obey the GNU
* General Public License in all respects for all of the code used other than
* OpenSSL. If you modify file(s) with this exception, you may extend this
* exception to your version of the file(s), but you are not obligated to do
* so. If you do not wish to do so, delete this exception statement from your
* version. If you delete this exception statement from all source files in
* the program, then also delete it here.
*
* Slurm is distributed in the hope that it will be useful, but WITHOUT ANY
* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
* FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
* details.
*
* You should have received a copy of the GNU General Public License along
* with Slurm; if not, write to the Free Software Foundation, Inc.,
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
\*****************************************************************************/
#include <errno.h>
#include <stdio.h>
#include <stdlib.h>
#include "slurm/slurm.h"
#include "src/common/cpu_frequency.h"
#include "src/common/forward.h"
#include "src/common/parse_time.h"
#include "src/interfaces/select.h"
#include "src/common/slurm_protocol_api.h"
#include "src/common/xmalloc.h"
#include "src/common/xstring.h"
/* Data structures for pthreads used to gather step information from multiple
* clusters in parallel */
typedef struct load_step_req_struct {
slurmdb_cluster_rec_t *cluster;
bool local_cluster;
slurm_msg_t *req_msg;
list_t *resp_msg_list;
} load_step_req_struct_t;
typedef struct load_step_resp_struct {
bool local_cluster;
job_step_info_response_msg_t *new_msg;
} load_step_resp_struct_t;
static int _nodes_in_list(char *node_list)
{
hostset_t *host_set = hostset_create(node_list);
int count = hostset_count(host_set);
hostset_destroy(host_set);
return count;
}
static int _sort_pids_by_name(void *x, void *y)
{
int diff = 0;
job_step_pids_t *rec_a = *(job_step_pids_t **)x;
job_step_pids_t *rec_b = *(job_step_pids_t **)y;
if (!rec_a->node_name || !rec_b->node_name)
return 0;
diff = xstrcmp(rec_a->node_name, rec_b->node_name);
if (diff > 0)
return 1;
else if (diff < 0)
return -1;
return 0;
}
static int _sort_stats_by_name(void *x, void *y)
{
job_step_stat_t *rec_a = *(job_step_stat_t **)x;
job_step_stat_t *rec_b = *(job_step_stat_t **)y;
if (!rec_a->step_pids || !rec_b->step_pids)
return 0;
return _sort_pids_by_name((void *)&rec_a->step_pids, (void *)&rec_b->step_pids);
}
/*
* slurm_print_job_step_info_msg - output information about all Slurm
* job steps based upon message as loaded using slurm_get_job_steps
* IN out - file to write to
* IN job_step_info_msg_ptr - job step information message pointer
* IN one_liner - print as a single line if true
*/
void
slurm_print_job_step_info_msg ( FILE* out,
job_step_info_response_msg_t * job_step_info_msg_ptr,
int one_liner )
{
int i;
job_step_info_t *job_step_ptr = job_step_info_msg_ptr->job_steps ;
char time_str[256];
slurm_make_time_str ((time_t *)&job_step_info_msg_ptr->last_update,
time_str, sizeof(time_str));
fprintf( out, "Job step data as of %s, record count %d\n",
time_str, job_step_info_msg_ptr->job_step_count);
for (i = 0; i < job_step_info_msg_ptr-> job_step_count; i++)
{
slurm_print_job_step_info ( out, & job_step_ptr[i],
one_liner ) ;
}
}
/*
* slurm_print_job_step_info - output information about a specific Slurm
* job step based upon message as loaded using slurm_get_job_steps
* IN out - file to write to
* IN job_ptr - an individual job step information record pointer
* IN one_liner - print as a single line if true
*/
void
slurm_print_job_step_info ( FILE* out, job_step_info_t * job_step_ptr,
int one_liner )
{
char *print_this = slurm_sprint_job_step_info(job_step_ptr, one_liner);
fprintf ( out, "%s", print_this);
xfree(print_this);
}
/*
* slurm_sprint_job_step_info - output information about a specific Slurm
* job step based upon message as loaded using slurm_get_job_steps
* IN job_ptr - an individual job step information record pointer
* IN one_liner - print as a single line if true
* RET out - char * containing formatted output (must be freed after call)
* NULL is returned on failure.
*/
char *
slurm_sprint_job_step_info ( job_step_info_t * job_step_ptr,
int one_liner )
{
char tmp_node_cnt[40];
char time_str[256];
char limit_str[32];
char tmp_line[128];
char *out = NULL;
char *line_end = (one_liner) ? " " : "\n ";
char *sorted_nodelist = NULL;
uint16_t flags = STEP_ID_FLAG_NONE;
/****** Line 1 ******/
slurm_make_time_str ((time_t *)&job_step_ptr->start_time, time_str,
sizeof(time_str));
if (job_step_ptr->time_limit == INFINITE)
snprintf(limit_str, sizeof(limit_str), "UNLIMITED");
else
secs2time_str ((time_t)job_step_ptr->time_limit * 60,
limit_str, sizeof(limit_str));
if (job_step_ptr->array_job_id) {
xstrfmtcat(out, "StepId=%u_%u.",
job_step_ptr->array_job_id,
job_step_ptr->array_task_id);
flags = STEP_ID_FLAG_NO_PREFIX | STEP_ID_FLAG_NO_JOB;
}
log_build_step_id_str(&job_step_ptr->step_id,
tmp_line, sizeof(tmp_line),
flags);
xstrfmtcat(out, "%s ", tmp_line);
xstrfmtcat(out, "UserId=%u StartTime=%s TimeLimit=%s",
job_step_ptr->user_id, time_str, limit_str);
/****** Line 2 ******/
xstrcat(out, line_end);
sorted_nodelist = slurm_sort_node_list_str(job_step_ptr->nodes);
xstrfmtcat(out, "State=%s Partition=%s NodeList=%s",
job_state_string(job_step_ptr->state),
job_step_ptr->partition, sorted_nodelist);
xfree(sorted_nodelist);
/****** Line 3 ******/
convert_num_unit((float)_nodes_in_list(job_step_ptr->nodes),
tmp_node_cnt, sizeof(tmp_node_cnt), UNIT_NONE,
NO_VAL, CONVERT_NUM_UNIT_EXACT);
xstrcat(out, line_end);
xstrfmtcat(out, "Nodes=%s CPUs=%u Tasks=%u Name=%s Network=%s",
tmp_node_cnt, job_step_ptr->num_cpus,
job_step_ptr->num_tasks, job_step_ptr->name,
job_step_ptr->network);
/****** Line 4 ******/
xstrcat(out, line_end);
xstrfmtcat(out, "TRES=%s", job_step_ptr->tres_fmt_alloc_str);
/****** Line 5 ******/
xstrcat(out, line_end);
xstrfmtcat(out, "ResvPorts=%s", job_step_ptr->resv_ports);
/****** Line 6 ******/
xstrcat(out, line_end);
if (cpu_freq_debug(NULL, NULL, tmp_line, sizeof(tmp_line),
job_step_ptr->cpu_freq_gov,
job_step_ptr->cpu_freq_min,
job_step_ptr->cpu_freq_max, NO_VAL) != 0) {
xstrcat(out, tmp_line);
} else {
xstrcat(out, "CPUFreqReq=Default");
}
if (job_step_ptr->task_dist) {
char *name =
slurm_step_layout_type_name(job_step_ptr->task_dist);
xstrfmtcat(out, " Dist=%s", name);
xfree(name);
}
/****** Line 7 ******/
xstrcat(out, line_end);
xstrfmtcat(out, "SrunHost:Pid=%s:%u",
job_step_ptr->srun_host, job_step_ptr->srun_pid);
/****** Line (optional) ******/
if (job_step_ptr->cpus_per_tres) {
xstrcat(out, line_end);
xstrfmtcat(out, "CpusPerTres=%s", job_step_ptr->cpus_per_tres);
}
/****** Line (optional) ******/
if (job_step_ptr->mem_per_tres) {
xstrcat(out, line_end);
xstrfmtcat(out, "MemPerTres=%s", job_step_ptr->mem_per_tres);
}
/****** Line (optional) ******/
if (job_step_ptr->tres_bind) {
xstrcat(out, line_end);
xstrfmtcat(out, "TresBind=%s", job_step_ptr->tres_bind);
}
/****** Line (optional) ******/
if (job_step_ptr->tres_freq) {
xstrcat(out, line_end);
xstrfmtcat(out, "TresFreq=%s", job_step_ptr->tres_freq);
}
/****** Line (optional) ******/
if (job_step_ptr->tres_per_step) {
xstrcat(out, line_end);
xstrfmtcat(out, "TresPerStep=%s", job_step_ptr->tres_per_step);
}
/****** Line (optional) ******/
if (job_step_ptr->tres_per_node) {
xstrcat(out, line_end);
xstrfmtcat(out, "TresPerNode=%s", job_step_ptr->tres_per_node);
}
/****** Line (optional) ******/
if (job_step_ptr->tres_per_socket) {
xstrcat(out, line_end);
xstrfmtcat(out, "TresPerSocket=%s",
job_step_ptr->tres_per_socket);
}
/****** Line (optional) ******/
if (job_step_ptr->tres_per_task) {
xstrcat(out, line_end);
xstrfmtcat(out, "TresPerTask=%s", job_step_ptr->tres_per_task);
}
/****** Line (optional) ******/
if (job_step_ptr->container || job_step_ptr->container_id) {
xstrcat(out, line_end);
xstrfmtcat(out, "Container=%s ContainerID=%s",
job_step_ptr->container, job_step_ptr->container_id);
}
/****** Line (optional) ******/
if (job_step_ptr->std_in) {
char *tmp_path;
xstrcat(out, line_end);
tmp_path = slurm_expand_step_stdio_fields(job_step_ptr->std_in,
job_step_ptr);
xstrfmtcat(out, "StdIn=%s", tmp_path);
xfree(tmp_path);
}
/****** Line (optional) ******/
if (job_step_ptr->std_err) {
char *tmp_path;
xstrcat(out, line_end);
tmp_path = slurm_expand_step_stdio_fields(job_step_ptr->std_err,
job_step_ptr);
xstrfmtcat(out, "StdErr=%s", tmp_path);
xfree(tmp_path);
}
/****** Line (optional) ******/
if (job_step_ptr->std_out) {
char *tmp_path;
xstrcat(out, line_end);
tmp_path = slurm_expand_step_stdio_fields(job_step_ptr->std_out,
job_step_ptr);
xstrfmtcat(out, "StdOut=%s", tmp_path);
xfree(tmp_path);
}
/****** END OF JOB RECORD ******/
if (one_liner)
xstrcat(out, "\n");
else
xstrcat(out, "\n\n");
return out;
}
static int _get_stepmgr_steps(void *x, void *arg)
{
stepmgr_job_info_t *sji = x;
slurm_msg_t resp_msg;
slurm_msg_t_init(&resp_msg);
job_step_info_response_msg_t *ctld_resp = arg;
slurm_msg_t req_msg;
slurm_msg_t_init(&req_msg);
slurm_msg_set_r_uid(&req_msg, slurm_conf.slurmd_user_id);
if (slurm_conf_get_addr(sji->stepmgr, &req_msg.address, req_msg.flags))
{
/*
* The node isn't in the conf, see if the
* controller has an address for it.
*/
slurm_node_alias_addrs_t *alias_addrs = NULL;
if (!slurm_get_node_alias_addrs(sji->stepmgr, &alias_addrs)) {
add_remote_nodes_to_conf_tbls(alias_addrs->node_list,
alias_addrs->node_addrs);
slurm_free_node_alias_addrs(alias_addrs);
slurm_conf_get_addr(sji->stepmgr, &req_msg.address,
req_msg.flags);
}
}
job_step_info_request_msg_t req_data = {0};
req_data.step_id.job_id = sji->job_id;
req_data.step_id.step_id = NO_VAL;
req_data.step_id.step_het_comp = NO_VAL;
req_msg.msg_type = REQUEST_JOB_STEP_INFO;
req_msg.data = &req_data;
if (slurm_send_recv_node_msg(&req_msg, &resp_msg, 0))
return SLURM_ERROR;
if (resp_msg.msg_type == RESPONSE_JOB_STEP_INFO) {
uint32_t new_rec_cnt;
job_step_info_response_msg_t *stepmgr_resp;
stepmgr_resp = (job_step_info_response_msg_t *) resp_msg.data;
/* Merge the step records into a single response message */
new_rec_cnt = ctld_resp->job_step_count +
stepmgr_resp->job_step_count;
if (stepmgr_resp->job_step_count) {
ctld_resp->job_steps =
xrealloc(ctld_resp->job_steps,
sizeof(job_step_info_t) *
new_rec_cnt);
(void) memcpy(ctld_resp->job_steps +
ctld_resp->job_step_count,
stepmgr_resp->job_steps,
sizeof(job_step_info_t) *
stepmgr_resp->job_step_count);
ctld_resp->job_step_count = new_rec_cnt;
xfree(stepmgr_resp->job_steps);
xfree(stepmgr_resp);
}
}
return SLURM_SUCCESS;
}
static int
_load_cluster_steps(slurm_msg_t *req_msg, job_step_info_response_msg_t **resp,
slurmdb_cluster_rec_t *cluster)
{
slurm_msg_t resp_msg;
int rc = SLURM_SUCCESS;
slurm_msg_t_init(&resp_msg);
*resp = NULL;
if (slurm_send_recv_controller_msg(req_msg, &resp_msg, cluster) < 0)
return SLURM_ERROR;
switch (resp_msg.msg_type) {
case RESPONSE_JOB_STEP_INFO:
*resp = (job_step_info_response_msg_t *) resp_msg.data;
resp_msg.data = NULL;
if ((*resp)->stepmgr_jobs) {
list_for_each((*resp)->stepmgr_jobs, _get_stepmgr_steps,
*resp);
}
break;
case RESPONSE_SLURM_RC:
rc = ((return_code_msg_t *) resp_msg.data)->return_code;
slurm_free_return_code_msg(resp_msg.data);
break;
default:
rc = SLURM_UNEXPECTED_MSG_ERROR;
break;
}
if (rc)
slurm_seterrno_ret(rc);
return rc;
}
/* Thread to read step information from some cluster */
static void *_load_step_thread(void *args)
{
load_step_req_struct_t *load_args = (load_step_req_struct_t *)args;
slurmdb_cluster_rec_t *cluster = load_args->cluster;
job_step_info_response_msg_t *new_msg = NULL;
int rc;
if ((rc = _load_cluster_steps(load_args->req_msg, &new_msg, cluster)) ||
!new_msg) {
verbose("Error reading step information from cluster %s: %s",
cluster->name, slurm_strerror(rc));
} else {
load_step_resp_struct_t *step_resp;
step_resp = xmalloc(sizeof(load_step_resp_struct_t));
step_resp->local_cluster = load_args->local_cluster;
step_resp->new_msg = new_msg;
list_append(load_args->resp_msg_list, step_resp);
}
xfree(args);
return NULL;
}
static int
_load_fed_steps(slurm_msg_t *req_msg, job_step_info_response_msg_t **resp,
uint16_t show_flags, char *cluster_name,
slurmdb_federation_rec_t *fed)
{
int i;
load_step_resp_struct_t *step_resp;
job_step_info_response_msg_t *orig_msg = NULL, *new_msg = NULL;
uint32_t new_rec_cnt;
slurmdb_cluster_rec_t *cluster;
list_itr_t *iter;
int pthread_count = 0;
pthread_t *load_thread = 0;
load_step_req_struct_t *load_args;
list_t *resp_msg_list;
*resp = NULL;
/* Spawn one pthread per cluster to collect step information */
resp_msg_list = list_create(NULL);
load_thread = xmalloc(sizeof(pthread_t) *
list_count(fed->cluster_list));
iter = list_iterator_create(fed->cluster_list);
while ((cluster = (slurmdb_cluster_rec_t *) list_next(iter))) {
bool local_cluster = false;
if ((cluster->control_host == NULL) ||
(cluster->control_host[0] == '\0'))
continue; /* Cluster down */
if (!xstrcmp(cluster->name, cluster_name))
local_cluster = true;
if ((show_flags & SHOW_LOCAL) && !local_cluster)
continue;
load_args = xmalloc(sizeof(load_step_req_struct_t));
load_args->cluster = cluster;
load_args->local_cluster = local_cluster;
load_args->req_msg = req_msg;
load_args->resp_msg_list = resp_msg_list;
slurm_thread_create(&load_thread[pthread_count],
_load_step_thread, load_args);
pthread_count++;
}
list_iterator_destroy(iter);
/* Wait for all pthreads to complete */
for (i = 0; i < pthread_count; i++)
slurm_thread_join(load_thread[i]);
xfree(load_thread);
/* Merge the responses into a single response message */
iter = list_iterator_create(resp_msg_list);
while ((step_resp = (load_step_resp_struct_t *) list_next(iter))) {
new_msg = step_resp->new_msg;
if (!orig_msg) {
orig_msg = new_msg;
*resp = orig_msg;
} else {
/* Merge the step records into a single response message */
orig_msg->last_update = MIN(orig_msg->last_update,
new_msg->last_update);
new_rec_cnt = orig_msg->job_step_count +
new_msg->job_step_count;
if (new_msg->job_step_count) {
orig_msg->job_steps =
xrealloc(orig_msg->job_steps,
sizeof(job_step_info_t) *
new_rec_cnt);
(void) memcpy(orig_msg->job_steps +
orig_msg->job_step_count,
new_msg->job_steps,
sizeof(job_step_info_t) *
new_msg->job_step_count);
orig_msg->job_step_count = new_rec_cnt;
}
xfree(new_msg->job_steps);
xfree(new_msg);
}
xfree(step_resp);
}
list_iterator_destroy(iter);
FREE_NULL_LIST(resp_msg_list);
if (!orig_msg)
slurm_seterrno_ret(ESLURM_INVALID_JOB_ID);
return SLURM_SUCCESS;
}
/*
* slurm_get_job_steps - issue RPC to get specific slurm job step
* configuration information if changed since update_time.
* a job_id value of NO_VAL implies all jobs, a step_id value of
* NO_VAL implies all steps
* IN update_time - time of current configuration data
* IN job_id - get information for specific job id, NO_VAL for all jobs
* IN step_id - get information for specific job step id, NO_VAL for all
* job steps
* IN job_info_msg_pptr - place to store a job configuration pointer
* IN show_flags - job step filtering options
* RET SLURM_SUCCESS on success, otherwise return SLURM_ERROR with errno set
* NOTE: free the response using slurm_free_job_step_info_response_msg
*/
int
slurm_get_job_steps (time_t update_time, uint32_t job_id, uint32_t step_id,
job_step_info_response_msg_t **resp, uint16_t show_flags)
{
int rc;
slurm_msg_t req_msg;
job_step_info_request_msg_t req;
slurmdb_federation_rec_t *fed;
void *ptr = NULL;
slurm_step_id_t tmp_step_id = {
.job_id = job_id,
.step_het_comp = NO_VAL,
.step_id = step_id,
};
if ((show_flags & SHOW_LOCAL) == 0) {
if (slurm_load_federation(&ptr) ||
!cluster_in_federation(ptr, slurm_conf.cluster_name)) {
/* Not in federation */
show_flags |= SHOW_LOCAL;
} else {
/* In federation. Need full info from all clusters */
update_time = (time_t) 0;
}
}
slurm_msg_t_init(&req_msg);
memset(&req, 0, sizeof(req));
req.last_update = update_time;
memcpy(&req.step_id, &tmp_step_id, sizeof(req.step_id));
req.show_flags = show_flags;
req_msg.msg_type = REQUEST_JOB_STEP_INFO;
req_msg.data = &req;
/* With -M option, working_cluster_rec is set and we only get
* information for that cluster */
if (working_cluster_rec || !ptr || (show_flags & SHOW_LOCAL)) {
rc = _load_cluster_steps(&req_msg, resp, working_cluster_rec);
} else {
fed = (slurmdb_federation_rec_t *) ptr;
rc = _load_fed_steps(&req_msg, resp, show_flags,
slurm_conf.cluster_name, fed);
}
if (ptr)
slurm_destroy_federation_rec(ptr);
return rc;
}
extern int slurm_find_step_ids_by_container_id(uint16_t show_flags, uid_t uid,
const char *container_id,
list_t *steps)
{
int rc;
slurm_msg_t req_msg, resp_msg;
container_id_request_msg_t req = { 0 };
container_id_response_msg_t *resp;
slurm_msg_t_init(&req_msg);
slurm_msg_t_init(&resp_msg);
req.uid = uid;
req.container_id = xstrdup(container_id);
req.show_flags = show_flags;
req_msg.msg_type = REQUEST_STEP_BY_CONTAINER_ID;
req_msg.data = &req;
if (slurm_send_recv_controller_msg(&req_msg, &resp_msg,
working_cluster_rec))
return errno;
switch (resp_msg.msg_type) {
case RESPONSE_STEP_BY_CONTAINER_ID:
if ((resp = resp_msg.data) && resp->steps)
list_transfer(steps, resp->steps);
rc = SLURM_SUCCESS;
break;
case RESPONSE_SLURM_RC:
rc = ((return_code_msg_t *) resp_msg.data)->return_code;
break;
default:
rc = SLURM_UNEXPECTED_MSG_ERROR;
break;
}
slurm_free_msg_data(resp_msg.msg_type, resp_msg.data);
return rc;
}
extern slurm_step_layout_t *slurm_job_step_layout_get(slurm_step_id_t *step_id)
{
slurm_step_id_t data;
slurm_msg_t req, resp;
int errnum;
char *stepmgr_nodename = NULL;
slurm_msg_t_init(&req);
slurm_msg_t_init(&resp);
req.msg_type = REQUEST_STEP_LAYOUT;
req.data = &data;
memcpy(&data, step_id, sizeof(data));
trystepmgr:
if (stepmgr_nodename) {
slurm_msg_set_r_uid(&req, slurm_conf.slurmd_user_id);
if (slurm_conf_get_addr(stepmgr_nodename, &req.address,
req.flags)) {
/*
* The node isn't in the conf, see if the
* controller has an address for it.
*/
slurm_node_alias_addrs_t *alias_addrs;
if (!slurm_get_node_alias_addrs(stepmgr_nodename,
&alias_addrs)) {
add_remote_nodes_to_conf_tbls(
alias_addrs->node_list,
alias_addrs->node_addrs);
}
slurm_free_node_alias_addrs(alias_addrs);
slurm_conf_get_addr(stepmgr_nodename, &req.address,
req.flags);
}
xfree(stepmgr_nodename);
if (slurm_send_recv_node_msg(&req, &resp, 0))
return NULL;
} else if (slurm_send_recv_controller_msg(&req, &resp,
working_cluster_rec) <0)
return NULL;
switch (resp.msg_type) {
case RESPONSE_SLURM_REROUTE_MSG:
{
reroute_msg_t *rr_msg = resp.data;
stepmgr_nodename = rr_msg->stepmgr;
rr_msg->stepmgr = NULL;
if (stepmgr_nodename) {
goto trystepmgr;
} else {
errno = SLURM_ERROR;
return NULL;
}
break;
}
case RESPONSE_STEP_LAYOUT:
return (slurm_step_layout_t *)resp.data;
case RESPONSE_SLURM_RC:
errnum = ((return_code_msg_t *)resp.data)->return_code;
slurm_free_return_code_msg(resp.data);
errno = errnum;
return NULL;
default:
errno = SLURM_UNEXPECTED_MSG_ERROR;
return NULL;
}
}
/*
* slurm_job_step_stat - status a current step
*
* IN step_id
* IN node_list, optional, if NULL then all nodes in step are returned.
* IN use_protocol_ver protocol version to use.
* OUT resp
* RET SLURM_SUCCESS on success SLURM_ERROR else
*/
extern int slurm_job_step_stat(slurm_step_id_t *step_id,
char *node_list,
uint16_t use_protocol_ver,
job_step_stat_response_msg_t **resp)
{
slurm_msg_t req_msg;
list_itr_t *itr;
slurm_step_id_t req;
list_t *ret_list = NULL;
ret_data_info_t *ret_data_info = NULL;
int rc = SLURM_SUCCESS;
slurm_step_layout_t *step_layout = NULL;
job_step_stat_response_msg_t *resp_out;
bool created = 0;
xassert(resp);
if (!node_list) {
if (!(step_layout = slurm_job_step_layout_get(step_id))) {
rc = errno;
error("%s: problem getting step_layout for %ps: %s",
__func__, step_id, slurm_strerror(rc));
return rc;
}
node_list = step_layout->node_list;
use_protocol_ver = MIN(SLURM_PROTOCOL_VERSION,
step_layout->start_protocol_ver);
fwd_set_alias_addrs(step_layout->alias_addrs);
}
if (!*resp) {
resp_out = xmalloc(sizeof(job_step_stat_response_msg_t));
*resp = resp_out;
created = 1;
} else
resp_out = *resp;
debug("%s: getting pid information of job %ps on nodes %s",
__func__, step_id, node_list);
slurm_msg_t_init(&req_msg);
slurm_msg_set_r_uid(&req_msg, SLURM_AUTH_UID_ANY);
memcpy(&req, step_id, sizeof(req));
memcpy(&resp_out->step_id, step_id, sizeof(resp_out->step_id));
req_msg.protocol_version = MIN(SLURM_PROTOCOL_VERSION,
use_protocol_ver);
req_msg.msg_type = REQUEST_JOB_STEP_STAT;
req_msg.data = &req;
if (!(ret_list = slurm_send_recv_msgs(node_list, &req_msg, 0))) {
error("%s: got an error no list returned", __func__);
rc = SLURM_ERROR;
if (created) {
slurm_job_step_stat_response_msg_free(resp_out);
*resp = NULL;
}
goto cleanup;
}
itr = list_iterator_create(ret_list);
while ((ret_data_info = list_next(itr))) {
switch (ret_data_info->type) {
case RESPONSE_JOB_STEP_STAT:
if (!resp_out->stats_list)
resp_out->stats_list = list_create(
slurm_free_job_step_stat);
list_push(resp_out->stats_list,
ret_data_info->data);
ret_data_info->data = NULL;
break;
case RESPONSE_SLURM_RC:
rc = slurm_get_return_code(ret_data_info->type,
ret_data_info->data);
if (rc == ESLURM_INVALID_JOB_ID) {
debug("%s: job step %ps has already completed",
__func__, step_id);
} else {
error("%s: there was an error with the request to %s rc = %s",
__func__, ret_data_info->node_name,
slurm_strerror(rc));
}
break;
default:
rc = slurm_get_return_code(ret_data_info->type,
ret_data_info->data);
error("%s: unknown return given from %s: %d rc = %s",
__func__, ret_data_info->node_name,
ret_data_info->type, slurm_strerror(rc));
break;
}
}
list_iterator_destroy(itr);
FREE_NULL_LIST(ret_list);
if (resp_out->stats_list)
list_sort(resp_out->stats_list, (ListCmpF)_sort_stats_by_name);
cleanup:
slurm_step_layout_destroy(step_layout);
return rc;
}
/*
* slurm_job_step_get_pids - get the complete list of pids for a given
* job step
*
* IN step_id
* IN node_list, optional, if NULL then all nodes in step are returned.
* OUT resp
* RET SLURM_SUCCESS on success SLURM_ERROR else
*/
extern int slurm_job_step_get_pids(slurm_step_id_t *step_id,
char *node_list,
job_step_pids_response_msg_t **resp)
{
int rc = SLURM_SUCCESS;
slurm_msg_t req_msg;
slurm_step_id_t req;
list_itr_t *itr;
list_t *ret_list = NULL;
ret_data_info_t *ret_data_info = NULL;
slurm_step_layout_t *step_layout = NULL;
job_step_pids_response_msg_t *resp_out;
bool created = 0;
xassert(resp);
if (!node_list) {
if (!(step_layout = slurm_job_step_layout_get(step_id))) {
rc = errno;
error("slurm_job_step_get_pids: "
"problem getting step_layout for %ps: %s",
step_id, slurm_strerror(rc));
return rc;
}
node_list = step_layout->node_list;
fwd_set_alias_addrs(step_layout->alias_addrs);
}
if (!*resp) {
resp_out = xmalloc(sizeof(job_step_pids_response_msg_t));
*resp = resp_out;
created = 1;
} else
resp_out = *resp;
debug("%s: getting pid information of job %ps on nodes %s",
__func__, step_id, node_list);
slurm_msg_t_init(&req_msg);
slurm_msg_set_r_uid(&req_msg, SLURM_AUTH_UID_ANY);
memcpy(&req, step_id, sizeof(req));
memcpy(&resp_out->step_id, step_id, sizeof(resp_out->step_id));
req_msg.msg_type = REQUEST_JOB_STEP_PIDS;
req_msg.data = &req;
if (!(ret_list = slurm_send_recv_msgs(node_list, &req_msg, 0))) {
error("%s: got an error no list returned", __func__);
rc = SLURM_ERROR;
if (created) {
slurm_job_step_pids_response_msg_free(resp_out);
*resp = NULL;
}
goto cleanup;
}
itr = list_iterator_create(ret_list);
while((ret_data_info = list_next(itr))) {
switch (ret_data_info->type) {
case RESPONSE_JOB_STEP_PIDS:
if (!resp_out->pid_list)
resp_out->pid_list = list_create(
slurm_free_job_step_pids);
list_push(resp_out->pid_list,
ret_data_info->data);
ret_data_info->data = NULL;
break;
case RESPONSE_SLURM_RC:
rc = slurm_get_return_code(ret_data_info->type,
ret_data_info->data);
error("%s: there was an error with the list pid request rc = %s",
__func__, slurm_strerror(rc));
break;
default:
rc = slurm_get_return_code(ret_data_info->type,
ret_data_info->data);
error("%s: unknown return given %d rc = %s",
__func__, ret_data_info->type,
slurm_strerror(rc));
break;
}
}
list_iterator_destroy(itr);
FREE_NULL_LIST(ret_list);
if (resp_out->pid_list)
list_sort(resp_out->pid_list, (ListCmpF)_sort_pids_by_name);
cleanup:
slurm_step_layout_destroy(step_layout);
return rc;
}
extern void slurm_job_step_layout_free(slurm_step_layout_t *layout)
{
slurm_step_layout_destroy(layout);
}
extern void slurm_job_step_pids_free(job_step_pids_t *object)
{
slurm_free_job_step_pids(object);
}
extern void slurm_job_step_pids_response_msg_free(void *object)
{
job_step_pids_response_msg_t *step_pids_msg =
(job_step_pids_response_msg_t *) object;
if (step_pids_msg) {
FREE_NULL_LIST(step_pids_msg->pid_list);
xfree(step_pids_msg);
}
}
extern void slurm_job_step_stat_free(job_step_stat_t *object)
{
slurm_free_job_step_stat(object);
}
extern void slurm_job_step_stat_response_msg_free(void *object)
{
job_step_stat_response_msg_t *step_stat_msg =
(job_step_stat_response_msg_t *) object;
if (step_stat_msg) {
FREE_NULL_LIST(step_stat_msg->stats_list);
xfree(step_stat_msg);
}
}