blob: f22843220dee47e338d5992fa63c6682ee517cf1 [file] [log] [blame]
/*****************************************************************************\
* job_step_info.c - get/print the job step state information of slurm
* $Id$
*****************************************************************************
* Copyright (C) 2002-2006 The Regents of the University of California.
* Copyright (C) 2008-2010 Lawrence Livermore National Security.
* Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
* Written by Morris Jette <jette1@llnl.gov>, et. al.
* CODE-OCEC-09-009. All rights reserved.
*
* This file is part of SLURM, a resource management program.
* For details, see <http://slurm.schedmd.com/>.
* Please also read the included file: DISCLAIMER.
*
* SLURM is free software; you can redistribute it and/or modify it under
* the terms of the GNU General Public License as published by the Free
* Software Foundation; either version 2 of the License, or (at your option)
* any later version.
*
* In addition, as a special exception, the copyright holders give permission
* to link the code of portions of this program with the OpenSSL library under
* certain conditions as described in each individual source file, and
* distribute linked combinations including the two. You must obey the GNU
* General Public License in all respects for all of the code used other than
* OpenSSL. If you modify file(s) with this exception, you may extend this
* exception to your version of the file(s), but you are not obligated to do
* so. If you do not wish to do so, delete this exception statement from your
* version. If you delete this exception statement from all source files in
* the program, then also delete it here.
*
* SLURM is distributed in the hope that it will be useful, but WITHOUT ANY
* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
* FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
* details.
*
* You should have received a copy of the GNU General Public License along
* with SLURM; if not, write to the Free Software Foundation, Inc.,
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
\*****************************************************************************/
#ifdef HAVE_CONFIG_H
# include "config.h"
#endif
#include <errno.h>
#include <stdio.h>
#include <stdlib.h>
#include "slurm/slurm.h"
#include "src/common/node_select.h"
#include "src/common/parse_time.h"
#include "src/common/slurm_protocol_api.h"
#include "src/common/xmalloc.h"
#include "src/common/xstring.h"
static int _nodes_in_list(char *node_list)
{
hostset_t host_set = hostset_create(node_list);
int count = hostset_count(host_set);
hostset_destroy(host_set);
return count;
}
static int _sort_pids_by_name(void *x, void *y)
{
int diff = 0;
job_step_pids_t *rec_a = *(job_step_pids_t **)x;
job_step_pids_t *rec_b = *(job_step_pids_t **)y;
if (!rec_a->node_name || !rec_b->node_name)
return 0;
diff = strcmp(rec_a->node_name, rec_b->node_name);
if (diff > 0)
return 1;
else if (diff < 0)
return -1;
return 0;
}
static int _sort_stats_by_name(void *x, void *y)
{
job_step_stat_t *rec_a = *(job_step_stat_t **)x;
job_step_stat_t *rec_b = *(job_step_stat_t **)y;
if (!rec_a->step_pids || !rec_b->step_pids)
return 0;
return _sort_pids_by_name((void *)&rec_a->step_pids, (void *)&rec_b->step_pids);
}
/*
* slurm_print_job_step_info_msg - output information about all Slurm
* job steps based upon message as loaded using slurm_get_job_steps
* IN out - file to write to
* IN job_step_info_msg_ptr - job step information message pointer
* IN one_liner - print as a single line if true
*/
void
slurm_print_job_step_info_msg ( FILE* out,
job_step_info_response_msg_t * job_step_info_msg_ptr,
int one_liner )
{
int i;
job_step_info_t *job_step_ptr = job_step_info_msg_ptr->job_steps ;
char time_str[32];
slurm_make_time_str ((time_t *)&job_step_info_msg_ptr->last_update,
time_str, sizeof(time_str));
fprintf( out, "Job step data as of %s, record count %d\n",
time_str, job_step_info_msg_ptr->job_step_count);
for (i = 0; i < job_step_info_msg_ptr-> job_step_count; i++)
{
slurm_print_job_step_info ( out, & job_step_ptr[i],
one_liner ) ;
}
}
/*
* slurm_print_job_step_info - output information about a specific Slurm
* job step based upon message as loaded using slurm_get_job_steps
* IN out - file to write to
* IN job_ptr - an individual job step information record pointer
* IN one_liner - print as a single line if true
*/
void
slurm_print_job_step_info ( FILE* out, job_step_info_t * job_step_ptr,
int one_liner )
{
char *print_this = slurm_sprint_job_step_info(job_step_ptr, one_liner);
fprintf ( out, "%s", print_this);
xfree(print_this);
}
/*
* slurm_sprint_job_step_info - output information about a specific Slurm
* job step based upon message as loaded using slurm_get_job_steps
* IN job_ptr - an individual job step information record pointer
* IN one_liner - print as a single line if true
* RET out - char * containing formatted output (must be freed after call)
* NULL is returned on failure.
*/
char *
slurm_sprint_job_step_info ( job_step_info_t * job_step_ptr,
int one_liner )
{
char tmp_node_cnt[40];
char time_str[32];
char limit_str[32];
char tmp_line[128];
char *out = NULL;
uint32_t cluster_flags = slurmdb_setup_cluster_flags();
/****** Line 1 ******/
slurm_make_time_str ((time_t *)&job_step_ptr->start_time, time_str,
sizeof(time_str));
if (job_step_ptr->time_limit == INFINITE)
sprintf(limit_str, "UNLIMITED");
else
secs2time_str ((time_t)job_step_ptr->time_limit * 60,
limit_str, sizeof(limit_str));
if (job_step_ptr->array_job_id) {
if (job_step_ptr->step_id == INFINITE) { /* Pending */
snprintf(tmp_line, sizeof(tmp_line), "StepId=%u_%u.TBD ",
job_step_ptr->array_job_id,
job_step_ptr->array_task_id);
} else {
snprintf(tmp_line, sizeof(tmp_line), "StepId=%u_%u.%u ",
job_step_ptr->array_job_id,
job_step_ptr->array_task_id,
job_step_ptr->step_id);
}
out = xstrdup(tmp_line);
} else {
if (job_step_ptr->step_id == INFINITE) { /* Pending */
snprintf(tmp_line, sizeof(tmp_line), "StepId=%u.TBD ",
job_step_ptr->job_id);
} else {
snprintf(tmp_line, sizeof(tmp_line), "StepId=%u.%u ",
job_step_ptr->job_id, job_step_ptr->step_id);
}
out = xstrdup(tmp_line);
}
snprintf(tmp_line, sizeof(tmp_line),
"UserId=%u StartTime=%s TimeLimit=%s",
job_step_ptr->user_id, time_str, limit_str);
xstrcat(out, tmp_line);
if (one_liner)
xstrcat(out, " ");
else
xstrcat(out, "\n ");
/****** Line 2 ******/
snprintf(tmp_line, sizeof(tmp_line),
"State=%s ",
job_state_string(job_step_ptr->state));
xstrcat(out, tmp_line);
if (cluster_flags & CLUSTER_FLAG_BG) {
char *io_nodes;
select_g_select_jobinfo_get(job_step_ptr->select_jobinfo,
SELECT_JOBDATA_IONODES,
&io_nodes);
if (io_nodes) {
snprintf(tmp_line, sizeof(tmp_line),
"Partition=%s MidplaneList=%s[%s] Gres=%s",
job_step_ptr->partition,
job_step_ptr->nodes, io_nodes,
job_step_ptr->gres);
xfree(io_nodes);
} else
snprintf(tmp_line, sizeof(tmp_line),
"Partition=%s MidplaneList=%s Gres=%s",
job_step_ptr->partition,
job_step_ptr->nodes,
job_step_ptr->gres);
} else {
snprintf(tmp_line, sizeof(tmp_line),
"Partition=%s NodeList=%s Gres=%s",
job_step_ptr->partition, job_step_ptr->nodes,
job_step_ptr->gres);
}
xstrcat(out, tmp_line);
if (one_liner)
xstrcat(out, " ");
else
xstrcat(out, "\n ");
/****** Line 3 ******/
if (cluster_flags & CLUSTER_FLAG_BGQ) {
uint32_t nodes = 0;
select_g_select_jobinfo_get(job_step_ptr->select_jobinfo,
SELECT_JOBDATA_NODE_CNT,
&nodes);
convert_num_unit((float)nodes, tmp_node_cnt,
sizeof(tmp_node_cnt), UNIT_NONE);
} else {
convert_num_unit((float)_nodes_in_list(job_step_ptr->nodes),
tmp_node_cnt, sizeof(tmp_node_cnt),
UNIT_NONE);
}
snprintf(tmp_line, sizeof(tmp_line),
"Nodes=%s Tasks=%u Name=%s Network=%s",
tmp_node_cnt, job_step_ptr->num_tasks, job_step_ptr->name,
job_step_ptr->network);
xstrcat(out, tmp_line);
if (one_liner)
xstrcat(out, " ");
else
xstrcat(out, "\n ");
/****** Line 4 ******/
snprintf(tmp_line, sizeof(tmp_line),
"ResvPorts=%s Checkpoint=%u CheckpointDir=%s",
job_step_ptr->resv_ports,
job_step_ptr->ckpt_interval, job_step_ptr->ckpt_dir);
xstrcat(out, tmp_line);
if (one_liner)
xstrcat(out, " ");
else
xstrcat(out, "\n ");
/****** Line 5 ******/
if (job_step_ptr->cpu_freq == NO_VAL) {
snprintf(tmp_line, sizeof(tmp_line),
"CPUFreqReq=Default\n\n");
} else if (job_step_ptr->cpu_freq & CPU_FREQ_RANGE_FLAG) {
switch (job_step_ptr->cpu_freq)
{
case CPU_FREQ_LOW :
snprintf(tmp_line, sizeof(tmp_line),
"CPUFreqReq=Low\n\n");
break;
case CPU_FREQ_MEDIUM :
snprintf(tmp_line, sizeof(tmp_line),
"CPUFreqReq=Medium\n\n");
break;
case CPU_FREQ_HIGH :
snprintf(tmp_line, sizeof(tmp_line),
"CPUFreqReq=High\n\n");
break;
case CPU_FREQ_HIGHM1 :
snprintf(tmp_line, sizeof(tmp_line),
"CPUFreqReq=HighM1\n\n");
break;
default :
snprintf(tmp_line, sizeof(tmp_line),
"CPUFreqReq=Unknown\n\n");
}
} else {
snprintf(tmp_line, sizeof(tmp_line),
"CPUFreqReq=%u\n\n", job_step_ptr->cpu_freq);
}
xstrcat(out, tmp_line);
return out;
}
/*
* slurm_get_job_steps - issue RPC to get specific slurm job step
* configuration information if changed since update_time.
* a job_id value of NO_VAL implies all jobs, a step_id value of
* NO_VAL implies all steps
* IN update_time - time of current configuration data
* IN job_id - get information for specific job id, NO_VAL for all jobs
* IN step_id - get information for specific job step id, NO_VAL for all
* job steps
* IN job_info_msg_pptr - place to store a job configuration pointer
* IN show_flags - job step filtering options
* RET 0 on success, otherwise return -1 and set errno to indicate the error
* NOTE: free the response using slurm_free_job_step_info_response_msg
*/
int
slurm_get_job_steps (time_t update_time, uint32_t job_id, uint32_t step_id,
job_step_info_response_msg_t **resp, uint16_t show_flags)
{
int rc;
slurm_msg_t req_msg;
slurm_msg_t resp_msg;
job_step_info_request_msg_t req;
slurm_msg_t_init(&req_msg);
slurm_msg_t_init(&resp_msg);
req.last_update = update_time;
req.job_id = job_id;
req.step_id = step_id;
req.show_flags = show_flags;
req_msg.msg_type = REQUEST_JOB_STEP_INFO;
req_msg.data = &req;
if (slurm_send_recv_controller_msg(&req_msg, &resp_msg) < 0)
return SLURM_ERROR;
switch (resp_msg.msg_type) {
case RESPONSE_JOB_STEP_INFO:
*resp = (job_step_info_response_msg_t *) resp_msg.data;
break;
case RESPONSE_SLURM_RC:
rc = ((return_code_msg_t *) resp_msg.data)->return_code;
slurm_free_return_code_msg(resp_msg.data);
if (rc)
slurm_seterrno_ret(rc);
*resp = NULL;
break;
default:
slurm_seterrno_ret(SLURM_UNEXPECTED_MSG_ERROR);
break;
}
return SLURM_PROTOCOL_SUCCESS;
}
extern slurm_step_layout_t *
slurm_job_step_layout_get(uint32_t job_id, uint32_t step_id)
{
job_step_id_msg_t data;
slurm_msg_t req, resp;
int errnum;
slurm_msg_t_init(&req);
slurm_msg_t_init(&resp);
req.msg_type = REQUEST_STEP_LAYOUT;
req.data = &data;
data.job_id = job_id;
data.step_id = step_id;
if (slurm_send_recv_controller_msg(&req, &resp) < 0) {
return NULL;
}
switch (resp.msg_type) {
case RESPONSE_STEP_LAYOUT:
return (slurm_step_layout_t *)resp.data;
case RESPONSE_SLURM_RC:
errnum = ((return_code_msg_t *)resp.data)->return_code;
slurm_free_return_code_msg(resp.data);
errno = errnum;
return NULL;
default:
errno = SLURM_UNEXPECTED_MSG_ERROR;
return NULL;
}
}
/*
* slurm_job_step_stat - status a current step
*
* IN job_id
* IN step_id
* IN node_list, optional, if NULL then all nodes in step are returned.
* OUT resp
* RET SLURM_SUCCESS on success SLURM_ERROR else
*/
extern int slurm_job_step_stat(uint32_t job_id, uint32_t step_id,
char *node_list,
job_step_stat_response_msg_t **resp)
{
slurm_msg_t req_msg;
ListIterator itr;
job_step_id_msg_t req;
List ret_list = NULL;
ret_data_info_t *ret_data_info = NULL;
int rc = SLURM_SUCCESS;
slurm_step_layout_t *step_layout = NULL;
job_step_stat_response_msg_t *resp_out;
bool created = 0;
xassert(resp);
if (!node_list) {
if (!(step_layout =
slurm_job_step_layout_get(job_id, step_id))) {
rc = errno;
error("slurm_job_step_stat: "
"problem getting step_layout for %u.%u: %s",
job_id, step_id, slurm_strerror(rc));
return rc;
}
node_list = step_layout->node_list;
}
if (!*resp) {
resp_out = xmalloc(sizeof(job_step_stat_response_msg_t));
*resp = resp_out;
created = 1;
} else
resp_out = *resp;
debug("slurm_job_step_stat: "
"getting pid information of job %u.%u on nodes %s",
job_id, step_id, node_list);
slurm_msg_t_init(&req_msg);
memset(&req, 0, sizeof(job_step_id_msg_t));
resp_out->job_id = req.job_id = job_id;
resp_out->step_id = req.step_id = step_id;
req_msg.msg_type = REQUEST_JOB_STEP_STAT;
req_msg.data = &req;
if (!(ret_list = slurm_send_recv_msgs(node_list, &req_msg, 0, false))) {
error("slurm_job_step_stat: got an error no list returned");
rc = SLURM_ERROR;
if (created) {
slurm_job_step_stat_response_msg_free(resp_out);
*resp = NULL;
}
goto cleanup;
}
itr = list_iterator_create(ret_list);
while ((ret_data_info = list_next(itr))) {
switch (ret_data_info->type) {
case RESPONSE_JOB_STEP_STAT:
if (!resp_out->stats_list)
resp_out->stats_list = list_create(
slurm_free_job_step_stat);
list_push(resp_out->stats_list,
ret_data_info->data);
ret_data_info->data = NULL;
break;
case RESPONSE_SLURM_RC:
rc = slurm_get_return_code(ret_data_info->type,
ret_data_info->data);
if (rc == ESLURM_INVALID_JOB_ID) {
debug("slurm_job_step_stat: job step %u.%u "
"has already completed",
job_id, step_id);
} else {
error("slurm_job_step_stat: "
"there was an error with the request to "
"%s rc = %s",
ret_data_info->node_name,
slurm_strerror(rc));
}
break;
default:
rc = slurm_get_return_code(ret_data_info->type,
ret_data_info->data);
error("slurm_job_step_stat: "
"unknown return given from %s: %d rc = %s",
ret_data_info->node_name, ret_data_info->type,
slurm_strerror(rc));
break;
}
}
list_iterator_destroy(itr);
list_destroy(ret_list);
if (resp_out->stats_list)
list_sort(resp_out->stats_list, (ListCmpF)_sort_stats_by_name);
cleanup:
slurm_step_layout_destroy(step_layout);
return rc;
}
/*
* slurm_job_step_get_pids - get the complete list of pids for a given
* job step
*
* IN job_id
* IN step_id
* IN node_list, optional, if NULL then all nodes in step are returned.
* OUT resp
* RET SLURM_SUCCESS on success SLURM_ERROR else
*/
extern int slurm_job_step_get_pids(uint32_t job_id, uint32_t step_id,
char *node_list,
job_step_pids_response_msg_t **resp)
{
int rc = SLURM_SUCCESS;
slurm_msg_t req_msg;
job_step_id_msg_t req;
ListIterator itr;
List ret_list = NULL;
ret_data_info_t *ret_data_info = NULL;
slurm_step_layout_t *step_layout = NULL;
job_step_pids_response_msg_t *resp_out;
bool created = 0;
xassert(resp);
if (!node_list) {
if (!(step_layout =
slurm_job_step_layout_get(job_id, step_id))) {
rc = errno;
error("slurm_job_step_get_pids: "
"problem getting step_layout for %u.%u: %s",
job_id, step_id, slurm_strerror(rc));
return rc;
}
node_list = step_layout->node_list;
}
if (!*resp) {
resp_out = xmalloc(sizeof(job_step_pids_response_msg_t));
*resp = resp_out;
created = 1;
} else
resp_out = *resp;
debug("slurm_job_step_get_pids: "
"getting pid information of job %u.%u on nodes %s",
job_id, step_id, node_list);
slurm_msg_t_init(&req_msg);
memset(&req, 0, sizeof(job_step_id_msg_t));
resp_out->job_id = req.job_id = job_id;
resp_out->step_id = req.step_id = step_id;
req_msg.msg_type = REQUEST_JOB_STEP_PIDS;
req_msg.data = &req;
if (!(ret_list = slurm_send_recv_msgs(node_list,
&req_msg, 0, false))) {
error("slurm_job_step_get_pids: got an error no list returned");
rc = SLURM_ERROR;
if (created) {
slurm_job_step_pids_response_msg_free(resp_out);
*resp = NULL;
}
goto cleanup;
}
itr = list_iterator_create(ret_list);
while((ret_data_info = list_next(itr))) {
switch (ret_data_info->type) {
case RESPONSE_JOB_STEP_PIDS:
if (!resp_out->pid_list)
resp_out->pid_list = list_create(
slurm_free_job_step_pids);
list_push(resp_out->pid_list,
ret_data_info->data);
ret_data_info->data = NULL;
break;
case RESPONSE_SLURM_RC:
rc = slurm_get_return_code(ret_data_info->type,
ret_data_info->data);
error("slurm_job_step_get_pids: "
"there was an error with the "
"list pid request rc = %s",
slurm_strerror(rc));
break;
default:
rc = slurm_get_return_code(ret_data_info->type,
ret_data_info->data);
error("slurm_job_step_get_pids: "
"unknown return given %d rc = %s",
ret_data_info->type, slurm_strerror(rc));
break;
}
}
list_iterator_destroy(itr);
list_destroy(ret_list);
if (resp_out->pid_list)
list_sort(resp_out->pid_list, (ListCmpF)_sort_pids_by_name);
cleanup:
slurm_step_layout_destroy(step_layout);
return rc;
}
extern void slurm_job_step_layout_free(slurm_step_layout_t *layout)
{
slurm_step_layout_destroy(layout);
}
extern void slurm_job_step_pids_free(job_step_pids_t *object)
{
slurm_free_job_step_pids(object);
}
extern void slurm_job_step_pids_response_msg_free(void *object)
{
job_step_pids_response_msg_t *step_pids_msg =
(job_step_pids_response_msg_t *) object;
if (step_pids_msg) {
if (step_pids_msg->pid_list)
list_destroy(step_pids_msg->pid_list);
xfree(step_pids_msg);
}
}
extern void slurm_job_step_stat_free(job_step_stat_t *object)
{
slurm_free_job_step_stat(object);
}
extern void slurm_job_step_stat_response_msg_free(void *object)
{
job_step_stat_response_msg_t *step_stat_msg =
(job_step_stat_response_msg_t *) object;
if (step_stat_msg) {
if (step_stat_msg->stats_list)
list_destroy(step_stat_msg->stats_list);
xfree(step_stat_msg);
}
}