blob: 114b64f9b324c92ce0877694415287aa6ca2e1ca [file] [log] [blame]
/*****************************************************************************\
* squeue.c - Report jobs in the slurm system
*****************************************************************************
* Copyright (C) 2002-2007 The Regents of the University of California.
* Copyright (C) 2008-2009 Lawrence Livermore National Security.
* Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
* Written by Joey Ekstrom <ekstrom1@llnl.gov>,
* Morris Jette <jette1@llnl.gov>, et. al.
* CODE-OCEC-09-009. All rights reserved.
*
* This file is part of Slurm, a resource management program.
* For details, see <https://slurm.schedmd.com/>.
* Please also read the included file: DISCLAIMER.
*
* Slurm is free software; you can redistribute it and/or modify it under
* the terms of the GNU General Public License as published by the Free
* Software Foundation; either version 2 of the License, or (at your option)
* any later version.
*
* In addition, as a special exception, the copyright holders give permission
* to link the code of portions of this program with the OpenSSL library under
* certain conditions as described in each individual source file, and
* distribute linked combinations including the two. You must obey the GNU
* General Public License in all respects for all of the code used other than
* OpenSSL. If you modify file(s) with this exception, you may extend this
* exception to your version of the file(s), but you are not obligated to do
* so. If you do not wish to do so, delete this exception statement from your
* version. If you delete this exception statement from all source files in
* the program, then also delete it here.
*
* Slurm is distributed in the hope that it will be useful, but WITHOUT ANY
* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
* FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
* details.
*
* You should have received a copy of the GNU General Public License along
* with Slurm; if not, write to the Free Software Foundation, Inc.,
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
\*****************************************************************************/
#include "config.h"
#ifdef HAVE_TERMCAP_H
# include <termcap.h>
#endif
#include <sys/ioctl.h>
#include <termios.h>
#include "slurm/slurm.h"
#include "src/common/read_config.h"
#include "src/common/slurm_time.h"
#include "src/common/xstring.h"
#include "src/interfaces/data_parser.h"
#include "src/interfaces/select.h"
#include "src/squeue/squeue.h"
typedef struct {
int job_ids_count;
slurm_selected_step_t *job_ids;
int index;
} job_state_args_t;
/********************
* Global Variables *
********************/
struct squeue_parameters params;
int max_line_size;
/*************
* Functions *
*************/
static int _get_info(bool clear_old, bool log_cluster_name, int argc,
char **argv);
static int _get_window_width( void );
static int _multi_cluster(list_t *clusters, int argc, char **argv);
static int _print_job(bool clear_old, bool log_cluster_name, int argc,
char **argv);
static int _print_job_steps(bool clear_old, int argc, char **argv);
int
main (int argc, char **argv)
{
log_options_t opts = LOG_OPTS_STDERR_ONLY ;
int error_code = SLURM_SUCCESS;
slurm_init(NULL);
log_init(xbasename(argv[0]), opts, SYSLOG_FACILITY_USER, NULL);
parse_command_line( argc, argv );
if (params.verbose) {
opts.stderr_level += params.verbose;
log_alter(opts, SYSLOG_FACILITY_USER, NULL);
}
max_line_size = _get_window_width( );
if (params.clusters)
working_cluster_rec = list_peek(params.clusters);
while (1) {
if ((!params.no_header) &&
(params.iterate || params.verbose || params.long_list))
print_date();
if (!params.clusters) {
if (_get_info(false, false, argc, argv))
error_code = 1;
} else if (_multi_cluster(params.clusters, argc, argv))
error_code = 1;
if ( params.iterate ) {
printf( "\n");
sleep( params.iterate );
} else
break;
}
if ( error_code != SLURM_SUCCESS )
exit (error_code);
else
exit (0);
}
static int _multi_cluster(list_t *clusters, int argc, char **argv)
{
list_itr_t *itr;
bool log_cluster_name = false, first = true;
int rc = 0, rc2;
if ((list_count(clusters) > 1) && params.no_header)
log_cluster_name = true;
itr = list_iterator_create(clusters);
while ((working_cluster_rec = list_next(itr))) {
if (!params.no_header) {
if (first)
first = false;
else
printf("\n");
printf("CLUSTER: %s\n", working_cluster_rec->name);
}
rc2 = _get_info(true, log_cluster_name, argc, argv);
if (rc2)
rc = 1;
}
list_iterator_destroy(itr);
return rc;
}
static int _get_info(bool clear_old, bool log_cluster_name, int argc,
char **argv)
{
if ( params.step_flag )
return _print_job_steps(clear_old, argc, argv);
else
return _print_job(clear_old, log_cluster_name, argc, argv);
}
/* get_window_width - return the size of the window STDOUT goes to */
static int
_get_window_width( void )
{
int width = 80;
#ifdef TIOCGSIZE
struct ttysize win;
if (ioctl (STDOUT_FILENO, TIOCGSIZE, &win) == 0)
width = win.ts_cols;
#elif defined TIOCGWINSZ
struct winsize win;
if (ioctl (STDOUT_FILENO, TIOCGWINSZ, &win) == 0)
width = win.ws_col;
#else
const char *s;
s = getenv("COLUMNS");
if (s)
width = strtol(s,NULL,10);
#endif
return width;
}
static int _foreach_add_job(void *x, void *arg)
{
job_state_args_t *args = arg;
squeue_job_step_t *job_step_id = x;
slurm_selected_step_t *id = &args->job_ids[args->index];
*id = (slurm_selected_step_t) SLURM_SELECTED_STEP_INITIALIZER;
/* FIXME: squeue_job_step_t doesn't include HetComponent */
id->array_task_id = job_step_id->array_id;
id->step_id = job_step_id->step_id;
args->index++;
return SLURM_SUCCESS;
}
static void _populate_array_job_states(job_state_response_job_t *src,
slurm_job_info_t *job)
{
xassert(src->array_job_id);
job->array_job_id = src->array_job_id;
job->array_task_id = src->array_task_id;
if (!src->array_task_id_bitmap)
return;
job->array_bitmap = bit_copy(src->array_task_id_bitmap);
job->array_task_str = bit_fmt_full(job->array_bitmap);
}
static int _query_job_states(int argc, char **argv)
{
int rc = SLURM_SUCCESS;
job_state_response_msg_t *jsr = NULL;
job_state_args_t args = { 0 };
job_info_msg_t *job_msg = NULL;
if (params.job_list) {
args.job_ids_count = list_count(params.job_list);
args.job_ids = xcalloc(args.job_ids_count,
sizeof(*args.job_ids));
if (list_for_each_ro(params.job_list, _foreach_add_job,
&args) < 0)
fatal("list job_ids should not fail");
}
if ((rc = slurm_load_job_state(args.job_ids_count, args.job_ids, &jsr)))
goto cleanup;
if (params.mimetype) {
openapi_resp_job_state_t resp = {
.jobs = jsr,
};
DATA_DUMP_CLI(OPENAPI_JOB_STATE_RESP, resp, argc, argv, NULL,
params.mimetype, params.data_parser, rc);
goto cleanup;
}
if (!params.format && !params.format_long)
params.format = "%.18i %.2t";
if (!params.format_list) {
if (params.format)
rc = parse_format(params.format);
else if (params.format_long)
rc = parse_long_format(params.format_long);
if (rc)
goto cleanup;
}
job_msg = xmalloc(sizeof(*job_msg));
if (jsr && (jsr->jobs_count > 0)) {
job_msg->record_count = jsr->jobs_count;
job_msg->job_array = xcalloc(job_msg->record_count,
sizeof(*job_msg->job_array));
for (int i = 0; i < jsr->jobs_count; i++) {
job_state_response_job_t *src = &jsr->jobs[i];
slurm_job_info_t *job = &job_msg->job_array[i];
job->job_id = src->job_id;
if (src->array_job_id) {
_populate_array_job_states(src, job);
} else if ((job->het_job_id = src->het_job_id)) {
job->het_job_offset =
(src->job_id - job->het_job_id);
job->array_task_id = NO_VAL;
} else {
job->array_task_id = NO_VAL;
}
job->job_state = src->state;
}
}
print_jobs_array(job_msg->job_array, job_msg->record_count,
params.format_list);
cleanup:
#ifdef MEMORY_LEAK_DEBUG
slurm_free_job_info_msg(job_msg);
xfree(args.job_ids);
slurm_free_job_state_response_msg(jsr);
#endif
return rc;
}
/* _print_job - print the specified job's information */
static int _print_job(bool clear_old, bool log_cluster_name, int argc,
char **argv)
{
static job_info_msg_t *old_job_ptr;
job_info_msg_t *new_job_ptr = NULL;
int error_code;
uint16_t show_flags = 0;
if (params.all_flag || (params.job_list && list_count(params.job_list)))
show_flags |= SHOW_ALL;
if (params.federation_flag)
show_flags |= SHOW_FEDERATION;
if (params.local_flag)
show_flags |= SHOW_LOCAL;
if (params.sibling_flag)
show_flags |= SHOW_FEDERATION | SHOW_SIBLING;
/* We require detail data when CPUs are requested */
if ((params.format && strstr(params.format, "C")) || params.detail_flag)
show_flags |= SHOW_DETAIL;
if (old_job_ptr) {
if (clear_old)
old_job_ptr->last_update = 0;
if (params.job_id) {
error_code = slurm_load_job(
&new_job_ptr, params.job_id,
show_flags);
} else if (params.user_id) {
error_code = slurm_load_job_user(&new_job_ptr,
params.user_id,
show_flags);
} else {
if (params.clusters)
show_flags |= SHOW_LOCAL;
error_code = slurm_load_jobs(
old_job_ptr->last_update,
&new_job_ptr, show_flags);
}
if (error_code == SLURM_SUCCESS)
slurm_free_job_info_msg( old_job_ptr );
else if (errno == SLURM_NO_CHANGE_IN_DATA) {
error_code = SLURM_SUCCESS;
new_job_ptr = old_job_ptr;
}
} else if (params.only_state) {
return (error_code = _query_job_states(argc, argv));
} else if (params.job_id) {
error_code = slurm_load_job(&new_job_ptr, params.job_id,
show_flags);
} else if (params.user_id) {
error_code = slurm_load_job_user(&new_job_ptr, params.user_id,
show_flags);
} else {
error_code = slurm_load_jobs((time_t) NULL, &new_job_ptr,
show_flags);
}
if (error_code) {
slurm_perror ("slurm_load_jobs error");
return SLURM_ERROR;
}
old_job_ptr = new_job_ptr;
if (params.mimetype) {
int rc;
squeue_filter_jobs_for_json(new_job_ptr);
openapi_resp_job_info_msg_t resp = {
.jobs = new_job_ptr,
.last_backfill = new_job_ptr->last_backfill,
.last_update = new_job_ptr->last_update,
};
DATA_DUMP_CLI(OPENAPI_JOB_INFO_RESP, resp, argc, argv, NULL,
params.mimetype, params.data_parser, rc);
#ifdef MEMORY_LEAK_DEBUG
slurm_free_job_info_msg(new_job_ptr);
#endif
return rc;
}
if (params.job_id || params.user_id)
old_job_ptr->last_update = (time_t) 0;
if (params.verbose) {
printf ("last_update_time=%ld records=%u\n",
(long) new_job_ptr->last_update,
new_job_ptr->record_count);
}
if (!params.format && !params.format_long) {
if (log_cluster_name)
xstrcat(params.format_long, "cluster:10 ,");
if (params.long_list) {
xstrcat(params.format_long,
"jobarrayid:.18 ,partition:.9 ,name:.8 ,"
"username:.8 ,state:.8 ,timeused:.10 ,"
"timelimit:.9 ,numnodes:.6 ,reasonlist:0");
} else {
xstrcat(params.format_long,
"jobarrayid:.18 ,partition:.9 ,name:.8 ,"
"username:.8 ,statecompact:.2 ,timeused:.10 ,"
"numnodes:.6 ,reasonlist:0");
}
}
if (!params.format_list) {
if (params.format)
parse_format(params.format);
else if (params.format_long)
parse_long_format(params.format_long);
}
print_jobs_array(new_job_ptr->job_array, new_job_ptr->record_count,
params.format_list) ;
return SLURM_SUCCESS;
}
/* _print_job_step - print the specified job step's information */
static int _print_job_steps(bool clear_old, int argc, char **argv)
{
int error_code;
static job_step_info_response_msg_t * old_step_ptr = NULL;
static job_step_info_response_msg_t * new_step_ptr;
uint16_t show_flags = 0;
if (params.all_flag)
show_flags |= SHOW_ALL;
if (params.local_flag)
show_flags |= SHOW_LOCAL;
if (old_step_ptr) {
if (clear_old)
old_step_ptr->last_update = 0;
/* Use a last_update time of 0 so that we can get an updated
* run_time for jobs rather than just its start_time */
error_code = slurm_get_job_steps((time_t) 0, NO_VAL, NO_VAL,
&new_step_ptr, show_flags);
if (error_code == SLURM_SUCCESS)
slurm_free_job_step_info_response_msg( old_step_ptr );
else if (errno == SLURM_NO_CHANGE_IN_DATA) {
error_code = SLURM_SUCCESS;
new_step_ptr = old_step_ptr;
}
} else {
error_code = slurm_get_job_steps((time_t) 0, NO_VAL, NO_VAL,
&new_step_ptr, show_flags);
}
if (error_code) {
slurm_perror ("slurm_get_job_steps error");
return SLURM_ERROR;
}
old_step_ptr = new_step_ptr;
if (params.mimetype) {
int rc;
openapi_resp_job_step_info_msg_t resp = {
.steps = new_step_ptr,
.last_update = new_step_ptr->last_update,
};
DATA_DUMP_CLI(OPENAPI_STEP_INFO_MSG, resp, argc, argv, NULL,
params.mimetype, params.data_parser, rc);
#ifdef MEMORY_LEAK_DEBUG
slurm_free_job_step_info_response_msg(new_step_ptr);
#endif
return rc;
}
if (params.verbose) {
printf ("last_update_time=%ld records=%u\n",
(long) new_step_ptr->last_update,
new_step_ptr->job_step_count);
}
if (!params.format && !params.format_long)
params.format = "%.15i %.8j %.9P %.8u %.9M %N";
if (!params.format_list) {
if (params.format)
parse_format(params.format);
else if (params.format_long)
parse_long_format(params.format_long);
}
print_steps_array( new_step_ptr->job_steps,
new_step_ptr->job_step_count,
params.format_list );
return SLURM_SUCCESS;
}