blob: 4c3a37ac2519d71e08495783bf222d2380a8362b [file] [log] [blame]
/****************************************************************************\
* sdiag.c - Utility for getting information about slurmctld behavior
*****************************************************************************
* Produced at Barcelona Supercomputing Center, December 2011
* Written by Alejandro Lucero <alucero@bsc.es>
*
* This file is part of Slurm, a resource management program.
* For details, see <https://slurm.schedmd.com/>.
* Please also read the included file: DISCLAIMER.
*
* Slurm is free software; you can redistribute it and/or modify it under
* the terms of the GNU General Public License as published by the Free
* Software Foundation; either version 2 of the License, or (at your option)
* any later version.
*
* In addition, as a special exception, the copyright holders give permission
* to link the code of portions of this program with the OpenSSL library under
* certain conditions as described in each individual source file, and
* distribute linked combinations including the two. You must obey the GNU
* General Public License in all respects for all of the code used other than
* OpenSSL. If you modify file(s) with this exception, you may extend this
* exception to your version of the file(s), but you are not obligated to do
* so. If you do not wish to do so, delete this exception statement from your
* version. If you delete this exception statement from all source files in
* the program, then also delete it here.
*
* Slurm is distributed in the hope that it will be useful, but WITHOUT ANY
* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
* FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
* details.
*
* You should have received a copy of the GNU General Public License along
* with Slurm; if not, write to the Free Software Foundation, Inc.,
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
\*****************************************************************************/
#include "config.h"
#include <stdlib.h>
#include <unistd.h>
#include <slurm/slurm.h>
#include "src/common/macros.h"
#include "src/common/read_config.h"
#include "src/common/slurm_protocol_defs.h"
#include "src/common/slurm_time.h"
#include "src/common/uid.h"
#include "src/common/xmalloc.h"
#include "src/common/xstring.h"
#include "src/interfaces/data_parser.h"
#include "sdiag.h"
/********************
* Global Variables *
********************/
/*
* Use one common struct for both rpcs and users.
* Use the larger type size from either.
*/
typedef struct {
uint32_t id;
uint32_t count;
uint64_t time;
uint64_t average_time;
uint16_t queued;
uint64_t dropped;
uint16_t cycle_last;
uint16_t cycle_max;
} rpc_stat_t;
static rpc_stat_t *types = NULL, *users = NULL;
struct sdiag_parameters params = {0};
stats_info_response_msg_t *buf;
static int _print_stats(void);
static void _sort_rpc(void);
stats_info_request_msg_t req;
extern void parse_command_line(int argc, char **argv);
int main(int argc, char **argv)
{
int rc = 0;
slurm_init(NULL);
parse_command_line(argc, argv);
if (params.mode == STAT_COMMAND_RESET) {
req.command_id = STAT_COMMAND_RESET;
rc = slurm_reset_statistics((stats_info_request_msg_t *)&req);
if (rc == SLURM_SUCCESS)
printf("Reset scheduling statistics\n");
else
slurm_perror("slurm_reset_statistics");
} else {
req.command_id = STAT_COMMAND_GET;
rc = slurm_get_statistics(&buf, &req);
if (rc == SLURM_SUCCESS) {
_sort_rpc();
if (params.mimetype) {
DATA_DUMP_CLI_SINGLE(OPENAPI_DIAG_RESP, buf,
argc, argv, NULL,
params.mimetype,
params.data_parser, rc);
} else {
rc = _print_stats();
}
slurm_free_stats_response_msg(buf);
xfree(types);
xfree(users);
} else
slurm_perror("slurm_get_statistics");
}
exit(rc);
}
static int _print_stats(void)
{
int i;
if (!buf) {
printf("No data available. Probably slurmctld is not working\n");
return -1;
}
printf("*******************************************************\n");
printf("sdiag output at %s (%ld)\n",
slurm_ctime2(&buf->req_time), buf->req_time);
printf("Data since %s (%ld)\n",
slurm_ctime2(&buf->req_time_start), buf->req_time_start);
printf("*******************************************************\n");
printf("Server thread count: %d\n", buf->server_thread_count);
printf("RPC queue enabled: %d\n", buf->rpc_queue_enabled);
printf("Agent queue size: %d\n", buf->agent_queue_size);
printf("Agent count: %d\n", buf->agent_count);
printf("Agent thread count: %d\n", buf->agent_thread_count);
printf("DBD Agent queue size: %d\n\n", buf->dbd_agent_queue_size);
printf("Jobs submitted: %d\n", buf->jobs_submitted);
printf("Jobs started: %d\n", buf->jobs_started);
printf("Jobs completed: %d\n", buf->jobs_completed);
printf("Jobs canceled: %d\n", buf->jobs_canceled);
printf("Jobs failed: %d\n\n", buf->jobs_failed);
printf("Job states ts: %s (%ld)\n",
slurm_ctime2(&buf->job_states_ts), buf->job_states_ts);
printf("Jobs pending: %d\n", buf->jobs_pending);
printf("Jobs running: %d\n", buf->jobs_running);
printf("\nMain schedule statistics (microseconds):\n");
printf("\tLast cycle: %u\n", buf->schedule_cycle_last);
printf("\tMax cycle: %u\n", buf->schedule_cycle_max);
printf("\tTotal cycles: %u\n", buf->schedule_cycle_counter);
if (buf->schedule_cycle_counter > 0) {
printf("\tMean cycle: %u\n",
buf->schedule_cycle_sum / buf->schedule_cycle_counter);
printf("\tMean depth cycle: %u\n",
buf->schedule_cycle_depth / buf->schedule_cycle_counter);
}
if ((buf->req_time - buf->req_time_start) > 60) {
printf("\tCycles per minute: %u\n",
(uint32_t) (buf->schedule_cycle_counter /
((buf->req_time - buf->req_time_start) / 60)));
}
printf("\tLast queue length: %u\n", buf->schedule_queue_len);
printf("\nMain scheduler exit:\n");
for (i = 0; i < buf->schedule_exit_cnt; i++) {
printf("\t%s:%2u\n", schedule_exit2string(i),
buf->schedule_exit[i]);
}
if (buf->bf_active) {
printf("\nBackfilling stats (WARNING: data obtained"
" in the middle of backfilling execution.)\n");
} else
printf("\nBackfilling stats\n");
printf("\tTotal backfilled jobs (since last slurm start): %u\n",
buf->bf_backfilled_jobs);
printf("\tTotal backfilled jobs (since last stats cycle start): %u\n",
buf->bf_last_backfilled_jobs);
printf("\tTotal backfilled heterogeneous job components: %u\n",
buf->bf_backfilled_het_jobs);
printf("\tTotal cycles: %u\n", buf->bf_cycle_counter);
if (buf->bf_when_last_cycle > 0) {
printf("\tLast cycle when: %s (%ld)\n",
slurm_ctime2(&buf->bf_when_last_cycle),
buf->bf_when_last_cycle);
} else {
printf("\tLast cycle when: N/A\n");
}
printf("\tLast cycle: %u\n", buf->bf_cycle_last);
printf("\tMax cycle: %u\n", buf->bf_cycle_max);
if (buf->bf_cycle_counter > 0) {
printf("\tMean cycle: %"PRIu64"\n",
buf->bf_cycle_sum / buf->bf_cycle_counter);
}
printf("\tLast depth cycle: %u\n", buf->bf_last_depth);
printf("\tLast depth cycle (try sched): %u\n", buf->bf_last_depth_try);
if (buf->bf_cycle_counter > 0) {
printf("\tDepth Mean: %u\n",
buf->bf_depth_sum / buf->bf_cycle_counter);
printf("\tDepth Mean (try depth): %u\n",
buf->bf_depth_try_sum / buf->bf_cycle_counter);
}
printf("\tLast queue length: %u\n", buf->bf_queue_len);
if (buf->bf_cycle_counter > 0) {
printf("\tQueue length mean: %u\n",
buf->bf_queue_len_sum / buf->bf_cycle_counter);
}
printf("\tLast table size: %u\n", buf->bf_table_size);
if (buf->bf_cycle_counter > 0) {
printf("\tMean table size: %u\n",
buf->bf_table_size_sum / buf->bf_cycle_counter);
}
printf("\nBackfill exit\n");
for (i = 0; i < buf->bf_exit_cnt; i++) {
printf("\t%s:%2u\n", bf_exit2string(i),
buf->bf_exit[i]);
}
printf("\nLatency for 1000 calls to gettimeofday(): %d microseconds\n",
buf->gettimeofday_latency);
printf("\nRemote Procedure Call statistics by message type\n");
for (i = 0; i < buf->rpc_type_size; i++) {
if (!buf->rpc_queue_enabled)
printf("\t%-40s(%5u) count:%-6u ave_time:%-6"PRIu64" total_time:%"PRIu64"\n",
rpc_num2string(types[i].id), types[i].id,
types[i].count, types[i].average_time,
types[i].time);
else
printf("\t%-40s(%5u) count:%-6u ave_time:%-6"PRIu64" total_time:%-12"PRIu64" queued:%-6u cycle_last:%-6u cycle_max:%-6u dropped:%"PRIu64"\n",
rpc_num2string(types[i].id), types[i].id,
types[i].count, types[i].average_time,
types[i].time, types[i].queued,
types[i].cycle_last, types[i].cycle_max,
types[i].dropped);
}
if (!buf->rpc_type_size)
printf("\tNo RPCs recorded yet.\n");
printf("\nRemote Procedure Call statistics by user\n");
for (i = 0; i < buf->rpc_user_size; i++) {
char *user = uid_to_string(users[i].id);
printf("\t%-16s(%8u) count:%-6u ave_time:%-6"PRIu64" total_time:%"PRIu64"\n",
user, users[i].id, users[i].count, users[i].average_time,
users[i].time);
xfree(user);
}
printf("\nPending RPC statistics\n");
if (buf->rpc_queue_type_count == 0)
printf("\tNo pending RPCs\n");
for (i = 0; i < buf->rpc_queue_type_count; i++){
printf("\t%-40s(%5u) count:%-6u\n",
rpc_num2string(buf->rpc_queue_type_id[i]),
buf->rpc_queue_type_id[i],
buf->rpc_queue_count[i]);
}
if (buf->rpc_dump_count > 0) {
printf("\nPending RPCs\n");
}
for (i = 0; i < buf->rpc_dump_count; i++) {
printf("\t%2u: %-36s %s\n",
i+1,
rpc_num2string(buf->rpc_dump_types[i]),
buf->rpc_dump_hostlist[i]);
}
return 0;
}
/* lowest to highest */
static int _sort_id(const void *p1, const void *p2)
{
const rpc_stat_t *s1 = p1, *s2 = p2;
if (s1->id > s2->id)
return 1;
else if (s1->id < s2->id)
return -1;
return 0;
}
/* highest to lowest */
static int _sort_time(const void *p1, const void *p2)
{
const rpc_stat_t *s1 = p1, *s2 = p2;
if (s1->time < s2->time)
return 1;
else if (s1->time > s2->time)
return -1;
return 0;
}
/* highest to lowest */
static int _sort_average_time(const void *p1, const void *p2)
{
const rpc_stat_t *s1 = p1, *s2 = p2;
if (s1->average_time < s2->average_time)
return 1;
else if (s1->average_time > s2->average_time)
return -1;
return 0;
}
/* highest to lowest */
static int _sort_count(const void *p1, const void *p2)
{
const rpc_stat_t *s1 = p1, *s2 = p2;
if (s1->count < s2->count)
return 1;
else if (s1->count > s2->count)
return -1;
return 0;
}
static void _sort_rpc(void)
{
int (*sort_function)(const void *, const void *) = _sort_count;
types = xcalloc(buf->rpc_type_size, sizeof(rpc_stat_t));
for (int i = 0; i < buf->rpc_type_size; i++) {
types[i].id = buf->rpc_type_id[i];
types[i].count = buf->rpc_type_cnt[i];
types[i].time = buf->rpc_type_time[i];
if (buf->rpc_type_cnt[i])
types[i].average_time = buf->rpc_type_time[i] /
buf->rpc_type_cnt[i];
if (buf->rpc_queue_enabled) {
types[i].queued = buf->rpc_type_queued[i];
types[i].dropped = buf->rpc_type_dropped[i];
types[i].cycle_last = buf->rpc_type_cycle_last[i];
types[i].cycle_max = buf->rpc_type_cycle_max[i];
}
}
users = xcalloc(buf->rpc_user_size, sizeof(rpc_stat_t));
for (int i = 0; i < buf->rpc_user_size; i++) {
users[i].id = buf->rpc_user_id[i];
users[i].count = buf->rpc_user_cnt[i];
users[i].time = buf->rpc_user_time[i];
if (buf->rpc_user_cnt[i])
users[i].average_time = buf->rpc_user_time[i] /
buf->rpc_user_cnt[i];
}
if (params.sort == SORT_ID)
sort_function = _sort_id;
else if (params.sort == SORT_TIME)
sort_function = _sort_time;
else if (params.sort == SORT_TIME2)
sort_function = _sort_average_time;
else
sort_function = _sort_count;
qsort(types, buf->rpc_type_size, sizeof(rpc_stat_t), sort_function);
qsort(users, buf->rpc_user_size, sizeof(rpc_stat_t), sort_function);
}