|  | /****************************************************************************\ | 
|  | *  sdiag.c - Utility for getting information about slurmctld behavior | 
|  | ***************************************************************************** | 
|  | *  Produced at Barcelona Supercomputing Center, December 2011 | 
|  | *  Written by Alejandro Lucero <alucero@bsc.es> | 
|  | * | 
|  | *  This file is part of Slurm, a resource management program. | 
|  | *  For details, see <https://slurm.schedmd.com/>. | 
|  | *  Please also read the included file: DISCLAIMER. | 
|  | * | 
|  | *  Slurm is free software; you can redistribute it and/or modify it under | 
|  | *  the terms of the GNU General Public License as published by the Free | 
|  | *  Software Foundation; either version 2 of the License, or (at your option) | 
|  | *  any later version. | 
|  | * | 
|  | *  In addition, as a special exception, the copyright holders give permission | 
|  | *  to link the code of portions of this program with the OpenSSL library under | 
|  | *  certain conditions as described in each individual source file, and | 
|  | *  distribute linked combinations including the two. You must obey the GNU | 
|  | *  General Public License in all respects for all of the code used other than | 
|  | *  OpenSSL. If you modify file(s) with this exception, you may extend this | 
|  | *  exception to your version of the file(s), but you are not obligated to do | 
|  | *  so. If you do not wish to do so, delete this exception statement from your | 
|  | *  version.  If you delete this exception statement from all source files in | 
|  | *  the program, then also delete it here. | 
|  | * | 
|  | *  Slurm is distributed in the hope that it will be useful, but WITHOUT ANY | 
|  | *  WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS | 
|  | *  FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more | 
|  | *  details. | 
|  | * | 
|  | *  You should have received a copy of the GNU General Public License along | 
|  | *  with Slurm; if not, write to the Free Software Foundation, Inc., | 
|  | *  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301  USA. | 
|  | \*****************************************************************************/ | 
|  |  | 
|  | #include "config.h" | 
|  |  | 
|  | #include <stdlib.h> | 
|  | #include <unistd.h> | 
|  |  | 
|  | #include <slurm/slurm.h> | 
|  | #include "src/common/macros.h" | 
|  | #include "src/common/read_config.h" | 
|  | #include "src/common/slurm_protocol_defs.h" | 
|  | #include "src/common/slurm_time.h" | 
|  | #include "src/common/uid.h" | 
|  | #include "src/common/xmalloc.h" | 
|  | #include "src/common/xstring.h" | 
|  |  | 
|  | #include "src/interfaces/data_parser.h" | 
|  |  | 
|  | #include "sdiag.h" | 
|  |  | 
|  | /******************** | 
|  | * Global Variables * | 
|  | ********************/ | 
|  |  | 
|  | /* | 
|  | * Use one common struct for both rpcs and users. | 
|  | * Use the larger type size from either. | 
|  | */ | 
|  | typedef struct { | 
|  | uint32_t id; | 
|  | uint32_t count; | 
|  | uint64_t time; | 
|  | uint64_t average_time; | 
|  | uint16_t queued; | 
|  | uint64_t dropped; | 
|  | uint16_t cycle_last; | 
|  | uint16_t cycle_max; | 
|  | } rpc_stat_t; | 
|  |  | 
|  | static rpc_stat_t *types = NULL, *users = NULL; | 
|  |  | 
|  | struct sdiag_parameters params = {0}; | 
|  |  | 
|  | stats_info_response_msg_t *buf; | 
|  |  | 
|  | static int  _print_stats(void); | 
|  | static void _sort_rpc(void); | 
|  |  | 
|  | stats_info_request_msg_t req; | 
|  |  | 
|  | extern void parse_command_line(int argc, char **argv); | 
|  |  | 
|  | int main(int argc, char **argv) | 
|  | { | 
|  | int rc = 0; | 
|  |  | 
|  | slurm_init(NULL); | 
|  | parse_command_line(argc, argv); | 
|  |  | 
|  | if (params.mode == STAT_COMMAND_RESET) { | 
|  | req.command_id = STAT_COMMAND_RESET; | 
|  | rc = slurm_reset_statistics((stats_info_request_msg_t *)&req); | 
|  | if (rc == SLURM_SUCCESS) | 
|  | printf("Reset scheduling statistics\n"); | 
|  | else | 
|  | slurm_perror("slurm_reset_statistics"); | 
|  | } else { | 
|  | req.command_id = STAT_COMMAND_GET; | 
|  | rc = slurm_get_statistics(&buf, &req); | 
|  | if (rc == SLURM_SUCCESS) { | 
|  | _sort_rpc(); | 
|  |  | 
|  | if (params.mimetype) { | 
|  | DATA_DUMP_CLI_SINGLE(OPENAPI_DIAG_RESP, buf, | 
|  | argc, argv, NULL, | 
|  | params.mimetype, | 
|  | params.data_parser, rc); | 
|  | } else { | 
|  | rc = _print_stats(); | 
|  | } | 
|  | slurm_free_stats_response_msg(buf); | 
|  | xfree(types); | 
|  | xfree(users); | 
|  | } else | 
|  | slurm_perror("slurm_get_statistics"); | 
|  | } | 
|  |  | 
|  | exit(rc); | 
|  | } | 
|  |  | 
|  | static int _print_stats(void) | 
|  | { | 
|  | int i; | 
|  |  | 
|  | if (!buf) { | 
|  | printf("No data available. Probably slurmctld is not working\n"); | 
|  | return -1; | 
|  | } | 
|  |  | 
|  | printf("*******************************************************\n"); | 
|  | printf("sdiag output at %s (%ld)\n", | 
|  | slurm_ctime2(&buf->req_time), buf->req_time); | 
|  | printf("Data since      %s (%ld)\n", | 
|  | slurm_ctime2(&buf->req_time_start), buf->req_time_start); | 
|  | printf("*******************************************************\n"); | 
|  |  | 
|  | printf("Server thread count:  %d\n", buf->server_thread_count); | 
|  | printf("RPC queue enabled:    %d\n", buf->rpc_queue_enabled); | 
|  | printf("Agent queue size:     %d\n", buf->agent_queue_size); | 
|  | printf("Agent count:          %d\n", buf->agent_count); | 
|  | printf("Agent thread count:   %d\n", buf->agent_thread_count); | 
|  | printf("DBD Agent queue size: %d\n\n", buf->dbd_agent_queue_size); | 
|  |  | 
|  | printf("Jobs submitted: %d\n", buf->jobs_submitted); | 
|  | printf("Jobs started:   %d\n", buf->jobs_started); | 
|  | printf("Jobs completed: %d\n", buf->jobs_completed); | 
|  | printf("Jobs canceled:  %d\n", buf->jobs_canceled); | 
|  | printf("Jobs failed:    %d\n\n", buf->jobs_failed); | 
|  |  | 
|  | printf("Job states ts:  %s (%ld)\n", | 
|  | slurm_ctime2(&buf->job_states_ts), buf->job_states_ts); | 
|  | printf("Jobs pending:   %d\n", buf->jobs_pending); | 
|  | printf("Jobs running:   %d\n", buf->jobs_running); | 
|  |  | 
|  | printf("\nMain schedule statistics (microseconds):\n"); | 
|  | printf("\tLast cycle:   %u\n", buf->schedule_cycle_last); | 
|  | printf("\tMax cycle:    %u\n", buf->schedule_cycle_max); | 
|  | printf("\tTotal cycles: %u\n", buf->schedule_cycle_counter); | 
|  | if (buf->schedule_cycle_counter > 0) { | 
|  | printf("\tMean cycle:   %u\n", | 
|  | buf->schedule_cycle_sum / buf->schedule_cycle_counter); | 
|  | printf("\tMean depth cycle:  %u\n", | 
|  | buf->schedule_cycle_depth / buf->schedule_cycle_counter); | 
|  | } | 
|  | if ((buf->req_time - buf->req_time_start) > 60) { | 
|  | printf("\tCycles per minute: %u\n", | 
|  | (uint32_t) (buf->schedule_cycle_counter / | 
|  | ((buf->req_time - buf->req_time_start) / 60))); | 
|  | } | 
|  | printf("\tLast queue length: %u\n", buf->schedule_queue_len); | 
|  |  | 
|  | printf("\nMain scheduler exit:\n"); | 
|  |  | 
|  | for (i = 0; i < buf->schedule_exit_cnt; i++) { | 
|  | printf("\t%s:%2u\n", schedule_exit2string(i), | 
|  | buf->schedule_exit[i]); | 
|  | } | 
|  |  | 
|  | if (buf->bf_active) { | 
|  | printf("\nBackfilling stats (WARNING: data obtained" | 
|  | " in the middle of backfilling execution.)\n"); | 
|  | } else | 
|  | printf("\nBackfilling stats\n"); | 
|  |  | 
|  | printf("\tTotal backfilled jobs (since last slurm start): %u\n", | 
|  | buf->bf_backfilled_jobs); | 
|  | printf("\tTotal backfilled jobs (since last stats cycle start): %u\n", | 
|  | buf->bf_last_backfilled_jobs); | 
|  | printf("\tTotal backfilled heterogeneous job components: %u\n", | 
|  | buf->bf_backfilled_het_jobs); | 
|  | printf("\tTotal cycles: %u\n", buf->bf_cycle_counter); | 
|  | if (buf->bf_when_last_cycle > 0) { | 
|  | printf("\tLast cycle when: %s (%ld)\n", | 
|  | slurm_ctime2(&buf->bf_when_last_cycle), | 
|  | buf->bf_when_last_cycle); | 
|  | } else { | 
|  | printf("\tLast cycle when: N/A\n"); | 
|  | } | 
|  | printf("\tLast cycle: %u\n", buf->bf_cycle_last); | 
|  | printf("\tMax cycle:  %u\n", buf->bf_cycle_max); | 
|  | if (buf->bf_cycle_counter > 0) { | 
|  | printf("\tMean cycle: %"PRIu64"\n", | 
|  | buf->bf_cycle_sum / buf->bf_cycle_counter); | 
|  | } | 
|  | printf("\tLast depth cycle: %u\n", buf->bf_last_depth); | 
|  | printf("\tLast depth cycle (try sched): %u\n", buf->bf_last_depth_try); | 
|  | if (buf->bf_cycle_counter > 0) { | 
|  | printf("\tDepth Mean: %u\n", | 
|  | buf->bf_depth_sum / buf->bf_cycle_counter); | 
|  | printf("\tDepth Mean (try depth): %u\n", | 
|  | buf->bf_depth_try_sum / buf->bf_cycle_counter); | 
|  | } | 
|  | printf("\tLast queue length: %u\n", buf->bf_queue_len); | 
|  | if (buf->bf_cycle_counter > 0) { | 
|  | printf("\tQueue length mean: %u\n", | 
|  | buf->bf_queue_len_sum / buf->bf_cycle_counter); | 
|  | } | 
|  | printf("\tLast table size: %u\n", buf->bf_table_size); | 
|  | if (buf->bf_cycle_counter > 0) { | 
|  | printf("\tMean table size: %u\n", | 
|  | buf->bf_table_size_sum / buf->bf_cycle_counter); | 
|  | } | 
|  | printf("\nBackfill exit\n"); | 
|  |  | 
|  | for (i = 0; i < buf->bf_exit_cnt; i++) { | 
|  | printf("\t%s:%2u\n", bf_exit2string(i), | 
|  | buf->bf_exit[i]); | 
|  | } | 
|  |  | 
|  | printf("\nLatency for 1000 calls to gettimeofday(): %d microseconds\n", | 
|  | buf->gettimeofday_latency); | 
|  |  | 
|  | printf("\nRemote Procedure Call statistics by message type\n"); | 
|  | for (i = 0; i < buf->rpc_type_size; i++) { | 
|  | if (!buf->rpc_queue_enabled) | 
|  | printf("\t%-40s(%5u) count:%-6u ave_time:%-6"PRIu64" total_time:%"PRIu64"\n", | 
|  | rpc_num2string(types[i].id), types[i].id, | 
|  | types[i].count, types[i].average_time, | 
|  | types[i].time); | 
|  | else | 
|  | printf("\t%-40s(%5u) count:%-6u ave_time:%-6"PRIu64" total_time:%-12"PRIu64" queued:%-6u cycle_last:%-6u cycle_max:%-6u dropped:%"PRIu64"\n", | 
|  | rpc_num2string(types[i].id), types[i].id, | 
|  | types[i].count, types[i].average_time, | 
|  | types[i].time, types[i].queued, | 
|  | types[i].cycle_last, types[i].cycle_max, | 
|  | types[i].dropped); | 
|  | } | 
|  | if (!buf->rpc_type_size) | 
|  | printf("\tNo RPCs recorded yet.\n"); | 
|  |  | 
|  | printf("\nRemote Procedure Call statistics by user\n"); | 
|  | for (i = 0; i < buf->rpc_user_size; i++) { | 
|  | char *user = uid_to_string(users[i].id); | 
|  |  | 
|  | printf("\t%-16s(%8u) count:%-6u ave_time:%-6"PRIu64" total_time:%"PRIu64"\n", | 
|  | user, users[i].id, users[i].count, users[i].average_time, | 
|  | users[i].time); | 
|  |  | 
|  | xfree(user); | 
|  | } | 
|  |  | 
|  | printf("\nPending RPC statistics\n"); | 
|  | if (buf->rpc_queue_type_count == 0) | 
|  | printf("\tNo pending RPCs\n"); | 
|  | for (i = 0; i < buf->rpc_queue_type_count; i++){ | 
|  | printf("\t%-40s(%5u) count:%-6u\n", | 
|  | rpc_num2string(buf->rpc_queue_type_id[i]), | 
|  | buf->rpc_queue_type_id[i], | 
|  | buf->rpc_queue_count[i]); | 
|  | } | 
|  |  | 
|  | if (buf->rpc_dump_count > 0) { | 
|  | printf("\nPending RPCs\n"); | 
|  | } | 
|  |  | 
|  | for (i = 0; i < buf->rpc_dump_count; i++) { | 
|  | printf("\t%2u: %-36s %s\n", | 
|  | i+1, | 
|  | rpc_num2string(buf->rpc_dump_types[i]), | 
|  | buf->rpc_dump_hostlist[i]); | 
|  | } | 
|  |  | 
|  | return 0; | 
|  | } | 
|  |  | 
|  | /* lowest to highest */ | 
|  | static int _sort_id(const void *p1, const void *p2) | 
|  | { | 
|  | const rpc_stat_t *s1 = p1, *s2 = p2; | 
|  |  | 
|  | if (s1->id > s2->id) | 
|  | return 1; | 
|  | else if (s1->id < s2->id) | 
|  | return -1; | 
|  | return 0; | 
|  | } | 
|  |  | 
|  | /* highest to lowest */ | 
|  | static int _sort_time(const void *p1, const void *p2) | 
|  | { | 
|  | const rpc_stat_t *s1 = p1, *s2 = p2; | 
|  |  | 
|  | if (s1->time < s2->time) | 
|  | return 1; | 
|  | else if (s1->time > s2->time) | 
|  | return -1; | 
|  | return 0; | 
|  | } | 
|  |  | 
|  | /* highest to lowest */ | 
|  | static int _sort_average_time(const void *p1, const void *p2) | 
|  | { | 
|  | const rpc_stat_t *s1 = p1, *s2 = p2; | 
|  |  | 
|  | if (s1->average_time < s2->average_time) | 
|  | return 1; | 
|  | else if (s1->average_time > s2->average_time) | 
|  | return -1; | 
|  | return 0; | 
|  | } | 
|  |  | 
|  | /* highest to lowest */ | 
|  | static int _sort_count(const void *p1, const void *p2) | 
|  | { | 
|  | const rpc_stat_t *s1 = p1, *s2 = p2; | 
|  |  | 
|  | if (s1->count < s2->count) | 
|  | return 1; | 
|  | else if (s1->count > s2->count) | 
|  | return -1; | 
|  | return 0; | 
|  | } | 
|  |  | 
|  | static void _sort_rpc(void) | 
|  | { | 
|  | int (*sort_function)(const void *, const void *) = _sort_count; | 
|  |  | 
|  | types = xcalloc(buf->rpc_type_size, sizeof(rpc_stat_t)); | 
|  | for (int i = 0; i < buf->rpc_type_size; i++) { | 
|  | types[i].id = buf->rpc_type_id[i]; | 
|  | types[i].count = buf->rpc_type_cnt[i]; | 
|  | types[i].time = buf->rpc_type_time[i]; | 
|  | if (buf->rpc_type_cnt[i]) | 
|  | types[i].average_time = buf->rpc_type_time[i] / | 
|  | buf->rpc_type_cnt[i]; | 
|  | if (buf->rpc_queue_enabled) { | 
|  | types[i].queued = buf->rpc_type_queued[i]; | 
|  | types[i].dropped = buf->rpc_type_dropped[i]; | 
|  | types[i].cycle_last = buf->rpc_type_cycle_last[i]; | 
|  | types[i].cycle_max = buf->rpc_type_cycle_max[i]; | 
|  | } | 
|  | } | 
|  |  | 
|  | users = xcalloc(buf->rpc_user_size, sizeof(rpc_stat_t)); | 
|  | for (int i = 0; i < buf->rpc_user_size; i++) { | 
|  | users[i].id = buf->rpc_user_id[i]; | 
|  | users[i].count = buf->rpc_user_cnt[i]; | 
|  | users[i].time = buf->rpc_user_time[i]; | 
|  | if (buf->rpc_user_cnt[i]) | 
|  | users[i].average_time = buf->rpc_user_time[i] / | 
|  | buf->rpc_user_cnt[i]; | 
|  | } | 
|  |  | 
|  | if (params.sort == SORT_ID) | 
|  | sort_function = _sort_id; | 
|  | else if (params.sort == SORT_TIME) | 
|  | sort_function = _sort_time; | 
|  | else if (params.sort == SORT_TIME2) | 
|  | sort_function = _sort_average_time; | 
|  | else | 
|  | sort_function = _sort_count; | 
|  |  | 
|  | qsort(types, buf->rpc_type_size, sizeof(rpc_stat_t), sort_function); | 
|  | qsort(users, buf->rpc_user_size, sizeof(rpc_stat_t), sort_function); | 
|  | } |