| /*****************************************************************************\ |
| * sstat.c - job accounting reports for Slurm's slurmdb/log plugin |
| ***************************************************************************** |
| * Copyright (C) 2008 Lawrence Livermore National Security. |
| * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). |
| * Written by Morris Jette <jette1@llnl.gov> |
| * CODE-OCEC-09-009. All rights reserved. |
| * |
| * This file is part of Slurm, a resource management program. |
| * For details, see <https://slurm.schedmd.com/>. |
| * Please also read the included file: DISCLAIMER. |
| * |
| * Slurm is free software; you can redistribute it and/or modify it under |
| * the terms of the GNU General Public License as published by the Free |
| * Software Foundation; either version 2 of the License, or (at your option) |
| * any later version. |
| * |
| * In addition, as a special exception, the copyright holders give permission |
| * to link the code of portions of this program with the OpenSSL library under |
| * certain conditions as described in each individual source file, and |
| * distribute linked combinations including the two. You must obey the GNU |
| * General Public License in all respects for all of the code used other than |
| * OpenSSL. If you modify file(s) with this exception, you may extend this |
| * exception to your version of the file(s), but you are not obligated to do |
| * so. If you do not wish to do so, delete this exception statement from your |
| * version. If you delete this exception statement from all source files in |
| * the program, then also delete it here. |
| * |
| * Slurm is distributed in the hope that it will be useful, but WITHOUT ANY |
| * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS |
| * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more |
| * details. |
| * |
| * You should have received a copy of the GNU General Public License along |
| * with Slurm; if not, write to the Free Software Foundation, Inc., |
| * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. |
| \*****************************************************************************/ |
| |
| #include "config.h" |
| |
| #include "sstat.h" |
| |
| /* |
| * Globals |
| */ |
| sstat_parameters_t params; |
| print_field_t fields[] = { |
| {10, "AllocTRES", print_fields_str, PRINT_TRESA}, |
| {10, "AveCPU", print_fields_str, PRINT_AVECPU}, |
| {10, "AveCPUFreq", print_fields_str, PRINT_ACT_CPUFREQ}, |
| {12, "AveDiskRead", print_fields_str, PRINT_AVEDISKREAD}, |
| {12, "AveDiskWrite", print_fields_str, PRINT_AVEDISKWRITE}, |
| {10, "AvePages", print_fields_str, PRINT_AVEPAGES}, |
| {10, "AveRSS", print_fields_str, PRINT_AVERSS}, |
| {10, "AveVMSize", print_fields_str, PRINT_AVEVSIZE}, |
| {14, "ConsumedEnergy", print_fields_str, PRINT_CONSUMED_ENERGY}, |
| {17, "ConsumedEnergyRaw", print_fields_uint64, |
| PRINT_CONSUMED_ENERGY_RAW}, |
| {-12, "JobID", print_fields_str, PRINT_JOBID}, |
| {12, "MaxDiskRead", print_fields_str, PRINT_MAXDISKREAD}, |
| {15, "MaxDiskReadNode", print_fields_str, PRINT_MAXDISKREADNODE}, |
| {15, "MaxDiskReadTask", print_fields_uint, PRINT_MAXDISKREADTASK}, |
| {12, "MaxDiskWrite", print_fields_str, PRINT_MAXDISKWRITE}, |
| {16, "MaxDiskWriteNode", print_fields_str, PRINT_MAXDISKWRITENODE}, |
| {16, "MaxDiskWriteTask", print_fields_uint, PRINT_MAXDISKWRITETASK}, |
| {8, "MaxPages", print_fields_str, PRINT_MAXPAGES}, |
| {12, "MaxPagesNode", print_fields_str, PRINT_MAXPAGESNODE}, |
| {14, "MaxPagesTask", print_fields_uint, PRINT_MAXPAGESTASK}, |
| {10, "MaxRSS", print_fields_str, PRINT_MAXRSS}, |
| {10, "MaxRSSNode", print_fields_str, PRINT_MAXRSSNODE}, |
| {10, "MaxRSSTask", print_fields_uint, PRINT_MAXRSSTASK}, |
| {10, "MaxVMSize", print_fields_str, PRINT_MAXVSIZE}, |
| {14, "MaxVMSizeNode", print_fields_str, PRINT_MAXVSIZENODE}, |
| {14, "MaxVMSizeTask", print_fields_uint, PRINT_MAXVSIZETASK}, |
| {10, "MinCPU", print_fields_str, PRINT_MINCPU}, |
| {10, "MinCPUNode", print_fields_str, PRINT_MINCPUNODE}, |
| {10, "MinCPUTask", print_fields_uint, PRINT_MINCPUTASK}, |
| {20, "Nodelist", print_fields_str, PRINT_NODELIST}, |
| {8, "NTasks", print_fields_uint, PRINT_NTASKS}, |
| {20, "Pids", print_fields_str, PRINT_PIDS}, |
| {10, "ReqCPUFreq", print_fields_str, PRINT_REQ_CPUFREQ_MIN}, /*vestigial*/ |
| {13, "ReqCPUFreqMin", print_fields_str, PRINT_REQ_CPUFREQ_MIN}, |
| {13, "ReqCPUFreqMax", print_fields_str, PRINT_REQ_CPUFREQ_MAX}, |
| {13, "ReqCPUFreqGov", print_fields_str, PRINT_REQ_CPUFREQ_GOV}, |
| {14, "TRESUsageInAve", print_fields_str, PRINT_TRESUIA}, |
| {14, "TRESUsageInMax", print_fields_str, PRINT_TRESUIM}, |
| {18, "TRESUsageInMaxNode", print_fields_str, PRINT_TRESUIMN}, |
| {18, "TRESUsageInMaxTask", print_fields_str, PRINT_TRESUIMT}, |
| {14, "TRESUsageInMin", print_fields_str, PRINT_TRESUIMI}, |
| {18, "TRESUsageInMinNode", print_fields_str, PRINT_TRESUIMIN}, |
| {18, "TRESUsageInMinTask", print_fields_str, PRINT_TRESUIMIT}, |
| {14, "TRESUsageInTot", print_fields_str, PRINT_TRESUIT}, |
| {15, "TRESUsageOutAve", print_fields_str, PRINT_TRESUOA}, |
| {15, "TRESUsageOutMax", print_fields_str, PRINT_TRESUOM}, |
| {19, "TRESUsageOutMaxNode", print_fields_str, PRINT_TRESUOMN}, |
| {19, "TRESUsageOutMaxTask", print_fields_str, PRINT_TRESUOMT}, |
| {15, "TRESUsageOutMin", print_fields_str, PRINT_TRESUOMI}, |
| {19, "TRESUsageOutMinNode", print_fields_str, PRINT_TRESUOMIN}, |
| {19, "TRESUsageOutMinTask", print_fields_str, PRINT_TRESUOMIT}, |
| {15, "TRESUsageOutTot", print_fields_str, PRINT_TRESUOT}, |
| {0, NULL, NULL, 0}}; |
| |
| list_t *jobs = NULL; |
| slurmdb_job_rec_t job; |
| slurmdb_step_rec_t step; |
| list_t *print_fields_list = NULL; |
| list_itr_t *print_fields_itr = NULL; |
| int field_count = 0; |
| |
| int _do_stat(slurm_step_id_t *step_id, char *nodelist, uint32_t req_cpufreq_min, |
| uint32_t req_cpufreq_max, uint32_t req_cpufreq_gov, |
| uint16_t use_protocol_ver, char *tres_alloc_str) |
| { |
| job_step_stat_response_msg_t *step_stat_response = NULL; |
| int rc = SLURM_SUCCESS; |
| list_itr_t *itr; |
| jobacctinfo_t *total_jobacct = NULL; |
| job_step_stat_t *step_stat = NULL; |
| int ntasks = 0; |
| int tot_tasks = 0; |
| hostlist_t *hl = NULL; |
| char *ave_usage_tmp = NULL; |
| |
| debug("requesting info for %ps", step_id); |
| if ((rc = slurm_job_step_stat(step_id, |
| nodelist, use_protocol_ver, |
| &step_stat_response)) != SLURM_SUCCESS) { |
| if (rc == ESLURM_INVALID_JOB_ID) { |
| debug("%ps has already completed", |
| step_id); |
| } else { |
| error("problem getting step_layout for %ps: %s", |
| step_id, slurm_strerror(rc)); |
| } |
| slurm_job_step_pids_response_msg_free(step_stat_response); |
| return rc; |
| } |
| |
| memset(&job, 0, sizeof(slurmdb_job_rec_t)); |
| job.jobid = step_id->job_id; |
| |
| memset(&step, 0, sizeof(slurmdb_step_rec_t)); |
| |
| memset(&step.stats, 0, sizeof(slurmdb_stats_t)); |
| |
| step.job_ptr = &job; |
| memcpy(&step.step_id, step_id, sizeof(step.step_id)); |
| step.nodes = xmalloc(BUF_SIZE); |
| step.req_cpufreq_min = req_cpufreq_min; |
| step.req_cpufreq_max = req_cpufreq_max; |
| step.req_cpufreq_gov = req_cpufreq_gov; |
| step.stepname = NULL; |
| step.state = JOB_RUNNING; |
| step.tres_alloc_str = tres_alloc_str; |
| hl = hostlist_create(NULL); |
| itr = list_iterator_create(step_stat_response->stats_list); |
| while ((step_stat = list_next(itr))) { |
| if (!step_stat->step_pids || !step_stat->step_pids->node_name) |
| continue; |
| if (step_stat->step_pids->pid_cnt > 0 ) { |
| int i; |
| for(i=0; i<step_stat->step_pids->pid_cnt; i++) { |
| if (step.pid_str) |
| xstrcat(step.pid_str, ","); |
| xstrfmtcat(step.pid_str, "%u", |
| step_stat->step_pids->pid[i]); |
| } |
| } |
| |
| if (params.pid_format) { |
| step.nodes = step_stat->step_pids->node_name; |
| print_fields(&step); |
| xfree(step.pid_str); |
| } else { |
| hostlist_push_host(hl, step_stat->step_pids->node_name); |
| ntasks += step_stat->num_tasks; |
| if (step_stat->jobacct) { |
| if (!assoc_mgr_tres_list && |
| step_stat->jobacct->tres_list) { |
| assoc_mgr_lock_t locks = |
| { .tres = WRITE_LOCK }; |
| assoc_mgr_lock(&locks); |
| assoc_mgr_post_tres_list( |
| step_stat->jobacct->tres_list); |
| assoc_mgr_unlock(&locks); |
| /* |
| * assoc_mgr_post_tres_list destroys the |
| * input list |
| */ |
| step_stat->jobacct->tres_list = NULL; |
| } |
| |
| /* |
| * total_jobacct has to be created after |
| * assoc_mgr is set up. |
| */ |
| if (!total_jobacct) |
| total_jobacct = |
| jobacctinfo_create(NULL); |
| |
| jobacctinfo_aggregate(total_jobacct, |
| step_stat->jobacct); |
| } |
| } |
| } |
| list_iterator_destroy(itr); |
| |
| if (total_jobacct) { |
| jobacctinfo_2_stats(&step.stats, total_jobacct); |
| jobacctinfo_destroy(total_jobacct); |
| } |
| |
| slurm_job_step_pids_response_msg_free(step_stat_response); |
| /* we printed it out already */ |
| if (params.pid_format) |
| goto getout; |
| |
| hostlist_sort(hl); |
| hostlist_ranged_string(hl, BUF_SIZE, step.nodes); |
| hostlist_destroy(hl); |
| tot_tasks += ntasks; |
| |
| if (tot_tasks) { |
| step.stats.act_cpufreq /= (double)tot_tasks; |
| |
| ave_usage_tmp = step.stats.tres_usage_in_ave; |
| step.stats.tres_usage_in_ave = slurmdb_ave_tres_usage( |
| ave_usage_tmp, tot_tasks); |
| xfree(ave_usage_tmp); |
| ave_usage_tmp = step.stats.tres_usage_out_ave; |
| step.stats.tres_usage_out_ave = slurmdb_ave_tres_usage( |
| ave_usage_tmp, tot_tasks); |
| xfree(ave_usage_tmp); |
| |
| step.ntasks = tot_tasks; |
| } |
| |
| print_fields(&step); |
| |
| getout: |
| |
| xfree(step.stats.tres_usage_in_max); |
| xfree(step.stats.tres_usage_out_max); |
| xfree(step.stats.tres_usage_in_max_taskid); |
| xfree(step.stats.tres_usage_out_max_taskid); |
| xfree(step.stats.tres_usage_in_max_nodeid); |
| xfree(step.stats.tres_usage_out_max_nodeid); |
| xfree(step.stats.tres_usage_in_ave); |
| xfree(step.stats.tres_usage_out_ave); |
| |
| return rc; |
| } |
| |
| int main(int argc, char **argv) |
| { |
| list_itr_t *itr = NULL; |
| slurm_step_id_t step_id = { |
| .job_id = 0, |
| .step_id = NO_VAL, |
| .step_het_comp = NO_VAL, |
| }; |
| slurm_selected_step_t *selected_step = NULL; |
| |
| slurm_init(NULL); |
| |
| print_fields_list = list_create(NULL); |
| print_fields_itr = list_iterator_create(print_fields_list); |
| |
| parse_command_line(argc, argv); |
| if (!params.opt_job_list || !list_count(params.opt_job_list)) { |
| error("You didn't give me any jobs to stat."); |
| return 1; |
| } |
| |
| print_fields_header(print_fields_list); |
| itr = list_iterator_create(params.opt_job_list); |
| while ((selected_step = list_next(itr))) { |
| job_step_info_response_msg_t *step_info = NULL; |
| |
| memcpy(&step_id, &selected_step->step_id, sizeof(step_id)); |
| |
| if (slurm_get_job_steps(0, step_id.job_id, step_id.step_id, |
| &step_info, SHOW_ALL)) { |
| error("couldn't get steps for job %u", |
| selected_step->step_id.job_id); |
| continue; |
| } else if (!step_info->job_step_count) { |
| if (step_id.step_id == NO_VAL) |
| error("No steps running for job %u", |
| selected_step->step_id.job_id); |
| else |
| error("%ps not found running.", |
| &selected_step->step_id); |
| |
| continue; |
| } |
| |
| for (int i = 0; i < step_info->job_step_count; i++) { |
| /* If no stepid was requested set it to the first one */ |
| if (step_id.step_id == NO_VAL) { |
| /* |
| * If asking for no particular step skip the |
| * special steps. |
| */ |
| if (!params.opt_all_steps && |
| (step_info->job_steps[i].step_id.step_id > |
| SLURM_MAX_NORMAL_STEP_ID)) |
| continue; |
| step_id.step_id = step_info->job_steps[i]. |
| step_id.step_id; |
| } |
| |
| if (!params.opt_all_steps && |
| !verify_step_id(&step_info->job_steps[i].step_id, |
| &step_id)) |
| continue; |
| |
| _do_stat(&step_info->job_steps[i].step_id, |
| step_info->job_steps[i].nodes, |
| step_info->job_steps[i].cpu_freq_min, |
| step_info->job_steps[i].cpu_freq_max, |
| step_info->job_steps[i].cpu_freq_gov, |
| step_info->job_steps[i].start_protocol_ver, |
| step_info->job_steps[i].tres_fmt_alloc_str); |
| } |
| } |
| list_iterator_destroy(itr); |
| |
| xfree(params.opt_field_list); |
| FREE_NULL_LIST(params.opt_job_list); |
| if (print_fields_itr) |
| list_iterator_destroy(print_fields_itr); |
| FREE_NULL_LIST(print_fields_list); |
| |
| return 0; |
| } |