blob: fb4e7afd10c3f21463bfe9a8fd98719860e94506 [file] [log] [blame]
/*****************************************************************************\
* sstat.c - job accounting reports for Slurm's slurmdb/log plugin
*****************************************************************************
* Copyright (C) 2008 Lawrence Livermore National Security.
* Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
* Written by Morris Jette <jette1@llnl.gov>
* CODE-OCEC-09-009. All rights reserved.
*
* This file is part of Slurm, a resource management program.
* For details, see <https://slurm.schedmd.com/>.
* Please also read the included file: DISCLAIMER.
*
* Slurm is free software; you can redistribute it and/or modify it under
* the terms of the GNU General Public License as published by the Free
* Software Foundation; either version 2 of the License, or (at your option)
* any later version.
*
* In addition, as a special exception, the copyright holders give permission
* to link the code of portions of this program with the OpenSSL library under
* certain conditions as described in each individual source file, and
* distribute linked combinations including the two. You must obey the GNU
* General Public License in all respects for all of the code used other than
* OpenSSL. If you modify file(s) with this exception, you may extend this
* exception to your version of the file(s), but you are not obligated to do
* so. If you do not wish to do so, delete this exception statement from your
* version. If you delete this exception statement from all source files in
* the program, then also delete it here.
*
* Slurm is distributed in the hope that it will be useful, but WITHOUT ANY
* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
* FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
* details.
*
* You should have received a copy of the GNU General Public License along
* with Slurm; if not, write to the Free Software Foundation, Inc.,
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
\*****************************************************************************/
#include "config.h"
#include "sstat.h"
/*
* Globals
*/
sstat_parameters_t params;
print_field_t fields[] = {
{10, "AllocTRES", print_fields_str, PRINT_TRESA},
{10, "AveCPU", print_fields_str, PRINT_AVECPU},
{10, "AveCPUFreq", print_fields_str, PRINT_ACT_CPUFREQ},
{12, "AveDiskRead", print_fields_str, PRINT_AVEDISKREAD},
{12, "AveDiskWrite", print_fields_str, PRINT_AVEDISKWRITE},
{10, "AvePages", print_fields_str, PRINT_AVEPAGES},
{10, "AveRSS", print_fields_str, PRINT_AVERSS},
{10, "AveVMSize", print_fields_str, PRINT_AVEVSIZE},
{14, "ConsumedEnergy", print_fields_str, PRINT_CONSUMED_ENERGY},
{17, "ConsumedEnergyRaw", print_fields_uint64,
PRINT_CONSUMED_ENERGY_RAW},
{-12, "JobID", print_fields_str, PRINT_JOBID},
{12, "MaxDiskRead", print_fields_str, PRINT_MAXDISKREAD},
{15, "MaxDiskReadNode", print_fields_str, PRINT_MAXDISKREADNODE},
{15, "MaxDiskReadTask", print_fields_uint, PRINT_MAXDISKREADTASK},
{12, "MaxDiskWrite", print_fields_str, PRINT_MAXDISKWRITE},
{16, "MaxDiskWriteNode", print_fields_str, PRINT_MAXDISKWRITENODE},
{16, "MaxDiskWriteTask", print_fields_uint, PRINT_MAXDISKWRITETASK},
{8, "MaxPages", print_fields_str, PRINT_MAXPAGES},
{12, "MaxPagesNode", print_fields_str, PRINT_MAXPAGESNODE},
{14, "MaxPagesTask", print_fields_uint, PRINT_MAXPAGESTASK},
{10, "MaxRSS", print_fields_str, PRINT_MAXRSS},
{10, "MaxRSSNode", print_fields_str, PRINT_MAXRSSNODE},
{10, "MaxRSSTask", print_fields_uint, PRINT_MAXRSSTASK},
{10, "MaxVMSize", print_fields_str, PRINT_MAXVSIZE},
{14, "MaxVMSizeNode", print_fields_str, PRINT_MAXVSIZENODE},
{14, "MaxVMSizeTask", print_fields_uint, PRINT_MAXVSIZETASK},
{10, "MinCPU", print_fields_str, PRINT_MINCPU},
{10, "MinCPUNode", print_fields_str, PRINT_MINCPUNODE},
{10, "MinCPUTask", print_fields_uint, PRINT_MINCPUTASK},
{20, "Nodelist", print_fields_str, PRINT_NODELIST},
{8, "NTasks", print_fields_uint, PRINT_NTASKS},
{20, "Pids", print_fields_str, PRINT_PIDS},
{10, "ReqCPUFreq", print_fields_str, PRINT_REQ_CPUFREQ_MIN}, /*vestigial*/
{13, "ReqCPUFreqMin", print_fields_str, PRINT_REQ_CPUFREQ_MIN},
{13, "ReqCPUFreqMax", print_fields_str, PRINT_REQ_CPUFREQ_MAX},
{13, "ReqCPUFreqGov", print_fields_str, PRINT_REQ_CPUFREQ_GOV},
{14, "TRESUsageInAve", print_fields_str, PRINT_TRESUIA},
{14, "TRESUsageInMax", print_fields_str, PRINT_TRESUIM},
{18, "TRESUsageInMaxNode", print_fields_str, PRINT_TRESUIMN},
{18, "TRESUsageInMaxTask", print_fields_str, PRINT_TRESUIMT},
{14, "TRESUsageInMin", print_fields_str, PRINT_TRESUIMI},
{18, "TRESUsageInMinNode", print_fields_str, PRINT_TRESUIMIN},
{18, "TRESUsageInMinTask", print_fields_str, PRINT_TRESUIMIT},
{14, "TRESUsageInTot", print_fields_str, PRINT_TRESUIT},
{15, "TRESUsageOutAve", print_fields_str, PRINT_TRESUOA},
{15, "TRESUsageOutMax", print_fields_str, PRINT_TRESUOM},
{19, "TRESUsageOutMaxNode", print_fields_str, PRINT_TRESUOMN},
{19, "TRESUsageOutMaxTask", print_fields_str, PRINT_TRESUOMT},
{15, "TRESUsageOutMin", print_fields_str, PRINT_TRESUOMI},
{19, "TRESUsageOutMinNode", print_fields_str, PRINT_TRESUOMIN},
{19, "TRESUsageOutMinTask", print_fields_str, PRINT_TRESUOMIT},
{15, "TRESUsageOutTot", print_fields_str, PRINT_TRESUOT},
{0, NULL, NULL, 0}};
list_t *jobs = NULL;
slurmdb_job_rec_t job;
slurmdb_step_rec_t step;
list_t *print_fields_list = NULL;
list_itr_t *print_fields_itr = NULL;
int field_count = 0;
int _do_stat(slurm_step_id_t *step_id, char *nodelist, uint32_t req_cpufreq_min,
uint32_t req_cpufreq_max, uint32_t req_cpufreq_gov,
uint16_t use_protocol_ver, char *tres_alloc_str)
{
job_step_stat_response_msg_t *step_stat_response = NULL;
int rc = SLURM_SUCCESS;
list_itr_t *itr;
jobacctinfo_t *total_jobacct = NULL;
job_step_stat_t *step_stat = NULL;
int ntasks = 0;
int tot_tasks = 0;
hostlist_t *hl = NULL;
char *ave_usage_tmp = NULL;
debug("requesting info for %ps", step_id);
if ((rc = slurm_job_step_stat(step_id,
nodelist, use_protocol_ver,
&step_stat_response)) != SLURM_SUCCESS) {
if (rc == ESLURM_INVALID_JOB_ID) {
debug("%ps has already completed",
step_id);
} else {
error("problem getting step_layout for %ps: %s",
step_id, slurm_strerror(rc));
}
slurm_job_step_pids_response_msg_free(step_stat_response);
return rc;
}
memset(&job, 0, sizeof(slurmdb_job_rec_t));
job.jobid = step_id->job_id;
memset(&step, 0, sizeof(slurmdb_step_rec_t));
memset(&step.stats, 0, sizeof(slurmdb_stats_t));
step.job_ptr = &job;
memcpy(&step.step_id, step_id, sizeof(step.step_id));
step.nodes = xmalloc(BUF_SIZE);
step.req_cpufreq_min = req_cpufreq_min;
step.req_cpufreq_max = req_cpufreq_max;
step.req_cpufreq_gov = req_cpufreq_gov;
step.stepname = NULL;
step.state = JOB_RUNNING;
step.tres_alloc_str = tres_alloc_str;
hl = hostlist_create(NULL);
itr = list_iterator_create(step_stat_response->stats_list);
while ((step_stat = list_next(itr))) {
if (!step_stat->step_pids || !step_stat->step_pids->node_name)
continue;
if (step_stat->step_pids->pid_cnt > 0 ) {
int i;
for(i=0; i<step_stat->step_pids->pid_cnt; i++) {
if (step.pid_str)
xstrcat(step.pid_str, ",");
xstrfmtcat(step.pid_str, "%u",
step_stat->step_pids->pid[i]);
}
}
if (params.pid_format) {
step.nodes = step_stat->step_pids->node_name;
print_fields(&step);
xfree(step.pid_str);
} else {
hostlist_push_host(hl, step_stat->step_pids->node_name);
ntasks += step_stat->num_tasks;
if (step_stat->jobacct) {
if (!assoc_mgr_tres_list &&
step_stat->jobacct->tres_list) {
assoc_mgr_lock_t locks =
{ .tres = WRITE_LOCK };
assoc_mgr_lock(&locks);
assoc_mgr_post_tres_list(
step_stat->jobacct->tres_list);
assoc_mgr_unlock(&locks);
/*
* assoc_mgr_post_tres_list destroys the
* input list
*/
step_stat->jobacct->tres_list = NULL;
}
/*
* total_jobacct has to be created after
* assoc_mgr is set up.
*/
if (!total_jobacct)
total_jobacct =
jobacctinfo_create(NULL);
jobacctinfo_aggregate(total_jobacct,
step_stat->jobacct);
}
}
}
list_iterator_destroy(itr);
if (total_jobacct) {
jobacctinfo_2_stats(&step.stats, total_jobacct);
jobacctinfo_destroy(total_jobacct);
}
slurm_job_step_pids_response_msg_free(step_stat_response);
/* we printed it out already */
if (params.pid_format)
goto getout;
hostlist_sort(hl);
hostlist_ranged_string(hl, BUF_SIZE, step.nodes);
hostlist_destroy(hl);
tot_tasks += ntasks;
if (tot_tasks) {
step.stats.act_cpufreq /= (double)tot_tasks;
ave_usage_tmp = step.stats.tres_usage_in_ave;
step.stats.tres_usage_in_ave = slurmdb_ave_tres_usage(
ave_usage_tmp, tot_tasks);
xfree(ave_usage_tmp);
ave_usage_tmp = step.stats.tres_usage_out_ave;
step.stats.tres_usage_out_ave = slurmdb_ave_tres_usage(
ave_usage_tmp, tot_tasks);
xfree(ave_usage_tmp);
step.ntasks = tot_tasks;
}
print_fields(&step);
getout:
xfree(step.stats.tres_usage_in_max);
xfree(step.stats.tres_usage_out_max);
xfree(step.stats.tres_usage_in_max_taskid);
xfree(step.stats.tres_usage_out_max_taskid);
xfree(step.stats.tres_usage_in_max_nodeid);
xfree(step.stats.tres_usage_out_max_nodeid);
xfree(step.stats.tres_usage_in_ave);
xfree(step.stats.tres_usage_out_ave);
return rc;
}
int main(int argc, char **argv)
{
list_itr_t *itr = NULL;
slurm_step_id_t step_id = {
.job_id = 0,
.step_id = NO_VAL,
.step_het_comp = NO_VAL,
};
slurm_selected_step_t *selected_step = NULL;
slurm_init(NULL);
print_fields_list = list_create(NULL);
print_fields_itr = list_iterator_create(print_fields_list);
parse_command_line(argc, argv);
if (!params.opt_job_list || !list_count(params.opt_job_list)) {
error("You didn't give me any jobs to stat.");
return 1;
}
print_fields_header(print_fields_list);
itr = list_iterator_create(params.opt_job_list);
while ((selected_step = list_next(itr))) {
job_step_info_response_msg_t *step_info = NULL;
memcpy(&step_id, &selected_step->step_id, sizeof(step_id));
if (slurm_get_job_steps(0, step_id.job_id, step_id.step_id,
&step_info, SHOW_ALL)) {
error("couldn't get steps for job %u",
selected_step->step_id.job_id);
continue;
} else if (!step_info->job_step_count) {
if (step_id.step_id == NO_VAL)
error("No steps running for job %u",
selected_step->step_id.job_id);
else
error("%ps not found running.",
&selected_step->step_id);
continue;
}
for (int i = 0; i < step_info->job_step_count; i++) {
/* If no stepid was requested set it to the first one */
if (step_id.step_id == NO_VAL) {
/*
* If asking for no particular step skip the
* special steps.
*/
if (!params.opt_all_steps &&
(step_info->job_steps[i].step_id.step_id >
SLURM_MAX_NORMAL_STEP_ID))
continue;
step_id.step_id = step_info->job_steps[i].
step_id.step_id;
}
if (!params.opt_all_steps &&
!verify_step_id(&step_info->job_steps[i].step_id,
&step_id))
continue;
_do_stat(&step_info->job_steps[i].step_id,
step_info->job_steps[i].nodes,
step_info->job_steps[i].cpu_freq_min,
step_info->job_steps[i].cpu_freq_max,
step_info->job_steps[i].cpu_freq_gov,
step_info->job_steps[i].start_protocol_ver,
step_info->job_steps[i].tres_fmt_alloc_str);
}
}
list_iterator_destroy(itr);
xfree(params.opt_field_list);
FREE_NULL_LIST(params.opt_job_list);
if (print_fields_itr)
list_iterator_destroy(print_fields_itr);
FREE_NULL_LIST(print_fields_list);
return 0;
}