blob: 5c4b298122fd1b06c55497b957452b955ef08a16 [file] [log] [blame]
/*****************************************************************************\
* sacct_stat.c - stat slurmd for percise job information
*
* $Id: options.c 7541 2006-03-18 01:44:58Z da $
*****************************************************************************
* Copyright (C) 2006 The Regents of the University of California.
* Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
* Written by Danny Auble <da@llnl.gov>.
* UCRL-CODE-226842.
*
* This file is part of SLURM, a resource management program.
* For details, see <http://www.llnl.gov/linux/slurm/>.
*
* SLURM is free software; you can redistribute it and/or modify it under
* the terms of the GNU General Public License as published by the Free
* Software Foundation; either version 2 of the License, or (at your option)
* any later version.
*
* In addition, as a special exception, the copyright holders give permission
* to link the code of portions of this program with the OpenSSL library under
* certain conditions as described in each individual source file, and
* distribute linked combinations including the two. You must obey the GNU
* General Public License in all respects for all of the code used other than
* OpenSSL. If you modify file(s) with this exception, you may extend this
* exception to your version of the file(s), but you are not obligated to do
* so. If you do not wish to do so, delete this exception statement from your
* version. If you delete this exception statement from all source files in
* the program, then also delete it here.
*
* SLURM is distributed in the hope that it will be useful, but WITHOUT ANY
* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
* FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
* details.
*
* You should have received a copy of the GNU General Public License along
* with SLURM; if not, write to the Free Software Foundation, Inc.,
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
\*****************************************************************************/
#include "sacct.h"
#include <pthread.h>
#include "src/common/slurm_jobacct.h"
#include "src/common/forward.h"
#include "src/common/slurm_auth.h"
step_rec_t step;
int thr_finished = 0;
void *_stat_thread(void *args);
int _sacct_query(slurm_step_layout_t *step_layout, uint32_t job_id,
uint32_t step_id);
int _process_results();
int _sacct_query(slurm_step_layout_t *step_layout, uint32_t job_id,
uint32_t step_id)
{
slurm_msg_t msg;
stat_jobacct_msg_t r;
stat_jobacct_msg_t *jobacct_msg = NULL;
ListIterator itr;
List ret_list = NULL;
sacct_t temp_sacct;
ret_data_info_t *ret_data_info = NULL;
int rc = SLURM_SUCCESS;
int ntasks = 0;
debug("getting the stat of job %d on %d nodes",
job_id, step_layout->node_cnt);
memset(&temp_sacct, 0, sizeof(sacct_t));
temp_sacct.min_cpu = (float)NO_VAL;
memset(&step.sacct, 0, sizeof(sacct_t));
step.sacct.min_cpu = (float)NO_VAL;
step.header.jobnum = job_id;
step.header.partition = NULL;
step.header.blockid = NULL;
step.stepnum = step_id;
step.nodes = step_layout->node_list;
step.stepname = NULL;
step.status = JOB_RUNNING;
step.ntasks = 0;
slurm_msg_t_init(&msg);
/* Common message contents */
r.job_id = job_id;
r.step_id = step_id;
r.jobacct = jobacct_g_alloc(NULL);
msg.msg_type = MESSAGE_STAT_JOBACCT;
msg.data = &r;
ret_list = slurm_send_recv_msgs(step_layout->node_list, &msg, 0);
if (!ret_list) {
error("got an error no list returned");
goto cleanup;
}
itr = list_iterator_create(ret_list);
while((ret_data_info = list_next(itr))) {
switch (ret_data_info->type) {
case MESSAGE_STAT_JOBACCT:
jobacct_msg = (stat_jobacct_msg_t *)
ret_data_info->data;
if(jobacct_msg) {
debug2("got it back for job %d",
jobacct_msg->job_id);
jobacct_g_2_sacct(
&temp_sacct,
jobacct_msg->jobacct);
ntasks += jobacct_msg->num_tasks;
aggregate_sacct(&step.sacct, &temp_sacct);
}
break;
case RESPONSE_SLURM_RC:
rc = slurm_get_return_code(ret_data_info->type,
ret_data_info->data);
error("there was an error with the request rc = %s",
slurm_strerror(rc));
break;
default:
rc = slurm_get_return_code(ret_data_info->type,
ret_data_info->data);
error("unknown return given %d rc = %s",
ret_data_info->type, slurm_strerror(rc));
break;
}
}
list_iterator_destroy(itr);
list_destroy(ret_list);
step.ntasks += ntasks;
cleanup:
if(step.ntasks) {
step.sacct.ave_rss *= 1024;
step.sacct.max_rss *= 1024;
step.sacct.ave_vsize *= 1024;
step.sacct.max_vsize *= 1024;
step.sacct.ave_cpu /= step.ntasks;
step.sacct.ave_cpu /= 100;
step.sacct.min_cpu /= 100;
step.sacct.ave_rss /= step.ntasks;
step.sacct.ave_vsize /= step.ntasks;
step.sacct.ave_pages /= step.ntasks;
}
jobacct_g_free(r.jobacct);
return SLURM_SUCCESS;
}
int _process_results()
{
print_fields(JOBSTEP, &step);
return SLURM_SUCCESS;
}
int sacct_stat(uint32_t jobid, uint32_t stepid)
{
slurm_msg_t req_msg;
slurm_msg_t resp_msg;
job_step_id_msg_t req;
slurm_step_layout_t *step_layout = NULL;
int rc = SLURM_SUCCESS;
slurm_msg_t_init(&req_msg);
slurm_msg_t_init(&resp_msg);
debug("requesting info for job %u.%u", jobid, stepid);
req.job_id = jobid;
req.step_id = stepid;
req_msg.msg_type = REQUEST_STEP_LAYOUT;
req_msg.data = &req;
if (slurm_send_recv_controller_msg(&req_msg, &resp_msg) < 0) {
return SLURM_ERROR;
}
switch (resp_msg.msg_type) {
case RESPONSE_STEP_LAYOUT:
step_layout = (slurm_step_layout_t *)resp_msg.data;
break;
case RESPONSE_SLURM_RC:
rc = ((return_code_msg_t *) resp_msg.data)->return_code;
slurm_free_return_code_msg(resp_msg.data);
printf("problem getting job: %s\n", slurm_strerror(rc));
slurm_seterrno_ret(rc);
break;
default:
slurm_seterrno_ret(SLURM_UNEXPECTED_MSG_ERROR);
break;
}
if(!step_layout) {
error("didn't get the job record rc = %s", slurm_strerror(rc));
return rc;
}
_sacct_query(step_layout, jobid, stepid);
slurm_step_layout_destroy(step_layout);
_process_results();
return rc;
}