blob: 997d02505710a629cb1428c584ea6b3b0c393390 [file] [log] [blame] [edit]
/*****************************************************************************\
* jobacct_common.c - common functions for almost all jobacct plugins.
*****************************************************************************
*
* Copyright (C) 2005 Hewlett-Packard Development Company, L.P.
* Written by Danny Auble, <da@llnl.gov>
* CODE-OCEC-09-009. All rights reserved.
*
* This file is part of SLURM, a resource management program.
* For details, see <https://computing.llnl.gov/linux/slurm/>.
* Please also read the included file: DISCLAIMER.
*
* SLURM is free software; you can redistribute it and/or modify it under
* the terms of the GNU General Public License as published by the Free
* Software Foundation; either version 2 of the License, or (at your option)
* any later version.
*
* In addition, as a special exception, the copyright holders give permission
* to link the code of portions of this program with the OpenSSL library under
* certain conditions as described in each individual source file, and
* distribute linked combinations including the two. You must obey the GNU
* General Public License in all respects for all of the code used other than
* OpenSSL. If you modify file(s) with this exception, you may extend this
* exception to your version of the file(s), but you are not obligated to do
* so. If you do not wish to do so, delete this exception statement from your
* version. If you delete this exception statement from all source files in
* the program, then also delete it here.
*
* SLURM is distributed in the hope that it will be useful, but WITHOUT ANY
* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
* FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
* details.
*
* You should have received a copy of the GNU General Public License along
* with SLURM; if not, write to the Free Software Foundation, Inc.,
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
*
* This file is patterned after jobcomp_linux.c, written by Morris Jette and
* Copyright (C) 2002 The Regents of the University of California.
\*****************************************************************************/
#include "jobacct_common.h"
/*
** Define slurm-specific aliases for use by plugins, see slurm_xlator.h
** for details.
*/
strong_alias(jobacct_common_pack, slurm_jobacct_common_pack);
strong_alias(jobacct_common_unpack, slurm_jobacct_common_unpack);
strong_alias(jobacct_common_alloc_jobacct, slurm_jobacct_common_alloc_jobacct);
strong_alias(jobacct_common_free_jobacct, slurm_jobacct_common_free_jobacct);
pthread_mutex_t jobacct_lock = PTHREAD_MUTEX_INITIALIZER;
uint32_t jobacct_job_id = 0;
uint32_t jobacct_step_id = 0;
uint32_t jobacct_mem_limit = 0;
uint32_t jobacct_vmem_limit = 0;
uint32_t mult = 1000;
static void _pack_jobacct_id(jobacct_id_t *jobacct_id,
uint16_t rpc_version, Buf buffer)
{
pack32((uint32_t)jobacct_id->nodeid, buffer);
pack16((uint16_t)jobacct_id->taskid, buffer);
}
static int _unpack_jobacct_id(jobacct_id_t *jobacct_id,
uint16_t rpc_version, Buf buffer)
{
safe_unpack32(&jobacct_id->nodeid, buffer);
safe_unpack16(&jobacct_id->taskid, buffer);
return SLURM_SUCCESS;
unpack_error:
return SLURM_ERROR;
}
extern int jobacct_common_init_struct(struct jobacctinfo *jobacct,
jobacct_id_t *jobacct_id)
{
if(!jobacct_id) {
jobacct_id_t temp_id;
temp_id.taskid = (uint16_t)NO_VAL;
temp_id.nodeid = (uint32_t)NO_VAL;
jobacct_id = &temp_id;
}
memset(jobacct, 0, sizeof(struct jobacctinfo));
jobacct->sys_cpu_sec = 0;
jobacct->sys_cpu_usec = 0;
jobacct->user_cpu_sec = 0;
jobacct->user_cpu_usec = 0;
jobacct->max_vsize = 0;
memcpy(&jobacct->max_vsize_id, jobacct_id, sizeof(jobacct_id_t));
jobacct->tot_vsize = 0;
jobacct->max_rss = 0;
memcpy(&jobacct->max_rss_id, jobacct_id, sizeof(jobacct_id_t));
jobacct->tot_rss = 0;
jobacct->max_pages = 0;
memcpy(&jobacct->max_pages_id, jobacct_id, sizeof(jobacct_id_t));
jobacct->tot_pages = 0;
jobacct->min_cpu = (uint32_t)NO_VAL;
memcpy(&jobacct->min_cpu_id, jobacct_id, sizeof(jobacct_id_t));
jobacct->tot_cpu = 0;
return SLURM_SUCCESS;
}
extern struct jobacctinfo *jobacct_common_alloc_jobacct(
jobacct_id_t *jobacct_id)
{
struct jobacctinfo *jobacct = xmalloc(sizeof(struct jobacctinfo));
jobacct_common_init_struct(jobacct, jobacct_id);
return jobacct;
}
extern void jobacct_common_free_jobacct(void *object)
{
struct jobacctinfo *jobacct = (struct jobacctinfo *)object;
xfree(jobacct);
}
extern int jobacct_common_setinfo(struct jobacctinfo *jobacct,
enum jobacct_data_type type, void *data)
{
int rc = SLURM_SUCCESS;
int *fd = (int *)data;
struct rusage *rusage = (struct rusage *)data;
uint32_t *uint32 = (uint32_t *) data;
jobacct_id_t *jobacct_id = (jobacct_id_t *) data;
struct jobacctinfo *send = (struct jobacctinfo *) data;
slurm_mutex_lock(&jobacct_lock);
switch (type) {
case JOBACCT_DATA_TOTAL:
memcpy(jobacct, send, sizeof(struct jobacctinfo));
break;
case JOBACCT_DATA_PIPE:
safe_write(*fd, jobacct, sizeof(struct jobacctinfo));
break;
case JOBACCT_DATA_RUSAGE:
jobacct->user_cpu_sec = rusage->ru_utime.tv_sec;
jobacct->user_cpu_usec = rusage->ru_utime.tv_usec;
jobacct->sys_cpu_sec = rusage->ru_stime.tv_sec;
jobacct->sys_cpu_usec = rusage->ru_stime.tv_usec;
break;
case JOBACCT_DATA_MAX_RSS:
jobacct->max_rss = *uint32;
break;
case JOBACCT_DATA_MAX_RSS_ID:
jobacct->max_rss_id = *jobacct_id;
break;
case JOBACCT_DATA_TOT_RSS:
jobacct->tot_rss = *uint32;
break;
case JOBACCT_DATA_MAX_VSIZE:
jobacct->max_vsize = *uint32;
break;
case JOBACCT_DATA_MAX_VSIZE_ID:
jobacct->max_vsize_id = *jobacct_id;
break;
case JOBACCT_DATA_TOT_VSIZE:
jobacct->tot_vsize = *uint32;
break;
case JOBACCT_DATA_MAX_PAGES:
jobacct->max_pages = *uint32;
break;
case JOBACCT_DATA_MAX_PAGES_ID:
jobacct->max_pages_id = *jobacct_id;
break;
case JOBACCT_DATA_TOT_PAGES:
jobacct->tot_pages = *uint32;
break;
case JOBACCT_DATA_MIN_CPU:
jobacct->min_cpu = *uint32;
break;
case JOBACCT_DATA_MIN_CPU_ID:
jobacct->min_cpu_id = *jobacct_id;
break;
case JOBACCT_DATA_TOT_CPU:
jobacct->tot_cpu = *uint32;
break;
default:
debug("jobacct_g_set_setinfo data_type %d invalid", type);
}
slurm_mutex_unlock(&jobacct_lock);
return rc;
rwfail:
slurm_mutex_unlock(&jobacct_lock);
return SLURM_ERROR;
}
extern int jobacct_common_getinfo(struct jobacctinfo *jobacct,
enum jobacct_data_type type, void *data)
{
int rc = SLURM_SUCCESS;
int *fd = (int *)data;
uint32_t *uint32 = (uint32_t *) data;
jobacct_id_t *jobacct_id = (jobacct_id_t *) data;
struct rusage *rusage = (struct rusage *)data;
struct jobacctinfo *send = (struct jobacctinfo *) data;
slurm_mutex_lock(&jobacct_lock);
switch (type) {
case JOBACCT_DATA_TOTAL:
memcpy(send, jobacct, sizeof(struct jobacctinfo));
break;
case JOBACCT_DATA_PIPE:
safe_read(*fd, jobacct, sizeof(struct jobacctinfo));
break;
case JOBACCT_DATA_RUSAGE:
memset(rusage, 0, sizeof(struct rusage));
rusage->ru_utime.tv_sec = jobacct->user_cpu_sec;
rusage->ru_utime.tv_usec = jobacct->user_cpu_usec;
rusage->ru_stime.tv_sec = jobacct->sys_cpu_sec;
rusage->ru_stime.tv_usec = jobacct->sys_cpu_usec;
break;
case JOBACCT_DATA_MAX_RSS:
*uint32 = jobacct->max_rss;
break;
case JOBACCT_DATA_MAX_RSS_ID:
*jobacct_id = jobacct->max_rss_id;
break;
case JOBACCT_DATA_TOT_RSS:
*uint32 = jobacct->tot_rss;
break;
case JOBACCT_DATA_MAX_VSIZE:
*uint32 = jobacct->max_vsize;
break;
case JOBACCT_DATA_MAX_VSIZE_ID:
*jobacct_id = jobacct->max_vsize_id;
break;
case JOBACCT_DATA_TOT_VSIZE:
*uint32 = jobacct->tot_vsize;
break;
case JOBACCT_DATA_MAX_PAGES:
*uint32 = jobacct->max_pages;
break;
case JOBACCT_DATA_MAX_PAGES_ID:
*jobacct_id = jobacct->max_pages_id;
break;
case JOBACCT_DATA_TOT_PAGES:
*uint32 = jobacct->tot_pages;
break;
case JOBACCT_DATA_MIN_CPU:
*uint32 = jobacct->min_cpu;
break;
case JOBACCT_DATA_MIN_CPU_ID:
*jobacct_id = jobacct->min_cpu_id;
break;
case JOBACCT_DATA_TOT_CPU:
*uint32 = jobacct->tot_cpu;
break;
default:
debug("jobacct_g_set_setinfo data_type %d invalid", type);
}
slurm_mutex_unlock(&jobacct_lock);
return rc;
rwfail:
slurm_mutex_unlock(&jobacct_lock);
return SLURM_ERROR;
}
extern void jobacct_common_aggregate(struct jobacctinfo *dest,
struct jobacctinfo *from)
{
xassert(dest);
xassert(from);
slurm_mutex_lock(&jobacct_lock);
if(dest->max_vsize < from->max_vsize) {
dest->max_vsize = from->max_vsize;
dest->max_vsize_id = from->max_vsize_id;
}
dest->tot_vsize += from->tot_vsize;
if(dest->max_rss < from->max_rss) {
dest->max_rss = from->max_rss;
dest->max_rss_id = from->max_rss_id;
}
dest->tot_rss += from->tot_rss;
if(dest->max_pages < from->max_pages) {
dest->max_pages = from->max_pages;
dest->max_pages_id = from->max_pages_id;
}
dest->tot_pages += from->tot_pages;
if((dest->min_cpu > from->min_cpu)
|| (dest->min_cpu == (uint32_t)NO_VAL)) {
if(from->min_cpu == (uint32_t)NO_VAL)
from->min_cpu = 0;
dest->min_cpu = from->min_cpu;
dest->min_cpu_id = from->min_cpu_id;
}
dest->tot_cpu += from->tot_cpu;
if(dest->max_vsize_id.taskid == (uint16_t)NO_VAL)
dest->max_vsize_id = from->max_vsize_id;
if(dest->max_rss_id.taskid == (uint16_t)NO_VAL)
dest->max_rss_id = from->max_rss_id;
if(dest->max_pages_id.taskid == (uint16_t)NO_VAL)
dest->max_pages_id = from->max_pages_id;
if(dest->min_cpu_id.taskid == (uint16_t)NO_VAL)
dest->min_cpu_id = from->min_cpu_id;
dest->user_cpu_sec += from->user_cpu_sec;
dest->user_cpu_usec += from->user_cpu_usec;
while (dest->user_cpu_usec >= 1E6) {
dest->user_cpu_sec++;
dest->user_cpu_usec -= 1E6;
}
dest->sys_cpu_sec += from->sys_cpu_sec;
dest->sys_cpu_usec += from->sys_cpu_usec;
while (dest->sys_cpu_usec >= 1E6) {
dest->sys_cpu_sec++;
dest->sys_cpu_usec -= 1E6;
}
slurm_mutex_unlock(&jobacct_lock);
}
extern void jobacct_common_2_stats(slurmdb_stats_t *stats,
struct jobacctinfo *jobacct)
{
xassert(jobacct);
xassert(stats);
slurm_mutex_lock(&jobacct_lock);
stats->vsize_max = jobacct->max_vsize;
stats->vsize_max_nodeid = jobacct->max_vsize_id.nodeid;
stats->vsize_max_taskid = jobacct->max_vsize_id.taskid;
stats->vsize_ave = (double)jobacct->tot_vsize;
stats->rss_max = jobacct->max_rss;
stats->rss_max_nodeid = jobacct->max_rss_id.nodeid;
stats->rss_max_taskid = jobacct->max_rss_id.taskid;
stats->rss_ave = (double)jobacct->tot_rss;
stats->pages_max = jobacct->max_pages;
stats->pages_max_nodeid = jobacct->max_pages_id.nodeid;
stats->pages_max_taskid = jobacct->max_pages_id.taskid;
stats->pages_ave = (double)jobacct->tot_pages;
stats->cpu_min = jobacct->min_cpu;
stats->cpu_min_nodeid = jobacct->min_cpu_id.nodeid;
stats->cpu_min_taskid = jobacct->min_cpu_id.taskid;
stats->cpu_ave = (double)jobacct->tot_cpu;
slurm_mutex_unlock(&jobacct_lock);
}
extern void jobacct_common_pack(struct jobacctinfo *jobacct,
uint16_t rpc_version, Buf buffer)
{
int i=0;
if(!jobacct) {
for(i=0; i<16; i++)
pack32((uint32_t) 0, buffer);
for(i=0; i<4; i++)
pack16((uint16_t) 0, buffer);
return;
}
slurm_mutex_lock(&jobacct_lock);
pack32((uint32_t)jobacct->user_cpu_sec, buffer);
pack32((uint32_t)jobacct->user_cpu_usec, buffer);
pack32((uint32_t)jobacct->sys_cpu_sec, buffer);
pack32((uint32_t)jobacct->sys_cpu_usec, buffer);
pack32((uint32_t)jobacct->max_vsize, buffer);
pack32((uint32_t)jobacct->tot_vsize, buffer);
pack32((uint32_t)jobacct->max_rss, buffer);
pack32((uint32_t)jobacct->tot_rss, buffer);
pack32((uint32_t)jobacct->max_pages, buffer);
pack32((uint32_t)jobacct->tot_pages, buffer);
pack32((uint32_t)jobacct->min_cpu, buffer);
pack32((uint32_t)jobacct->tot_cpu, buffer);
_pack_jobacct_id(&jobacct->max_vsize_id, rpc_version, buffer);
_pack_jobacct_id(&jobacct->max_rss_id, rpc_version, buffer);
_pack_jobacct_id(&jobacct->max_pages_id, rpc_version, buffer);
_pack_jobacct_id(&jobacct->min_cpu_id, rpc_version, buffer);
slurm_mutex_unlock(&jobacct_lock);
}
/* you need to xfree this */
extern int jobacct_common_unpack(struct jobacctinfo **jobacct,
uint16_t rpc_version, Buf buffer)
{
uint32_t uint32_tmp;
*jobacct = xmalloc(sizeof(struct jobacctinfo));
safe_unpack32(&uint32_tmp, buffer);
(*jobacct)->user_cpu_sec = uint32_tmp;
safe_unpack32(&uint32_tmp, buffer);
(*jobacct)->user_cpu_usec = uint32_tmp;
safe_unpack32(&uint32_tmp, buffer);
(*jobacct)->sys_cpu_sec = uint32_tmp;
safe_unpack32(&uint32_tmp, buffer);
(*jobacct)->sys_cpu_usec = uint32_tmp;
safe_unpack32(&(*jobacct)->max_vsize, buffer);
safe_unpack32(&(*jobacct)->tot_vsize, buffer);
safe_unpack32(&(*jobacct)->max_rss, buffer);
safe_unpack32(&(*jobacct)->tot_rss, buffer);
safe_unpack32(&(*jobacct)->max_pages, buffer);
safe_unpack32(&(*jobacct)->tot_pages, buffer);
safe_unpack32(&(*jobacct)->min_cpu, buffer);
safe_unpack32(&(*jobacct)->tot_cpu, buffer);
if(_unpack_jobacct_id(&(*jobacct)->max_vsize_id, rpc_version, buffer)
!= SLURM_SUCCESS)
goto unpack_error;
if(_unpack_jobacct_id(&(*jobacct)->max_rss_id, rpc_version, buffer)
!= SLURM_SUCCESS)
goto unpack_error;
if(_unpack_jobacct_id(&(*jobacct)->max_pages_id, rpc_version, buffer)
!= SLURM_SUCCESS)
goto unpack_error;
if(_unpack_jobacct_id(&(*jobacct)->min_cpu_id, rpc_version, buffer)
!= SLURM_SUCCESS)
goto unpack_error;
return SLURM_SUCCESS;
unpack_error:
xfree(*jobacct);
return SLURM_ERROR;
}
extern int jobacct_common_set_mem_limit(uint32_t job_id, uint32_t step_id,
uint32_t mem_limit)
{
if ((job_id == 0) || (mem_limit == 0)) {
error("jobacct_common_set_mem_limit: jobid:%u mem_limit:%u",
job_id, mem_limit);
return SLURM_ERROR;
}
jobacct_job_id = job_id;
jobacct_step_id = step_id;
jobacct_mem_limit = mem_limit * 1024; /* MB to KB */
jobacct_vmem_limit = jobacct_mem_limit;
jobacct_vmem_limit *= (slurm_get_vsize_factor() / 100.0);
return SLURM_SUCCESS;
}
extern int jobacct_common_add_task(pid_t pid, jobacct_id_t *jobacct_id,
List task_list)
{
struct jobacctinfo *jobacct = jobacct_common_alloc_jobacct(jobacct_id);
slurm_mutex_lock(&jobacct_lock);
if(pid <= 0) {
error("invalid pid given (%d) for task acct", pid);
goto error;
} else if (!task_list) {
error("no task list created!");
goto error;
}
jobacct->pid = pid;
jobacct->min_cpu = 0;
debug2("adding task %u pid %d on node %u to jobacct",
jobacct_id->taskid, pid, jobacct_id->nodeid);
list_push(task_list, jobacct);
slurm_mutex_unlock(&jobacct_lock);
return SLURM_SUCCESS;
error:
slurm_mutex_unlock(&jobacct_lock);
jobacct_common_free_jobacct(jobacct);
return SLURM_ERROR;
}
extern struct jobacctinfo *jobacct_common_stat_task(pid_t pid, List task_list)
{
struct jobacctinfo *jobacct = NULL;
struct jobacctinfo *ret_jobacct = NULL;
ListIterator itr = NULL;
slurm_mutex_lock(&jobacct_lock);
if (!task_list) {
error("no task list created!");
goto error;
}
itr = list_iterator_create(task_list);
while((jobacct = list_next(itr))) {
if(jobacct->pid == pid)
break;
}
list_iterator_destroy(itr);
if (jobacct == NULL)
goto error;
ret_jobacct = xmalloc(sizeof(struct jobacctinfo));
memcpy(ret_jobacct, jobacct, sizeof(struct jobacctinfo));
error:
slurm_mutex_unlock(&jobacct_lock);
return ret_jobacct;
}
extern struct jobacctinfo *jobacct_common_remove_task(pid_t pid, List task_list)
{
struct jobacctinfo *jobacct = NULL;
ListIterator itr = NULL;
slurm_mutex_lock(&jobacct_lock);
if (!task_list) {
error("no task list created!");
goto error;
}
itr = list_iterator_create(task_list);
while((jobacct = list_next(itr))) {
if(jobacct->pid == pid) {
list_remove(itr);
break;
}
}
list_iterator_destroy(itr);
if(jobacct) {
debug2("removing task %u pid %d from jobacct",
jobacct->max_vsize_id.taskid, jobacct->pid);
} else {
debug2("pid(%d) not being watched in jobacct!", pid);
}
error:
slurm_mutex_unlock(&jobacct_lock);
return jobacct;
}