| /*****************************************************************************\ |
| * options.c - option functions for sacct |
| * |
| * $Id: options.c 7541 2006-03-18 01:44:58Z da $ |
| ***************************************************************************** |
| * Copyright (C) 2006 The Regents of the University of California. |
| * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). |
| * Written by Danny Auble <da@llnl.gov>. |
| * UCRL-CODE-226842. |
| * |
| * This file is part of SLURM, a resource management program. |
| * For details, see <http://www.llnl.gov/linux/slurm/>. |
| * |
| * SLURM is free software; you can redistribute it and/or modify it under |
| * the terms of the GNU General Public License as published by the Free |
| * Software Foundation; either version 2 of the License, or (at your option) |
| * any later version. |
| * |
| * In addition, as a special exception, the copyright holders give permission |
| * to link the code of portions of this program with the OpenSSL library under |
| * certain conditions as described in each individual source file, and |
| * distribute linked combinations including the two. You must obey the GNU |
| * General Public License in all respects for all of the code used other than |
| * OpenSSL. If you modify file(s) with this exception, you may extend this |
| * exception to your version of the file(s), but you are not obligated to do |
| * so. If you do not wish to do so, delete this exception statement from your |
| * version. If you delete this exception statement from all source files in |
| * the program, then also delete it here. |
| * |
| * SLURM is distributed in the hope that it will be useful, but WITHOUT ANY |
| * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS |
| * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more |
| * details. |
| * |
| * You should have received a copy of the GNU General Public License along |
| * with SLURM; if not, write to the Free Software Foundation, Inc., |
| * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. |
| \*****************************************************************************/ |
| |
| #include "src/common/read_config.h" |
| #include "sacct.h" |
| #include <time.h> |
| |
| typedef struct expired_rec { /* table of expired jobs */ |
| uint32_t job; |
| time_t job_submit; |
| char *line; |
| } expired_rec_t; |
| |
| void _destroy_parts(void *object); |
| void _destroy_steps(void *object); |
| void _destroy_exp(void *object); |
| char *_convert_type(int rec_type); |
| int _cmp_jrec(const void *a1, const void *a2); |
| void _dump_header(acct_header_t header); |
| FILE *_open_log_file(void); |
| void _help_fields_msg(void); |
| void _help_msg(void); |
| void _usage(void); |
| void _init_params(); |
| char *_prefix_filename(char *path, char *prefix); |
| |
| int selected_status[STATUS_COUNT]; |
| List selected_parts = NULL; |
| List selected_steps = NULL; |
| |
| void _destroy_parts(void *object) |
| { |
| char *part = (char *)object; |
| xfree(part); |
| } |
| |
| void _destroy_steps(void *object) |
| { |
| selected_step_t *step = (selected_step_t *)object; |
| if(step) { |
| xfree(step->job); |
| xfree(step->step); |
| xfree(step); |
| } |
| } |
| void _destroy_exp(void *object) |
| { |
| expired_rec_t *exp_rec = (expired_rec_t *)object; |
| if(exp_rec) { |
| xfree(exp_rec->line); |
| xfree(exp_rec); |
| } |
| } |
| |
| char *_convert_type(int rec_type) |
| { |
| switch(rec_type) { |
| case JOB_START: |
| return "JOB_START"; |
| case JOB_STEP: |
| return "JOB_STEP"; |
| case JOB_TERMINATED: |
| return "JOB_TERMINATED"; |
| default: |
| return "UNKNOWN"; |
| } |
| } |
| |
| void _show_rec(char *f[]) |
| { |
| int i; |
| fprintf(stderr, "rec>"); |
| for (i=0; f[i]; i++) |
| fprintf(stderr, " %s", f[i]); |
| fprintf(stderr, "\n"); |
| return; |
| } |
| |
| int _cmp_jrec(const void *a1, const void *a2) { |
| expired_rec_t *j1 = (expired_rec_t *) a1; |
| expired_rec_t *j2 = (expired_rec_t *) a2; |
| |
| if (j1->job < j2->job) |
| return -1; |
| else if (j1->job == j2->job) { |
| if(j1->job_submit == j2->job_submit) |
| return 0; |
| else |
| return 1; |
| } |
| return 1; |
| } |
| |
| /* _dump_header() -- dump the common fields of a record |
| * |
| * In: Index into the jobs table |
| * Out: Nothing. |
| */ |
| void _dump_header(acct_header_t header) |
| { |
| struct tm ts; |
| gmtime_r(&header.timestamp, &ts); |
| printf("%u %s %04d%02d%02d%02d%02d%02d %d %s %s ", |
| header.jobnum, |
| header.partition, |
| 1900+(ts.tm_year), |
| 1+(ts.tm_mon), |
| ts.tm_mday, |
| ts.tm_hour, |
| ts.tm_min, |
| ts.tm_sec, |
| (int)header.job_submit, |
| header.blockid, /* block id */ |
| "-"); /* reserved 1 */ |
| } |
| /* _open_log_file() -- find the current or specified log file, and open it |
| * |
| * IN: Nothing |
| * RETURNS: Nothing |
| * |
| * Side effects: |
| * - Sets opt_filein to the current system accounting log unless |
| * the user specified another file. |
| */ |
| |
| FILE *_open_log_file(void) |
| { |
| FILE *fd = fopen(params.opt_filein, "r"); |
| if (fd == NULL) { |
| perror(params.opt_filein); |
| exit(1); |
| } |
| return fd; |
| } |
| |
| void _help_fields_msg(void) |
| { |
| int i; |
| |
| for (i = 0; fields[i].name; i++) { |
| if (i & 3) |
| printf(" "); |
| else |
| printf("\n"); |
| printf("%-10s", fields[i].name); |
| } |
| printf("\n"); |
| return; |
| } |
| |
| void _help_msg(void) |
| { |
| slurm_ctl_conf_t *conf = slurm_conf_lock(); |
| printf("\n" |
| "By default, sacct displays accounting data for all jobs and job\n" |
| "steps that are present in the log.\n" |
| "\n" |
| "Notes:\n" |
| "\n" |
| " * If --dump is specified,\n" |
| " * The field selection options (--brief, --fields, ...)\n" |
| " have no effect\n" |
| " * Elapsed time fields are presented as 2 fields, integral\n" |
| " seconds and integral microseconds\n" |
| " * If --dump is not specified, elapsed time fields are presented\n" |
| " as [[days-]hours:]minutes:seconds.hundredths\n" |
| " * The default input file is the file named in the \"jobacct_logfile\"\n" |
| " parameter in %s.\n" |
| "\n" |
| "Options:\n" |
| "\n" |
| "-A, --Account\n" |
| " Equivalent to \"--fields=jobid,jobname,start,end,cpu,\n" |
| " vsize_short,status,exitcode\". This option has no effect\n" |
| " if --dump is specified.\n" |
| "-a, --all\n" |
| " Display job accounting data for all users. By default, only\n" |
| " data for the current user is displayed for users other than\n" |
| " root.\n" |
| "-b, --brief\n" |
| " Equivalent to \"--fields=jobstep,status,error\". This option\n" |
| " has no effect if --dump is specified.\n" |
| "-d, --dump\n" |
| " Dump the raw data records\n" |
| "--duplicates\n" |
| " If SLURM job ids are reset, but the job accounting log file\n" |
| " isn't reset at the same time (with -e, for example), some\n" |
| " job numbers will probably appear more than once in the\n" |
| " accounting log file to refer to different jobs; such jobs\n" |
| " can be distinguished by the \"job_submit\" time stamp in the\n" |
| " data records.\n" |
| " When data for specific jobs are requested with\n" |
| " the --jobs option, we assume that the user\n" |
| " wants to see only the most recent job with that number. This\n" |
| " behavior can be overridden by specifying --duplicates, in\n" |
| " which case all records that match the selection criteria\n" |
| " will be returned.\n" |
| " When --jobs is not specified, we report\n" |
| " data for all jobs that match the selection criteria, even if\n" |
| " some of the job numbers are reused. Specify that you only\n" |
| " want the most recent job for each selected job number with\n" |
| " the --noduplicates option.\n" |
| "-e <timespec>, --expire=<timespec>\n" |
| " Remove jobs from SLURM's current accounting log file (or the\n" |
| " file specified with --file) that completed more than <timespec>\n" |
| " ago. If <timespec> is an integer, it is interpreted as\n" |
| " minutes. If <timespec> is an integer followed by \"h\", it is\n" |
| " interpreted as a number of hours. If <timespec> is an integer\n" |
| " followed by \"d\", it is interpreted as number of days. For\n" |
| " example, \"--expire=14d\" means that you wish to purge the job\n" |
| " accounting log of all jobs that completed more than 14 days ago.\n" |
| "-F <field-list>, --fields=<field-list>\n" |
| " Display the specified data (use \"--help-fields\" for a\n" |
| " list of available fields). If no field option is specified,\n" |
| " we use \"--fields=jobstep,jobname,partition,ncpus,status,error\".\n" |
| "-f<file>, --file=<file>\n" |
| " Read data from the specified file, rather than SLURM's current\n" |
| " accounting log file.\n" |
| "-l, --long\n" |
| " Equivalent to specifying\n" |
| " \"--fields=jobstep,usercpu,systemcpu,minflt,majflt,nprocs,\n" |
| " ncpus,elapsed,status,exitcode\"\n" |
| "-O, --formatted_dump\n" |
| " Dump accounting records in an easy-to-read format, primarily\n" |
| " for debugging.\n" |
| "-g <gid>, --gid <gid>\n" |
| " Select only jobs submitted from the <gid> group.\n" |
| "-h, --help\n" |
| " Print a general help message.\n" |
| "--help-fields\n" |
| " Print a list of fields that can be specified with the\n" |
| " \"--fields\" option\n" |
| "-j <job(.step)>, --jobs=<job(.step)>\n" |
| " Display information about this job or comma-separated\n" |
| " list of jobs. The default is all jobs. Adding .step will\n" |
| " display the specfic job step of that job.\n" |
| "--noduplicates\n" |
| " See the discussion under --duplicates.\n" |
| "--noheader\n" |
| " Print (or don't print) a header. The default is to print a\n" |
| " header; the option has no effect if --dump is specified\n" |
| "-p <part_list>, --partition=<part_list>\n" |
| " Display or purge information about jobs and job steps in the\n" |
| " <part_list> partition(s). The default is all partitions.\n" |
| "-P --purge\n" |
| " Used in conjunction with --expire to remove invalid data\n" |
| " from the job accounting log.\n" |
| "-r --raw\n" |
| " don't format data leave in raw format\n" |
| "-s <state-list>, --state=<state-list>\n" |
| " Select jobs based on their current status: running (r),\n" |
| " completed (cd), failed (f), timeout (to), and node_fail (nf).\n" |
| "-S, --stat\n" |
| " Get real time status of a jobstep supplied by the -j\n" |
| " option\n" |
| "-t, --total\n" |
| " Only show cumulative statistics for each job, not the\n" |
| " intermediate steps\n" |
| "-u <uid>, --uid <uid>\n" |
| " Select only jobs submitted by the user with uid <uid>. Only\n" |
| " root users are allowed to specify a uid other than their own.\n" |
| "--usage\n" |
| " Pointer to this message.\n" |
| "-v, --verbose\n" |
| " Primarily for debugging purposes, report the state of various\n" |
| " variables during processing.\n", conf->slurm_conf); |
| |
| slurm_conf_unlock(); |
| |
| return; |
| } |
| |
| void _usage(void) |
| { |
| printf("\nUsage: sacct [options]\n\tUse --help for help\n"); |
| } |
| |
| void _init_params() |
| { |
| params.opt_dump = 0; /* --dump */ |
| params.opt_dup = -1; /* --duplicates; +1 = explicitly set */ |
| params.opt_fdump = 0; /* --formattted_dump */ |
| params.opt_stat = 0; /* --stat */ |
| params.opt_gid = -1; /* --gid (-1=wildcard, 0=root) */ |
| params.opt_header = 1; /* can only be cleared */ |
| params.opt_help = 0; /* --help */ |
| params.opt_long = 0; /* --long */ |
| params.opt_lowmem = 0; /* --low_memory */ |
| params.opt_purge = 0; /* --purge */ |
| params.opt_raw = 0; /* --raw */ |
| params.opt_total = 0; /* --total */ |
| params.opt_uid = -1; /* --uid (-1=wildcard, 0=root) */ |
| params.opt_verbose = 0; /* --verbose */ |
| params.opt_expire_timespec = NULL; /* --expire= */ |
| params.opt_field_list = NULL; /* --fields= */ |
| params.opt_filein = NULL; /* --file */ |
| params.opt_job_list = NULL; /* --jobs */ |
| params.opt_partition_list = NULL;/* --partitions */ |
| params.opt_state_list = NULL; /* --states */ |
| } |
| |
| /* prefix_filename() -- insert a filename prefix into a path |
| * |
| * IN: path = fully-qualified path+file name |
| * prefix = the prefix to insert into the file name |
| * RETURNS: pointer to the updated path+file name |
| */ |
| |
| char *_prefix_filename(char *path, char *prefix) { |
| char *out; |
| int i, |
| plen; |
| |
| plen = strlen(path); |
| out = xmalloc(plen+strlen(prefix)+1); |
| for (i=plen-1; i>=0; i--) |
| if (path[i]=='/') { |
| break; |
| } |
| i++; |
| *out = 0; |
| strncpy(out, path, i); |
| out[i] = 0; |
| strcat(out, prefix); |
| strcat(out, path+i); |
| return(out); |
| } |
| |
| int decode_status_char(char *status) |
| { |
| if (!strcasecmp(status, "p")) |
| return JOB_PENDING; /* we should never see this */ |
| else if (!strcasecmp(status, "r")) |
| return JOB_RUNNING; |
| else if (!strcasecmp(status, "su")) |
| return JOB_SUSPENDED; |
| else if (!strcasecmp(status, "cd")) |
| return JOB_COMPLETE; |
| else if (!strcasecmp(status, "ca")) |
| return JOB_CANCELLED; |
| else if (!strcasecmp(status, "f")) |
| return JOB_FAILED; |
| else if (!strcasecmp(status, "to")) |
| return JOB_TIMEOUT; |
| else if (!strcasecmp(status, "nf")) |
| return JOB_NODE_FAIL; |
| else |
| return -1; // unknown |
| } |
| |
| char *decode_status_int(int status) |
| { |
| switch(status & ~JOB_COMPLETING) { |
| case JOB_PENDING: |
| return "PENDING"; /* we should never see this */ |
| case JOB_RUNNING: |
| return "RUNNING"; |
| case JOB_SUSPENDED: |
| return "SUSPENDED"; |
| case JOB_COMPLETE: |
| return "COMPLETED"; |
| case JOB_CANCELLED: |
| return "CANCELLED"; |
| case JOB_FAILED: |
| return "FAILED"; |
| case JOB_TIMEOUT: |
| return "TIMEOUT"; |
| case JOB_NODE_FAIL: |
| return "NODE_FAILED"; |
| default: |
| return "UNKNOWN"; |
| } |
| } |
| |
| char *decode_status_int_abbrev(int status) |
| { |
| switch(status & ~JOB_COMPLETING) { |
| case JOB_PENDING: |
| return "PD"; /* we should never see this */ |
| case JOB_RUNNING: |
| return "R"; |
| case JOB_SUSPENDED: |
| return "S"; |
| case JOB_COMPLETE: |
| return "CD"; |
| case JOB_CANCELLED: |
| return "CA"; |
| case JOB_FAILED: |
| return "F"; |
| case JOB_TIMEOUT: |
| return "TO"; |
| case JOB_NODE_FAIL: |
| return "NF"; |
| case JOB_END: |
| return "JOB_END"; |
| default: |
| return "UNKNOWN"; |
| } |
| } |
| |
| int get_data(void) |
| { |
| char line[BUFFER_SIZE]; |
| char *f[MAX_RECORD_FIELDS+1]; /* End list with null entry and, |
| possibly, more data than we |
| expected */ |
| char *fptr; |
| int i; |
| FILE *fd = NULL; |
| int lc = 0; |
| int rec_type = -1; |
| selected_step_t *selected_step = NULL; |
| char *selected_part = NULL; |
| ListIterator itr = NULL; |
| int show_full = 0; |
| |
| fd = _open_log_file(); |
| |
| while (fgets(line, BUFFER_SIZE, fd)) { |
| lc++; |
| fptr = line; /* break the record into NULL- |
| terminated strings */ |
| |
| for (i = 0; i < MAX_RECORD_FIELDS; i++) { |
| f[i] = fptr; |
| fptr = strstr(fptr, " "); |
| if (fptr == NULL) { |
| fptr = strstr(f[i], "\n"); |
| if (fptr) |
| *fptr = 0; |
| break; |
| } else |
| *fptr++ = 0; |
| } |
| f[++i] = 0; |
| |
| if(i < HEADER_LENGTH) { |
| continue; |
| } |
| |
| rec_type = atoi(f[F_RECTYPE]); |
| |
| if (list_count(selected_steps)) { |
| itr = list_iterator_create(selected_steps); |
| while((selected_step = list_next(itr))) { |
| if (strcmp(selected_step->job, f[F_JOB])) |
| continue; |
| /* job matches; does the step? */ |
| if(selected_step->step == NULL) { |
| show_full = 1; |
| list_iterator_destroy(itr); |
| goto foundjob; |
| } else if (rec_type != JOB_STEP |
| || !strcmp(f[F_JOBSTEP], |
| selected_step->step)) { |
| list_iterator_destroy(itr); |
| goto foundjob; |
| } |
| } |
| list_iterator_destroy(itr); |
| continue; /* no match */ |
| } else { |
| show_full = 1; |
| } |
| foundjob: |
| |
| if (list_count(selected_parts)) { |
| itr = list_iterator_create(selected_parts); |
| while((selected_part = list_next(itr))) |
| if (!strcasecmp(f[F_PARTITION], |
| selected_part)) { |
| list_iterator_destroy(itr); |
| goto foundp; |
| } |
| list_iterator_destroy(itr); |
| continue; /* no match */ |
| } |
| foundp: |
| |
| if (params.opt_fdump) { |
| do_fdump(f, lc); |
| continue; |
| } |
| |
| /* Build suitable tables with all the data */ |
| switch(rec_type) { |
| case JOB_START: |
| if(i < F_JOB_ACCOUNT) { |
| printf("Bad data on a Job Start\n"); |
| _show_rec(f); |
| } else |
| process_start(f, lc, show_full, i); |
| break; |
| case JOB_STEP: |
| if(i < F_MAX_VSIZE) { |
| printf("Bad data on a Step entry\n"); |
| _show_rec(f); |
| } else |
| process_step(f, lc, show_full, i); |
| break; |
| case JOB_SUSPEND: |
| if(i < JOB_TERM_LENGTH) { |
| printf("Bad data on a Suspend entry\n"); |
| _show_rec(f); |
| } else |
| process_suspend(f, lc, show_full, i); |
| break; |
| case JOB_TERMINATED: |
| if(i < JOB_TERM_LENGTH) { |
| printf("Bad data on a Job Term\n"); |
| _show_rec(f); |
| } else |
| process_terminated(f, lc, show_full, i); |
| break; |
| default: |
| if (params.opt_verbose > 1) |
| fprintf(stderr, |
| "Invalid record at line %d of " |
| "input file\n", |
| lc); |
| if (params.opt_verbose > 2) |
| _show_rec(f); |
| input_error++; |
| break; |
| } |
| } |
| |
| if (ferror(fd)) { |
| perror(params.opt_filein); |
| exit(1); |
| } |
| fclose(fd); |
| |
| return SLURM_SUCCESS; |
| } |
| |
| void parse_command_line(int argc, char **argv) |
| { |
| extern int optind; |
| int c, i, optionIndex = 0; |
| char *end = NULL, *start = NULL, *acct_type = NULL; |
| selected_step_t *selected_step = NULL; |
| ListIterator itr = NULL; |
| struct stat stat_buf; |
| char *dot = NULL; |
| |
| static struct option long_options[] = { |
| {"all", 0,0, 'a'}, |
| {"Account", 0,0, 'A'}, |
| {"brief", 0, 0, 'b'}, |
| {"duplicates", 0, ¶ms.opt_dup, 1}, |
| {"dump", 0, 0, 'd'}, |
| {"expire", 1, 0, 'e'}, |
| {"fields", 1, 0, 'F'}, |
| {"file", 1, 0, 'f'}, |
| {"formatted_dump", 0, 0, 'O'}, |
| {"stat", 0, 0, 'S'}, |
| {"gid", 1, 0, 'g'}, |
| {"group", 1, 0, 'g'}, |
| {"help", 0, ¶ms.opt_help, 1}, |
| {"help-fields", 0, ¶ms.opt_help, 2}, |
| {"jobs", 1, 0, 'j'}, |
| {"long", 0, 0, 'l'}, |
| {"big_logfile", 0, ¶ms.opt_lowmem, 1}, |
| {"noduplicates", 0, ¶ms.opt_dup, 0}, |
| {"noheader", 0, ¶ms.opt_header, 0}, |
| {"partition", 1, 0, 'p'}, |
| {"purge", 0, 0, 'P'}, |
| {"raw", 0, 0, 'r'}, |
| {"state", 1, 0, 's'}, |
| {"total", 0, 0, 't'}, |
| {"uid", 1, 0, 'u'}, |
| {"usage", 0, ¶ms.opt_help, 3}, |
| {"user", 1, 0, 'u'}, |
| {"verbose", 0, 0, 'v'}, |
| {"version", 0, 0, 'V'}, |
| {0, 0, 0, 0}}; |
| |
| _init_params(); |
| |
| if ((i=getuid())) /* default to current user unless root*/ |
| params.opt_uid = i; |
| |
| opterr = 1; /* Let getopt report problems to the user */ |
| |
| while (1) { /* now cycle through the command line */ |
| c = getopt_long(argc, argv, "aAbde:F:f:g:hj:J:lOPp:rs:StUu:Vv", |
| long_options, &optionIndex); |
| if (c == -1) |
| break; |
| switch (c) { |
| case 'a': |
| params.opt_uid = -1; |
| break; |
| case 'A': |
| params.opt_field_list = |
| xrealloc(params.opt_field_list, |
| (params.opt_field_list==NULL? 0 : |
| sizeof(params.opt_field_list)) + |
| sizeof(ACCOUNT_FIELDS)+1); |
| strcat(params.opt_field_list, ACCOUNT_FIELDS); |
| strcat(params.opt_field_list, ","); |
| break; |
| |
| case 'b': |
| params.opt_field_list = |
| xrealloc(params.opt_field_list, |
| (params.opt_field_list==NULL? 0 : |
| sizeof(params.opt_field_list)) + |
| sizeof(BRIEF_FIELDS)+1); |
| strcat(params.opt_field_list, BRIEF_FIELDS); |
| strcat(params.opt_field_list, ","); |
| break; |
| |
| case 'd': |
| params.opt_dump = 1; |
| break; |
| |
| case 'e': |
| { /* decode the time spec */ |
| long acc=0; |
| params.opt_expire_timespec = strdup(optarg); |
| for (i=0; params.opt_expire_timespec[i]; i++) { |
| char c = params.opt_expire_timespec[i]; |
| if (isdigit(c)) { |
| acc = (acc*10)+(c-'0'); |
| continue; |
| } |
| switch (c) { |
| case 'D': |
| case 'd': |
| params.opt_expire += |
| acc*SECONDS_IN_DAY; |
| acc=0; |
| break; |
| case 'H': |
| case 'h': |
| params.opt_expire += |
| acc*SECONDS_IN_HOUR; |
| acc=0; |
| break; |
| case 'M': |
| case 'm': |
| params.opt_expire += |
| acc*SECONDS_IN_MINUTE; |
| acc=0; |
| break; |
| default: |
| params.opt_expire = -1; |
| goto bad_timespec; |
| } |
| } |
| params.opt_expire += acc*SECONDS_IN_MINUTE; |
| bad_timespec: |
| if (params.opt_expire <= 0) { |
| fprintf(stderr, |
| "Invalid timspec for " |
| "--expire: \"%s\"\n", |
| params.opt_expire_timespec); |
| exit(1); |
| } |
| } |
| params.opt_uid = -1; /* fix default; can't purge by uid */ |
| break; |
| |
| case 'F': |
| if(params.opt_stat) |
| xfree(params.opt_field_list); |
| |
| params.opt_field_list = |
| xrealloc(params.opt_field_list, |
| (params.opt_field_list==NULL? 0 : |
| strlen(params.opt_field_list)) + |
| strlen(optarg) + 1); |
| strcat(params.opt_field_list, optarg); |
| strcat(params.opt_field_list, ","); |
| break; |
| |
| case 'f': |
| params.opt_filein = |
| xrealloc(params.opt_filein, strlen(optarg)+1); |
| strcpy(params.opt_filein, optarg); |
| break; |
| |
| case 'g': |
| if (isdigit((int) *optarg)) |
| params.opt_gid = atoi(optarg); |
| else { |
| struct group *grp; |
| if ((grp=getgrnam(optarg))==NULL) { |
| fprintf(stderr, |
| "Invalid group id: %s\n", |
| optarg); |
| exit(1); |
| } |
| params.opt_gid=grp->gr_gid; |
| } |
| break; |
| |
| case 'h': |
| params.opt_help = 1; |
| break; |
| |
| case 'j': |
| if ((strspn(optarg, "0123456789, ") < strlen(optarg)) |
| && (strspn(optarg, ".0123456789, ") |
| < strlen(optarg))) { |
| fprintf(stderr, "Invalid jobs list: %s\n", |
| optarg); |
| exit(1); |
| } |
| params.opt_job_list = |
| xrealloc(params.opt_job_list, |
| (params.opt_job_list==NULL? 0 : |
| strlen(params.opt_job_list)) + |
| strlen(optarg) + 1); |
| strcat(params.opt_job_list, optarg); |
| strcat(params.opt_job_list, ","); |
| break; |
| |
| case 'l': |
| params.opt_field_list = |
| xrealloc(params.opt_field_list, |
| (params.opt_field_list==NULL? 0 : |
| strlen(params.opt_field_list)) + |
| sizeof(LONG_FIELDS)+1); |
| strcat(params.opt_field_list, LONG_FIELDS); |
| strcat(params.opt_field_list, ","); |
| break; |
| |
| case 'O': |
| params.opt_fdump = 1; |
| break; |
| |
| case 'P': |
| params.opt_purge = 1; |
| break; |
| |
| case 'p': |
| params.opt_partition_list = |
| xrealloc(params.opt_partition_list, |
| (params.opt_partition_list==NULL? 0 : |
| strlen(params.opt_partition_list)) + |
| strlen(optarg) + 1); |
| strcat(params.opt_partition_list, optarg); |
| strcat(params.opt_partition_list, ","); |
| break; |
| case 'r': |
| params.opt_raw = 1; |
| break; |
| case 's': |
| params.opt_state_list = |
| xrealloc(params.opt_state_list, |
| (params.opt_state_list==NULL? 0 : |
| strlen(params.opt_state_list)) + |
| strlen(optarg) + 1); |
| strcat(params.opt_state_list, optarg); |
| strcat(params.opt_state_list, ","); |
| break; |
| |
| case 'S': |
| params.opt_field_list = |
| xrealloc(params.opt_field_list, |
| (params.opt_field_list==NULL? 0 : |
| strlen(params.opt_field_list)) + |
| sizeof(STAT_FIELDS)+1); |
| |
| strcat(params.opt_field_list, STAT_FIELDS); |
| strcat(params.opt_field_list, ","); |
| |
| params.opt_stat = 1; |
| break; |
| |
| case 't': |
| params.opt_total = 1; |
| break; |
| |
| case 'U': |
| params.opt_help = 3; |
| break; |
| |
| case 'u': |
| if (isdigit((int) *optarg)) |
| params.opt_uid = atoi(optarg); |
| else { |
| struct passwd *pwd; |
| if ((pwd=getpwnam(optarg))==NULL) { |
| fprintf(stderr, |
| "Invalid user id: %s\n", |
| optarg); |
| exit(1); |
| } |
| params.opt_uid=pwd->pw_uid; |
| } |
| break; |
| |
| case 'v': |
| /* Handle -vvv thusly... |
| * 0 - report only normal messages and errors |
| * 1 - report options selected and major operations |
| * 2 - report data anomalies probably not errors |
| * 3 - blather on and on |
| */ |
| params.opt_verbose++; |
| break; |
| |
| case 'V': |
| { |
| char obuf[20]; /* should be long enough */ |
| char *rev="$Revision: 7267 $"; |
| char *s; |
| |
| s=strstr(rev, " ")+1; |
| for (i=0; s[i]!=' '; i++) |
| obuf[i]=s[i]; |
| obuf[i] = 0; |
| printf("%s: %s\n", argv[0], obuf); |
| exit(0); |
| } |
| |
| case ':': |
| case '?': /* getopt() has explained it */ |
| exit(1); |
| } |
| } |
| |
| /* Now set params.opt_dup, unless they've already done so */ |
| if (params.opt_dup < 0) /* not already set explicitly */ |
| if (params.opt_job_list) |
| /* They probably want the most recent job N if |
| * they requested specific jobs or steps. */ |
| params.opt_dup = 0; |
| |
| if (params.opt_verbose) { |
| fprintf(stderr, "Options selected:\n" |
| "\topt_dump=%d\n" |
| "\topt_dup=%d\n" |
| "\topt_expire=%s (%lu seconds)\n" |
| "\topt_fdump=%d\n" |
| "\topt_stat=%d\n" |
| "\topt_field_list=%s\n" |
| "\topt_filein=%s\n" |
| "\topt_header=%d\n" |
| "\topt_help=%d\n" |
| "\topt_job_list=%s\n" |
| "\topt_long=%d\n" |
| "\topt_lowmem=%d\n" |
| "\topt_partition_list=%s\n" |
| "\topt_purge=%d\n" |
| "\topt_raw=%d\n" |
| "\topt_state_list=%s\n" |
| "\topt_total=%d\n" |
| "\topt_uid=%d\n" |
| "\topt_verbose=%d\n", |
| params.opt_dump, |
| params.opt_dup, |
| params.opt_expire_timespec, params.opt_expire, |
| params.opt_fdump, |
| params.opt_stat, |
| params.opt_field_list, |
| params.opt_filein, |
| params.opt_header, |
| params.opt_help, |
| params.opt_job_list, |
| params.opt_long, |
| params.opt_lowmem, |
| params.opt_partition_list, |
| params.opt_purge, |
| params.opt_raw, |
| params.opt_state_list, |
| params.opt_total, |
| params.opt_uid, |
| params.opt_verbose); |
| } |
| |
| /* check if we have accounting data to view */ |
| if (params.opt_filein == NULL) |
| params.opt_filein = slurm_get_jobacct_loc(); |
| acct_type = slurm_get_jobacct_type(); |
| if ((strcmp(acct_type, "jobacct/none") == 0) |
| && (stat(params.opt_filein, &stat_buf) != 0)) { |
| fprintf(stderr, "SLURM accounting is disabled\n"); |
| exit(1); |
| } |
| xfree(acct_type); |
| |
| /* specific partitions requested? */ |
| if (params.opt_partition_list) { |
| |
| start = params.opt_partition_list; |
| while ((end = strstr(start, ",")) && start) { |
| *end = 0; |
| while (isspace(*start)) |
| start++; /* discard whitespace */ |
| if(!(int)*start) |
| continue; |
| acct_type = xstrdup(start); |
| list_append(selected_parts, acct_type); |
| start = end + 1; |
| } |
| if (params.opt_verbose) { |
| fprintf(stderr, "Partitions requested:\n"); |
| itr = list_iterator_create(selected_parts); |
| while((start = list_next(itr))) |
| fprintf(stderr, "\t: %s\n", start); |
| list_iterator_destroy(itr); |
| } |
| } |
| |
| /* specific jobs requested? */ |
| if (params.opt_job_list) { |
| start = params.opt_job_list; |
| while ((end = strstr(start, ",")) && start) { |
| *end = 0; |
| while (isspace(*start)) |
| start++; /* discard whitespace */ |
| if(!(int)*start) |
| continue; |
| selected_step = xmalloc(sizeof(selected_step_t)); |
| list_append(selected_steps, selected_step); |
| |
| dot = strstr(start, "."); |
| if (dot == NULL) { |
| debug2("No jobstep requested"); |
| selected_step->step = NULL; |
| } else { |
| *dot++ = 0; |
| selected_step->step = xstrdup(dot); |
| } |
| selected_step->job = xstrdup(start); |
| start = end + 1; |
| } |
| if (params.opt_verbose) { |
| fprintf(stderr, "Jobs requested:\n"); |
| itr = list_iterator_create(selected_steps); |
| while((selected_step = list_next(itr))) { |
| if(selected_step->step) |
| fprintf(stderr, "\t: %s.%s\n", |
| selected_step->job, |
| selected_step->step); |
| else |
| fprintf(stderr, "\t: %s\n", |
| selected_step->job); |
| } |
| list_iterator_destroy(itr); |
| } |
| } |
| |
| /* specific states (completion status) requested? */ |
| if (params.opt_state_list) { |
| start = params.opt_state_list; |
| while ((end = strstr(start, ",")) && start) { |
| int c; |
| *end = 0; |
| while (isspace(*start)) |
| start++; /* discard whitespace */ |
| if(!(int)*start) |
| continue; |
| c = decode_status_char(start); |
| if (c == -1) |
| fatal("unrecognized job state value"); |
| selected_status[c] = 1; |
| start = end + 1; |
| } |
| if (params.opt_verbose) { |
| fprintf(stderr, "States requested:\n"); |
| for(i=0; i< STATUS_COUNT; i++) { |
| if(selected_status[i]) { |
| fprintf(stderr, "\t: %s\n", |
| decode_status_int(i)); |
| break; |
| } |
| } |
| } |
| } |
| |
| /* select the output fields */ |
| if (params.opt_field_list==NULL) { |
| if (params.opt_dump || params.opt_expire) |
| goto endopt; |
| params.opt_field_list = xmalloc(sizeof(DEFAULT_FIELDS)+1); |
| strcpy(params.opt_field_list, DEFAULT_FIELDS); |
| strcat(params.opt_field_list, ","); |
| } |
| start = params.opt_field_list; |
| while ((end = strstr(start, ","))) { |
| *end = 0; |
| while (isspace(*start)) |
| start++; /* discard whitespace */ |
| if(!(int)*start) |
| continue; |
| for (i = 0; fields[i].name; i++) { |
| if (!strcasecmp(fields[i].name, start)) |
| goto foundfield; |
| } |
| fprintf(stderr, |
| "Invalid field requested: \"%s\"\n", |
| start); |
| exit(1); |
| foundfield: |
| printfields[nprintfields++] = i; |
| start = end + 1; |
| } |
| if (params.opt_verbose) { |
| fprintf(stderr, "%d field%s selected:\n", |
| nprintfields, |
| (nprintfields==1? "" : "s")); |
| for (i = 0; i < nprintfields; i++) |
| fprintf(stderr, |
| "\t%s\n", |
| fields[printfields[i]].name); |
| } |
| endopt: |
| if (optind < argc) { |
| fprintf(stderr, "Error: Unknown arguments:"); |
| for (i=optind; i<argc; i++) |
| fprintf(stderr, " %s", argv[i]); |
| fprintf(stderr, "\n"); |
| exit(1); |
| } |
| return; |
| } |
| |
| /* Note: do_dump() strives to present data in an upward-compatible |
| * manner so that apps written to use data from `sacct -d` in slurm |
| * v1.0 will continue to work in v1.1 and later. |
| * |
| * To help ensure this compatibility, |
| * a. The meaning of an existing field never changes |
| * b. New fields are appended to the end of a record |
| * |
| * The "numfields" field of the record can be used as a sub-version |
| * number, as it will never decrease for the life of the current |
| * record version number (currently 1). For example, if your app needs |
| * to use field 28, a record with numfields<28 is too old a version |
| * for you, while numfields>=28 will provide what you are expecting. |
| */ |
| void do_dump(void) |
| { |
| ListIterator itr = NULL; |
| ListIterator itr_step = NULL; |
| job_rec_t *job = NULL; |
| step_rec_t *step = NULL; |
| struct tm ts; |
| |
| itr = list_iterator_create(jobs); |
| while((job = list_next(itr))) { |
| if (!params.opt_dup) |
| if (job->jobnum_superseded) { |
| if (params.opt_verbose > 1) |
| fprintf(stderr, |
| "Note: Skipping older" |
| " job %u dated %d\n", |
| job->header.jobnum, |
| (int)job->header.job_submit); |
| continue; |
| } |
| if (params.opt_uid>=0) |
| if (job->header.uid != params.opt_uid) |
| continue; |
| if(job->sacct.min_cpu == (float)NO_VAL) |
| job->sacct.min_cpu = 0; |
| |
| if(list_count(job->steps)) { |
| job->sacct.ave_cpu /= list_count(job->steps); |
| job->sacct.ave_rss /= list_count(job->steps); |
| job->sacct.ave_vsize /= list_count(job->steps); |
| job->sacct.ave_pages /= list_count(job->steps); |
| if(!job->track_steps) { |
| if(list_count(job->steps) > 1) |
| job->track_steps = 1; |
| else { |
| step = list_peek(job->steps); |
| if(step && step->stepname |
| && job->jobname |
| && strcmp(step->stepname, |
| job->jobname)) |
| job->track_steps = 1; |
| } |
| } |
| } |
| |
| /* JOB_START */ |
| if (job->show_full) { |
| if (!job->job_start_seen && job->job_step_seen) { |
| /* If we only saw JOB_TERMINATED, the |
| * job was probably canceled. */ |
| fprintf(stderr, |
| "Error: No JOB_START record for " |
| "job %u\n", |
| job->header.jobnum); |
| } |
| _dump_header(job->header); |
| printf("JOB_START 1 16 %d %d %s %d %d %d %s %s\n", |
| job->header.uid, |
| job->header.gid, |
| job->jobname, |
| job->track_steps, |
| job->priority, |
| job->ncpus, |
| job->nodes, |
| job->account); |
| } |
| /* JOB_STEP */ |
| itr_step = list_iterator_create(job->steps); |
| while((step = list_next(itr_step))) { |
| if (step->status == JOB_RUNNING && |
| job->job_terminated_seen) { |
| step->status = JOB_FAILED; |
| step->exitcode=1; |
| } |
| _dump_header(step->header); |
| if(step->end == 0) |
| step->end = job->end; |
| |
| gmtime_r(&step->end, &ts); |
| printf("JOB_STEP 1 50 %u %04d%02d%02d%02d%02d%02d ", |
| step->stepnum, |
| 1900+(ts.tm_year), 1+(ts.tm_mon), ts.tm_mday, |
| ts.tm_hour, ts.tm_min, ts.tm_sec); |
| printf("%s %d %d %d %d ", |
| decode_status_int_abbrev(step->status), |
| step->exitcode, |
| step->ntasks, |
| step->ncpus, |
| step->elapsed); |
| printf("%d %d %d %d %d %d ", |
| step->tot_cpu_sec, |
| step->tot_cpu_usec, |
| (int)step->rusage.ru_utime.tv_sec, |
| (int)step->rusage.ru_utime.tv_usec, |
| (int)step->rusage.ru_stime.tv_sec, |
| (int)step->rusage.ru_stime.tv_usec); |
| printf("%d %d %d %d %d %d %d %d %d " |
| "%d %d %d %d %d %d %d ", |
| (int)step->rusage.ru_maxrss, |
| (int)step->rusage.ru_ixrss, |
| (int)step->rusage.ru_idrss, |
| (int)step->rusage.ru_isrss, |
| (int)step->rusage.ru_minflt, |
| (int)step->rusage.ru_majflt, |
| (int)step->rusage.ru_nswap, |
| (int)step->rusage.ru_inblock, |
| (int)step->rusage.ru_oublock, |
| (int)step->rusage.ru_msgsnd, |
| (int)step->rusage.ru_msgrcv, |
| (int)step->rusage.ru_nsignals, |
| (int)step->rusage.ru_nvcsw, |
| (int)step->rusage.ru_nivcsw, |
| step->sacct.max_vsize/1024, |
| step->sacct.max_rss/1024); |
| /* Data added in Slurm v1.1 */ |
| printf("%u %u %.2f %u %u %.2f %d %u %u %.2f " |
| "%.2f %u %u %.2f %s %s %s\n", |
| step->sacct.max_vsize_id.nodeid, |
| step->sacct.max_vsize_id.taskid, |
| step->sacct.ave_vsize/1024, |
| step->sacct.max_rss_id.nodeid, |
| step->sacct.max_rss_id.taskid, |
| step->sacct.ave_rss/1024, |
| step->sacct.max_pages, |
| step->sacct.max_pages_id.nodeid, |
| step->sacct.max_pages_id.taskid, |
| step->sacct.ave_pages, |
| step->sacct.min_cpu, |
| step->sacct.min_cpu_id.nodeid, |
| step->sacct.min_cpu_id.taskid, |
| step->sacct.ave_cpu, |
| step->stepname, |
| step->nodes, |
| job->account); |
| } |
| list_iterator_destroy(itr_step); |
| /* JOB_TERMINATED */ |
| if (job->show_full) { |
| _dump_header(job->header); |
| gmtime_r(&job->end, &ts); |
| printf("JOB_TERMINATED 1 50 %d ", |
| job->elapsed); |
| printf("%04d%02d%02d%02d%02d%02d ", |
| 1900+(ts.tm_year), 1+(ts.tm_mon), ts.tm_mday, |
| ts.tm_hour, ts.tm_min, ts.tm_sec); |
| printf("%s %d %d %d %d ", |
| decode_status_int_abbrev(job->status), |
| job->exitcode, |
| job->ntasks, |
| job->ncpus, |
| job->elapsed); |
| printf("%d %d %d %d %d %d ", |
| job->tot_cpu_sec, |
| job->tot_cpu_usec, |
| (int)job->rusage.ru_utime.tv_sec, |
| (int)job->rusage.ru_utime.tv_usec, |
| (int)job->rusage.ru_stime.tv_sec, |
| (int)job->rusage.ru_stime.tv_usec); |
| printf("%d %d %d %d %d %d %d %d %d " |
| "%d %d %d %d %d %d %d ", |
| (int)job->rusage.ru_maxrss, |
| (int)job->rusage.ru_ixrss, |
| (int)job->rusage.ru_idrss, |
| (int)job->rusage.ru_isrss, |
| (int)job->rusage.ru_minflt, |
| (int)job->rusage.ru_majflt, |
| (int)job->rusage.ru_nswap, |
| (int)job->rusage.ru_inblock, |
| (int)job->rusage.ru_oublock, |
| (int)job->rusage.ru_msgsnd, |
| (int)job->rusage.ru_msgrcv, |
| (int)job->rusage.ru_nsignals, |
| (int)job->rusage.ru_nvcsw, |
| (int)job->rusage.ru_nivcsw, |
| job->sacct.max_vsize/1024, |
| job->sacct.max_rss/1024); |
| /* Data added in Slurm v1.1 */ |
| printf("%u %u %.2f %u %u %.2f %d %u %u %.2f " |
| "%.2f %u %u %.2f %s %s %s %d\n", |
| job->sacct.max_vsize_id.nodeid, |
| job->sacct.max_vsize_id.taskid, |
| job->sacct.ave_vsize/1024, |
| job->sacct.max_rss_id.nodeid, |
| job->sacct.max_rss_id.taskid, |
| job->sacct.ave_rss/1024, |
| job->sacct.max_pages, |
| job->sacct.max_pages_id.nodeid, |
| job->sacct.max_pages_id.taskid, |
| job->sacct.ave_pages, |
| job->sacct.min_cpu, |
| job->sacct.min_cpu_id.nodeid, |
| job->sacct.min_cpu_id.taskid, |
| job->sacct.ave_cpu, |
| "-", |
| job->nodes, |
| job->account, |
| job->requid); |
| } |
| } |
| list_iterator_destroy(itr); |
| } |
| |
| /* do_expire() -- purge expired data from the accounting log file |
| * |
| * What we're doing: |
| * 1. Open logfile.orig |
| * 2. stat logfile.orig |
| * - confirm that it's not a sym link |
| * - capture the ownership and permissions |
| * 3. scan logfile.orig for JOB_TERMINATED records with F_TIMESTAMP dates |
| * that precede the specified expiration date. Build exp_table as |
| * a list of expired jobs. |
| * 4. Open logfile.expired for append |
| * 5. Create logfile.new as ".new.<logfile>" (output with line buffering) |
| * 6. Re-scan logfile.orig, writing |
| * - Expired job records to logfile.expired |
| * - Other job records to logfile.new |
| * 7. Rename logfile.orig as ".old.<logfile>" |
| * 8. Rename logfile.new as "<logfile>" |
| * 9. Execute "scontrol reconfigure" which will cause slurmctld to |
| * start writing to logfile.new |
| * 10. fseek(ftell(logfile.orig)) to clear EOF |
| * 11. Copy any new records from logfile.orig to logfile.new |
| * 12. Close logfile.expired, logfile.new |
| * 13. Unlink .old.<logfile> |
| */ |
| |
| void do_expire(void) |
| { |
| char line[BUFFER_SIZE], |
| *f[EXPIRE_READ_LENGTH], |
| *fptr = NULL, |
| *logfile_name = NULL, |
| *old_logfile_name = NULL; |
| int file_err=0, |
| new_file, |
| i = 0; |
| expired_rec_t *exp_rec = NULL; |
| expired_rec_t *exp_rec2 = NULL; |
| List keep_list = list_create(_destroy_exp); |
| List exp_list = list_create(_destroy_exp); |
| List other_list = list_create(_destroy_exp); |
| struct stat statbuf; |
| mode_t prot = 0600; |
| uid_t uid; |
| gid_t gid; |
| FILE *expired_logfile = NULL, |
| *new_logfile = NULL; |
| FILE *fd = NULL; |
| int lc=0; |
| int rec_type = -1; |
| ListIterator itr = NULL; |
| ListIterator itr2 = NULL; |
| char *temp = NULL; |
| |
| /* Figure out our expiration date */ |
| time_t expiry; |
| expiry = time(NULL)-params.opt_expire; |
| if (params.opt_verbose) |
| fprintf(stderr, "Purging jobs completed prior to %d\n", |
| (int)expiry); |
| |
| /* Open the current or specified logfile, or quit */ |
| fd = _open_log_file(); |
| if (stat(params.opt_filein, &statbuf)) { |
| perror("stat'ing logfile"); |
| goto finished; |
| } |
| if ((statbuf.st_mode & S_IFLNK) == S_IFLNK) { |
| fprintf(stderr, "%s is a symbolic link; --expire requires " |
| "a hard-linked file name\n", params.opt_filein); |
| goto finished; |
| } |
| if (!(statbuf.st_mode & S_IFREG)) { |
| fprintf(stderr, "%s is not a regular file; --expire " |
| "only works on accounting log files\n", |
| params.opt_filein); |
| goto finished; |
| } |
| prot = statbuf.st_mode & 0777; |
| gid = statbuf.st_gid; |
| uid = statbuf.st_uid; |
| old_logfile_name = _prefix_filename(params.opt_filein, ".old."); |
| if (stat(old_logfile_name, &statbuf)) { |
| if (errno != ENOENT) { |
| fprintf(stderr,"Error checking for %s: ", |
| old_logfile_name); |
| perror(""); |
| goto finished; |
| } |
| } else { |
| fprintf(stderr, "Warning! %s exists -- please remove " |
| "or rename it before proceeding\n", |
| old_logfile_name); |
| goto finished; |
| } |
| |
| /* create our initial buffer */ |
| while (fgets(line, BUFFER_SIZE, fd)) { |
| lc++; |
| fptr = line; /* break the record into NULL- |
| terminated strings */ |
| exp_rec = xmalloc(sizeof(expired_rec_t)); |
| exp_rec->line = xstrdup(line); |
| |
| for (i = 0; i < EXPIRE_READ_LENGTH; i++) { |
| f[i] = fptr; |
| fptr = strstr(fptr, " "); |
| if (fptr == NULL) |
| break; |
| else |
| *fptr++ = 0; |
| } |
| |
| exp_rec->job = atoi(f[F_JOB]); |
| exp_rec->job_submit = atoi(f[F_JOB_SUBMIT]); |
| |
| rec_type = atoi(f[F_RECTYPE]); |
| /* Odd, but complain some other time */ |
| if (rec_type == JOB_TERMINATED) { |
| if (expiry < atoi(f[F_TIMESTAMP])) { |
| list_append(keep_list, exp_rec); |
| continue; |
| } |
| if (list_count(selected_parts)) { |
| itr = list_iterator_create(selected_parts); |
| while((temp = list_next(itr))) |
| if(!strcasecmp(f[F_PARTITION], temp)) |
| break; |
| list_iterator_destroy(itr); |
| if(!temp) { |
| list_append(keep_list, exp_rec); |
| continue; |
| } /* no match */ |
| } |
| list_append(exp_list, exp_rec); |
| if (params.opt_verbose > 2) |
| fprintf(stderr, "Selected: %8d %d\n", |
| exp_rec->job, |
| (int)exp_rec->job_submit); |
| } else { |
| list_append(other_list, exp_rec); |
| } |
| } |
| if (!list_count(exp_list)) { |
| printf("No job records were purged.\n"); |
| goto finished; |
| } |
| logfile_name = xmalloc(strlen(params.opt_filein)+sizeof(".expired")); |
| sprintf(logfile_name, "%s.expired", params.opt_filein); |
| new_file = stat(logfile_name, &statbuf); |
| if ((expired_logfile = fopen(logfile_name, "a"))==NULL) { |
| fprintf(stderr, "Error while opening %s", |
| logfile_name); |
| perror(""); |
| xfree(logfile_name); |
| goto finished; |
| } |
| |
| if (new_file) { /* By default, the expired file looks like the log */ |
| chmod(logfile_name, prot); |
| chown(logfile_name, uid, gid); |
| } |
| xfree(logfile_name); |
| |
| logfile_name = _prefix_filename(params.opt_filein, ".new."); |
| if ((new_logfile = fopen(logfile_name, "w"))==NULL) { |
| fprintf(stderr, "Error while opening %s", |
| logfile_name); |
| perror(""); |
| fclose(expired_logfile); |
| goto finished; |
| } |
| chmod(logfile_name, prot); /* preserve file protection */ |
| chown(logfile_name, uid, gid); /* and ownership */ |
| /* Use line buffering to allow us to safely write |
| * to the log file at the same time as slurmctld. */ |
| if (setvbuf(new_logfile, NULL, _IOLBF, 0)) { |
| perror("setvbuf()"); |
| fclose(expired_logfile); |
| goto finished2; |
| } |
| |
| list_sort(exp_list, (ListCmpF) _cmp_jrec); |
| list_sort(keep_list, (ListCmpF) _cmp_jrec); |
| |
| if (params.opt_verbose > 2) { |
| fprintf(stderr, "--- contents of exp_list ---"); |
| itr = list_iterator_create(exp_list); |
| while((exp_rec = list_next(itr))) { |
| if (!(i%5)) |
| fprintf(stderr, "\n"); |
| else |
| fprintf(stderr, "\t"); |
| fprintf(stderr, "%d", exp_rec->job); |
| } |
| fprintf(stderr, "\n---- end of exp_list ---\n"); |
| list_iterator_destroy(itr); |
| } |
| /* write the expired file */ |
| itr = list_iterator_create(exp_list); |
| while((exp_rec = list_next(itr))) { |
| itr2 = list_iterator_create(other_list); |
| while((exp_rec2 = list_next(itr2))) { |
| if((exp_rec2->job != exp_rec->job) |
| || (exp_rec2->job_submit != exp_rec->job_submit)) |
| continue; |
| if (fputs(exp_rec2->line, expired_logfile)<0) { |
| perror("writing expired_logfile"); |
| list_iterator_destroy(itr2); |
| list_iterator_destroy(itr); |
| fclose(expired_logfile); |
| goto finished2; |
| } |
| list_remove(itr2); |
| _destroy_exp(exp_rec2); |
| } |
| list_iterator_destroy(itr2); |
| if (fputs(exp_rec->line, expired_logfile)<0) { |
| perror("writing expired_logfile"); |
| list_iterator_destroy(itr); |
| fclose(expired_logfile); |
| goto finished2; |
| } |
| } |
| list_iterator_destroy(itr); |
| fclose(expired_logfile); |
| |
| /* write the new log */ |
| itr = list_iterator_create(keep_list); |
| while((exp_rec = list_next(itr))) { |
| itr2 = list_iterator_create(other_list); |
| while((exp_rec2 = list_next(itr2))) { |
| if(exp_rec2->job != exp_rec->job) |
| continue; |
| if (fputs(exp_rec2->line, new_logfile)<0) { |
| perror("writing keep_logfile"); |
| list_iterator_destroy(itr2); |
| list_iterator_destroy(itr); |
| goto finished2; |
| } |
| list_remove(itr2); |
| _destroy_exp(exp_rec2); |
| } |
| list_iterator_destroy(itr2); |
| if (fputs(exp_rec->line, new_logfile)<0) { |
| perror("writing keep_logfile"); |
| list_iterator_destroy(itr); |
| goto finished2; |
| } |
| } |
| list_iterator_destroy(itr); |
| |
| /* write records in other_list to new log */ |
| itr = list_iterator_create(other_list); |
| while((exp_rec = list_next(itr))) { |
| if (fputs(exp_rec->line, new_logfile)<0) { |
| perror("writing keep_logfile"); |
| list_iterator_destroy(itr); |
| goto finished2; |
| } |
| } |
| list_iterator_destroy(itr); |
| |
| if (rename(params.opt_filein, old_logfile_name)) { |
| perror("renaming logfile to .old."); |
| goto finished2; |
| } |
| if (rename(logfile_name, params.opt_filein)) { |
| perror("renaming new logfile"); |
| /* undo it? */ |
| if (!rename(old_logfile_name, params.opt_filein)) |
| fprintf(stderr, "Please correct the problem " |
| "and try again"); |
| else |
| fprintf(stderr, "SEVERE ERROR: Current accounting " |
| "log may have been renamed %s;\n" |
| "please rename it to \"%s\" if necessary, " |
| "and try again\n", |
| old_logfile_name, params.opt_filein); |
| goto finished2; |
| } |
| fflush(new_logfile); /* Flush the buffers before forking */ |
| fflush(fd); |
| |
| file_err = slurm_reconfigure (); |
| |
| if (file_err) { |
| file_err = 1; |
| fprintf(stderr, "Error: Attempt to reconfigure " |
| "SLURM failed.\n"); |
| if (rename(old_logfile_name, params.opt_filein)) { |
| perror("renaming logfile from .old."); |
| goto finished2; |
| } |
| |
| } |
| if (fseek(fd, 0, SEEK_CUR)) { /* clear EOF */ |
| perror("looking for late-arriving records"); |
| goto finished2; |
| } |
| |
| /* reopen new logfile in append mode, since slurmctld may write it */ |
| if (freopen(params.opt_filein, "a", new_logfile) == NULL) { |
| perror("reopening new logfile"); |
| goto finished2; |
| } |
| |
| while (fgets(line, BUFFER_SIZE, fd)) { |
| if (fputs(line, new_logfile)<0) { |
| perror("writing final records"); |
| goto finished2; |
| } |
| } |
| |
| printf("%d jobs expired.\n", list_count(exp_list)); |
| finished2: |
| fclose(new_logfile); |
| if (!file_err) { |
| if (unlink(old_logfile_name) == -1) |
| error("Unable to unlink old logfile %s: %m", |
| old_logfile_name); |
| } |
| finished: |
| fclose(fd); |
| list_destroy(exp_list); |
| list_destroy(keep_list); |
| list_destroy(other_list); |
| xfree(old_logfile_name); |
| xfree(logfile_name); |
| } |
| |
| void do_fdump(char* f[], int lc) |
| { |
| int i=0, j=0; |
| char **type; |
| char *header[] = {"job", /* F_JOB */ |
| "partition", /* F_PARTITION */ |
| "job_submit", /* F_JOB_SUBMIT */ |
| "timestamp", /* F_TIMESTAMP */ |
| "uid", /* F_UIDGID */ |
| "gid", /* F_UIDGID */ |
| "BlockID", /* F_BLOCKID */ |
| "reserved-2",/* F_RESERVED1 */ |
| "recordType",/* F_RECTYPE */ |
| NULL}; |
| |
| char *start[] = {"jobName", /* F_JOBNAME */ |
| "TrackSteps", /* F_TRACK_STEPS */ |
| "priority", /* F_PRIORITY */ |
| "ncpus", /* F_NCPUS */ |
| "nodeList", /* F_NODES */ |
| "account", /* F_JOB_ACCOUNT */ |
| NULL}; |
| |
| char *step[] = {"jobStep", /* F_JOBSTEP */ |
| "status", /* F_STATUS */ |
| "exitcode", /* F_EXITCODE */ |
| "ntasks", /* F_NTASKS */ |
| "ncpus", /* F_STEPNCPUS */ |
| "elapsed", /* F_ELAPSED */ |
| "cpu_sec", /* F_CPU_SEC */ |
| "cpu_usec", /* F_CPU_USEC */ |
| "user_sec", /* F_USER_SEC */ |
| "user_usec", /* F_USER_USEC */ |
| "sys_sec", /* F_SYS_SEC */ |
| "sys_usec", /* F_SYS_USEC */ |
| "rss", /* F_RSS */ |
| "ixrss", /* F_IXRSS */ |
| "idrss", /* F_IDRSS */ |
| "isrss", /* F_ISRSS */ |
| "minflt", /* F_MINFLT */ |
| "majflt", /* F_MAJFLT */ |
| "nswap", /* F_NSWAP */ |
| "inblocks", /* F_INBLOCKS */ |
| "oublocks", /* F_OUTBLOCKS */ |
| "msgsnd", /* F_MSGSND */ |
| "msgrcv", /* F_MSGRCV */ |
| "nsignals", /* F_NSIGNALS */ |
| "nvcsw", /* F_VCSW */ |
| "nivcsw", /* F_NIVCSW */ |
| "max_vsize", /* F_MAX_VSIZE */ |
| "max_vsize_task", /* F_MAX_VSIZE_TASK */ |
| "ave_vsize", /* F_AVE_VSIZE */ |
| "max_rss", /* F_MAX_RSS */ |
| "max_rss_task", /* F_MAX_RSS_TASK */ |
| "ave_rss", /* F_AVE_RSS */ |
| "max_pages", /* F_MAX_PAGES */ |
| "max_pages_task", /* F_MAX_PAGES_TASK */ |
| "ave_pages", /* F_AVE_PAGES */ |
| "min_cputime", /* F_MIN_CPU */ |
| "min_cputime_task", /* F_MIN_CPU_TASK */ |
| "ave_cputime", /* F_AVE_RSS */ |
| "StepName", /* F_STEPNAME */ |
| "StepNodes", /* F_STEPNODES */ |
| "max_vsize_node", /* F_MAX_VSIZE_NODE */ |
| "max_rss_node", /* F_MAX_RSS_NODE */ |
| "max_pages_node", /* F_MAX_PAGES_NODE */ |
| "min_cputime_node", /* F_MIN_CPU_NODE */ |
| "account", /* F_STEP_ACCOUNT */ |
| "requid", /* F_STEP_REQUID */ |
| NULL}; |
| |
| char *suspend[] = {"Suspend/Run time", /* F_TOT_ELAPSED */ |
| "status", /* F_STATUS */ |
| NULL}; |
| |
| char *term[] = {"totElapsed", /* F_TOT_ELAPSED */ |
| "status", /* F_STATUS */ |
| "requid", /* F_JOB_REQUID */ |
| NULL}; |
| |
| i = atoi(f[F_RECTYPE]); |
| printf("\n------- Line %d %s -------\n", lc, _convert_type(i)); |
| |
| for(j=0; j < HEADER_LENGTH; j++) |
| printf("%12s: %s\n", header[j], f[j]); |
| switch(i) { |
| case JOB_START: |
| type = start; |
| j = JOB_START_LENGTH; |
| break; |
| case JOB_STEP: |
| type = step; |
| j = JOB_STEP_LENGTH; |
| break; |
| case JOB_SUSPEND: |
| type = suspend; |
| j = JOB_TERM_LENGTH; |
| case JOB_TERMINATED: |
| type = term; |
| j = JOB_TERM_LENGTH; |
| break; |
| default: |
| while(f[j]) { |
| printf(" Field[%02d]: %s\n", j, f[j]); |
| j++; |
| } |
| return; |
| } |
| |
| for(i=HEADER_LENGTH; i < j; i++) |
| printf("%12s: %s\n", type[i-HEADER_LENGTH], f[i]); |
| } |
| |
| void do_help(void) |
| { |
| switch (params.opt_help) { |
| case 1: |
| _help_msg(); |
| break; |
| case 2: |
| _help_fields_msg(); |
| break; |
| case 3: |
| _usage(); |
| break; |
| default: |
| fprintf(stderr, "sacct bug: params.opt_help=%d\n", |
| params.opt_help); |
| } |
| } |
| |
| /* do_list() -- List the assembled data |
| * |
| * In: Nothing explicit. |
| * Out: void. |
| * |
| * At this point, we have already selected the desired data, |
| * so we just need to print it for the user. |
| */ |
| void do_list(void) |
| { |
| int do_jobsteps = 1; |
| int rc = 0; |
| |
| ListIterator itr = NULL; |
| ListIterator itr_step = NULL; |
| job_rec_t *job = NULL; |
| step_rec_t *step = NULL; |
| |
| if (params.opt_total) |
| do_jobsteps = 0; |
| |
| itr = list_iterator_create(jobs); |
| while((job = list_next(itr))) { |
| if (!params.opt_dup) |
| if (job->jobnum_superseded) { |
| if (params.opt_verbose > 1) |
| fprintf(stderr, |
| "Note: Skipping older" |
| " job %u dated %d\n", |
| job->header.jobnum, |
| (int)job->header.job_submit); |
| continue; |
| } |
| if (!job->job_start_seen && job->job_step_seen) { |
| /* If we only saw JOB_TERMINATED, the job was |
| * probably canceled. */ |
| fprintf(stderr, |
| "Error: No JOB_START record for job %u\n", |
| job->header.jobnum); |
| if (rc<ERROR) |
| rc = ERROR; |
| } |
| if (params.opt_verbose > 1) { |
| if (!job->job_start_seen) |
| fprintf(stderr, |
| "Note: No JOB_START record for " |
| "job %u\n", |
| job->header.jobnum); |
| if (!job->job_step_seen) |
| fprintf(stderr, |
| "Note: No JOB_STEP record for " |
| "job %u\n", |
| job->header.jobnum); |
| if (!job->job_terminated_seen) |
| fprintf(stderr, |
| "Note: No JOB_TERMINATED record for " |
| "job %u\n", |
| job->header.jobnum); |
| } |
| if (params.opt_uid >= 0 && (job->header.uid != params.opt_uid)) |
| continue; |
| if (params.opt_gid >= 0 && (job->header.gid != params.opt_gid)) |
| continue; |
| if(job->sacct.min_cpu == NO_VAL) |
| job->sacct.min_cpu = 0; |
| |
| if(list_count(job->steps)) { |
| job->sacct.ave_cpu /= list_count(job->steps); |
| job->sacct.ave_rss /= list_count(job->steps); |
| job->sacct.ave_vsize /= list_count(job->steps); |
| job->sacct.ave_pages /= list_count(job->steps); |
| } |
| |
| if (job->show_full) { |
| if (params.opt_state_list) { |
| if(!selected_status[job->status]) |
| continue; |
| } |
| print_fields(JOB, job); |
| } |
| |
| if(!job->track_steps) { |
| /* If we don't have track_steps we want to see |
| if we have multiple steps. If we only have |
| 1 step check the job name against the step |
| name in most all cases it will be |
| different. If it is different print out |
| the step separate. |
| */ |
| if(list_count(job->steps) > 1) |
| job->track_steps = 1; |
| else { |
| step = list_peek(job->steps); |
| if(step && step->stepname |
| && job->jobname |
| && strcmp(step->stepname, |
| job->jobname)) |
| job->track_steps = 1; |
| } |
| } |
| |
| if (do_jobsteps && (job->track_steps || !job->show_full)) { |
| itr_step = list_iterator_create(job->steps); |
| while((step = list_next(itr_step))) { |
| if (step->status == JOB_RUNNING |
| && job->job_terminated_seen) { |
| step->status = JOB_FAILED; |
| } |
| if (params.opt_state_list) { |
| if(!selected_status[step->status]) |
| continue; |
| } |
| if(step->end == 0) |
| step->end = job->end; |
| print_fields(JOBSTEP, step); |
| } |
| list_iterator_destroy(itr_step); |
| } |
| } |
| list_iterator_destroy(itr); |
| } |
| |
| void do_stat() |
| { |
| ListIterator itr = NULL; |
| uint32_t jobid = 0; |
| uint32_t stepid = 0; |
| selected_step_t *selected_step = NULL; |
| |
| itr = list_iterator_create(selected_steps); |
| while((selected_step = list_next(itr))) { |
| jobid = atoi(selected_step->job); |
| if(selected_step->step) |
| stepid = atoi(selected_step->step); |
| else |
| stepid = 0; |
| sacct_stat(jobid, stepid); |
| } |
| list_iterator_destroy(itr); |
| } |
| void sacct_init() |
| { |
| int i=0; |
| jobs = list_create(destroy_job); |
| selected_parts = list_create(_destroy_parts); |
| selected_steps = list_create(_destroy_steps); |
| for(i=0; i<STATUS_COUNT; i++) |
| selected_status[i] = 0; |
| } |
| |
| void sacct_fini() |
| { |
| list_destroy(jobs); |
| list_destroy(selected_parts); |
| list_destroy(selected_steps); |
| } |