| #!/usr/bin/perl |
| ############################################################################### |
| # |
| # sjstat - List attributes of jobs under SLURM control |
| # |
| ############################################################################### |
| # Copyright (C) 2007 The Regents of the University of California. |
| # Copyright (C) 2008-2009 Lawrence Livermore National Security. |
| # Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). |
| # Written by Phil Eckert <eckert21@llnl.gov>. |
| # CODE-OCEC-09-009. All rights reserved. |
| # |
| # This file is part of Slurm, a resource management program. |
| # For details, see <https://slurm.schedmd.com/>. |
| # Please also read the included file: DISCLAIMER. |
| # |
| # SLURM is free software; you can redistribute it and/or modify it under |
| # the terms of the GNU General Public License as published by the Free |
| # Software Foundation; either version 2 of the License, or (at your option) |
| # any later version. |
| # |
| # In addition, as a special exception, the copyright holders give permission |
| # to link the code of portions of this program with the OpenSSL library under |
| # certain conditions as described in each individual source file, and |
| # distribute linked combinations including the two. You must obey the GNU |
| # General Public License in all respects for all of the code used other than |
| # OpenSSL. If you modify file(s) with this exception, you may extend this |
| # exception to your version of the file(s), but you are not obligated to do |
| # so. If you do not wish to do so, delete this exception statement from your |
| # version. If you delete this exception statement from all source files in |
| # the program, then also delete it here. |
| # |
| # SLURM is distributed in the hope that it will be useful, but WITHOUT ANY |
| # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS |
| # FOR A PARTICULAR PURPOSE. See the GNU General Public License for more |
| # details. |
| # |
| # You should have received a copy of the GNU General Public License along |
| # with SLURM; if not, write to the Free Software Foundation, Inc., |
| # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. |
| # |
| # Based off code with permission copyright 2006, 2007 Cluster Resources, Inc. |
| ############################################################################### |
| |
| # |
| # Man page stuff. |
| # |
| BEGIN { |
| # Just dump the man page in *roff format and exit if --roff specified. |
| foreach my $arg (@ARGV) { |
| if ($arg eq "--") { |
| last; |
| } elsif ($arg eq "--roff") { |
| use Pod::Man; |
| my $parser = Pod::Man->new (section => 1); |
| $parser->parse_from_file($0, \*STDOUT); |
| exit 0; |
| } |
| } |
| } |
| |
| use strict; |
| use Getopt::Long 2.24 qw(:config no_ignore_case); |
| use autouse 'Pod::Usage' => qw(pod2usage); |
| |
| # |
| # Global Variables. |
| # |
| my ($help, $man, $pool, $running, $verbose); |
| my (%MaxNodes, %MaxTime); |
| |
| # |
| # Check SLURM status. |
| # |
| isslurmup(); |
| |
| # |
| # Get user options. |
| # |
| get_options(); |
| |
| # |
| # Get partition information from scontrol, used |
| # currently in conjunction with the sinfo data.. |
| # |
| do_scontrol_part(); |
| |
| # |
| # Get and display the sinfo data. |
| # |
| do_sinfo(); |
| |
| # |
| # If the -c option was entered, stop here. |
| # |
| exit if ($pool); |
| |
| # |
| # Get and display the squeue data. |
| # |
| do_squeue(); |
| |
| exit; |
| |
| |
| # |
| # Get the SLURM partitions information. |
| # |
| sub do_sinfo |
| { |
| |
| my (@s_part, @s_mem, @s_cpu, @s_feat, @s_active, @s_idle, |
| @s_out, @s_total, @s_usable); |
| # |
| # Get the partition and node info. |
| # |
| my $options = "\"%9P %7m %.4c %.22F %f\""; |
| |
| my $ct = 0; |
| my @sin = `sinfo -e -o $options`; |
| foreach my $tmp (@sin) { |
| next if ($tmp =~ /^PARTITION/); |
| chomp $tmp; |
| my @line = split(' ',$tmp); |
| $s_part[$ct] = $line[0]; |
| $s_mem[$ct] = $line[1]; |
| $s_cpu[$ct] = $line[2]; |
| # |
| # Split the status into various components. |
| # |
| my @fields = split(/\//, $line[3]); |
| $s_active[$ct] = $fields[0]; |
| $s_idle[$ct] = $fields[1]; |
| $s_out[$ct] = $fields[2]; |
| $s_total[$ct] = $fields[3]; |
| |
| $s_usable[$ct] = $s_total[$ct] - $s_out[$ct]; |
| |
| $s_feat[$ct] = ($line[4] .= " "); |
| $s_feat[$ct] =~ s/\(null\)//g; |
| $ct++; |
| } |
| |
| printf("\nScheduling pool data:\n"); |
| if ($verbose) { |
| printf("----------------------------------------------------------------------------------\n"); |
| printf(" Total Usable Free Node Time Other \n"); |
| printf("Pool Memory Cpus Nodes Nodes Nodes Limit Limit traits \n"); |
| printf("----------------------------------------------------------------------------------\n"); |
| } else { |
| printf("-------------------------------------------------------------\n"); |
| printf("Pool Memory Cpus Total Usable Free Other Traits \n"); |
| printf("-------------------------------------------------------------\n"); |
| } |
| |
| for (my $i = 0; $i < $ct; $i++) { |
| if ($verbose) { |
| my $p = $s_part[$i]; |
| $p =~ s/\*//; |
| printf("%-9s %7dMb %5s %6s %7s %6s %6s %10s %-s\n", |
| $s_part[$i], $s_mem[$i], $s_cpu[$i], |
| $s_total[$i], $s_usable[$i], |
| $s_idle[$i], $MaxNodes{$p}, |
| $MaxTime{$p}, $s_feat[$i]); |
| } else { |
| printf("%-9s %7dMb %5s %6s %6s %6s %-s\n", |
| $s_part[$i], $s_mem[$i], $s_cpu[$i], |
| $s_total[$i], $s_usable[$i], |
| $s_idle[$i], $s_feat[$i]); |
| } |
| } |
| printf("\n"); |
| |
| return; |
| } |
| |
| |
| # |
| # Get the SLURM queues. |
| # |
| sub do_squeue |
| { |
| |
| my (@s_job, @s_user, @s_nodes, @s_status, @s_begin, @s_limit, |
| @s_start, @s_pool, @s_used, @s_master); |
| # |
| # Base options on whether this partition is node or process scheduled. |
| # |
| my ($type, $options); |
| my $rval = system("scontrol show config | grep cons_res >> /dev/null"); |
| if ($rval) { |
| $type = "Nodes"; |
| $options = "\"%8i %8u %.6D %2t %S %.12l %.9P %.11M %1000R\""; |
| } else { |
| $type = "Procs"; |
| $options = "\"%8i %8u %.6C %2t %S %.12l %.9P %.11M %1000R\""; |
| } |
| |
| # |
| # Get the job information. |
| # |
| |
| my $ct = 0; |
| my $pat = "tr -s '[' '\000' |cut -d'-' -f 1 | cut -d',' -f 1"; |
| my @sout = `squeue -o $options`; |
| foreach my $tmp (@sout) { |
| next if ($tmp =~ /^JOBID/); |
| next if ($running && $tmp =~ / PD /); |
| chomp $tmp; |
| my @line = split(' ', $tmp); |
| $s_job[$ct] = $line[0]; |
| $s_user[$ct] = $line[1]; |
| $s_nodes[$ct] = $line[2]; |
| $s_status[$ct] = $line[3]; |
| $line[4] =~ s/^.....//; |
| $line[4] = "N/A" if ($line[3] =~ /PD/); |
| $s_begin[$ct] = $line[4]; |
| $s_limit[$ct] = $line[5]; |
| if ($line[5] eq "UNLIMITED") { |
| $s_limit[$ct] = $line[5]; |
| } else { |
| $s_limit[$ct] = convert_time($line[5]); |
| } |
| |
| $s_pool[$ct] = $line[6]; |
| $s_used[$ct] = $line[7]; |
| # |
| # Only keep the master node from the nodes list. |
| # |
| $line[8] =~ s/\[([0-9.]*).*/$1/; |
| $s_master[$ct] = $line[8]; |
| $ct++; |
| } |
| |
| |
| printf("Running job data:\n"); |
| |
| if ($verbose) { |
| printf("---------------------------------------------------------------------------------------------------\n"); |
| printf(" Time Time Time \n"); |
| printf("JobID User $type Pool Status Used Limit Started Master/Other \n"); |
| printf("---------------------------------------------------------------------------------------------------\n"); |
| } else { |
| printf("----------------------------------------------------------------------\n"); |
| printf("JobID User $type Pool Status Used Master/Other \n"); |
| printf("----------------------------------------------------------------------\n"); |
| } |
| |
| for (my $i = 0; $i < $ct; $i++) { |
| if ($verbose) { |
| printf("%-8s %-8s %6s %-9s %-7s %10s %11s %14s %.12s\n", |
| $s_job[$i], $s_user[$i], $s_nodes[$i], |
| $s_pool[$i], $s_status[$i], |
| $s_used[$i], $s_limit[$i], $s_begin[$i], |
| $s_master[$i]); |
| } else { |
| printf("%-8s %-8s %6s %-9s %-7s %10s %.12s\n", |
| $s_job[$i], $s_user[$i], $s_nodes[$i], |
| $s_pool[$i], $s_status[$i], |
| $s_used[$i], $s_master[$i]); |
| } |
| } |
| printf("\n"); |
| |
| return; |
| } |
| |
| # |
| # Get the SLURM partitions. |
| # |
| sub do_scontrol_part |
| { |
| |
| # |
| # Get All partition data Don't need it all now, but |
| # it may be useful later. |
| # |
| my @scon = `scontrol show part`; |
| my $part; |
| foreach my $tmp (@scon) { |
| chomp $tmp; |
| my @line = split(' ',$tmp); |
| ($part) = ($tmp =~ m/PartitionName=(\S+)/) if ($tmp =~ /PartitionName=/); |
| |
| ($MaxTime{$part}) = ($tmp =~ m/MaxTime=(\S+)\s+/) if ($tmp =~ /MaxTime=/); |
| ($MaxNodes{$part}) = ($tmp =~ m/MaxNodes=(\S+)\s+/) if ($tmp =~ /MaxNodes=/); |
| $MaxTime{$part} =~ s/UNLIMITED/UNLIM/ if ($MaxTime{$part}); |
| $MaxNodes{$part} =~ s/UNLIMITED/UNLIM/ if ($MaxNodes{$part}); |
| } |
| |
| return; |
| } |
| |
| |
| # |
| # Show the man page. |
| # |
| sub show_man |
| { |
| |
| if ($< == 0) { # Cannot invoke perldoc as root |
| my $id = eval { getpwnam("nobody") }; |
| $id = eval { getpwnam("nouser") } unless defined $id; |
| $id = -2 unless defined $id; |
| $< = $id; |
| printf("\n You can not do this as root!\n\n"); |
| exit 1; |
| } |
| $> = $<; # Disengage setuid |
| $ENV{PATH} = "/bin:/usr/bin"; # Untaint PATH |
| delete @ENV{'IFS', 'CDPATH', 'ENV', 'BASH_ENV'}; |
| if ($0 =~ /^([-\/\w\.]+)$/) { $0 = $1; } # Untaint $0 |
| else { die "Illegal characters were found in \$0 ($0)\n"; } |
| pod2usage(-exitstatus => 0, -verbose => 2); |
| |
| return; |
| } |
| |
| |
| # |
| # Convert the time to a better format. |
| # |
| sub convert_time |
| { |
| my $val = shift(@_); |
| |
| my $tmp; |
| my @field = split(/-|:/, $val); |
| if (@field == 4) { |
| $tmp = ($field[0]*24)+$field[1] . ':'.$field[2] . ':' . $field[3]; |
| } else { |
| $tmp = sprintf("%8s",$val); |
| } |
| |
| return($tmp); |
| } |
| |
| |
| # |
| # Get options. |
| # |
| sub get_options |
| { |
| GetOptions( |
| 'help|h|?' => \$help, |
| 'man' => \$man, |
| 'v' => \$verbose, |
| 'r' => \$running, |
| 'c' => \$pool, |
| ) or usage(1); |
| |
| show_man() if ($man); |
| usage(0) if ($help); |
| |
| return; |
| } |
| |
| |
| # |
| # Usage. |
| # |
| sub usage |
| { |
| my $eval = shift(@_); |
| |
| # |
| # Print usage instructions and exit. |
| # |
| print STDERR "\nUsage: sjstat [-h] [-c] p\[-man] [-r] [-v]\n"; |
| |
| printf("\ |
| -h shows usage. |
| -c shows computing resources info only. |
| -man shows man page. |
| -r show only running jobs. |
| -v is for the verbose mode.\n |
| |
| Output is very similar to that of squeue. |
| \n\n"); |
| |
| exit($eval); |
| } |
| |
| |
| # |
| # Determine if SLURM is available. |
| # |
| sub isslurmup |
| { |
| my $out = `scontrol show part 2>&1`; |
| if ($?) { |
| printf("\n SLURM is not communicating.\n\n"); |
| exit(1); |
| } |
| |
| return; |
| } |
| |
| |
| __END__ |
| |
| =head1 NAME |
| |
| B<sjstat> - List attributes of jobs under the SLURM control |
| |
| =head1 SYNOPSIS |
| |
| B<sjstat> [B<-h> ] [B<-c>] [B<-r> ] [B<-v>] |
| |
| =head1 DESCRIPTION |
| |
| The B<sjstat> command is used to display statistics of jobs under control of SLURM. |
| The output is designed to give information on the resource usage and availablilty, |
| as well as information about jobs that are currently active on the machine. This output |
| is built using the SLURM utilities, sinfo, squeue and scontrol, the man pages for these |
| utilities will provide more information and greater depth of understanding. |
| |
| =head1 OPTIONS |
| |
| =over 4 |
| |
| =item B<-h> |
| |
| Display a brief help message |
| |
| =item B<-c> |
| |
| Display the computing resource information only. |
| |
| =item B<-man> |
| |
| Show the man page. |
| |
| =item B<-r> |
| |
| Display only the running jobs. |
| |
| =item B<-v> |
| |
| Display more verbose information. |
| |
| =back |
| |
| =head1 EXAMPLE |
| |
| The following is a basic request for status. |
| |
| > sjstat |
| |
| Scheduling pool data: |
| ------------------------------------------------------------ |
| Pool Memory Cpus Total Usable Free Other Traits |
| ------------------------------------------------------------ |
| pdebug 15000Mb 8 32 32 24 (null) |
| pbatch* 15000Mb 8 1072 1070 174 (null) |
| |
| |
| Running job data: |
| ------------------------------------------------------------------- |
| JobID User Nodes Pool Status Used Master/Other |
| ------------------------------------------------------------------- |
| 395 mary 1000 pbatch PD 0:00 (JobHeld) |
| 396 mary 1000 pbatch PD 0:00 (JobHeld) |
| 375 sam 1000 pbatch CG 0:00 (JobHeld) |
| 388 fred 32 pbatch R 25:27 atlas89 |
| 361 harry 512 pbatch R 1:01:12 atlas618 |
| 1077742 sally 8 pdebug R 20:16 atlas18 |
| |
| |
| The Scheduling data contains information pertaining to the: |
| |
| Pool a set of nodes |
| Memory the amount of memory on each node |
| Cpus the number of cpus on each node |
| Total the total number of nodes in the pool |
| Usable total usaable nodes in the pool |
| Free total nodes that are currently free |
| |
| The Running job data contains information pertaining to the: |
| |
| JobID the SLURM job id |
| User owner of the job |
| Nodes nodes required, or in use by the job |
| (Note: On cpu scheduled machines, this field |
| will be labeled "Procs" show the number of processors |
| the job is using.) |
| Pool the Pool required or in use by the job |
| Status current status of the job |
| Used Wallclick time used by the job |
| Master/Other Either the Master (head) node used by the job, or may |
| indicate further status of a pending, or completing job. |
| |
| The common status values are: |
| |
| R The job is running |
| PD The job is Pending |
| CG The job is Completing |
| |
| These are states reported by SLURM and more elaborate documentation |
| can be found in the squeue/sinfo man pages. |
| |
| |
| An example of the -v option. |
| |
| Scheduling pool data: |
| ----------------------------------------------------------------------------- |
| Total Usable Free Node Time Other |
| Pool Memory Cpus Nodes Nodes Nodes Limit Limit Traits |
| ----------------------------------------------------------------------------- |
| pdebug 15000Mb 8 32 32 24 16 30 (null) |
| pbatch* 15000Mb 8 1072 1070 174 UNLIM UNLIM (null) |
| |
| Running job data: |
| --------------------------------------------------------------------------------------------------- |
| Time Time Time |
| JobID User Nodes Pool Status Used Limit Started Master/Other |
| --------------------------------------------------------------------------------------------------- |
| 38562 tom 4 pbatch PD 0:00 1:00:00 01-14T18:11:22 (JobHeld) |
| |
| The added fields to the "Scheduling pool data" are: |
| |
| Node Limit SLURM imposed node limit. |
| Time Limit SLURM imposed time limit, value in minutes. |
| |
| The added fields to the "Running job data" are: |
| |
| Limit Time limit of job. |
| Start Start time of job. |
| |
| =head1 REPORTING BUGS |
| |
| Report bugs to <eckert2@llnl.gov> |
| |
| =cut |