blob: ab4b3d4e36682ddf611369e988c14487b457cf80 [file] [log] [blame]
#!/usr/bin/env expect
############################################################################
# Purpose: Test of Slurm functionality
# Test sacct functionality and accuracy.
############################################################################
# Copyright (C) 2005 The Regents of the University of California.
# Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
# Written by Morris Jette <jette1@llnl.gov>
# CODE-OCEC-09-009. All rights reserved.
#
# This file is part of Slurm, a resource management program.
# For details, see <https://slurm.schedmd.com/>.
# Please also read the included file: DISCLAIMER.
#
# Slurm is free software; you can redistribute it and/or modify it under
# the terms of the GNU General Public License as published by the Free
# Software Foundation; either version 2 of the License, or (at your option)
# any later version.
#
# Slurm is distributed in the hope that it will be useful, but WITHOUT ANY
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
# details.
#
# You should have received a copy of the GNU General Public License along
# with Slurm; if not, write to the Free Software Foundation, Inc.,
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
############################################################################
source ./globals
set file_in "$test_dir/input"
set file_out "$test_dir/output"
set file_err "$test_dir/error"
set file_prog "$test_name.prog"
set file_stem $test_name
set job_id 0
set step_id 0
set matches 0
# job parameters
set mem_size 204800; # 200 MiB
set file_size 10485760
set sleep_time 32
set ret_code 42
set num_tasks 3
set max_rss_tolerance 10240; # 10 MiB
set ave_rss_tolerance 20480; # 20 MiB
set max_time_error 10
set job_mem_limit [expr ($mem_size + $max_rss_tolerance + $ave_rss_tolerance) / 1024]
# Expected values in sacct
set expected_state "FAILED"
set expected_ret_code $ret_code
if {[get_config_param "KillOnBadExit"] == 1} {
set expected_state "CANCELLED"
set expected_ret_code "0:15"
}
#
# Check requirements
#
set accounting_storage_enforce [get_config_param "AccountingStorageEnforce"]
if {[param_contains $accounting_storage_enforce "nosteps"] || [param_contains $accounting_storage_enforce "nojobs"]} {
skip "This test can not be run with nosteps or nojobs (AccountingStorageEnforce)"
}
if {[get_config_param JobAcctGatherType] != "jobacct_gather/linux" &&
[get_config_param JobAcctGatherType] != "jobacct_gather/cgroup"} {
skip "Job accounting information not gathered on this system"
}
if {[get_config_param AccountingStorageType] == "accounting_storage/none"} {
skip "Job accounting information not stored on this system"
}
set nodes [get_nodes_by_request "-n$num_tasks --mem=$job_mem_limit -t2"]
if {![llength $nodes]} {
skip "Unable to test with current node configuration"
}
proc cleanup {} {
global file_prog file_stem
file delete $file_prog $file_stem.read $file_stem.write
}
proc _get_mem {prog} {
global float number mem_size job_id step_id max_rss_tolerance ave_rss_tolerance num_tasks
set result [list]
set max_rss -1
set mem_task -1
set ave_rss -1
set prog_base [file rootname [file tail $prog]]
set output [run_command_output -fail "$prog --noheader -p --job=$job_id.$step_id --format maxrss,maxrsstask,averss --noconvert"]
dict set subtest cond [regexp "($number)\\D($number)\\D($number)" $output - max_rss mem_task ave_rss]
dict set subtest desc "$prog_base should provide the right output format"
dict set subtest diag "$output"
lappend result $subtest
if {![dict get $subtest cond]} {
return $result
}
set max_rss [scale_to_ks $max_rss ""]
# Since we will be multiplying ave_rss by num_tasks below and expecting it to be greater than
# max_rss, we must add 1 to ave_rss to compensate for the small potential truncation loss that
# can occur in the slurm code when calculating AveRSS (effectively int(TotRSS / tasks)).
set ave_rss [scale_to_ks [expr $ave_rss + 1] ""]
dict set subtest cond [tolerance $mem_size $max_rss $max_rss_tolerance]
dict set subtest desc "MaxRSS is within expected tolerance for $prog_base"
dict set subtest diag "Observed MaxRSS ($max_rss) is not within tolerance $max_rss_tolerance of expected value ($mem_size)"
lappend result $subtest
# We expect ave_rss to be greater than or equal to max_rss / num_tasks.
# We are assuming that the two non-memory tasks will not climb above a total of 10 MiB.
dict set subtest cond [tolerance $max_rss [expr $ave_rss * $num_tasks] "+$ave_rss_tolerance"]
dict set subtest desc "AveRSS is within expected tolerance for $prog_base"
dict set subtest diag "Observed AveRSS ($ave_rss) is not within tolerance +[expr $ave_rss_tolerance / $num_tasks] of expected value ([expr $max_rss / $num_tasks])"
lappend result $subtest
return $result
}
# Check the job written and read file size.
proc _get_file_size {prog} {
global number float job_id step_id file_size
set result [list]
set max_disk_write -1
set ave_disk_write -1
set max_disk_read -1
set ave_disk_read -1
set prog_base [file rootname [file tail $prog]]
set output [run_command_output -fail "$prog --noheader -p --job=$job_id.$step_id --format MaxDiskWrite,AveDiskWrite,MaxDiskRead,AveDiskRead,MaxDiskWriteTask,MaxDiskReadTask --noconvert"]
dict set subtest cond [regexp "($float)(\[MGT\]*)\\D($float)(\[MGT\]*)\\D($float)(\[MGT\]*)\\D($float)(\[MGT\]*)\\D($number)\\D($number)" $output \
- \
max_disk_write scale1 \
ave_disk_write scale2 \
max_disk_read scale3 \
ave_disk_read scale4 \
w_task r_task]
dict set subtest desc "$prog_base should provide the right output format"
dict set subtest diag "$output"
lappend result $subtest
if {![dict get $subtest cond]} {
return $result
}
set max_disk_write [scale_to_megs $max_disk_write $scale1]
set ave_disk_write [scale_to_megs $ave_disk_write $scale2]
set max_disk_read [scale_to_megs $max_disk_read $scale3]
set ave_disk_read [scale_to_megs $ave_disk_read $scale4]
dict set subtest cond [expr {$w_task == 1}]
dict set subtest desc "$prog_base should provide MaxDiskWriteTask equal to 1"
dict set subtest diag "$w_task != 1"
lappend result $subtest
dict set subtest cond [expr {$r_task == 2}]
dict set subtest desc "$prog_base should provide MaxDiskReadTask equal to 2"
dict set subtest diag "$r_task != 2"
lappend result $subtest
dict set subtest cond [tolerance $max_disk_write $max_disk_read "0.3"]
dict set subtest desc "$prog_base should provide MaxDiskWrite close to MaxDiskRead with 0.3 tolerance"
dict set subtest diag "$max_disk_write is too different from $max_disk_read"
lappend result $subtest
dict set subtest cond [tolerance $ave_disk_write $ave_disk_read "0.3"]
dict set subtest desc "$prog_base should provide AveDiskWrite close to AveDiskRead with 0.3 tolerance"
dict set subtest diag "$ave_disk_write is too different from $ave_disk_read"
lappend result $subtest
return $result
}
proc wait_and_subtest {test prog} {
set all_passed false
wait_for -timeout 15 {$all_passed} {
set result [$test $prog]
set all_passed true
foreach subtest $result {
if {![dict get $subtest cond]} {
set all_passed false
}
}
}
foreach subtest $result {
subtest [dict get $subtest cond] [dict get $subtest desc] [dict get $subtest diag]
}
}
#
# Delete left-over program and rebuild it
# Compilation is not optimized to avoid memset to be skipped.
#
run_command -fail "$bin_cc -o $file_prog ${file_prog}.c"
run_command -fail "$bin_chmod 700 $file_prog"
make_bash_script $file_in "
$srun ./$file_prog $ret_code $sleep_time $mem_size $file_size $file_stem
"
# Create a file to read
set fd [open ${file_stem}.read "wb"]
for {set i 0} {$i < $file_size} {incr i} {
puts -nonewline $fd [binary format n $i]
}
close $fd
#
# Run a simple job
# Usage: test12.2.prog <exit_code> <sleep_secs> <mem_kb>
# <file_size> <file_stem>
#
set config_prob 0
set timeout [expr $max_job_delay + $sleep_time]
set job_id [submit_job -fail "-n$num_tasks --mem-per-cpu=$job_mem_limit --output=$file_out --error=$file_err -t2 $file_in"]
#
# Wait for job to run
#
wait_for_job -fail $job_id "RUNNING"
wait_and_subtest _get_mem $sstat
wait_and_subtest _get_file_size $sstat
#
# Wait for job to complete
#
wait_for_job -fail $job_id "DONE"
wait_and_subtest _get_mem $sacct
wait_and_subtest _get_file_size $sacct
#
# Report basic sacct info
#
set output "COMPLETING"
wait_for -fail {![regexp "COMPLETING" $output]} {
set output [run_command_output "$sacct --noheader -P --job=$job_id.$step_id --format jobid,jobname,state,exitcode --starttime=00:00"]
}
subtest {[regexp "$job_id\\.$step_id.$file_prog.$expected_state.$expected_ret_code" $output]} "sacct should report $job_id\\.$step_id.$file_prog.$expected_state.$expected_ret_code" "$output"
#
# Report the sacct accounting info: Elapsed
#
set output [run_command_output "$sacct --noheader -P --job=$job_id.$step_id --format elapsed --starttime=00:00"]
set value [convert_time_str $output "secs"]
subtest {[tolerance $sleep_time $value +$max_time_error]} "Elapsed time reported by sacct should be close to $sleep_time" "$value too different from $sleep_time"
#
# Report the sacct accounting info: TotalCPU
#
set output [run_command_output "$sacct --noheader -P --job=$job_id.$step_id --format totalcpu --starttime=00:00"]
set value [convert_time_str $output "secs"]
subtest {$value <= $sleep_time} "TotalCPU reported by sacct should be less than $sleep_time secs" "$value > $sleep_time"
#
# Report the sacct accounting info: MinCPU
#
set output [run_command_output "$sacct --noheader -P --job=$job_id.$step_id --format mincpu --starttime=00:00"]
set value [convert_time_str $output "secs"]
subtest {$value <= $sleep_time} "MinCPU reported by sacct should be less than $sleep_time secs" "$value > $sleep_time"