| #!/usr/bin/env expect |
| ############################################################################ |
| # Purpose: Test of Slurm functionality |
| # Test sacct functionality and accuracy. |
| ############################################################################ |
| # Copyright (C) 2005 The Regents of the University of California. |
| # Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). |
| # Written by Morris Jette <jette1@llnl.gov> |
| # CODE-OCEC-09-009. All rights reserved. |
| # |
| # This file is part of Slurm, a resource management program. |
| # For details, see <https://slurm.schedmd.com/>. |
| # Please also read the included file: DISCLAIMER. |
| # |
| # Slurm is free software; you can redistribute it and/or modify it under |
| # the terms of the GNU General Public License as published by the Free |
| # Software Foundation; either version 2 of the License, or (at your option) |
| # any later version. |
| # |
| # Slurm is distributed in the hope that it will be useful, but WITHOUT ANY |
| # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS |
| # FOR A PARTICULAR PURPOSE. See the GNU General Public License for more |
| # details. |
| # |
| # You should have received a copy of the GNU General Public License along |
| # with Slurm; if not, write to the Free Software Foundation, Inc., |
| # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. |
| ############################################################################ |
| source ./globals |
| |
| set file_in "$test_dir/input" |
| set file_out "$test_dir/output" |
| set file_err "$test_dir/error" |
| set file_prog "$test_name.prog" |
| set file_stem $test_name |
| set job_id 0 |
| set step_id 0 |
| set matches 0 |
| |
| # job parameters |
| set mem_size 204800; # 200 MiB |
| set file_size 10485760 |
| set sleep_time 32 |
| set ret_code 42 |
| set num_tasks 3 |
| set max_rss_tolerance 10240; # 10 MiB |
| set ave_rss_tolerance 20480; # 20 MiB |
| set max_time_error 10 |
| set job_mem_limit [expr ($mem_size + $max_rss_tolerance + $ave_rss_tolerance) / 1024] |
| |
| # Expected values in sacct |
| set expected_state "FAILED" |
| set expected_ret_code $ret_code |
| if {[get_config_param "KillOnBadExit"] == 1} { |
| set expected_state "CANCELLED" |
| set expected_ret_code "0:15" |
| } |
| |
| # |
| # Check requirements |
| # |
| set accounting_storage_enforce [get_config_param "AccountingStorageEnforce"] |
| if {[param_contains $accounting_storage_enforce "nosteps"] || [param_contains $accounting_storage_enforce "nojobs"]} { |
| skip "This test can not be run with nosteps or nojobs (AccountingStorageEnforce)" |
| } |
| if {[get_config_param JobAcctGatherType] != "jobacct_gather/linux" && |
| [get_config_param JobAcctGatherType] != "jobacct_gather/cgroup"} { |
| skip "Job accounting information not gathered on this system" |
| } |
| if {[get_config_param AccountingStorageType] == "accounting_storage/none"} { |
| skip "Job accounting information not stored on this system" |
| } |
| |
| set nodes [get_nodes_by_request "-n$num_tasks --mem=$job_mem_limit -t2"] |
| if {![llength $nodes]} { |
| skip "Unable to test with current node configuration" |
| } |
| |
| proc cleanup {} { |
| global file_prog file_stem |
| |
| file delete $file_prog $file_stem.read $file_stem.write |
| } |
| |
| proc _get_mem {prog} { |
| global float number mem_size job_id step_id max_rss_tolerance ave_rss_tolerance num_tasks |
| |
| set result [list] |
| set max_rss -1 |
| set mem_task -1 |
| set ave_rss -1 |
| set prog_base [file rootname [file tail $prog]] |
| |
| set output [run_command_output -fail "$prog --noheader -p --job=$job_id.$step_id --format maxrss,maxrsstask,averss --noconvert"] |
| |
| dict set subtest cond [regexp "($number)\\D($number)\\D($number)" $output - max_rss mem_task ave_rss] |
| dict set subtest desc "$prog_base should provide the right output format" |
| dict set subtest diag "$output" |
| lappend result $subtest |
| if {![dict get $subtest cond]} { |
| return $result |
| } |
| |
| set max_rss [scale_to_ks $max_rss ""] |
| # Since we will be multiplying ave_rss by num_tasks below and expecting it to be greater than |
| # max_rss, we must add 1 to ave_rss to compensate for the small potential truncation loss that |
| # can occur in the slurm code when calculating AveRSS (effectively int(TotRSS / tasks)). |
| set ave_rss [scale_to_ks [expr $ave_rss + 1] ""] |
| |
| dict set subtest cond [tolerance $mem_size $max_rss $max_rss_tolerance] |
| dict set subtest desc "MaxRSS is within expected tolerance for $prog_base" |
| dict set subtest diag "Observed MaxRSS ($max_rss) is not within tolerance $max_rss_tolerance of expected value ($mem_size)" |
| lappend result $subtest |
| |
| # We expect ave_rss to be greater than or equal to max_rss / num_tasks. |
| # We are assuming that the two non-memory tasks will not climb above a total of 10 MiB. |
| dict set subtest cond [tolerance $max_rss [expr $ave_rss * $num_tasks] "+$ave_rss_tolerance"] |
| dict set subtest desc "AveRSS is within expected tolerance for $prog_base" |
| dict set subtest diag "Observed AveRSS ($ave_rss) is not within tolerance +[expr $ave_rss_tolerance / $num_tasks] of expected value ([expr $max_rss / $num_tasks])" |
| lappend result $subtest |
| |
| return $result |
| } |
| |
| # Check the job written and read file size. |
| proc _get_file_size {prog} { |
| global number float job_id step_id file_size |
| |
| set result [list] |
| set max_disk_write -1 |
| set ave_disk_write -1 |
| set max_disk_read -1 |
| set ave_disk_read -1 |
| set prog_base [file rootname [file tail $prog]] |
| |
| set output [run_command_output -fail "$prog --noheader -p --job=$job_id.$step_id --format MaxDiskWrite,AveDiskWrite,MaxDiskRead,AveDiskRead,MaxDiskWriteTask,MaxDiskReadTask --noconvert"] |
| |
| dict set subtest cond [regexp "($float)(\[MGT\]*)\\D($float)(\[MGT\]*)\\D($float)(\[MGT\]*)\\D($float)(\[MGT\]*)\\D($number)\\D($number)" $output \ |
| - \ |
| max_disk_write scale1 \ |
| ave_disk_write scale2 \ |
| max_disk_read scale3 \ |
| ave_disk_read scale4 \ |
| w_task r_task] |
| dict set subtest desc "$prog_base should provide the right output format" |
| dict set subtest diag "$output" |
| lappend result $subtest |
| if {![dict get $subtest cond]} { |
| return $result |
| } |
| |
| set max_disk_write [scale_to_megs $max_disk_write $scale1] |
| set ave_disk_write [scale_to_megs $ave_disk_write $scale2] |
| set max_disk_read [scale_to_megs $max_disk_read $scale3] |
| set ave_disk_read [scale_to_megs $ave_disk_read $scale4] |
| |
| dict set subtest cond [expr {$w_task == 1}] |
| dict set subtest desc "$prog_base should provide MaxDiskWriteTask equal to 1" |
| dict set subtest diag "$w_task != 1" |
| lappend result $subtest |
| |
| dict set subtest cond [expr {$r_task == 2}] |
| dict set subtest desc "$prog_base should provide MaxDiskReadTask equal to 2" |
| dict set subtest diag "$r_task != 2" |
| lappend result $subtest |
| |
| dict set subtest cond [tolerance $max_disk_write $max_disk_read "0.3"] |
| dict set subtest desc "$prog_base should provide MaxDiskWrite close to MaxDiskRead with 0.3 tolerance" |
| dict set subtest diag "$max_disk_write is too different from $max_disk_read" |
| lappend result $subtest |
| |
| dict set subtest cond [tolerance $ave_disk_write $ave_disk_read "0.3"] |
| dict set subtest desc "$prog_base should provide AveDiskWrite close to AveDiskRead with 0.3 tolerance" |
| dict set subtest diag "$ave_disk_write is too different from $ave_disk_read" |
| lappend result $subtest |
| |
| return $result |
| } |
| |
| proc wait_and_subtest {test prog} { |
| set all_passed false |
| wait_for -timeout 15 {$all_passed} { |
| set result [$test $prog] |
| |
| set all_passed true |
| foreach subtest $result { |
| if {![dict get $subtest cond]} { |
| set all_passed false |
| } |
| } |
| } |
| foreach subtest $result { |
| subtest [dict get $subtest cond] [dict get $subtest desc] [dict get $subtest diag] |
| } |
| } |
| |
| # |
| # Delete left-over program and rebuild it |
| # Compilation is not optimized to avoid memset to be skipped. |
| # |
| run_command -fail "$bin_cc -o $file_prog ${file_prog}.c" |
| run_command -fail "$bin_chmod 700 $file_prog" |
| |
| make_bash_script $file_in " |
| $srun ./$file_prog $ret_code $sleep_time $mem_size $file_size $file_stem |
| " |
| |
| # Create a file to read |
| set fd [open ${file_stem}.read "wb"] |
| for {set i 0} {$i < $file_size} {incr i} { |
| puts -nonewline $fd [binary format n $i] |
| } |
| close $fd |
| |
| # |
| # Run a simple job |
| # Usage: test12.2.prog <exit_code> <sleep_secs> <mem_kb> |
| # <file_size> <file_stem> |
| # |
| set config_prob 0 |
| set timeout [expr $max_job_delay + $sleep_time] |
| set job_id [submit_job -fail "-n$num_tasks --mem-per-cpu=$job_mem_limit --output=$file_out --error=$file_err -t2 $file_in"] |
| |
| # |
| # Wait for job to run |
| # |
| wait_for_job -fail $job_id "RUNNING" |
| |
| wait_and_subtest _get_mem $sstat |
| wait_and_subtest _get_file_size $sstat |
| |
| # |
| # Wait for job to complete |
| # |
| wait_for_job -fail $job_id "DONE" |
| |
| wait_and_subtest _get_mem $sacct |
| wait_and_subtest _get_file_size $sacct |
| |
| # |
| # Report basic sacct info |
| # |
| set output "COMPLETING" |
| wait_for -fail {![regexp "COMPLETING" $output]} { |
| set output [run_command_output "$sacct --noheader -P --job=$job_id.$step_id --format jobid,jobname,state,exitcode --starttime=00:00"] |
| } |
| subtest {[regexp "$job_id\\.$step_id.$file_prog.$expected_state.$expected_ret_code" $output]} "sacct should report $job_id\\.$step_id.$file_prog.$expected_state.$expected_ret_code" "$output" |
| |
| # |
| # Report the sacct accounting info: Elapsed |
| # |
| set output [run_command_output "$sacct --noheader -P --job=$job_id.$step_id --format elapsed --starttime=00:00"] |
| set value [convert_time_str $output "secs"] |
| subtest {[tolerance $sleep_time $value +$max_time_error]} "Elapsed time reported by sacct should be close to $sleep_time" "$value too different from $sleep_time" |
| |
| # |
| # Report the sacct accounting info: TotalCPU |
| # |
| set output [run_command_output "$sacct --noheader -P --job=$job_id.$step_id --format totalcpu --starttime=00:00"] |
| set value [convert_time_str $output "secs"] |
| subtest {$value <= $sleep_time} "TotalCPU reported by sacct should be less than $sleep_time secs" "$value > $sleep_time" |
| |
| # |
| # Report the sacct accounting info: MinCPU |
| # |
| set output [run_command_output "$sacct --noheader -P --job=$job_id.$step_id --format mincpu --starttime=00:00"] |
| set value [convert_time_str $output "secs"] |
| subtest {$value <= $sleep_time} "MinCPU reported by sacct should be less than $sleep_time secs" "$value > $sleep_time" |