| #!/usr/bin/env expect |
| ############################################################################ |
| # Purpose: Test for accounting records of specific job names with their ID |
| ############################################################################ |
| # Copyright (C) SchedMD LLC. |
| # |
| # This file is part of Slurm, a resource management program. |
| # For details, see <https://slurm.schedmd.com/>. |
| # Please also read the included file: DISCLAIMER. |
| # |
| # Slurm is free software; you can redistribute it and/or modify it under |
| # the terms of the GNU General Public License as published by the Free |
| # Software Foundation; either version 2 of the License, or (at your option) |
| # any later version. |
| # |
| # Slurm is distributed in the hope that it will be useful, but WITHOUT ANY |
| # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS |
| # FOR A PARTICULAR PURPOSE. See the GNU General Public License for more |
| # details. |
| # |
| # You should have received a copy of the GNU General Public License along |
| # with Slurm; if not, write to the Free Software Foundation, Inc., |
| # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. |
| ############################################################################ |
| |
| |
| # |
| # Supplemental function to test21.21 that test a job with |
| # resources within the allowed limit in the association |
| # |
| proc inc21_21_good { test_type limit } { |
| global number bin_id ta srun test_node selectparam nthreads is_skip re_word_str |
| |
| set job_id 0 |
| set val 0 |
| set add "" |
| |
| # Wait for old jobs to clean up |
| sleep 2 |
| |
| log_info "====== Test $test_type ======" |
| |
| if {($test_type eq "maxcpus" || $test_type eq "maxcpumins") && ([default_part_exclusive] != 0 || [check_config_select "linear"])} { |
| log_warn "Unable to perform test with exclusive node allocations" |
| set is_skip 1 |
| return |
| } |
| set select_type_param [get_select_type_params] |
| if { [string first "CR_SOCKET" $select_type_param] != -1} { |
| log_warn "This test can't be run SelectTypeParameters=CR_SOCKET" |
| set is_skip 1 |
| return |
| } |
| if {$test_type eq "maxnode"} { |
| set add "--exclusive" |
| } else { |
| set add "-w$test_node" |
| } |
| |
| set matches 0 |
| spawn $srun -v -t1 $add [lindex $limit 0][lindex $limit 1] \ |
| --account=$ta $bin_id |
| expect { |
| -re "launching StepId=($number)\\.$re_word_str" { |
| set job_id $expect_out(1,string) |
| incr matches |
| exp_continue |
| } |
| timeout { |
| fail "srun not responding" |
| } |
| eof { |
| wait |
| } |
| } |
| if {$job_id == 0} { |
| wait_for_job -fail $job_id "DONE" |
| } |
| |
| subtest -fail { $matches == 1 } "Job launches with correct limit" |
| } |
| |
| # |
| # Supplemental function to test21.21 that test a job with |
| # resources larger than allowed limit in the association |
| # |
| proc inc21_21_bad { test_type limit } { |
| global number bin_id ta srun test_node nthreads selectparam re_word_str |
| |
| set job_id 0 |
| set over_lim [expr [lindex $limit 1] + 1] |
| set add "" |
| |
| log_info "====== Test $test_type ======" |
| |
| if {$test_type eq "maxnode"} { |
| set add "--exclusive" |
| } else { |
| set add "-w$test_node" |
| } |
| |
| set matches 0 |
| spawn $srun -v $add -t1 [lindex $limit 0]$over_lim --account=$ta \ |
| -I $bin_id |
| expect { |
| -re "Job violates accounting/QOS policy" { |
| log_info "This error is expected, not a problem" |
| exp_continue |
| } |
| -re "launching StepId=($number)\\.$re_word_str" { |
| set job_id $expect_out(1,string) |
| fail "Job ($job_id) should not have run" |
| } |
| timeout { |
| fail "srun not responding" |
| } |
| eof { |
| wait |
| } |
| } |
| if {$job_id != 0} { |
| wait_for_job -fail $job_id "DONE" |
| } |
| } |
| |
| proc inc21_21_grp_test { test_type limit } { |
| global number bin_id ta srun sbatch test_node selectparam nthreads is_skip |
| global file_in squeue scancel bin_bash bin_chmod job_list |
| |
| set val 0 |
| set exclusive "" |
| |
| log_info "===== Test $test_type =====" |
| |
| if { $test_type eq "grpcpumins" && |
| ![param_contains [get_config_param "AccountingStorageEnforce"] "safe"] } { |
| log_warn "This test can't be run without AccountingStorageEnforce having \"safe\" in it" |
| set is_skip 1 |
| return |
| } |
| |
| if { [default_part_exclusive] != 0 || [check_config_select "linear"]} { |
| log_warn "This test can't be run Exclusive node allocations" |
| set is_skip 1 |
| return |
| } |
| |
| set select_type_param [get_select_type_params] |
| if { [string first "CR_SOCKET" $select_type_param] != -1} { |
| log_warn "This test can't be run SelectTypeParameters=CR_SOCKET" |
| set is_skip 1 |
| return |
| } |
| |
| # Check and see if it is a CPU test |
| if {$test_type eq "grpcpus" || $test_type eq "grpcpumins" || $test_type eq "grpcpurunmins"} { |
| if {$selectparam} { |
| set val [expr [lindex $limit 1] / $nthreads] |
| } else { |
| set val [lindex $limit 1] |
| } |
| } else { |
| set exclusive "#SBATCH --exclusive" |
| set val [lindex $limit 1] |
| } |
| |
| make_bash_script $file_in " |
| $exclusive |
| sleep 10" |
| |
| # |
| # First we will submit n jobs that should be below the association limit |
| # and should run. We wait for these to start before submitting the |
| # over-limit job. If we were to submit them all at once, periodically the |
| # earlier submitted jobs can take longer to start than later submitted jobs |
| # such as when an epilog is still in progress on the assigned nodes. |
| # |
| for {set inx 0} {$inx < $val} {incr inx} { |
| set job_id($inx) [submit_job -fail "-t1 [lindex $limit 0]1 --account=$ta --output=/dev/null --error=/dev/null $file_in"] |
| lappend job_list $job_id($inx) |
| } |
| |
| # Wait for the expected jobs to start running |
| for {set inx 0} {$inx < $val} {incr inx} { |
| wait_for -fail -timeout 30 -pollinterval .2 {$state eq "RUNNING"} { |
| set state [get_job_param $job_id($inx) "JobState"] |
| } |
| } |
| |
| # |
| # Submit an additional job. This job should pend since it will be past the |
| # association limit. This job gets a longer time limit to avoid having it |
| # prematurely start after _decay_thread() runs and decays the values of the |
| # other running jobs. |
| # |
| set job_id($inx) [submit_job -fail "-t$val [lindex $limit 0]1 --account=$ta --output=/dev/null --error=/dev/null $file_in"] |
| lappend job_list $job_id($inx) |
| |
| set pending 0 |
| set running 0 |
| spawn $squeue -A $ta -h -o "\%t \%r" |
| expect { |
| -re "PD ." { |
| incr pending |
| exp_continue |
| } |
| -re "R ." { |
| incr running |
| exp_continue |
| } |
| timeout { |
| fail "squeue not responding" |
| } |
| eof { |
| wait |
| } |
| } |
| |
| subtest -fail { $pending == 1 && $running == $val } "$test_type limit" "Found $pending jobs pending and $running jobs running while expecting 1 and $val" |
| |
| # |
| # Cancel test jobs |
| # |
| spawn $scancel --quiet --account=$ta |
| expect { |
| eof { |
| wait |
| } |
| } |
| } |
| |
| # |
| # Supplemental function to test21.21 that test for max/grp |
| # submit and jobs |
| # |
| proc inc21_21_submit_test { limit } { |
| global file_in srun sbatch squeue scancel bin_id number bin_sleep is_skip re_word_str |
| global bin_rm ta maxjob_lim maxsub_lim |
| global acct_mod_desc acct_mod_acct_vals acct_mod_assoc_vals |
| global acct_mod_assoc_test_vals job_list |
| |
| set limit_job "" |
| set limit_sub "" |
| |
| if {$limit eq "grpjobsub" && [default_part_exclusive] != 0} { |
| log_warn "Unable to perform test with exclusive node allocations" |
| set is_skip 1 |
| return |
| } |
| |
| if {$limit eq "maxjobsub"} { |
| set limit_job "maxjob" |
| set limit_sub "maxsubmit" |
| |
| } else { |
| set limit_job "grpjob" |
| set limit_sub "grpsubmit" |
| } |
| |
| set acct_mod_assoc_test_vals($limit_job) \ |
| [lindex $acct_mod_assoc_vals($limit) 0] |
| set acct_mod_assoc_test_vals($limit_sub) \ |
| [lindex $acct_mod_assoc_vals($limit) 1] |
| if [mod_acct $ta [array get acct_mod_desc] \ |
| [array get acct_mod_assoc_test_vals] \ |
| [array get acct_mod_acct_vals]] { |
| fail "Unable to modify account ($ta)" |
| } |
| |
| make_bash_script $file_in " |
| $bin_sleep 120 |
| " |
| |
| # Test to make sure that the grpsubmit and maxsubmit |
| # are enforced with jobs |
| log_info "==== Test $limit ====" |
| |
| # Submit jobs to test the limit set in the association |
| for {set inx 0} {$inx < $acct_mod_assoc_test_vals($limit_sub)} {incr inx} { |
| set job_id($inx) [submit_job -fail "-N1 -n1 --account=$ta --output=/dev/null --error=/dev/null -t5 $file_in"] |
| if { !$job_id($inx) } { |
| fail "sbatch didn't return jobid" |
| } |
| lappend job_list $job_id($inx) |
| } |
| |
| # Wait for the allowed jobs to start running |
| for {set inx 0} {$inx < $acct_mod_assoc_test_vals($limit_sub)} {incr inx} { |
| if {$inx < $acct_mod_assoc_test_vals($limit_job)} { |
| wait_for_job -fail -timeout 10 -pollinterval .1 $job_id($inx) "RUNNING" |
| } |
| } |
| |
| # Then submit one more over the limit and it should fail |
| set result [run_command "$sbatch -N1 -n1 --account=$ta --output=/dev/null --error=/dev/null -t5 $file_in"] |
| subtest [dict get $result exit_code] "Job submitted in excess of $limit limit should fail" |
| subtest {[regexp {Job violates accounting/QOS policy} [dict get $result output]]} "Job submitted in execess of $limit limit should display policy violation error" |
| |
| set matches 0 |
| spawn $squeue -A$ta -h -o "\%i \%t \%r" |
| expect { |
| -re "($job_id(2)|$job_id(3)).PD.AssocMaxJobsLimit" { |
| incr matches |
| exp_continue |
| } |
| -re "($job_id(2)|$job_id(3)).PD.AssocGrpJobsLimit" { |
| incr matches |
| exp_continue |
| } |
| -re "($job_id(0)|$job_id(1)).R.$re_word_str" { |
| incr matches |
| exp_continue |
| } |
| timeout { |
| fail "squeue not responding" |
| } |
| eof { |
| wait |
| } |
| } |
| |
| spawn $scancel --quiet --account=$ta |
| expect { |
| eof { |
| wait |
| } |
| } |
| |
| if {$limit eq "maxjobsub" && $matches > 0 && $matches < 4 && [default_part_exclusive] != 0} { |
| log_warn "Only started $matches of 4 possible jobs" |
| } elseif { $matches != 4 } { |
| fail "Jobs are not in the expected state (expected $matches != 4)" |
| } |
| |
| # Test to make sure that the grpsubmit and maxsubmit |
| # are enforced with job arrays |
| |
| log_info "==== Test $limit with job arrays ====" |
| |
| # Submit jobs to test the limit set in the association |
| for {set inx 0} {$inx < $acct_mod_assoc_test_vals($limit_sub)} {incr inx} { |
| set job_id($inx) [submit_job -fail "-N1 -a0 --account=$ta --output=/dev/null --error=/dev/null -t5 $file_in"] |
| if { !$job_id($inx) } { |
| fail "sbatch didn't return jobid" |
| } |
| lappend job_list $job_id($inx) |
| |
| # Wait for the job to be scheduled, if it should be |
| if {$inx < $acct_mod_assoc_test_vals($limit_job)} { |
| wait_for_job -fail -timeout 10 -pollinterval .1 $job_id($inx) "RUNNING" |
| } |
| } |
| |
| # Then submit one more over the limit and it should fail |
| set result [run_command "$sbatch -N1 -a0 --account=$ta --output=/dev/null --error=/dev/null -t5 $file_in"] |
| subtest [dict get $result exit_code] "Job submitted in excess of $limit limit should fail" |
| subtest {[regexp {Job violates accounting/QOS policy} [dict get $result output]]} "Job submitted in execess of $limit limit should display policy violation error" |
| |
| set matches 0 |
| spawn $squeue -A$ta -h -o "\%i \%t \%r" |
| expect { |
| -re "($job_id(2)|$job_id(3))_\\\[0\\\].PD.AssocMaxJobsLimit" { |
| incr matches |
| exp_continue |
| } |
| -re "($job_id(2)|$job_id(3))_\\\[0\\\].PD.AssocGrpJobsLimit" { |
| incr matches |
| exp_continue |
| } |
| -re "($job_id(0)|$job_id(1))_0.R.$re_word_str" { |
| incr matches |
| exp_continue |
| } |
| timeout { |
| fail "squeue not responding" |
| } |
| eof { |
| wait |
| } |
| } |
| |
| spawn $scancel --quiet --account=$ta |
| expect { |
| eof { |
| wait |
| } |
| } |
| |
| if {$limit eq "maxjobsub" && $matches > 0 && $matches < 4 && [default_part_exclusive] != 0} { |
| log_warn "Only started $matches of 4 possible jobs" |
| } elseif { $matches != 4 } { |
| fail "Jobs are not in the expected state (expected $matches != 4)" |
| } |
| |
| # Clear the limits |
| set acct_mod_assoc_test_vals($limit_job) "-1" |
| set acct_mod_assoc_test_vals($limit_sub) "-1" |
| } |
| |
| # |
| # Function that tests an association's grpwall limit |
| # |
| proc inc21_21_grpwall { test_type limit } { |
| global bin_sleep ta test_qos job_list |
| |
| set local_job_list [list] |
| set jobs 5.0 |
| set grpwall_num [lindex $limit 1] |
| set grpwall_per_job [expr $grpwall_num * 1.1 / $jobs] |
| set sleep_time [expr int(ceil($grpwall_per_job * 60))] |
| set job_time [expr int(ceil($grpwall_per_job))] |
| set timeout 120 |
| |
| log_info "====== Test $test_type ======" |
| |
| # Wait for old jobs to clean up |
| sleep 2 |
| |
| # Since wall is a decayed variable lets reset it to make sure the test |
| # gets exactly what we would expect. |
| reset_qos_usage "" $test_qos |
| |
| log_debug "Running $jobs jobs of $sleep_time seconds of duration to ensure that we reach the Grpwall limit of $grpwall_num minutes" |
| for {set i 0} {$i < $jobs} {incr i} { |
| set job_id [submit_job -fail "--account=$ta -N1 -t$job_time --wrap '$bin_sleep $sleep_time' -o /dev/null -e /dev/null"] |
| lappend local_job_list $job_id |
| lappend job_list $job_id |
| } |
| |
| foreach job_id $local_job_list { |
| if {[wait_job_reason $job_id COMPLETED] != $::RETURN_SUCCESS} { |
| fail "Job ($job_id) did not complete" |
| } |
| } |
| |
| log_debug "Submitting the final job and check that it is set Pending with Reason AssocGrpWallLimit" |
| set job_id [submit_job -fail "--account=$ta -N1 -t$job_time --wrap '$bin_sleep $sleep_time' -o /dev/null -e /dev/null"] |
| lappend local_job_list $job_id |
| lappend job_list $job_id |
| # Subtest of the limit |
| if {[wait_job_reason $job_id PENDING AssocGrpWallLimit] != $::RETURN_SUCCESS} { |
| cancel_job $local_job_list |
| fail "Job should not have run" |
| } |
| |
| # Cancel jobs |
| cancel_job $local_job_list |
| } |