|  | #!/usr/bin/env expect | 
|  | ############################################################################ | 
|  | # Purpose: Test for accounting records of specific job names with their ID | 
|  | ############################################################################ | 
|  | # Copyright (C) SchedMD LLC. | 
|  | # | 
|  | # This file is part of Slurm, a resource management program. | 
|  | # For details, see <https://slurm.schedmd.com/>. | 
|  | # Please also read the included file: DISCLAIMER. | 
|  | # | 
|  | # Slurm is free software; you can redistribute it and/or modify it under | 
|  | # the terms of the GNU General Public License as published by the Free | 
|  | # Software Foundation; either version 2 of the License, or (at your option) | 
|  | # any later version. | 
|  | # | 
|  | # Slurm is distributed in the hope that it will be useful, but WITHOUT ANY | 
|  | # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS | 
|  | # FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more | 
|  | # details. | 
|  | # | 
|  | # You should have received a copy of the GNU General Public License along | 
|  | # with Slurm; if not, write to the Free Software Foundation, Inc., | 
|  | # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301  USA. | 
|  | ############################################################################ | 
|  |  | 
|  |  | 
|  | # | 
|  | # Supplemental function to test21.21 that test a job with | 
|  | # resources within the allowed limit in the association | 
|  | # | 
|  | proc inc21_21_good { test_type limit } { | 
|  | global number bin_id ta srun test_node selectparam nthreads is_skip re_word_str | 
|  |  | 
|  | set job_id 0 | 
|  | set val 0 | 
|  | set add "" | 
|  |  | 
|  | # Wait for old jobs to clean up | 
|  | sleep 2 | 
|  |  | 
|  | log_info "====== Test $test_type ======" | 
|  |  | 
|  | if {($test_type eq "maxcpus" || $test_type eq "maxcpumins") && ([default_part_exclusive] != 0 || [check_config_select "linear"])} { | 
|  | log_warn "Unable to perform test with exclusive node allocations" | 
|  | set is_skip 1 | 
|  | return | 
|  | } | 
|  | set select_type_param [get_select_type_params] | 
|  | if { [string first "CR_SOCKET" $select_type_param] != -1} { | 
|  | log_warn "This test can't be run SelectTypeParameters=CR_SOCKET" | 
|  | set is_skip 1 | 
|  | return | 
|  | } | 
|  | if {$test_type eq "maxnode"} { | 
|  | set add "--exclusive" | 
|  | } else { | 
|  | set add "-w$test_node" | 
|  | } | 
|  |  | 
|  | set matches 0 | 
|  | spawn $srun -v -t1 $add [lindex $limit 0][lindex $limit 1] \ | 
|  | --account=$ta $bin_id | 
|  | expect { | 
|  | -re "launching StepId=($number)\\.$re_word_str" { | 
|  | set job_id $expect_out(1,string) | 
|  | incr matches | 
|  | exp_continue | 
|  | } | 
|  | timeout { | 
|  | fail "srun not responding" | 
|  | } | 
|  | eof { | 
|  | wait | 
|  | } | 
|  | } | 
|  | if {$job_id == 0} { | 
|  | wait_for_job -fail $job_id "DONE" | 
|  | } | 
|  |  | 
|  | subtest -fail { $matches == 1 } "Job launches with correct limit" | 
|  | } | 
|  |  | 
|  | # | 
|  | # Supplemental function to test21.21 that test a job with | 
|  | # resources larger than allowed limit in the association | 
|  | # | 
|  | proc inc21_21_bad { test_type limit } { | 
|  | global number bin_id ta srun test_node nthreads selectparam re_word_str | 
|  |  | 
|  | set job_id 0 | 
|  | set over_lim [expr [lindex $limit 1] + 1] | 
|  | set add "" | 
|  |  | 
|  | log_info "====== Test $test_type ======" | 
|  |  | 
|  | if {$test_type eq "maxnode"} { | 
|  | set add "--exclusive" | 
|  | } else { | 
|  | set add "-w$test_node" | 
|  | } | 
|  |  | 
|  | set matches 0 | 
|  | spawn $srun -v $add -t1 [lindex $limit 0]$over_lim --account=$ta \ | 
|  | -I $bin_id | 
|  | expect { | 
|  | -re "Job violates accounting/QOS policy" { | 
|  | log_info "This error is expected, not a problem" | 
|  | exp_continue | 
|  | } | 
|  | -re "launching StepId=($number)\\.$re_word_str" { | 
|  | set job_id $expect_out(1,string) | 
|  | fail "Job ($job_id) should not have run" | 
|  | } | 
|  | timeout { | 
|  | fail "srun not responding" | 
|  | } | 
|  | eof { | 
|  | wait | 
|  | } | 
|  | } | 
|  | if {$job_id != 0} { | 
|  | wait_for_job -fail $job_id "DONE" | 
|  | } | 
|  | } | 
|  |  | 
|  | proc inc21_21_grp_test { test_type limit } { | 
|  | global number bin_id ta srun sbatch test_node selectparam nthreads is_skip | 
|  | global file_in squeue scancel bin_bash bin_chmod job_list | 
|  |  | 
|  | set val 0 | 
|  | set exclusive "" | 
|  |  | 
|  | log_info "===== Test $test_type =====" | 
|  |  | 
|  | if  { $test_type eq "grpcpumins" && | 
|  | ![param_contains [get_config_param "AccountingStorageEnforce"] "safe"] } { | 
|  | log_warn "This test can't be run without AccountingStorageEnforce having \"safe\" in it" | 
|  | set is_skip 1 | 
|  | return | 
|  | } | 
|  |  | 
|  | if { [default_part_exclusive] != 0 || [check_config_select "linear"]} { | 
|  | log_warn "This test can't be run Exclusive node allocations" | 
|  | set is_skip 1 | 
|  | return | 
|  | } | 
|  |  | 
|  | set select_type_param [get_select_type_params] | 
|  | if { [string first "CR_SOCKET" $select_type_param] != -1} { | 
|  | log_warn "This test can't be run SelectTypeParameters=CR_SOCKET" | 
|  | set is_skip 1 | 
|  | return | 
|  | } | 
|  |  | 
|  | # Check and see if it is a CPU test | 
|  | if {$test_type eq "grpcpus" || $test_type eq "grpcpumins" || $test_type eq "grpcpurunmins"} { | 
|  | if {$selectparam} { | 
|  | set val [expr [lindex $limit 1] / $nthreads] | 
|  | } else { | 
|  | set val [lindex $limit 1] | 
|  | } | 
|  | } else { | 
|  | set exclusive "#SBATCH --exclusive" | 
|  | set val [lindex $limit 1] | 
|  | } | 
|  |  | 
|  | make_bash_script $file_in " | 
|  | $exclusive | 
|  | sleep 10" | 
|  |  | 
|  | # | 
|  | # First we will submit n jobs that should be below the association limit | 
|  | # and should run. We wait for these to start before submitting the | 
|  | # over-limit job. If we were to submit them all at once, periodically the | 
|  | # earlier submitted jobs can take longer to start than later submitted jobs | 
|  | # such as when an epilog is still in progress on the assigned nodes. | 
|  | # | 
|  | for {set inx 0} {$inx < $val} {incr inx} { | 
|  | set job_id($inx) [submit_job -fail "-t1 [lindex $limit 0]1 --account=$ta --output=/dev/null --error=/dev/null $file_in"] | 
|  | lappend job_list $job_id($inx) | 
|  | } | 
|  |  | 
|  | # Wait for the expected jobs to start running | 
|  | for {set inx 0} {$inx < $val} {incr inx} { | 
|  | wait_for -fail -timeout 30 -pollinterval .2 {$state eq "RUNNING"} { | 
|  | set state [get_job_param $job_id($inx) "JobState"] | 
|  | } | 
|  | } | 
|  |  | 
|  | # | 
|  | # Submit an additional job. This job should pend since it will be past the | 
|  | # association limit. This job gets a longer time limit to avoid having it | 
|  | # prematurely start after _decay_thread() runs and decays the values of the | 
|  | # other running jobs. | 
|  | # | 
|  | set job_id($inx) [submit_job -fail "-t$val [lindex $limit 0]1 --account=$ta --output=/dev/null --error=/dev/null $file_in"] | 
|  | lappend job_list $job_id($inx) | 
|  |  | 
|  | set pending 0 | 
|  | set running 0 | 
|  | spawn $squeue -A $ta -h -o "\%t \%r" | 
|  | expect { | 
|  | -re "PD ." { | 
|  | incr pending | 
|  | exp_continue | 
|  | } | 
|  | -re "R ." { | 
|  | incr running | 
|  | exp_continue | 
|  | } | 
|  | timeout { | 
|  | fail "squeue not responding" | 
|  | } | 
|  | eof { | 
|  | wait | 
|  | } | 
|  | } | 
|  |  | 
|  | subtest -fail { $pending == 1 && $running == $val } "$test_type limit" "Found $pending jobs pending and $running jobs running while expecting 1 and $val" | 
|  |  | 
|  | # | 
|  | # Cancel test jobs | 
|  | # | 
|  | spawn $scancel --quiet --account=$ta | 
|  | expect { | 
|  | eof { | 
|  | wait | 
|  | } | 
|  | } | 
|  | } | 
|  |  | 
|  | # | 
|  | # Supplemental function to test21.21 that test for max/grp | 
|  | # submit and jobs | 
|  | # | 
|  | proc inc21_21_submit_test { limit } { | 
|  | global file_in srun sbatch squeue scancel bin_id number bin_sleep is_skip re_word_str | 
|  | global bin_rm ta maxjob_lim maxsub_lim | 
|  | global acct_mod_desc acct_mod_acct_vals acct_mod_assoc_vals | 
|  | global acct_mod_assoc_test_vals job_list | 
|  |  | 
|  | set limit_job "" | 
|  | set limit_sub "" | 
|  |  | 
|  | if {$limit eq "grpjobsub" && [default_part_exclusive] != 0} { | 
|  | log_warn "Unable to perform test with exclusive node allocations" | 
|  | set is_skip 1 | 
|  | return | 
|  | } | 
|  |  | 
|  | if {$limit eq "maxjobsub"} { | 
|  | set limit_job "maxjob" | 
|  | set limit_sub "maxsubmit" | 
|  |  | 
|  | } else { | 
|  | set limit_job "grpjob" | 
|  | set limit_sub "grpsubmit" | 
|  | } | 
|  |  | 
|  | set acct_mod_assoc_test_vals($limit_job) \ | 
|  | [lindex $acct_mod_assoc_vals($limit) 0] | 
|  | set acct_mod_assoc_test_vals($limit_sub) \ | 
|  | [lindex $acct_mod_assoc_vals($limit) 1] | 
|  | if [mod_acct $ta [array get acct_mod_desc] \ | 
|  | [array get acct_mod_assoc_test_vals] \ | 
|  | [array get acct_mod_acct_vals]] { | 
|  | fail "Unable to modify account ($ta)" | 
|  | } | 
|  |  | 
|  | make_bash_script $file_in " | 
|  | $bin_sleep 120 | 
|  | " | 
|  |  | 
|  | # Test to make sure that the grpsubmit and maxsubmit | 
|  | # are enforced with jobs | 
|  | log_info "==== Test $limit ====" | 
|  |  | 
|  | # Submit jobs to test the limit set in the association | 
|  | for {set inx 0} {$inx < $acct_mod_assoc_test_vals($limit_sub)} {incr inx} { | 
|  | set job_id($inx) [submit_job -fail "-N1 -n1 --account=$ta --output=/dev/null --error=/dev/null -t5 $file_in"] | 
|  | if { !$job_id($inx) } { | 
|  | fail "sbatch didn't return jobid" | 
|  | } | 
|  | lappend job_list $job_id($inx) | 
|  | } | 
|  |  | 
|  | # Wait for the allowed jobs to start running | 
|  | for {set inx 0} {$inx < $acct_mod_assoc_test_vals($limit_sub)} {incr inx} { | 
|  | if {$inx < $acct_mod_assoc_test_vals($limit_job)} { | 
|  | wait_for_job -fail -timeout 10 -pollinterval .1 $job_id($inx) "RUNNING" | 
|  | } | 
|  | } | 
|  |  | 
|  | # Then submit one more over the limit and it should fail | 
|  | set result [run_command "$sbatch -N1 -n1 --account=$ta --output=/dev/null --error=/dev/null -t5 $file_in"] | 
|  | subtest [dict get $result exit_code] "Job submitted in excess of $limit limit should fail" | 
|  | subtest {[regexp {Job violates accounting/QOS policy} [dict get $result output]]} "Job submitted in execess of $limit limit should display policy violation error" | 
|  |  | 
|  | set matches 0 | 
|  | spawn $squeue -A$ta -h -o "\%i \%t \%r" | 
|  | expect { | 
|  | -re "($job_id(2)|$job_id(3)).PD.AssocMaxJobsLimit" { | 
|  | incr matches | 
|  | exp_continue | 
|  | } | 
|  | -re "($job_id(2)|$job_id(3)).PD.AssocGrpJobsLimit" { | 
|  | incr matches | 
|  | exp_continue | 
|  | } | 
|  | -re "($job_id(0)|$job_id(1)).R.$re_word_str" { | 
|  | incr matches | 
|  | exp_continue | 
|  | } | 
|  | timeout { | 
|  | fail "squeue not responding" | 
|  | } | 
|  | eof { | 
|  | wait | 
|  | } | 
|  | } | 
|  |  | 
|  | spawn $scancel --quiet --account=$ta | 
|  | expect { | 
|  | eof { | 
|  | wait | 
|  | } | 
|  | } | 
|  |  | 
|  | if {$limit eq "maxjobsub" && $matches > 0 && $matches < 4 && [default_part_exclusive] != 0} { | 
|  | log_warn "Only started $matches of 4 possible jobs" | 
|  | } elseif { $matches != 4 } { | 
|  | fail "Jobs are not in the expected state (expected $matches != 4)" | 
|  | } | 
|  |  | 
|  | # Test to make sure that the grpsubmit and maxsubmit | 
|  | # are enforced with job arrays | 
|  |  | 
|  | log_info "==== Test $limit with job arrays ====" | 
|  |  | 
|  | # Submit jobs to test the limit set in the association | 
|  | for {set inx 0} {$inx < $acct_mod_assoc_test_vals($limit_sub)} {incr inx} { | 
|  | set job_id($inx) [submit_job -fail "-N1 -a0 --account=$ta --output=/dev/null --error=/dev/null -t5 $file_in"] | 
|  | if { !$job_id($inx) } { | 
|  | fail "sbatch didn't return jobid" | 
|  | } | 
|  | lappend job_list $job_id($inx) | 
|  |  | 
|  | # Wait for the job to be scheduled, if it should be | 
|  | if {$inx <  $acct_mod_assoc_test_vals($limit_job)} { | 
|  | wait_for_job -fail -timeout 10 -pollinterval .1 $job_id($inx) "RUNNING" | 
|  | } | 
|  | } | 
|  |  | 
|  | # Then submit one more over the limit and it should fail | 
|  | set result [run_command "$sbatch -N1 -a0 --account=$ta --output=/dev/null --error=/dev/null -t5 $file_in"] | 
|  | subtest [dict get $result exit_code] "Job submitted in excess of $limit limit should fail" | 
|  | subtest {[regexp {Job violates accounting/QOS policy} [dict get $result output]]} "Job submitted in execess of $limit limit should display policy violation error" | 
|  |  | 
|  | set matches 0 | 
|  | spawn $squeue -A$ta -h -o "\%i \%t \%r" | 
|  | expect { | 
|  | -re "($job_id(2)|$job_id(3))_\\\[0\\\].PD.AssocMaxJobsLimit" { | 
|  | incr matches | 
|  | exp_continue | 
|  | } | 
|  | -re "($job_id(2)|$job_id(3))_\\\[0\\\].PD.AssocGrpJobsLimit" { | 
|  | incr matches | 
|  | exp_continue | 
|  | } | 
|  | -re "($job_id(0)|$job_id(1))_0.R.$re_word_str" { | 
|  | incr matches | 
|  | exp_continue | 
|  | } | 
|  | timeout { | 
|  | fail "squeue not responding" | 
|  | } | 
|  | eof { | 
|  | wait | 
|  | } | 
|  | } | 
|  |  | 
|  | spawn $scancel --quiet --account=$ta | 
|  | expect { | 
|  | eof { | 
|  | wait | 
|  | } | 
|  | } | 
|  |  | 
|  | if {$limit eq "maxjobsub" && $matches > 0 && $matches < 4 && [default_part_exclusive] != 0} { | 
|  | log_warn "Only started $matches of 4 possible jobs" | 
|  | } elseif { $matches != 4 } { | 
|  | fail "Jobs are not in the expected state (expected $matches != 4)" | 
|  | } | 
|  |  | 
|  | # Clear the limits | 
|  | set acct_mod_assoc_test_vals($limit_job) "-1" | 
|  | set acct_mod_assoc_test_vals($limit_sub) "-1" | 
|  | } | 
|  |  | 
|  | # | 
|  | # Function that tests an association's grpwall limit | 
|  | # | 
|  | proc inc21_21_grpwall { test_type limit } { | 
|  | global bin_sleep ta test_qos job_list | 
|  |  | 
|  | set local_job_list  [list] | 
|  | set jobs            5.0 | 
|  | set grpwall_num     [lindex $limit 1] | 
|  | set grpwall_per_job [expr $grpwall_num * 1.1 / $jobs] | 
|  | set sleep_time      [expr int(ceil($grpwall_per_job * 60))] | 
|  | set job_time        [expr int(ceil($grpwall_per_job))] | 
|  | set timeout         120 | 
|  |  | 
|  | log_info "====== Test $test_type ======" | 
|  |  | 
|  | # Wait for old jobs to clean up | 
|  | sleep 2 | 
|  |  | 
|  | # Since wall is a decayed variable lets reset it to make sure the test | 
|  | # gets exactly what we would expect. | 
|  | reset_qos_usage "" $test_qos | 
|  |  | 
|  | log_debug "Running $jobs jobs of $sleep_time seconds of duration to ensure that we reach the Grpwall limit of $grpwall_num minutes" | 
|  | for {set i 0} {$i < $jobs} {incr i} { | 
|  | set job_id [submit_job -fail "--account=$ta -N1 -t$job_time --wrap '$bin_sleep $sleep_time' -o /dev/null -e /dev/null"] | 
|  | lappend local_job_list $job_id | 
|  | lappend job_list $job_id | 
|  | } | 
|  |  | 
|  | foreach job_id $local_job_list { | 
|  | if {[wait_job_reason $job_id COMPLETED] != $::RETURN_SUCCESS} { | 
|  | fail "Job ($job_id) did not complete" | 
|  | } | 
|  | } | 
|  |  | 
|  | log_debug "Submitting the final job and check that it is set Pending with Reason AssocGrpWallLimit" | 
|  | set job_id [submit_job -fail "--account=$ta -N1 -t$job_time --wrap '$bin_sleep $sleep_time' -o /dev/null -e /dev/null"] | 
|  | lappend local_job_list $job_id | 
|  | lappend job_list $job_id | 
|  | # Subtest of the limit | 
|  | if {[wait_job_reason $job_id PENDING AssocGrpWallLimit] != $::RETURN_SUCCESS} { | 
|  | cancel_job $local_job_list | 
|  | fail "Job should not have run" | 
|  | } | 
|  |  | 
|  | # Cancel jobs | 
|  | cancel_job $local_job_list | 
|  | } |