blob: d211fb25a8611e125614a21c381b90046dab54ff [file] [log] [blame]
#!/usr/bin/env expect
############################################################################
# Purpose: Test for accounting records of specific job names with their ID
############################################################################
# Copyright (C) SchedMD LLC.
#
# This file is part of Slurm, a resource management program.
# For details, see <https://slurm.schedmd.com/>.
# Please also read the included file: DISCLAIMER.
#
# Slurm is free software; you can redistribute it and/or modify it under
# the terms of the GNU General Public License as published by the Free
# Software Foundation; either version 2 of the License, or (at your option)
# any later version.
#
# Slurm is distributed in the hope that it will be useful, but WITHOUT ANY
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
# details.
#
# You should have received a copy of the GNU General Public License along
# with Slurm; if not, write to the Free Software Foundation, Inc.,
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
############################################################################
#
# Supplemental function to test21.21 that test a job with
# resources within the allowed limit in the association
#
proc inc21_21_good { test_type limit } {
global number bin_id ta srun test_node selectparam nthreads is_skip re_word_str
set job_id 0
set val 0
set add ""
# Wait for old jobs to clean up
sleep 2
log_info "====== Test $test_type ======"
if {($test_type eq "maxcpus" || $test_type eq "maxcpumins") && ([default_part_exclusive] != 0 || [check_config_select "linear"])} {
log_warn "Unable to perform test with exclusive node allocations"
set is_skip 1
return
}
set select_type_param [get_select_type_params]
if { [string first "CR_SOCKET" $select_type_param] != -1} {
log_warn "This test can't be run SelectTypeParameters=CR_SOCKET"
set is_skip 1
return
}
if {$test_type eq "maxnode"} {
set add "--exclusive"
} else {
set add "-w$test_node"
}
set matches 0
spawn $srun -v -t1 $add [lindex $limit 0][lindex $limit 1] \
--account=$ta $bin_id
expect {
-re "launching StepId=($number)\\.$re_word_str" {
set job_id $expect_out(1,string)
incr matches
exp_continue
}
timeout {
fail "srun not responding"
}
eof {
wait
}
}
if {$job_id == 0} {
wait_for_job -fail $job_id "DONE"
}
subtest -fail { $matches == 1 } "Job launches with correct limit"
}
#
# Supplemental function to test21.21 that test a job with
# resources larger than allowed limit in the association
#
proc inc21_21_bad { test_type limit } {
global number bin_id ta srun test_node nthreads selectparam re_word_str
set job_id 0
set over_lim [expr [lindex $limit 1] + 1]
set add ""
log_info "====== Test $test_type ======"
if {$test_type eq "maxnode"} {
set add "--exclusive"
} else {
set add "-w$test_node"
}
set matches 0
spawn $srun -v $add -t1 [lindex $limit 0]$over_lim --account=$ta \
-I $bin_id
expect {
-re "Job violates accounting/QOS policy" {
log_info "This error is expected, not a problem"
exp_continue
}
-re "launching StepId=($number)\\.$re_word_str" {
set job_id $expect_out(1,string)
fail "Job ($job_id) should not have run"
}
timeout {
fail "srun not responding"
}
eof {
wait
}
}
if {$job_id != 0} {
wait_for_job -fail $job_id "DONE"
}
}
proc inc21_21_grp_test { test_type limit } {
global number bin_id ta srun sbatch test_node selectparam nthreads is_skip
global file_in squeue scancel bin_bash bin_chmod job_list
set val 0
set exclusive ""
log_info "===== Test $test_type ====="
if { $test_type eq "grpcpumins" &&
![param_contains [get_config_param "AccountingStorageEnforce"] "safe"] } {
log_warn "This test can't be run without AccountingStorageEnforce having \"safe\" in it"
set is_skip 1
return
}
if { [default_part_exclusive] != 0 || [check_config_select "linear"]} {
log_warn "This test can't be run Exclusive node allocations"
set is_skip 1
return
}
set select_type_param [get_select_type_params]
if { [string first "CR_SOCKET" $select_type_param] != -1} {
log_warn "This test can't be run SelectTypeParameters=CR_SOCKET"
set is_skip 1
return
}
# Check and see if it is a CPU test
if {$test_type eq "grpcpus" || $test_type eq "grpcpumins" || $test_type eq "grpcpurunmins"} {
if {$selectparam} {
set val [expr [lindex $limit 1] / $nthreads]
} else {
set val [lindex $limit 1]
}
} else {
set exclusive "#SBATCH --exclusive"
set val [lindex $limit 1]
}
make_bash_script $file_in "
$exclusive
sleep 10"
#
# First we will submit n jobs that should be below the association limit
# and should run. We wait for these to start before submitting the
# over-limit job. If we were to submit them all at once, periodically the
# earlier submitted jobs can take longer to start than later submitted jobs
# such as when an epilog is still in progress on the assigned nodes.
#
for {set inx 0} {$inx < $val} {incr inx} {
set job_id($inx) [submit_job -fail "-t1 [lindex $limit 0]1 --account=$ta --output=/dev/null --error=/dev/null $file_in"]
lappend job_list $job_id($inx)
}
# Wait for the expected jobs to start running
for {set inx 0} {$inx < $val} {incr inx} {
wait_for -fail -timeout 30 -pollinterval .2 {$state eq "RUNNING"} {
set state [get_job_param $job_id($inx) "JobState"]
}
}
#
# Submit an additional job. This job should pend since it will be past the
# association limit. This job gets a longer time limit to avoid having it
# prematurely start after _decay_thread() runs and decays the values of the
# other running jobs.
#
set job_id($inx) [submit_job -fail "-t$val [lindex $limit 0]1 --account=$ta --output=/dev/null --error=/dev/null $file_in"]
lappend job_list $job_id($inx)
set pending 0
set running 0
spawn $squeue -A $ta -h -o "\%t \%r"
expect {
-re "PD ." {
incr pending
exp_continue
}
-re "R ." {
incr running
exp_continue
}
timeout {
fail "squeue not responding"
}
eof {
wait
}
}
subtest -fail { $pending == 1 && $running == $val } "$test_type limit" "Found $pending jobs pending and $running jobs running while expecting 1 and $val"
#
# Cancel test jobs
#
spawn $scancel --quiet --account=$ta
expect {
eof {
wait
}
}
}
#
# Supplemental function to test21.21 that test for max/grp
# submit and jobs
#
proc inc21_21_submit_test { limit } {
global file_in srun sbatch squeue scancel bin_id number bin_sleep is_skip re_word_str
global bin_rm ta maxjob_lim maxsub_lim
global acct_mod_desc acct_mod_acct_vals acct_mod_assoc_vals
global acct_mod_assoc_test_vals job_list
set limit_job ""
set limit_sub ""
if {$limit eq "grpjobsub" && [default_part_exclusive] != 0} {
log_warn "Unable to perform test with exclusive node allocations"
set is_skip 1
return
}
if {$limit eq "maxjobsub"} {
set limit_job "maxjob"
set limit_sub "maxsubmit"
} else {
set limit_job "grpjob"
set limit_sub "grpsubmit"
}
set acct_mod_assoc_test_vals($limit_job) \
[lindex $acct_mod_assoc_vals($limit) 0]
set acct_mod_assoc_test_vals($limit_sub) \
[lindex $acct_mod_assoc_vals($limit) 1]
if [mod_acct $ta [array get acct_mod_desc] \
[array get acct_mod_assoc_test_vals] \
[array get acct_mod_acct_vals]] {
fail "Unable to modify account ($ta)"
}
make_bash_script $file_in "
$bin_sleep 120
"
# Test to make sure that the grpsubmit and maxsubmit
# are enforced with jobs
log_info "==== Test $limit ===="
# Submit jobs to test the limit set in the association
for {set inx 0} {$inx < $acct_mod_assoc_test_vals($limit_sub)} {incr inx} {
set job_id($inx) [submit_job -fail "-N1 -n1 --account=$ta --output=/dev/null --error=/dev/null -t5 $file_in"]
if { !$job_id($inx) } {
fail "sbatch didn't return jobid"
}
lappend job_list $job_id($inx)
}
# Wait for the allowed jobs to start running
for {set inx 0} {$inx < $acct_mod_assoc_test_vals($limit_sub)} {incr inx} {
if {$inx < $acct_mod_assoc_test_vals($limit_job)} {
wait_for_job -fail -timeout 10 -pollinterval .1 $job_id($inx) "RUNNING"
}
}
# Then submit one more over the limit and it should fail
set result [run_command "$sbatch -N1 -n1 --account=$ta --output=/dev/null --error=/dev/null -t5 $file_in"]
subtest [dict get $result exit_code] "Job submitted in excess of $limit limit should fail"
subtest {[regexp {Job violates accounting/QOS policy} [dict get $result output]]} "Job submitted in execess of $limit limit should display policy violation error"
set matches 0
spawn $squeue -A$ta -h -o "\%i \%t \%r"
expect {
-re "($job_id(2)|$job_id(3)).PD.AssocMaxJobsLimit" {
incr matches
exp_continue
}
-re "($job_id(2)|$job_id(3)).PD.AssocGrpJobsLimit" {
incr matches
exp_continue
}
-re "($job_id(0)|$job_id(1)).R.$re_word_str" {
incr matches
exp_continue
}
timeout {
fail "squeue not responding"
}
eof {
wait
}
}
spawn $scancel --quiet --account=$ta
expect {
eof {
wait
}
}
if {$limit eq "maxjobsub" && $matches > 0 && $matches < 4 && [default_part_exclusive] != 0} {
log_warn "Only started $matches of 4 possible jobs"
} elseif { $matches != 4 } {
fail "Jobs are not in the expected state (expected $matches != 4)"
}
# Test to make sure that the grpsubmit and maxsubmit
# are enforced with job arrays
log_info "==== Test $limit with job arrays ===="
# Submit jobs to test the limit set in the association
for {set inx 0} {$inx < $acct_mod_assoc_test_vals($limit_sub)} {incr inx} {
set job_id($inx) [submit_job -fail "-N1 -a0 --account=$ta --output=/dev/null --error=/dev/null -t5 $file_in"]
if { !$job_id($inx) } {
fail "sbatch didn't return jobid"
}
lappend job_list $job_id($inx)
# Wait for the job to be scheduled, if it should be
if {$inx < $acct_mod_assoc_test_vals($limit_job)} {
wait_for_job -fail -timeout 10 -pollinterval .1 $job_id($inx) "RUNNING"
}
}
# Then submit one more over the limit and it should fail
set result [run_command "$sbatch -N1 -a0 --account=$ta --output=/dev/null --error=/dev/null -t5 $file_in"]
subtest [dict get $result exit_code] "Job submitted in excess of $limit limit should fail"
subtest {[regexp {Job violates accounting/QOS policy} [dict get $result output]]} "Job submitted in execess of $limit limit should display policy violation error"
set matches 0
spawn $squeue -A$ta -h -o "\%i \%t \%r"
expect {
-re "($job_id(2)|$job_id(3))_\\\[0\\\].PD.AssocMaxJobsLimit" {
incr matches
exp_continue
}
-re "($job_id(2)|$job_id(3))_\\\[0\\\].PD.AssocGrpJobsLimit" {
incr matches
exp_continue
}
-re "($job_id(0)|$job_id(1))_0.R.$re_word_str" {
incr matches
exp_continue
}
timeout {
fail "squeue not responding"
}
eof {
wait
}
}
spawn $scancel --quiet --account=$ta
expect {
eof {
wait
}
}
if {$limit eq "maxjobsub" && $matches > 0 && $matches < 4 && [default_part_exclusive] != 0} {
log_warn "Only started $matches of 4 possible jobs"
} elseif { $matches != 4 } {
fail "Jobs are not in the expected state (expected $matches != 4)"
}
# Clear the limits
set acct_mod_assoc_test_vals($limit_job) "-1"
set acct_mod_assoc_test_vals($limit_sub) "-1"
}
#
# Function that tests an association's grpwall limit
#
proc inc21_21_grpwall { test_type limit } {
global bin_sleep ta test_qos job_list
set local_job_list [list]
set jobs 5.0
set grpwall_num [lindex $limit 1]
set grpwall_per_job [expr $grpwall_num * 1.1 / $jobs]
set sleep_time [expr int(ceil($grpwall_per_job * 60))]
set job_time [expr int(ceil($grpwall_per_job))]
set timeout 120
log_info "====== Test $test_type ======"
# Wait for old jobs to clean up
sleep 2
# Since wall is a decayed variable lets reset it to make sure the test
# gets exactly what we would expect.
reset_qos_usage "" $test_qos
log_debug "Running $jobs jobs of $sleep_time seconds of duration to ensure that we reach the Grpwall limit of $grpwall_num minutes"
for {set i 0} {$i < $jobs} {incr i} {
set job_id [submit_job -fail "--account=$ta -N1 -t$job_time --wrap '$bin_sleep $sleep_time' -o /dev/null -e /dev/null"]
lappend local_job_list $job_id
lappend job_list $job_id
}
foreach job_id $local_job_list {
if {[wait_job_reason $job_id COMPLETED] != $::RETURN_SUCCESS} {
fail "Job ($job_id) did not complete"
}
}
log_debug "Submitting the final job and check that it is set Pending with Reason AssocGrpWallLimit"
set job_id [submit_job -fail "--account=$ta -N1 -t$job_time --wrap '$bin_sleep $sleep_time' -o /dev/null -e /dev/null"]
lappend local_job_list $job_id
lappend job_list $job_id
# Subtest of the limit
if {[wait_job_reason $job_id PENDING AssocGrpWallLimit] != $::RETURN_SUCCESS} {
cancel_job $local_job_list
fail "Job should not have run"
}
# Cancel jobs
cancel_job $local_job_list
}