blob: b9d327a5a6fb61e625f0907a26b313367046b06d [file] [log] [blame]
#!/usr/bin/env expect
############################################################################
# Purpose: Federated job cancellations
# Reqs: 1. Using slurmdbd accounting storage type and is up
# 2. fed_slurm_base is defined in globals.local - set to directory that
# has access to each federation configure (fedc1, fedc2, fedc3).
# Eg.
# fedr/slurm/ (src)
# fedr/fed1/bin
# fedr/fed1/sbin
# fedr/fed1/etc
# fedr/fed1/...
# fedr/fed2/...
# fedr/fed3/...
# 3. controllers are up and running.
############################################################################
# Copyright (C) SchedMD LLC.
#
# This file is part of Slurm, a resource management program.
# For details, see <https://slurm.schedmd.com/>.
# Please also read the included file: DISCLAIMER.
#
# Slurm is free software; you can redistribute it and/or modify it under
# the terms of the GNU General Public License as published by the Free
# Software Foundation; either version 2 of the License, or (at your option)
# any later version.
#
# Slurm is distributed in the hope that it will be useful, but WITHOUT ANY
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
# details.
#
# You should have received a copy of the GNU General Public License along
# with Slurm; if not, write to the Free Software Foundation, Inc.,
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
############################################################################
source ./globals
source ./globals_accounting
source ./globals_federation
set fed_name "feda"
set user_name ""
set srun_job_cnt 0
set my_scontrol "${fed_slurm_base}/$fedc1/bin/scontrol"
set my_sbatch "${fed_slurm_base}/$fedc1/bin/sbatch"
set my_srun "${fed_slurm_base}/$fedc1/bin/srun"
set my_scancel "${fed_slurm_base}/$fedc1/bin/scancel"
set my_squeue "${fed_slurm_base}/$fedc1/bin/squeue"
set my_scontrol "${fed_slurm_base}/$fedc1/bin/scontrol"
regexp "($number)" [get_config_param "MinJobAge"] {} min_job_age
set min_job_age [expr {$min_job_age + 65}]
set file_in "$test_dir/input"
#
# Check accounting config and bail if not found.
#
if {[get_config_param "AccountingStorageType"] ne "accounting_storage/slurmdbd"} {
skip "This test can't be run without a usable AccountStorageType"
}
if {[get_admin_level] ne "Administrator"} {
skip "This test can't be run without being an Accounting administrator. Use: sacctmgr mod user \$USER set admin=admin"
}
proc mod_regex { olds str } {
global eol
foreach o $olds {
set o "\\$o$eol"
set str [string map [list $o {}] $str]
if { [string first "(|" $str] ne -1 } {
set str [string map {"(|" ""} $str]
set str [string map {"){2}" ""} $str]
}
if { [string first "|)" $str] ne -1 } {
set str [string map {"|){2}" ""} $str]
set str [string map {"(" ""} $str]
}
}
return $str
}
proc find_reg { sub } {
global regs
set result ""
foreach r $regs {
if { [string first $sub $r] ne -1} {
append result "$r "
}
}
set result [string trimright $result " "]
return $result
}
proc lremove { list discard } {
return [lsearch -all -inline -not -exact $list $discard]
}
proc sbatch { args } {
global number bin_sleep node_count my_sbatch file_in
set matches 0
set job_id 0
set command "$my_sbatch -N$node_count --exclusive --output=/dev/null \
--error=/dev/null -t300 --requeue "
append command $args
append command " $file_in"
set regex "Submitted batch job ($number)"
spawn {*}$command
expect {
-re "$regex" {
incr matches
set job_id $expect_out(1,string)
}
timeout {
fail "sbatch not responding"
}
eof {
wait
}
}
if {$matches != 1} {
fail "batch submit failure"
}
return $job_id
}
proc squeue { options regex } {
global my_squeue
set matches 0
set command "$my_squeue -a "
append command $options
spawn {*}$command
expect {
-re "$regex" {
incr matches
}
eof {
wait
}
}
if {$matches != 1} {
fail "Unexpected error in squeue (expected $regex)"
}
}
proc scancel { options argument } {
global my_scancel
set matches 0
set command "$my_scancel "
append command $options
append command " $argument"
spawn {*}$command
sleep 2
}
proc cancel_federation_jobs { } {
global scancel user_name fedc1 fedc2 fedc3
spawn $scancel -M$fedc1,$fedc2,$fedc3 --user $user_name
expect {
eof {
wait
}
}
sleep 5
}
proc cleanup { } {
global fed_name bin_bash bin_rm test_name
cancel_federation_jobs
exec $bin_bash -c "$bin_rm -f $test_name*.out"
delete_federations $fed_name
}
# Start test
if {![check_federation_setup]} {
skip "This test can't be run without fed_slurm_base, fedc1, fedc2, fedc3 setup in globals.local"
}
if {![check_federation_up]} {
skip "This test can't be run without all clusters up"
}
set user_name [get_my_user_name]
# Remove existing setup
if {[cleanup] != 0} {
fail "failed to cleanup"
}
# Add clusters to federation
if {[setup_federation $fed_name]} {
fail "failed to setup federation"
}
# Get number of nodes per cluster
set node_count [llength [get_nodes_by_state idle,alloc,comp "[default_partition] --local"]]
log_info "################################################################"
log_info "Setup cluster features"
log_info "################################################################"
set matches 0
spawn $sacctmgr -i modify cluster $fedc1 set features=fa
expect {
-re "Setting$eol" {
incr matches
exp_continue
}
-re "^\\s*Feature\\s*=\\s*fa" {
incr matches
exp_continue
}
-re "Modified cluster...$eol" {
incr matches
exp_continue
}
-re "^\\s*$fedc1$eol" {
incr matches
exp_continue
}
timeout {
fail "sacctmgr mod not responding"
}
eof {
wait
}
}
if {$matches != 4} {
fail "Unexpected error (got $matches)"
}
set matches 0
spawn $sacctmgr -i modify cluster $fedc2 set features=fb
expect {
-re "Setting$eol" {
incr matches
exp_continue
}
-re "^\\s*Feature\\s*=\\s*fb" {
incr matches
exp_continue
}
-re "Modified cluster...$eol" {
incr matches
exp_continue
}
-re "^\\s*$fedc2$eol" {
incr matches
exp_continue
}
timeout {
fail "sacctmgr mod not responding"
}
eof {
wait
}
}
if {$matches != 4} {
fail "Unexpected error (got $matches)"
}
set matches 0
spawn $sacctmgr -i modify cluster $fedc3 set features=fc
expect {
-re "Setting$eol" {
incr matches
exp_continue
}
-re "^\\s*Feature\\s*=\\s*fc" {
incr matches
exp_continue
}
-re "Modified cluster...$eol" {
incr matches
exp_continue
}
-re "^\\s*$fedc3$eol" {
incr matches
exp_continue
}
timeout {
fail "sacctmgr mod not responding"
}
eof {
wait
}
}
if {$matches != 4} {
fail "Unexpected error (got $matches)"
}
make_bash_script $file_in "sleep 900"
log_info "################################################################"
log_info "Test scancel within federated clusters"
log_info "################################################################"
set ji0 [sbatch]
set jid([wait_for_fed_job $ji0 RUNNING $fedc1,$fedc2,$fedc3]) $ji0
set ji1 [sbatch]
set jid([wait_for_fed_job $ji1 RUNNING $fedc1,$fedc2,$fedc3]) $ji1
set ji2 [sbatch]
set jid([wait_for_fed_job $ji2 RUNNING $fedc1,$fedc2,$fedc3]) $ji2
set jid(PD) [sbatch]
wait_for_fed_job $jid(PD) PENDING $fedc1,$fedc2,$fedc3
set regpd "\\s*$jid(PD).+PD.+$eol"
set reg1a "\\s*$jid($fedc1).+R.+$eol"
set reg1b "\\s*$jid($fedc2).+RV.+$eol"
set reg1c "\\s*$jid($fedc3).+RV.+$eol"
set reg2 "\\s*$jid($fedc2).+R.+$eol"
set reg3 "\\s*$jid($fedc3).+R.+$eol"
set regs "$regpd $reg1a $reg1b $reg1c $reg2 $reg3"
set regf1 "CLUSTER: $fedc1\\s*JOBID.*?$regpd$reg1a\\s*($reg1b|$reg1c){2}$eol"
set regf2 "CLUSTER: $fedc2\\s*JOBID.*?$regpd$reg2$eol"
set regf3 "CLUSTER: $fedc3\\s*JOBID.*?$regpd$reg3"
set regex "$regf1$regf2$regf3"
squeue " -M$fedc1,$fedc2,$fedc3 " $regex
scancel "" $jid(PD)
set rm [find_reg $jid(PD)]
set regex [mod_regex $rm $regex]
squeue " -M$fedc1,$fedc2,$fedc3 " $regex
scancel "" $jid($fedc1)
set rm [find_reg $jid($fedc1)]
set regex [mod_regex $rm $regex]
squeue " -M$fedc1,$fedc2,$fedc3 " $regex
scancel "" $jid($fedc2)
set rm [find_reg $jid($fedc2)]
set regex [mod_regex $rm $regex]
squeue " -M$fedc1,$fedc2,$fedc3 " $regex
scancel "" $jid($fedc3)
set rm [find_reg $jid($fedc3)]
set regex [mod_regex $rm $regex]
squeue " -M$fedc1,$fedc2,$fedc3 " $regex
set ji0 [sbatch]
set cid($ji0) [wait_for_fed_job $ji0 RUNNING $fedc1,$fedc2,$fedc3]
set jid($cid($ji0)) $ji0
set ji1 [sbatch]
set cid($ji1) [wait_for_fed_job $ji1 RUNNING $fedc1,$fedc2,$fedc3]
set jid($cid($ji1)) $ji1
set ji2 [sbatch]
set cid($ji2) [wait_for_fed_job $ji2 RUNNING $fedc1,$fedc2,$fedc3]
set jid($cid($ji2)) $ji2
set ji3 [sbatch]
set jid(PD) $ji3
wait_for_fed_job $ji3 PENDING $fedc1,$fedc2,$fedc3
set regpd "\\s*$jid(PD).+PD.+$eol"
set reg1a "\\s*$jid($fedc1).+R.+$eol"
set reg1b "\\s*$jid($fedc2).+RV.+$eol"
set reg1c "\\s*$jid($fedc3).+RV.+$eol"
set reg2 "\\s*$jid($fedc2).+R.+$eol"
set reg3 "\\s*$jid($fedc3).+R.+$eol"
set regs "$regpd $reg1a $reg1b $reg1c $reg2 $reg3"
set regf1 "CLUSTER: $fedc1\\s*JOBID.*?$regpd$reg1a\\s*($reg1b|$reg1c){2}$eol"
set regf2 "CLUSTER: $fedc2\\s*JOBID.*?$regpd$reg2$eol"
set regf3 "CLUSTER: $fedc3\\s*JOBID.*?$regpd$reg3"
set regex "$regf1$regf2$regf3"
squeue " -M$fedc1,$fedc2,$fedc3 " $regex
scancel "-M$fedc3" $ji3
set rm [find_reg $ji3]
set regex [mod_regex $rm $regex]
squeue " -M$fedc1,$fedc2,$fedc3 " $regex
scancel "-M$cid($ji0)" $ji0
set rm [find_reg $ji0]
set regex [mod_regex $rm $regex]
squeue " -M$fedc1,$fedc2,$fedc3 " $regex
scancel "-M$cid($ji1)" $ji1
set rm [find_reg $ji1]
set regex [mod_regex $rm $regex]
squeue " -M$fedc1,$fedc2,$fedc3 " $regex
scancel "-M$cid($ji2)" $ji2
set rm [find_reg $ji2]
set regex [mod_regex $rm $regex]
squeue " -M$fedc1,$fedc2,$fedc3 " $regex
set ji0 [sbatch]
set cid($ji0) [wait_for_fed_job $ji0 RUNNING $fedc1,$fedc2,$fedc3]
set jid($cid($ji0)) $ji0
set ji1 [sbatch]
set cid($ji1) [wait_for_fed_job $ji1 RUNNING $fedc1,$fedc2,$fedc3]
set jid($cid($ji1)) $ji1
set ji2 [sbatch]
set cid($ji2) [wait_for_fed_job $ji2 RUNNING $fedc1,$fedc2,$fedc3]
set jid($cid($ji2)) $ji2
set reg1a "\\s*$jid($fedc1).+R.+$eol"
set reg1b "\\s*$jid($fedc2).+RV.+$eol"
set reg1c "\\s*$jid($fedc3).+RV.+$eol"
set reg2 "\\s*$jid($fedc2).+R.+$eol"
set reg3 "\\s*$jid($fedc3).+R.+$eol"
set regs "$reg1a $reg1b $reg1c $reg2 $reg3"
set regf1 "CLUSTER: $fedc1\\s*JOBID.*?$reg1a\\s*($reg1b|$reg1c){2}$eol"
set regf2 "CLUSTER: $fedc2\\s*JOBID.*?$reg2$eol"
set regf3 "CLUSTER: $fedc3\\s*JOBID.*?$reg3"
set regex "$regf1$regf2$regf3"
squeue " -M$fedc1,$fedc2,$fedc3 " $regex
scancel "-M$fedc3" $jid($fedc2)
set rm [find_reg $jid($fedc2)]
set regex [mod_regex $rm $regex]
squeue " -M$fedc1,$fedc2,$fedc3 " $regex
scancel "-M$fedc2" $jid($fedc3)
set rm [find_reg $jid($fedc3)]
set regex [mod_regex $rm $regex]
squeue " -M$fedc1,$fedc2,$fedc3 " $regex
scancel "-M$fedc2" $jid($fedc1)
set rm [find_reg $jid($fedc1)]
set regex [mod_regex $rm $regex]
squeue " -M$fedc1,$fedc2,$fedc3 " $regex
set ji0 [sbatch]
set cid($ji0) [wait_for_fed_job $ji0 RUNNING $fedc1,$fedc2,$fedc3]
set jid($cid($ji0)) $ji0
set ji1 [sbatch]
set cid($ji1) [wait_for_fed_job $ji1 RUNNING $fedc1,$fedc2,$fedc3]
set jid($cid($ji1)) $ji1
set ji2 [sbatch]
set cid($ji2) [wait_for_fed_job $ji2 RUNNING $fedc1,$fedc2,$fedc3]
set jid($cid($ji2)) $ji2
set ji3 [sbatch]
set jid(PD) $ji3
wait_for_fed_job $ji3 PENDING $fedc1,$fedc2,$fedc3
set regpd "\\s*$jid(PD).+PD.+$eol"
set reg1a "\\s*$jid($fedc1).+R.+$eol"
set reg1b "\\s*$jid($fedc2).+RV.+$eol"
set reg1c "\\s*$jid($fedc3).+RV.+$eol"
set reg2 "\\s*$jid($fedc2).+R.+$eol"
set reg3 "\\s*$jid($fedc3).+R.+$eol"
set regs "$regpd $reg1a $reg1b $reg1c $reg2 $reg3"
set regf1 "CLUSTER: $fedc1\\s*JOBID.*?$regpd$reg1a\\s*($reg1b|$reg1c){2}$eol"
set regf2 "CLUSTER: $fedc2\\s*JOBID.*?$regpd$reg2$eol"
set regf3 "CLUSTER: $fedc3\\s*JOBID.*?$regpd$reg3"
set regex "$regf1$regf2$regf3"
squeue " -M$fedc1,$fedc2,$fedc3 " $regex
set my_scancel "${fed_slurm_base}/$fedc3/bin/scancel"
spawn $my_scancel -u $user_name
log_trace "Here1"
wait_for_fed_job $ji0 DONE $cid($ji0)
wait_for_fed_job $ji1 DONE $cid($ji1)
wait_for_fed_job $ji2 DONE $cid($ji2)
wait_for_fed_job $ji3 DONE $fedc1
set regex [mod_regex $regs $regex]
squeue " -M$fedc1,$fedc2,$fedc3 " $regex