blob: 3b7f84e331726c7151ed91aa898644261dd51f49 [file] [log] [blame]
#!/usr/bin/env expect
############################################################################
# Purpose: Test Federated Cluster States
#
# Reqs: 1. Using slurmdbd accounting storage type and is up
# 2. fed_slurm_base is defined in globals.local - set to directory that
# has access to each federation configure (fedc1, fedc2, fedc3).
# Eg.
# fedr/slurm/ (src)
# fedr/fed1/bin
# fedr/fed1/sbin
# fedr/fed1/etc
# fedr/fed1/...
# fedr/fed2/...
# fedr/fed3/...
# 3. controllers are up and running.
############################################################################
# Copyright (C) SchedMD LLC.
#
# This file is part of Slurm, a resource management program.
# For details, see <https://slurm.schedmd.com/>.
# Please also read the included file: DISCLAIMER.
#
# Slurm is free software; you can redistribute it and/or modify it under
# the terms of the GNU General Public License as published by the Free
# Software Foundation; either version 2 of the License, or (at your option)
# any later version.
#
# Slurm is distributed in the hope that it will be useful, but WITHOUT ANY
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
# details.
#
# You should have received a copy of the GNU General Public License along
# with Slurm; if not, write to the Free Software Foundation, Inc.,
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
############################################################################
source ./globals
source ./globals_accounting
source ./globals_federation
set fed_name "feda"
set user_name ""
set srun_job_cnt 0
set my_scontrol "${fed_slurm_base}/$fedc1/bin/scontrol"
set my_sacctmgr "${fed_slurm_base}/$fedc1/bin/sacctmgr"
set my_sbatch "${fed_slurm_base}/$fedc1/bin/sbatch"
set my_srun "${fed_slurm_base}/$fedc1/bin/srun"
set my_salloc "${fed_slurm_base}/$fedc1/bin/salloc"
set my_squeue "${fed_slurm_base}/$fedc1/bin/squeue"
set drain_time 10
#
# Check accounting config and bail if not found.
#
if {[get_config_param "AccountingStorageType"] ne "accounting_storage/slurmdbd"} {
skip "This test can't be run without a usable AccountStorageType"
}
if {[get_admin_level] ne "Administrator"} {
skip "This test can't be run without being an Accounting administrator. Use: sacctmgr mod user \$USER set admin=admin"
}
proc sacctmgr_show { regex } {
global number bin_sleep node_count my_sacctmgr fed_name
set matches 0
spawn $my_sacctmgr show fed $fed_name --noheader
expect {
-re "$regex" {
incr matches
}
timeout {
fail "scontrol not responding"
}
eof {
wait
}
}
if {$matches != 1} {
fail "Didn't match regex ($regex: $matches)"
}
}
proc sacctmgr_mod { state } {
global my_sacctmgr eol fedc1
set matches 0
spawn $my_sacctmgr mod cluster $fedc1 set fedstate=$state -i
expect {
-re "Setting$eol" {
incr matches
exp_continue
}
-re "^\\s+FedState\\s+=\\s+$state" {
incr matches
exp_continue
}
-re "Modified cluster...$eol" {
incr matches
exp_continue
}
-re "^\\s+$fedc1$eol" {
incr matches
exp_continue
}
timeout {
fail "sacctmgr mod not responding"
}
eof {
wait
}
}
if {$matches != 4} {
fail "Unexpected error (got $matches)"
}
}
proc scontrol { category regex } {
global fedc1 fed_slurm_base bin_sleep my_scontrol
set matches 0
spawn $my_scontrol show $category
expect {
-re "$regex" {
incr matches
}
timeout {
fail "scontrol not responding"
}
eof {
wait
}
}
if {$matches != 1} {
fail "Didn't match regex ($regex: $matches)"
}
}
proc sbatch { mfed error } {
global number bin_sleep node_count my_sbatch fedc1
set script "\"sleep 10\""
set matches 0
set job_id 0
set command "$my_sbatch -N$node_count --exclusive --output=/dev/null \
--error=/dev/null -t300 --requeue --wrap $script"
set regex "Submitted batch job ($number)"
set collect 1
if { $error } {
set regex "sbatch: error: Batch job submission failed: System submissions disabled"
set collect 0
}
if {$mfed} {
append command " -M$fedc1"
}
spawn {*}$command
expect {
-re "$regex" {
incr matches
if { $collect } {
set job_id $expect_out(1,string)
}
}
timeout {
fail "sbatch not responding"
}
eof {
wait
}
}
if {$matches != 1} {
fail "Batch submit failure"
}
sleep 3
return $job_id
}
proc squeue { regex } {
global fed_slurm_base fedc1
set matches 0
set my_squeue "${fed_slurm_base}/$fedc1/bin/squeue"
spawn $my_squeue --noheader -Ostatecompact:.5
expect {
-re "$regex" {
incr matches
}
eof {
wait
}
}
if {$matches != 1} {
fail "Unexpected error in squeue (expected $regex)"
}
}
proc cancel_federation_jobs { } {
global scancel user_name fedc1 fedc2 fedc3
spawn $scancel -M$fedc1,$fedc2,$fedc3 --user $user_name
expect {
eof {
wait
}
}
sleep 5
}
proc cleanup { } {
global fed_name bin_bash bin_rm test_name
cancel_federation_jobs
delete_federations $fed_name
exec $bin_bash -c "$bin_rm -f $test_name*.out"
}
# Start test
if {![check_federation_setup]} {
skip "This test can't be run without fed_slurm_base, fedc1, fedc2, fedc3 setup in globals.local"
}
if {![check_federation_up]} {
skip "This test can't be run without all clusters up"
}
set user_name [get_my_user_name]
# Remove existing setup
cleanup
# Get number of nodes per cluster
set node_count [llength [get_nodes_by_state idle,alloc,comp]]
log_info "################################################################"
log_info "Setup cluster features"
log_info "################################################################"
set job_id 0
set matches 0
log_trace "Here"
exec $my_sacctmgr add Federation $fed_name clusters=$fedc1,$fedc2 -i
log_trace "Here"
set matches 0
sacctmgr_show ".+ACTIVE.+ACTIVE.+"
scontrol fed ".+ACTIVE.+ACTIVE.+"
sbatch 1 0
set job_id [sbatch 1 0]
sacctmgr_mod INACTIVE
squeue "PD\\s+R\\s+"
sleep 10
squeue "PD\\s+"
sbatch 1 1
set matches 0
spawn $my_srun hostname
expect {
-re "srun: error: Unable to allocate resources: System submissions disabled" {
incr matches
exp_continue
}
timeout {
fail "sacctmgr mod not responding"
}
eof {
wait
}
}
if {$matches != 1} {
fail "Unexpected error (got $matches)"
}
set matches 0
spawn $my_salloc hostname
expect {
-re "salloc: error: Job submit/allocate failed: System submissions disabled" {
incr matches
exp_continue
}
timeout {
fail "sacctmgr mod not responding"
}
eof {
wait
}
}
if {$matches != 1} {
fail "Unexpected error (got $matches)"
}
sacctmgr_mod ACTIVE
wait_for_fed_job $job_id RUNNING $fedc1,$fedc2
squeue "R\\s+"
sbatch 1 0
squeue "R\\s+"
sbatch 1 0
sbatch 1 0
sacctmgr_mod DRAIN
sbatch 1 1
sacctmgr_show "\\s+DRAIN.+ACTIVE\\s+"
scontrol fed ".+DRAIN.+ACTIVE.+"
squeue "PD\\s+R\\s+"
sleep 10
squeue "R\\s+"
sleep 10
squeue ""
log_info "Sleep $drain_time seconds until system is drained"
sleep $drain_time
scontrol fed ".+DRAINED.+ACTIVE.+"
sacctmgr_show "\\s+DRAINED.+ACTIVE\\s+"
sacctmgr_mod ACTIVE
sbatch 1 0
squeue "R\\s+"
set matches 0
spawn $my_sacctmgr mod cluster $fedc1 set fedstate=DRAIN+REMOVE -i
expect {
-re "Setting$eol" {
incr matches
exp_continue
}
-re "^\\s+FedState\\s+=\\s+DRAIN\\+REMOVE" {
incr matches
exp_continue
}
-re "Modified cluster...$eol" {
incr matches
exp_continue
}
-re "^\\s+$fedc1$eol" {
incr matches
exp_continue
}
timeout {
fail "sacctmgr mod not responding"
}
eof {
wait
}
}
if {$matches != 4} {
fail "Unexpected error (got $matches)"
}
sbatch 1 1
squeue ""
log_info "Sleep $drain_time seconds until system is removed"
sleep $drain_time
scontrol fed ""
sacctmgr_show "\\s+ACTIVE\\s+"
# Once the system is drained, the controller will mark all jobs as non-requeueable.
scontrol jobs "Requeue=0.+"
# Make sure we can submit to removed cluster.
set job_id [sbatch 1 0]
if {[expr $job_id >> 26]} {
fail "Removed cluster still giving federated job id"
}
wait_for_fed_job $job_id DONE $fedc1
set matches 0
spawn $my_sacctmgr mod fed $fed_name set clusters+=$fedc1 -i
expect {
-re "Setting$eol" {
incr matches
exp_continue
}
-re "^\\s+Cluster\\s+\\+=\\s+$fedc1" {
incr matches
exp_continue
}
-re "Modified federation...$eol" {
incr matches
exp_continue
}
-re "^\\s+$fed_name$eol" {
incr matches
exp_continue
}
timeout {
fail "sacctmgr mod not responding"
}
eof {
wait
}
}
if {$matches != 4} {
fail "Unexpected error (got $matches)"
}
sacctmgr_show "\\s+ACTIVE.+ACTIVE\\s+"
set matches 0
spawn $my_sacctmgr mod cluster $fedc2 set fedstate=DRAIN -i
expect {
-re "Setting$eol" {
incr matches
exp_continue
}
-re "^\\s+FedState\\s+=\\s+DRAIN" {
incr matches
exp_continue
}
-re "Modified cluster...$eol" {
incr matches
exp_continue
}
-re "^\\s+$fedc2$eol" {
incr matches
exp_continue
}
timeout {
fail "sacctmgr mod not responding"
}
eof {
wait
}
}
if {$matches != 4} {
fail "Unexpected error (got $matches)"
}
sacctmgr_show "\\s+ACTIVE.+DRAIN\\s+"
scontrol fed ".+ACTIVE.+DRAIN.+"
sbatch 0 0
squeue "R\\s+"
sbatch 0 0
squeue "PD\\s+R\\s+"