blob: b9bb2c311df25c24a611d6cf4aa9b03d69693d85 [file] [log] [blame]
#!/usr/bin/env expect
############################################################################
# Purpose: Establish global state information for Slurm federation tests
#
# To define site-specific state information, set the values in a file
# named 'globals.local'. Those values will override any specified here.
# for example:
#
# $ cat globals.local
# set slurm_dir "/usr/local"
# set mpicc "/usr/local/bin/mpicc"
#
############################################################################
# Copyright (C) SchedMD LLC.
# This file is part of Slurm, a resource management program.
# For details, see <https://slurm.schedmd.com/>.
# Please also read the supplied file: DISCLAIMER.
#
# Slurm is free software; you can redistribute it and/or modify it under
# the terms of the GNU General Public License as published by the Free
# Software Foundation; either version 2 of the License, or (at your option)
# any later version.
#
# Slurm is distributed in the hope that it will be useful, but WITHOUT ANY
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
# details.
#
# You should have received a copy of the GNU General Public License along
# with SLURM; if not, write to the Free Software Foundation, Inc.,
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
############################################################################
source ./globals
# Set if testing federations
cset fed_slurm_base ""
cset fedc1 ""
cset fedc2 ""
cset fedc3 ""
proc check_federation_setup { } {
global fed_slurm_base fedc1 fedc2 fedc3
set rc true
if {$fed_slurm_base eq "" || $fedc1 eq "" || $fedc2 eq "" || $fedc3 eq ""} {
set rc false
}
return $rc
}
proc setup_federation { fed_name } {
global sacctmgr fedc1 fedc2 fedc3 eol
set rc $::RETURN_SUCCESS
spawn $sacctmgr -i add federation $fed_name
set matches 0
expect {
-re "Adding Federation\\(s\\)$eol" {
incr matches
exp_continue
}
-re "$fed_name$eol" {
incr matches
exp_continue
}
timeout {
fail "sacctmgr add not responding"
}
eof {
wait
}
}
if {!$rc && $matches != 2} {
log_error "Failed to create federation"
return $::RETURN_ERROR
}
set count 0
foreach cluster [list $fedc1 $fedc2 $fedc3] {
incr count
spawn $sacctmgr -i mod cluster $cluster set federation=$fed_name features=
set matches 0
expect {
-re "Setting$eol" {
incr matches
exp_continue
}
-re "^\\s+Feature\\s+=\\s+$eol" {
incr matches
exp_continue
}
-re "^\\s+Federation\\s+=\\s+$fed_name$eol" {
incr matches
exp_continue
}
-re "Modified cluster...$eol" {
incr matches
exp_continue
}
-re "^\\s+$cluster$eol" {
incr matches
exp_continue
}
timeout {
fail "sacctmgr add not responding"
}
eof {
wait
}
}
if {!$rc && $matches != 5} {
log_error "Failed to add $cluster to federation"
set rc $::RETURN_ERROR
break
}
if {$count > 1} {
sleep 5
}
}
return $rc
}
proc check_cluster_up { cname } {
set rc true
set matches 0
set timeout 2
global fed_slurm_base fedc1 fedc2 fedc3
set my_scontrol "${fed_slurm_base}/$cname/bin/scontrol"
log_user 0
set my_pid [spawn $my_scontrol show config]
expect {
"Configuration data as of" {
incr matches
}
timeout {
log_warn "$cname not responding"
slow_kill $my_pid
set rc false
}
eof {
wait
}
}
if {$matches != 1} {
log_error "$cname not responding"
set rc false
}
log_user 1
return $rc
}
proc check_federation_up {} {
set rc true
global fedc1 fedc2 fedc3
if {![check_cluster_up $fedc1] ||
![check_cluster_up $fedc2] ||
![check_cluster_up $fedc3]} {
log_warn "This test can't be run if any clusters--$fedc1,\
$fedc2, or $fedc3--are down"
set rc false
}
return $rc
}
proc delete_federations { names } {
global sacctmgr
set matches 0
set rc $::RETURN_SUCCESS
set object "federation"
spawn $sacctmgr -i delete $object $names
expect {
-re "privilege to perform this action" {
log_error "Don't have privileges"
set rc $::RETURN_ERROR
}
-re "(There was a problem|Unknown condition|Bad format on|Bad MaxWall|Unknown option)" {
log_error "There was a problem with the sacctmgr command"
set rc $::RETURN_ERROR
}
-re "Problem getting" {
log_error "There was a problem getting information from the database"
set rc $::RETURN_ERROR
}
-re "Problem adding" {
log_error "There was an unknown problem"
set rc $::RETURN_ERROR
}
-re "No associations" {
log_error "Your command didn't return anything"
set rc $::RETURN_ERROR
}
-re "Deleting $object" {
incr matches
exp_continue
}
-re " Nothing deleted" {
incr matches
exp_continue
}
timeout {
fail "sacctmgr delete not responding"
}
eof {
wait
}
}
if {!$rc && $matches != 1} {
log_error "sacctmgr had a problem deleting $object. Got $matches"
}
return $rc
}
proc get_clusterfed_info { fed_name } {
global sacctmgr eol
set matches 0
array set clusters {}
spawn $sacctmgr show cluster federation=$fed_name \
format="cluster%20,federation%20,id,controlhost,controlport,features,fedstate"
expect {
-re "Cluster\\s+Federation\\s+ID\\s+ControlHost\\s+ControlPort\\s+Features\\s+FedState $eol" {
incr matches
exp_continue
}
-re "\\s+(\\S+)\\s+$fed_name\\s+(\\d+)\\s+(\\S+)\\s+(\\d+)\\s+(\\S*)\\s+(\\S*) $eol" {
set clusters($expect_out(1,string)) [dict create id $expect_out(2,string) \
host $expect_out(3,string) \
port $expect_out(4,string) \
features $expect_out(5,string) \
state $expect_out(6,string)]
incr matches
exp_continue
}
timeout {
fail "sacctmgr add not responding"
}
eof {
wait
}
}
if {$matches < 2} {
fail "Didn't match enough clusters for federation ($fed_name) ($matches < 2)"
}
return [array get clusters]
}
#
# Add a single cluster to the given federation.
# IN: cname - name of cluster to add to federation.
# IN: fed_name - name of federation to add cluster to.
# RET: RETURN_SUCCESS, or non-zero on error.
#
proc add_cluster_to_fed {cname fed_name} {
global sacctmgr eol
set rc $::RETURN_SUCCESS
set matches 0
spawn $sacctmgr -i modify federation $fed_name set clusters+=$cname
expect {
-re "Setting$eol" {
incr matches
exp_continue
}
-re "Cluster\\s+ \\+= $cname$eol" {
incr matches
exp_continue
}
-re "^\\s+Modified federation...$eol" {
incr matches
exp_continue
}
-re "\\s+$fed_name$eol" {
incr matches
exp_continue
}
timeout {
fail "sacctmgr add not responding"
}
eof {
wait
}
}
if {$rc || $matches != 4} {
log_error "Failed to add $cname to $fed_name ($matches != 4)"
set $rc $::RETURN_ERROR
}
return $rc
}
#
# Remove a single cluster from the given federation.
# IN: cname - name of cluster to remove from the federation.
# IN: fed_name - name of federation to remove cluster from.
# RET: RETURN_SUCCESS, or non-zero on error.
#
proc remove_cluster_from_fed {cname fed_name} {
global sacctmgr eol
set rc $::RETURN_SUCCESS
set matches 0
spawn $sacctmgr -i modify federation $fed_name set clusters-=$cname
expect {
-re "Setting$eol" {
incr matches
exp_continue
}
-re "Cluster\\s+ -= $cname$eol" {
incr matches
exp_continue
}
-re "^\\s+Modified federation...$eol" {
incr matches
exp_continue
}
-re "\\s+$fed_name$eol" {
incr matches
exp_continue
}
timeout {
fail "sacctmgr add not responding"
}
eof {
wait
}
}
if {$rc || $matches != 4} {
log_error "Failed to remove $cname from $fed_name"
set $rc $::RETURN_ERROR
}
return $rc
}
################################################################
#
# NAME
# wait_for_fed_job - waits for a job to reach the desired state
#
# SYNOPSIS
# wait_for_fed_job ?options? job_id desired_state clusters
#
# DESCRIPTION
# Wait for a previously submitted Slurm job to reach the desired state.
#
# OPTIONS
# -timeout <integer_number>
# time in seconds to wait for the job to be in the desired state
# before timing out (default is 360)
# -pollinterval <integer_number>
# time in seconds between each job state check (default is 1)
#
# ARGUMENTS
# job_id
# The Slurm job id of a job we want to wait for.
# desired_state
# The state you want the job to attain before
# returning. Currently supports:
# DONE any terminated state
# PENDING job is pending
# RUNNING job is running
# SUSPENDED job is suspended
# clusters
# The list of clusters to wait for. If empty "" the default
# clusters of fedc1, fedc2 and fedc3 will be used.
#
# RETURN VALUE
# The name of the first cluster on which the job is found
# in the desired state. Empty string indicates a failure.
#
# NOTE
# We sleep for two seconds before replying that a job is
# done to give time for I/O completion (stdout/stderr files)
#
################################################################
proc wait_for_fed_job args {
global scontrol fedc1 fedc2 fedc3
set timeout 360
set poll_interval 1
while {[llength $args]} {
switch -glob -- [lindex $args 0] {
-time* {set args [lassign $args - timeout]}
-poll* {set args [lassign $args - poll_interval]}
-* {fail "Unknown option: [lindex $args 0]"}
default break
}
}
set argument_count [llength $args]
if {$argument_count != 3} {
fail "Invalid number of arguments ($argument_count): $args"
} else {
lassign $args job_id desired_state clusters
}
# First verify that desired_state is supported
switch $desired_state {
"DONE" {}
"PENDING" {}
"REVOKED" {}
"RUNNING" {}
"SUSPENDED" {}
"SPECIAL_EXIT" {}
default {
log_error "Invalid desired state: $desired_state"
return ""
}
}
if {$job_id == 0} {
log_error "Invalid job ID: $job_id"
return ""
}
set my_delay 0
set spec_clusters [list $fedc1 $fedc2 $fedc3]
if {$clusters ne ""} {
set spec_clusters [split $clusters ","]
}
log_debug "Checking for job '$job_id' in state '$desired_state' on [join $spec_clusters ,]"
while 1 {
foreach cluster $spec_clusters {
log_debug "Checking $cluster"
set fd [open "|$scontrol -M$cluster --local -a -o show job $job_id"]
gets $fd line
catch {close $fd}
if {[regexp {JobState\s*=\s*(\w+)} $line foo state] != 1} {
log_error "$desired_state not found on cluster $cluster"
continue
}
switch $state {
"NOT_FOUND" -
"CANCELLED" -
"DEADLINE" -
"FAILED" -
"TIMEOUT" -
"NODE_FAIL" -
"PREEMPTED" -
"COMPLETED" {
if {$desired_state eq "DONE"} {
log_debug "Job $job_id is DONE ($state) on $cluster"
sleep 2
return $cluster
}
if {$desired_state eq "RUNNING"} {
log_warn "Job $job_id is $state but we wanted RUNNING"
}
if {$desired_state eq "SUSPENDED"} {
log_warn "Job $job_id is $state but we wanted SUSPENDED"
}
return ""
}
"PENDING" {
if {$desired_state eq "PENDING"} {
log_debug "Job $job_id is PENDING on $cluster"
return $cluster
}
log_warn "Job $job_id is in state $state, but desired state $desired_state"
}
"REVOKED" {
if {$desired_state eq "REVOKED"} {
log_debug "Job $job_id is REVOKED on $cluster"
return $cluster
}
log_warn "Job $job_id is in state $state, but desired state $desired_state"
}
"RUNNING" {
if {$desired_state eq "RUNNING"} {
log_debug "Job $job_id is RUNNING on $cluster"
return $cluster
}
log_warn "Job $job_id is in state $state, but desired state $desired_state"
}
"SPECIAL_EXIT" {
if {$desired_state eq "SPECIAL_EXIT"} {
log_debug "Job $job_id is SPECIAL_EXIT on $cluster"
return $cluster
}
log_warn "Job $job_id is in state $state, but desired state $desired_state"
}
"SUSPENDED" {
if {$desired_state eq "SUSPENDED"} {
log_debug "Job $job_id is SUSPENDED on $cluster"
return $cluster
}
log_warn "Job $job_id is in state $state, but desired state $desired_state"
}
default {
log_debug "Job $job_id is in state $state. Desired state was $desired_state"
}
}
}
if { $my_delay > $timeout } {
log_error "Timeout waiting for job state $desired_state"
return ""
}
exec sleep $poll_interval
set my_delay [expr $my_delay + $poll_interval]
}
}