blob: f09762cb01558f2d1f48acb15da83f995797415d [file] [log] [blame]
#!/usr/bin/env expect
############################################################################
# Purpose: Establish global state information for Slurm test suite
#
# To define site-specific state information, set the values in a file
# named 'globals.local'. Those values will override any specified here.
# for example:
#
# $ cat globals.local
# set slurm_dir "/usr/local"
# set build_dir "/home/mine/SLURM/build_smd"
# set src_dir "/home/mine/SLURM/slurm.git"
# set mpicc "/usr/local/bin/mpicc"
#
# If you want to have more than one test going at the same time for multiple
# installs you can have multiple globals.local files and set the
# SLURM_LOCAL_GLOBALS_FILE env var, and have that set to the correct
# globals.local file for your various installs. The file can be named anything,
# not just globals.local.
#
############################################################################
# Copyright (C) 2002-2007 The Regents of the University of California.
# Copyright (C) 2008-2010 Lawrence Livermore National Security.
# Copyright (C) SchedMD LLC.
# Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
# Written by Morris Jette <jette1@llnl.gov>
# Additions by Joseph Donaghy <donaghy1@llnl.gov>
# CODE-OCEC-09-009. All rights reserved.
#
# This file is part of Slurm, a resource management program.
# For details, see <https://slurm.schedmd.com/>.
# Please also read the supplied file: DISCLAIMER.
#
# Slurm is free software; you can redistribute it and/or modify it under
# the terms of the GNU General Public License as published by the Free
# Software Foundation; either version 2 of the License, or (at your option)
# any later version.
#
# Slurm is distributed in the hope that it will be useful, but WITHOUT ANY
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
# details.
#
# You should have received a copy of the GNU General Public License along
# with Slurm; if not, write to the Free Software Foundation, Inc.,
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
############################################################################
# Avoid sourcing this file multiple times
if {[info procs exit] eq "exit"} {
return
}
# Read in unified testsuite properties from testsuite/testsuite.conf
set unified_testsuite_dir [file dirname [file normalize "[info script]/.."]]
set unified_testsuite_conf_file "$unified_testsuite_dir/testsuite.conf"
if {[info exists env(SLURM_TESTSUITE_CONF)]} {
set unified_testsuite_conf_file $env(SLURM_TESTSUITE_CONF)
}
set unified_testsuite_config_dict [dict create]
if [file readable "$unified_testsuite_conf_file"] {
set fp [open "$unified_testsuite_conf_file" r]
set file_data [read $fp]
close $fp
foreach line [split $file_data "\n"] {
if [regexp {^\s*(\S+)\s*=\s*(\S+)\s*$} $line {} parameter_name parameter_value] {
dict set unified_testsuite_config_dict [string tolower $parameter_name] $parameter_value
}
}
} else {
puts stderr "Warning: Unable to open the unified testsuite configuration file ($unified_testsuite_conf_file) for reading. This file can be created from a copy of the autogenerated sample found in BUILDDIR/testsuite/testsuite.conf.sample. By default, this file is expected to be found in SRCDIR/testsuite ($unified_testsuite_dir). If placed elsewhere, set the SLURM_TESTSUITE_CONF environment variable to the full path of your testsuite.conf file."
flush stderr
}
if [dict exists $unified_testsuite_config_dict "slurmsourcedir"] {
set src_dir [dict get $unified_testsuite_config_dict "slurmsourcedir"]
}
if [dict exists $unified_testsuite_config_dict "slurmbuilddir"] {
set build_dir [dict get $unified_testsuite_config_dict "slurmbuilddir"]
}
if [dict exists $unified_testsuite_config_dict "slurminstalldir"] {
set slurm_dir [dict get $unified_testsuite_config_dict "slurminstalldir"]
}
global sacctmgr sacct salloc sattach sbatch sbcast scancel scontrol sinfo
global smd squeue sreport srun sstat strigger
################################################################
#
# NAME
# cset - conditional set
#
# SYNOPSIS
# cset name value
#
# DESCRIPTION
# Conditional set. Only set variable if variable does not yet exist.
#
# Input: name -- name of the variable to set
# value -- value to set to 'name'
#
################################################################
proc cset {name value} {
if {![uplevel 1 info exists $name]} {
upvar $name tmp
set tmp $value
}
}
#
# Defining colors here to be able to use them in globals.local.
# By default, these colors are bold
#
set COLOR_RED "\033\[1;31m"
set COLOR_RED_NORMAL "\033\[31m"
set COLOR_ORANGE "\033\[1;38;5;208m"
set COLOR_YELLOW "\033\[1;33m"
set COLOR_GREEN "\033\[1;32m"
set COLOR_BLUE "\033\[1;34m"
set COLOR_MAGENTA "\033\[1;35m"
set COLOR_CYAN "\033\[1;36m"
set COLOR_NONE "\033\[0m"
cset local_globals_file "./globals.local"
# Log level "enum"
# Define log levels here so they are available in globals.local
set LOG_LEVEL_QUIET 0
set LOG_LEVEL_FATAL 1
set LOG_LEVEL_ERROR 2
set LOG_LEVEL_WARNING 3
set LOG_LEVEL_INFO 4
set LOG_LEVEL_PASS 4
set LOG_LEVEL_COMMAND 4
set LOG_LEVEL_DEBUG 5
set LOG_LEVEL_TRACE 6
if {[info exists env(SLURM_LOCAL_GLOBALS_FILE)]} {
set local_globals_file $env(SLURM_LOCAL_GLOBALS_FILE)
}
if [file exists $local_globals_file] {
source $local_globals_file
}
#
# Specify the slurm install directory.
# Used to locate binaries, libraries, and header files.
#
cset slurm_dir "/usr"
cset build_dir "../../"
cset src_dir "../../"
cset config_h "${build_dir}/config.h"
cset sacctmgr "${slurm_dir}/bin/sacctmgr"
cset sacct "${slurm_dir}/bin/sacct"
cset salloc "${slurm_dir}/bin/salloc"
cset sattach "${slurm_dir}/bin/sattach"
cset sbatch "${slurm_dir}/bin/sbatch"
cset sbcast "${slurm_dir}/bin/sbcast"
cset scancel "${slurm_dir}/bin/scancel"
cset scontrol "${slurm_dir}/bin/scontrol"
cset sdiag "${slurm_dir}/bin/sdiag"
cset sgather "${slurm_dir}/bin/sgather"
cset sh5util "${slurm_dir}/bin/sh5util"
cset sinfo "${slurm_dir}/bin/sinfo"
cset smd "${slurm_dir}/bin/smd"
cset sprio "${slurm_dir}/bin/sprio"
cset squeue "${slurm_dir}/bin/squeue"
cset srun "${slurm_dir}/bin/srun"
cset sreport "${slurm_dir}/bin/sreport"
cset sshare "${slurm_dir}/bin/sshare"
cset sstat "${slurm_dir}/bin/sstat"
cset strigger "${slurm_dir}/bin/strigger"
cset slurmd "${slurm_dir}/sbin/slurmd"
cset slurmrestd "${slurm_dir}/sbin/slurmrestd"
cset pbsnodes "${slurm_dir}/bin/pbsnodes"
cset qdel "${slurm_dir}/bin/qdel"
cset qstat "${slurm_dir}/bin/qstat"
cset qsub "${slurm_dir}/bin/qsub"
cset qalter "${slurm_dir}/bin/qalter"
cset qrerun "${slurm_dir}/bin/qrerun"
cset seff "${slurm_dir}/bin/seff"
cset lsid "${slurm_dir}/bin/lsid"
cset bjobs "${slurm_dir}/bin/bjobs"
cset bkill "${slurm_dir}/bin/bkill"
cset bsub "${slurm_dir}/bin/bsub"
cset influx "/usr/bin/influx"
# If using MPICH-2 or other version of MPI requiring pmi libary, use this
#cset mpicc "/home/jette/mpich2-install/bin/mpicc"
#cset use_pmi 1
# OR for other versions of MPICH, use this
cset mpicc "/usr/local/bin/mpicc"
cset nvcc "/usr/bin/nvcc"
cset use_pmi 0
#cset upcc "/usr/local/bin/upcc"
cset upcc "/usr/bin/xlupc"
cset oshcc "/usr/local/bin/oshcc"
cset mpirun "mpirun"
cset totalviewcli "/usr/local/bin/totalviewcli"
# Set if using "--enable-memory-leak-debug" configuration option
cset enable_memory_leak_debug 0
# test_prompt: to be used as prompt for interactive shells (see proc reset_bash_prompt)
set test_prompt "TEST_PROMPT: "
#
# Specify locations of other executable files used
# Only the shell names (e.g. bin_bash) must be full pathnames
#
cset bin_awk "awk"
cset bin_bash [exec which bash | tail -n 1]
cset bin_cat "cat"
cset bin_cc "gcc"
cset bin_chmod "chmod"
cset bin_cmp "cmp"
cset bin_cp "cp"
cset bin_date "date"
cset bin_diff "diff"
cset bin_echo "echo"
cset bin_env "env"
cset bin_file "file"
cset bin_id "id"
cset bin_ip "ip"
cset bin_jq "jq"
cset bin_grep "grep"
cset bin_head "head"
cset bin_ln "ln"
cset bin_perldoc "/usr/bin/perldoc"
cset bin_py3 "python3"
cset bin_oasgen "openapi-generator-cli"
# Don't user $bin_hostname, use $bin_printenv SLURMD_NODENAME
cset bin_hostname "hostname"
cset bin_kill "kill"
cset bin_lscpu "lscpu"
cset bin_make "make"
cset bin_mv "mv"
cset bin_od "od"
cset bin_pkill "pkill"
cset bin_printenv "printenv"
cset bin_ps "ps"
cset bin_pwd "pwd"
cset bin_rm "rm"
cset bin_sed "sed"
cset bin_sleep "sleep"
cset bin_sort "sort"
cset bin_socat "socat"
cset bin_sum "sum"
cset bin_sudo "sudo"
cset bin_systemd_detect_virt "systemd-detect-virt"
cset bin_touch "touch"
cset bin_true "true"
cset bin_uname "uname"
cset bin_uniq "uniq"
cset bin_unshare "unshare"
cset bin_virtualenv "virtualenv"
cset bin_wc "wc"
#
# Let the commands complete without expect timing out waiting for a
# response. Single node jobs submitted to the default partition should
# be initiated within this number of seconds.
# for interactive slurm jobs: cset timeout $max_job_delay
#
cset max_job_delay 120
#
# Specify the maximum number of tasks to use in the stress tests.
#
cset max_stress_tasks 4
#
# The error message that the "sleep" command prints when we run "sleep aaa".
#
cset sleep_error_message "(invalid time interval)|(bad character in argument)|(usage: sleep seconds)"
# Force LANG, as the expect tests aren't localized
set ::env(LANG) "en_US.UTF-8"
# Testsuite level variables
cset testsuite_dir "[file dirname [file normalize [info script]]]"
cset testsuite_shared_dir "[$bin_pwd]"
# Testsuite non-privileged user (set it in globals.local)
cset testsuite_user ""
# Testsuite log variables
cset testsuite_log_level $LOG_LEVEL_DEBUG
cset testsuite_log_format "\[%{timestamp}s.%{msecs}03d] %{loglevel}-7s %{message}s \(%{backtrace}s)"
cset testsuite_time_format "%Y-%m-%d %H:%M:%S"
# Default to using color if writing to a terminal and not if writing to a file
cset testsuite_colorize [dict exists [fconfigure stdout] -mode]
cset testsuite_color_fatal $COLOR_RED
cset testsuite_color_error $COLOR_RED_NORMAL
cset testsuite_color_warn $COLOR_ORANGE
cset testsuite_color_info $COLOR_YELLOW
cset testsuite_color_pass $COLOR_GREEN
cset testsuite_color_command $COLOR_CYAN
cset testsuite_color_debug $COLOR_BLUE
cset testsuite_color_trace $COLOR_MAGENTA
cset testsuite_color_header $COLOR_NONE
cset testsuite_color_success $COLOR_GREEN
cset testsuite_color_failure $COLOR_RED
cset testsuite_color_skipped $COLOR_ORANGE
# Set to true to cause the first subtest failure to immediately end the test
cset testsuite_subtest_fatal false
# Set to all, fail_skip, fail or none print datails of subtests and testprocs
cset testsuite_subtest_details fail
cset testsuite_testproc_details fail
cset testsuite_testproc_log_calls ignore_skips
# To automatically call cleanup or not when ending the test
cset testsuite_cleanup_on_failure true
if {[info exists env(SLURM_TESTSUITE_CLEANUP_ON_FAILURE)]} {
set testsuite_cleanup_on_failure $env(SLURM_TESTSUITE_CLEANUP_ON_FAILURE)
}
# To avoid potential infinite loops due calls to fail/pass/skip inside
# custom cleanup procs (_test_fini should be called only once)
set _test_fini_called false
# Testproc internal variables
set _testproc_included [list]
set _testproc_excluded [list]
set _testproc_pass_list [list]
set _testproc_skip_list [list]
set _testproc_fail_list [list]
set _testproc_messages [dict create]
set _testproc_skip_next false
set _testproc_skip_reason ""
set _incomplete_reason ""
set _subtest_messages [dict create]
set _subtest_pass_count 0
set _subtest_skip_count 0
set _subtest_fail_count 0
set STATUS_PASS 0
set STATUS_FAIL 1
set STATUS_SKIP -1
# Final test status
set test_status $STATUS_PASS
# Other common variables
set re_word_str "\\S+"
set digit "\\d"
set eol "\r?\n"
set float "\\d+\\.?\\d*"
set number "\\d+"
set format_time "\\d+\\:\\d+\\:\\d+"
set number_with_suffix "\\d+\[KM\]*"
set slash "/"
set whitespace "\\s+"
set controlmachine_regex "\\S+"
# Any characters except ( , : newline
set no_delim "\[^(,:\r\n\]"
set no_delim_slash "\[^(,:/\r\n\]"
# The first group matches GRES name
# The second **optional** group matches GRES type.
# The third group matches GRES count.
# Test out the regex here: https://regex101.com/r/FlNYKM/7
set gres_regex "($no_delim_slash*):($no_delim*)?:?($no_delim*)?:?($no_delim*)"
#basic #defines in slurm.h
set NO_VAL 0xfffffffe
set INFINITE 0xffffffff
set SLURM_MAX_NORMAL_STEP_ID 0xfffffff0
set SLURM_EXTERN_CONT 0xfffffffc
set SLURM_BATCH_SCRIPT 0xfffffffb
#
# Global variable used in multiple functions in "globals" file
#
set gpu_sock_list {}
#
# Procedure return values
#
set RETURN_SUCCESS 0
set RETURN_ERROR 1
set RETURN_TIMEOUT 110 ; # ETIMEDOUT
################################################################
#
# NAME
# fail - fails a test
#
# SYNOPSIS
# fail message
#
# DESCRIPTION
# To be used when an error is fatal for the test. This routine
# prints the specified error message, optionally cleans up, prints
# a final test failure message, and exits the test with exit code 1.
#
# ENVIRONMENT
# Whether or not the cleanup procedure is called depends on the setting
# of the $testsuite_cleanup_on_failure set in the globals.local file or
# overridden with the SLURM_TESTSUITE_CLEANUP_ON_FAILURE environment
# variable.
#
# NOTE
# DO NOT call this within your local cleanup procedure.
#
################################################################
proc fail { message } {
global _incomplete_reason STATUS_FAIL
# Avoid recursive calls from within cleanup
if {[info level] > 1 && [lindex [info level -1] 0] eq "cleanup"} {
log_error "Local cleanup shouldn't call pass, fail or skip"
return
}
log_fatal $message
set _incomplete_reason $message
# _test_fini will handle cleanup and print the failure message.
_test_fini $STATUS_FAIL
}
################################################################
#
# NAME
# skip - skips a test
#
# SYNOPSIS
# skip message
#
# DESCRIPTION
# To be used when a precondition for the test fails and the test
# should be skipped. This routine prints the specified warning message,
# calls the cleanup procedure if defined, prints a final test skipped
# message, and exits the test with exit code -1 (aka 255).
#
# NOTE
# DO NOT call this within your local cleanup procedure.
#
################################################################
proc skip { message } {
global _incomplete_reason STATUS_SKIP
# Avoid recursive calls from within cleanup
if {[info level] > 1 && [lindex [info level -1] 0] eq "cleanup"} {
log_error "Local cleanup shouldn't call pass, fail or skip"
return
}
log_warn $message
set _incomplete_reason $message
# _test_fini will handle cleanup and print the skipped message.
_test_fini $STATUS_SKIP
}
################################################################
#
# NAME
# pass - passes a test
#
# SYNOPSIS
# pass
#
# DESCRIPTION
# To be used when a test passes and should complete with success.
# This routine calls the cleanup procedure if defined, prints a final
# test success message, and exits with exit code 0.
#
# NOTE
# DO NOT call this within your local cleanup procedure.
#
################################################################
proc pass { } {
global STATUS_PASS
# Avoid recursive calls from within cleanup
if {[info level] > 1 && [lindex [info level -1] 0] eq "cleanup"} {
log_error "Local cleanup shouldn't call pass, fail or skip"
return
}
# _test_fini will handle cleanup and print the success message.
_test_fini $STATUS_PASS
}
################################################################
#
# NAME
# subpass - registers a passing subtest result
#
# SYNOPSIS
# subpass ?description?
#
# DESCRIPTION
# Increments the subtest pass count and logs a passing subtest message
#
# ARGUMENTS
# description
# A single-line string describing the subtest being verified
#
################################################################
proc subpass args {
global _subtest_fail_count _subtest_pass_count _subtest_skip_count
global _subtest_messages
set description ""
set argument_count [llength $args]
if {$argument_count == 1} { set args [lassign $args description] }
if {$argument_count > 1} {
fail "Too many arguments ($argument_count): $args"
}
set subtest_count [expr $_subtest_pass_count + $_subtest_fail_count + $_subtest_skip_count + 1]
incr _subtest_pass_count
set message [format "Subtest %2d passed" $subtest_count]
if {$description ne ""} { append message " : $description" }
log_pass $message
dict set _subtest_messages $subtest_count [list pass $message]
}
################################################################
#
# NAME
# subfail - registers a failing subtest result
#
# SYNOPSIS
# subfail ?options? ?description? ?diagnostics?
#
# DESCRIPTION
# Increments the subtest failure count and logs a failing subtest message
#
# OPTIONS
# -fatal
# Causes this subtest failure to be fatal, ending the test
# ARGUMENTS
# description
# A single-line string describing the condition being verified
# diagnostics
# A string providing additional diagnostic information that will
# be included with the log message
#
# ENVIRONMENT
# testsuite_subtest_fatal
# Specifies whether first failing subtest aborts the test
#
################################################################
proc subfail args {
global _subtest_fail_count _subtest_pass_count _subtest_skip_count
global testsuite_subtest_fatal
global _subtest_messages
set description ""
set fatal false
while {[llength $args]} {
switch -glob -- [lindex $args 0] {
-fatal {set fatal true; set args [lrange $args 1 end]}
-* {fail "Unknown option: [lindex $args 0]"}
default break
}
}
set argument_count [llength $args]
if {$argument_count >= 1} { set args [lassign $args description] }
set subtest_count [expr $_subtest_pass_count + $_subtest_fail_count + $_subtest_skip_count + 1]
incr _subtest_fail_count
set message [format "Subtest %2d failed" $subtest_count]
if {$description ne ""} { append message " : $description" }
if [llength $args] { append message " (" [join $args ", "] ")" }
if {$fatal || $testsuite_subtest_fatal} {
fail $message
} else {
log_error $message
}
dict set _subtest_messages $subtest_count [list fail $message]
}
################################################################
#
# NAME
# subskip - registers a skipped subtest result
#
# SYNOPSIS
# subskip ?options? ?description?
#
# DESCRIPTION
# Increments the subtest skip count and logs a skipped subtest message
#
# OPTIONS
# -count NUMBER
# When used with -skip, indicates the number of subtests that
# were skipped
# ARGUMENTS
# description
# A single-line string describing the reason the subtest is
# being skipped
#
################################################################
proc subskip args {
global _subtest_fail_count _subtest_pass_count _subtest_skip_count
global _subtest_messages
set description ""
set count 1
set nolog 0
while {[llength $args]} {
switch -glob -- [lindex $args 0] {
-nolog {
set nolog 1
# Shift args down one
set args [lassign $args -]
}
-count {set args [lassign $args - count]}
-* {fail "Unknown option: [lindex $args 0]"}
default break
}
}
set argument_count [llength $args]
if {$argument_count == 1} { set args [lassign $args description] }
if {$argument_count > 1} {
fail "Too many arguments ($argument_count): $args"
}
set subtest_count [expr $_subtest_pass_count + $_subtest_fail_count + $_subtest_skip_count + 1]
incr _subtest_skip_count $count
if {$count > 1} {
set message "Subtest $subtest_count-[expr $subtest_count+$count-1] skipped"
} else {
set message [format "Subtest %2d skipped" $subtest_count]
}
if {$description ne ""} { append message " : $description" }
if {$nolog != 1} {
log_warn $message
}
dict set _subtest_messages [expr $subtest_count] [list skip $message]
}
################################################################
#
# NAME
# get_subtest_fail_count - returns the current subtest failure count
#
# SYNOPSIS
# get_subtest_fail_count
#
################################################################
proc get_subtest_fail_count {} {
global _subtest_fail_count
return $_subtest_fail_count
}
################################################################
#
# NAME
# reset_bash_prompt - used in expect body to reset bash prompt
#
# SYNOPSIS
# reset_bash_prompt
#
# DESCRIPTION
# The procedure must be called within an expect body to
# reset the bash prompt to the $test_prompt.
#
# EXAMPLE
# spawn $salloc $bin_bash
# expect {
# -re "Granted job allocation ($number)" {
# set job_id $expect_out(1,string)
# reset_bash_prompt
# exp_continue
# }
# -re $test_prompt {}; # Expect initial test prompt from bash
# timeout {
# fail "salloc not responding or prompt not recognized"
# }
# }
#
################################################################
proc reset_bash_prompt {} {
global testsuite_dir
upvar spawn_id sid
send -i $sid "source $testsuite_dir/reset_bash_prompt.sh\n"
}
################################################################
#
# NAME
# print_time - prints the current date and time
#
# SYNOPSIS
# print_time
#
################################################################
proc print_time { } {
global bin_date
spawn $bin_date
expect {
eof {
wait
}
}
return
}
################################################################
#
# NAME
# dict_getdef - 'dict get' with ability to specify the default value
#
# SYNOPSIS
# dict_getdef dictionary_value key default_value
#
# DESCRIPTION
# Tcl < 8.7 lacks a built in 'dict get' with ability to specify the
# default value. Tcl 8.7 adds a dict getdef.
# This proc returns the value from the dictionary corresponding to the
# keys if it exists, or the default value otherwise.
#
# EXAMPLE
# dict_getdef $option_dict action "warn"
#
# SOURCE
# https://core.tcl-lang.org/tips/doc/trunk/tip/342.md
# https://core.tcl-lang.org/tcl/tktview/2370575
#
################################################################
proc dict_getdef {D args} {
if {[dict exists $D {*}[lrange $args 0 end-1]]} then {
dict get $D {*}[lrange $args 0 end-1]
} else {
lindex $args end
}
}
################################################################
#
# NAME
# _line_trace - returns an abbreviated call stack trace with line numbers
#
# SYNOPSIS
# _line_trace
#
################################################################
proc _line_trace {} {
set line_trace ""
set first_entry true
for {set f [expr [info frame] - 3]} {$f >= 1} {incr f -1} {
set frame_dict [info frame $f]
if [dict exists $frame_dict file] {
if [regexp uplevel [dict get $frame_dict cmd]] {
continue
}
if {$first_entry} {
set first_entry false
} else {
append line_trace ","
}
if [dict exists $frame_dict proc] {
set proc [namespace tail [dict get $frame_dict proc]]
if {$proc ne ""} {
append line_trace "$proc\@"
}
}
append line_trace [file tail [dict get $frame_dict file]]
if [dict exists $frame_dict line] {
append line_trace ":[dict get $frame_dict line]"
}
}
}
return $line_trace
}
################################################################
#
# NAME
# tolerance - determines whether a value is within a specified tolerance
#
# SYNOPSIS
# tolerance expected observed tolerance_expression
#
# ARGUMENTS
# expected
# the expected (numeric) value
# observed
# the observed (numeric) value
# tolerance_expression
# a string of the form: [~][+|-]<tolerance>[%]
#
# DESCRIPTION
# tolerance
# A numeric tolerance
# symmetry
# By default the permitted range of values is symetric:
# [expected - tolerance, expected + tolerance]
# If the + sign is specified, the tolerance is limited to the
# the higher side only:
# [expected, expected + tolerance]
# If the - sign is specified, the tolerance is limited to the
# the lower side only:
# [expected - tolerance, expected]
# percent
# By default the permitted range is computed as absolute values:
# [expected - tolerance, expected + tolerance]
# If % is specified, the permitted range is computed as a
# percentage of the expected value:
# [expected*(1-tolerance/100), expected*(1+tolerance/100)]
# exclusivity
# By default the permitted range of values is inclusive, ie
# the min and max tolerated values are included in the range:
# [expected - tolerance, expected + tolerance]
# If ~ (exclusive) is specified, the tolerance limits are
# exclusive, ie the min and max tolerated values are excluded:
# (expected - tolerance, expected + tolerance)
# expression
# any combination of symetry, percent and exclusivity is allowed
#
# RETURN VALUE
# Returns true if the observed value is within the specified tolerance
# range of the expected value, otherwise false
#
# EXAMPLES
# The indicated tolerance_expression is true if:
# "5" expected - 5 <= observed <= expected + 5
# "-5" expected - 5 <= observed <= expected
# "+5" expected <= observed <= expected + 5
# "5%" expected - 5% <= observed <= expected + 5%
# "~5" expected - 5 < observed < expected + 5
# "~+5%" expected <= observed < expected + 5%
#
################################################################
proc tolerance { expected observed tolerance_expression } {
if {![regexp {^(~?)([-+]?)([0-9\.]+)(%?)$} $tolerance_expression {} exclusive sign tolerance percent]} {
fail "Invalid tolerance expression ($tolerance_expression)"
}
set lower_bound_expression $observed
if {$sign eq "+" || $exclusive ne "~"} {
append lower_bound_expression " >="
} else {
append lower_bound_expression " >"
}
append lower_bound_expression " $expected"
if {$sign eq "-" || $sign eq ""} {
if {$percent eq "%"} {
append lower_bound_expression " - $tolerance * $expected / 100"
} else {
append lower_bound_expression " - $tolerance"
}
}
set upper_bound_expression $observed
if {$sign eq "-" || $exclusive ne "~"} {
append upper_bound_expression " <="
} else {
append upper_bound_expression " <"
}
append upper_bound_expression " $expected"
if {$sign eq "+" || $sign eq ""} {
if {$percent eq "%"} {
append upper_bound_expression " + $tolerance * $expected / 100"
} else {
append upper_bound_expression " + $tolerance"
}
}
if {[expr $lower_bound_expression] && [expr $upper_bound_expression]} {
log_debug "$observed is within tolerance $tolerance_expression of $expected"
return true
} else {
log_warn "$observed is not within tolerance $tolerance_expression of $expected"
return false
}
}
################################################################
#
# NAME
# check_run_as_user - check if the caller may run_command as the supplied user
#
# SYNOPSIS
# check_run_as_user user
#
# DESCRIPTION
# Note that a proper sudo config needs to be set in orther to pass this
# check. Calling user should be permitted to run_commands as the
# supplied user using sudo without password.
# See the -user option of run_command.
# This proc also log_warn a message if user already exists in the DB
# because most probably this user is testsuite_user and that user is
# expected NOT to be in the DB and could potentially be removed from it by
# the test.
#
# RETURN VALUE
# Returns a boolean value indicating whether the calling user may
# run_command as user.
#
################################################################
proc check_run_as_user user {
global bin_id
if {$user eq ""} {
return false
}
if {[run_command_status -none -user $user "$bin_id -un"]} {
return false
}
if {[get_admin_level $user] != ""} {
log_warn "User $user already exists in DB, but it's probable that it's going to be removed by the test cleanup"
}
return true
}
################################################################
#
# NAME
# check_user_id - check if user exists in the system
#
# SYNOPSIS
# check_user_id user
#
# DESCRIPTION
# This proc also log_warn a message if user already exists in the DB
# because most probably this user is testsuite_user and that user is
# expected NOT to be in the DB and could potentially be removed from
# it by the test.
#
# RETURN VALUE
# Returns true if the user passed exists in the system, false otherwise.
#
################################################################
proc check_user_id user {
global bin_id
if {$user eq ""} {
return false
}
if {[run_command_status -none "$bin_id -un $user"]} {
return false
}
if {[get_admin_level $user] != ""} {
log_warn "User $user already exists in DB, but it's probable that it's going to be removed by the test cleanup"
}
return true
}
################################################################
#
# NAME
# run_command - executes a command and returns a dictionary result
#
# SYNOPSIS
# run_command ?options? command
#
# DESCRIPTION
# Executes a command and returns a dictionary that includes the output,
# exit code, etc. An action can be taken (fail, warn, subtest, none) if
# the command's exit code is unexpected. By default, the action
# will be applied if the command fails. If the -xfail option is
# specified, the behavior will be reversed to apply the action if the
# command ran successfully.
# A timeout is always treated as unexpected, so log_error will be shown
# by default, or fail/subfail will be called if -fail/-subtest are used.
#
# OPTIONS
# -fail
# If the exit code is unexpected, the action that will
# be taken is to fail the test
# -subtest
# If the exit code is unexpected, the action that will
# be taken is to subfail, otherwise subpass will be called
# -warn
# If the exit code is unexpected, the action that will
# be taken is to log a warning (this is the default)
# -none
# If the exit code is unexpected, no action will be taken
# -xfail
# If the command exits with zero the action will be applied.
# Without this option, the action will be applied if the
# command exits with a non-zero exit code.
# -timeout <float_number>
# Time in seconds to wait for the command to complete before
# timing out (default is 60.0)
# -nolog
# Logging for this command will occur at trace threshold only
# -stdin
# Provide standard in to be piped into command
# -user <user>
# Attempt to execute command as <user>. Note that sudo must be
# properly configured to permit the caller to execute as <user>.
# See check_run_as_user.
#
# ARGUMENTS
# command
# a string containing the command and arguments to execute
#
# RETURN VALUE
# A dictionary containing the following elements:
# command - The command that was invoked
# exit_code - Exit code
# output - The combined standard output and standard error
# start_time - The time (with ms) the command was executed
# duration - The duration (seconds and milliseconds) the
# command took to run
#
################################################################
proc run_command args {
global bin_bash bin_sudo
set alt_user ""
set exit_status 0
set timedout false
set output ""
set action "warn"
set timeout 60
set expect_failure false
set log_at_trace_level false
set stdin ""
while {[llength $args]} {
switch -glob -- [lindex $args 0] {
-fail {set action "fail"; set args [lrange $args 1 end]}
-subtest {set action "subtest"; set args [lrange $args 1 end]}
-none {set action "none"; set args [lrange $args 1 end]}
-timeout {set args [lassign $args - timeout]}
-warn {set action "warn"; set args [lrange $args 1 end]}
-xfail {set expect_failure true; set args [lrange $args 1 end]}
-nolog {set log_at_trace_level true; set args [lrange $args 1 end]}
-stdin {set stdin [lindex $args 1]; set args [lassign $args - stdin]}
-user {set args [lassign $args - alt_user]}
-* {fail "Unknown option: [lindex $args 0]"}
default break
}
}
if {[llength $args] == 1} {
lassign $args command
} else {
fail "Invalid number of arguments [llength $args]: $args"
}
if {$action eq "subtest"} {
if {$expect_failure} {
set test_description "Command \"$command\" should fail"
} else {
set test_description "Command \"$command\" should succeed"
}
}
if {$log_at_trace_level} {
interp alias {} log_run {} log_trace
interp alias {} log_details {} log_trace
} else {
interp alias {} log_run {} log_command
interp alias {} log_details {} log_debug
}
set orig_log_user [log_user -info]
log_user 0
if {$alt_user ne ""} {
log_run "Run Command as user $alt_user: $command"
} else {
log_run "Run Command: $command"
}
set start_clock_ms [clock milliseconds]
set stty_init raw ; # Prevent the terminal from inserting \r
if {$alt_user ne ""} {
set expect_pid [spawn -noecho $bin_sudo -nu $alt_user $bin_bash -c "$command"]
} else {
set expect_pid [spawn -noecho $bin_bash -c "$command"]
}
if { $stdin != "" } {
exp_send "$stdin"
set command "$command <<< $stdin"
}
expect {
-re "(.+)" {
append output $expect_out(1,string)
exp_continue
}
timeout {
slow_kill $expect_pid
set exit_status $::RETURN_TIMEOUT
set timedout true
}
eof {
lassign [wait] pid spawnid os_error_flag errno
set exit_status [expr $errno > 128 ? $errno - 256 : $errno]
}
}
set start_time [format "%.3f" [expr $start_clock_ms / 1000.000]]
set end_time [format "%.3f" [expr [clock milliseconds] / 1000.000]]
set duration [format "%.3f" [expr $end_time - $start_time]]
log_details "Command Results:"
log_details " Duration: $duration"
log_details " Exit Code: $exit_status"
if {[info exists output]} {
log_details " Output: $output"
}
if {$timedout} {
set message "Command \"$command\" timed out after $timeout seconds"
if {$action eq "fail"} {
fail $message
} elseif {$action eq "subtest"} {
subfail $test_description $message
} else {
log_error $message
}
} elseif {! $expect_failure && $exit_status != 0} {
set message "Command \"$command\" failed with rc=$exit_status"
if {[info exists output] && $output != ""} {
append message ": [string trimright $output]"
}
if {$action eq "warn"} {
log_warn $message
} elseif {$action eq "subtest"} {
subfail $test_description $message
} elseif {$action eq "fail"} {
fail $message
}
} elseif {$expect_failure && $exit_status == 0} {
set message "Command \"$command\" was expected to fail but succeeeded"
if {$action eq "warn"} {
log_warn $message
} elseif {$action eq "subtest"} {
subfail $test_description $message
} elseif {$action eq "fail"} {
fail $message
}
} elseif {$action eq "subtest"} {
subpass $test_description
}
log_user $orig_log_user
dict set result command $command
dict set result exit_code $exit_status
dict set result output $output
dict set result start_time $start_time
dict set result duration $duration
return $result
}
################################################################
#
# NAME
# run_command_output - executes a command and returns the output
#
# SYNOPSIS
# run_command_output ?options? command
#
# DESCRIPTION
# Executes a command and returns a dictionary that includes the output,
# exit code, etc. An action can be taken (fail, warn, none) if the
# command's exit code or timeout is unexpected. By default, the action
# will be applied if the command fails. If the -xfail option is
# specified, the behavior will be reversed to apply the action if the
# command ran successfully.
#
# OPTIONS
# -fail
# if the exit code or timeout is unexpected, the action that will
# be taken is to fail the test
# -warn
# if the exit code or timeout is unexpected, the action that will
# be taken is to log a warning (this is the default)
# -none
# if the exit code or timeout is unexpected, no action will be
# taken
# -xfail
# if the command exits with zero and does not time out, the
# action will be applied. Without this option, the action will
# be applied if the command exits with a non-zero exit code or
# times out.
# -timeout <float_number>
# time in seconds to wait for the command to complete before
# timing out (default is 60.0)
# -stdin
# Provide standard in to be piped into command
#
# ARGUMENTS
# command
# a string containing the command and arguments to execute
#
# RETURN VALUE
# A string containing the combined standard output and standard error
#
################################################################
proc run_command_output args {
set result [run_command {*}$args]
if [dict exists $result output] {
return [dict get $result output]
} else {
return ""
}
}
################################################################
#
# NAME
# run_command_status - executes a command and returns the exit code
#
# SYNOPSIS
# run_command_status ?options? command
#
# DESCRIPTION
# Executes a command and returns a dictionary that includes the output,
# exit code, etc. An action can be taken (fail, warn, none) if the
# command's exit code or timeout is unexpected. By default, the action
# will be applied if the command fails. If the -xfail option is
# specified, the behavior will be reversed to apply the action if the
# command ran successfully.
#
# OPTIONS
# -fail
# if the exit code or timeout is unexpected, the action that will
# be taken is to fail the test
# -warn
# if the exit code or timeout is unexpected, the action that will
# be taken is to log a warning (this is the default)
# -none
# if the exit code or timeout is unexpected, no action will be
# taken
# -xfail
# if the command exits with zero and does not time out, the
# action will be applied. Without this option, the action will
# be applied if the command exits with a non-zero exit code or
# times out.
# -timeout <float_number>
# time in seconds to wait for the command to complete before
# timing out (default is 60.0)
# -stdin
# Provide standard in to be piped into command
#
# ARGUMENTS
# command
# a string containing the command and arguments to execute
#
# RETURN VALUE
# The exit code for the invoked command
#
################################################################
proc run_command_status args {
set result [run_command {*}$args]
return [dict get $result exit_code]
}
################################################################
#
# NAME
# cancel_job - cancels the specified job list
#
# SYNOPSIS
# cancel_job job_id_list ?het_job?
#
# DESCRIPTION
# Cancels one or more jobs. A job_id of 0 will be silently ignored.
#
# OPTIONS
# -fail
# if scancel fails with exit code or timeout, or the job doesn't
# end, test will will fail
#
# ARGUMENTS
# job_id_list
# The list of Slurm job ids that we want to cancel
# het_job
# 1 if jobs are hetjobs and we want to confirm each
# component has completed
#
# RETURN VALUE
# RETURN_SUCCESS if jobs are cancelled, or non-zero value otherwise.
#
################################################################
proc cancel_job args {
global scancel
set fatal false
while {[llength $args]} {
switch -glob -- [lindex $args 0] {
-fail {set fatal true; set args [lrange $args 1 end]}
-* {fail "Unknown option: [lindex $args 0]"}
default break
}
}
set argument_count [llength $args]
if {$argument_count < 1} {
fail "Too few arguments ($argument_count): $args"
} elseif {$argument_count > 2} {
fail "Too many arguments ($argument_count): $args"
}
lassign $args job_id_list
if {$argument_count == 2} {
set het_job [lindex $args 1]
} else {
set het_job 0
}
set job_list_clean [list]
foreach job_id $job_id_list {
if {$job_id != 0} {
lappend job_list_clean $job_id
}
}
if {![llength $job_list_clean]} {
return $::RETURN_SUCCESS
}
log_debug "Cancelling $job_list_clean"
set result [run_command "$scancel -Q $job_list_clean"]
if {[dict get $result exit_code]} {
set message "scancel command returned an error ([dict get $result output])"
if {$fatal} {
fail $message
} else {
log_warn $message
return $::RETURN_ERROR
}
}
foreach job_id $job_list_clean {
if {[wait_for_job $job_id "DONE" $het_job]} {
set message "Job ($job_id) did not end when cancelled"
if {$fatal} {
fail $message
} else {
log_warn $message
return $::RETURN_ERROR
}
}
}
return $::RETURN_SUCCESS
}
################################################################
#
# NAME
# get_line_cnt - returns the size of the specified file
#
# SYNOPSIS
# get_line_cnt file_name
#
# RETURN VALUE
# Number of lines in the specified file.
#
################################################################
proc get_line_cnt { file_name } {
global bin_wc number
set lines 0
spawn $bin_wc -l $file_name
expect {
-re "($number) " {
set lines $expect_out(1,string)
exp_continue
}
eof {
wait
}
}
return $lines
}
################################################################
#
# NAME
# convert_time_str - Converts a string like [HH:]MM:SS[.sss] to secs
#
# SYNOPSIS
# Convert the string to a number.
#
# DESCRIPTION
# Units can be hours, mins, secs.
#
# RETURN VALUE
# A non-zero return code indicates a failure.
#
################################################################
proc convert_time_str { time_str units } {
subtest {[regexp {(?:(\d+):)?(\d+):(\d+)(?:\.(\d+))?} $time_str - hours mins secs msecs]} "Time should be in the format \[HH:\]MM:SS\[.sss\]" "$time_str"
if { $hours eq "" } {
set hours 0
}
# Use scan to avoid TCL octals
set value [expr ([scan $hours "%d"] * 3600) + ([scan $mins "%d"] * 60) + [scan $secs "%d"]]
if { $units eq "hours" } {
set value [expr $value / 3600]
} elseif { $units eq "mins" } {
set value [expr $value / 60]
}
return $value
}
################################################################
#
# NAME
# slow_kill - kills a process slowly
#
# SYNOPSIS
# slow_kill pid
#
# DESCRIPTION
# Kill a process slowly, first trying SIGINT, pausing for
# a second, then sending SIGKILL.
#
# RETURN VALUE
# A non-zero return code indicates a failure.
#
################################################################
proc slow_kill { pid } {
global bin_kill
catch {exec $bin_kill -INT $pid}
catch {exec $bin_kill -INT $pid}
sleep 1
catch {exec $bin_kill -KILL $pid}
return 0
}
################################################################
#
# NAME
# get_my_id - gets the id from the running user
#
# SYNOPSIS
# get_my_id
#
# RETURN VALUE
# output of id
#
################################################################
proc get_my_id {} {
global bin_id number
set login_info -1
log_user 0
spawn $bin_id
expect {
-re "(uid=.*\n)" {
set login_info $expect_out(1,string)
exp_continue
}
eof {
wait
}
}
log_user 1
if {$login_info == -1} {
fail "Unable to get user info"
}
return $login_info
}
################################################################
#
# NAME
# get_my_user_name - gets the name from the running user
#
# SYNOPSIS
# get_my_user_name
#
# RETURN VALUE
# A non-zero return code indicates a failure.
#
################################################################
proc get_my_user_name { } {
global bin_id re_word_str
set user_name -1
log_user 0
spawn $bin_id -nu
expect {
-re "($re_word_str)" {
set user_name $expect_out(1,string)
exp_continue
}
eof {
wait
}
}
log_user 1
if {$user_name == -1} {
fail "Unable to get user name"
}
return $user_name
}
################################################################
#
# NAME
# get_my_uid - gets the uid from the running user
#
# SYNOPSIS
# get_my_uid
#
# RETURN VALUE
# The uid of the current user, or fails.
#
################################################################
proc get_my_uid { } {
global bin_id number
set out [run_command_output -nolog -fail "$bin_id -u"]
if {![regexp "($number)" $out - uid]} {
fail "Unable to get UID with $bin_id ($out)"
}
return $uid
}
################################################################
#
# NAME
# get_my_gid - gets the gid from the running user
#
# SYNOPSIS
# get_my_gid
#
# RETURN VALUE
# A non-zero return code indicates a failure.
#
################################################################
proc get_my_gid { } {
global bin_id number
set gid -1
log_user 0
spawn $bin_id -g
expect {
-re "($number)" {
set gid $expect_out(1,string)
exp_continue
}
eof {
wait
}
}
log_user 1
return $gid
}
################################################################
#
# NAME
# kill_salloc - kills all salloc commands associated with this user
#
# SYNOPSIS
# kill_salloc
#
# DESCRIPTION
# Kill all salloc commands associated with this user.
# Issue two SIGINT, sleep 1 and a SIGKILL
#
# RETURN VALUE
# A non-zero return code indicates a failure.
#
# NOTE
# Use slow_kill instead of kill_salloc if you can capture
# the process id
#
################################################################
proc kill_salloc { } {
global bin_id bin_pkill bin_sleep number
set uid [get_my_uid]
catch {exec $bin_pkill -INT -u $uid salloc}
catch {exec $bin_pkill -INT -u $uid salloc}
sleep 1
catch {exec $bin_pkill -KILL -u $uid salloc}
return 0
}
################################################################
#
# NAME
# kill_srun - kills all srun commands associated with this user
#
# SYNOPSIS
# kill_srun
#
# DESCRIPTION
# Kill all srun commands associated with this user.
# Issue two SIGINT, sleep 1 and a SIGKILL
#
# RETURN VALUE
# A non-zero return code indicates a failure.
#
# NOTE
# Use slow_kill instead of kill_srun if you can capture
# the process id
#
################################################################
proc kill_srun { } {
global bin_id bin_pkill bin_sleep number
set uid [get_my_uid]
catch {exec $bin_pkill -INT -u $uid srun}
catch {exec $bin_pkill -INT -u $uid srun}
sleep 1
catch {exec $bin_pkill -KILL -u $uid srun}
return 0
}
################################################################
#
# NAME
# wait_for - generic wait utility
#
# SYNOPSIS
# wait_for ?options? condition body
#
# DESCRIPTION
# Generic wait utility allowing you to repeatedly execute a generic block
# of code until a specified boolean expression is met. The code block and
# condition check occur every poll interval until a timeout is reached.
#
# OPTIONS
# -fail
# fail the test if the condition is not met within the timeout
# -subtest
# if the condition is met within the timeout call subpass, otherwise
# call subfail
# -timeout <float_number>
# time in seconds to wait for the condition to be met before
# timing out (default is 60.0)
# -pollinterval <float_number>
# time in seconds between each loop execution and condition check
# (default is 1.0)
#
# ARGUMENTS
# condition
# The boolean expression to test
# body
# A block of code to evaluate in the invoking stack frame
#
# RETURN VALUE
# RETURN_SUCCESS if the condition is met before the timeout occurs,
# RETURN_TIMEOUT if the timeout occurs before the condition is met
#
################################################################
proc wait_for args {
set action "none"
set subtest false
set timeout 60
set poll_interval 1
while {[llength $args]} {
switch -glob -- [lindex $args 0] {
-fail {set action "fail"; set args [lrange $args 1 end]}
-subtest {set action "subtest"; set args [lrange $args 1 end]}
-time* {set args [lassign $args - timeout]}
-poll* {set args [lassign $args - poll_interval]}
-* {fail "Unknown option: [lindex $args 0]"}
default break
}
}
if {[llength $args] == 2} {
lassign $args condition body
} else {
fail "Invalid number of arguments [llength $args]: $args"
}
set start_time [format "%.3f" [expr [clock milliseconds] / 1000.000]]
log_debug "Waiting for $condition"
while {1} {
# Evaluate code block
log_trace "Evaluating code block ([string trim $body])"
uplevel $body
# Check condition
if {[uplevel expr [format "{%s}" $condition]]} {
set now [format "%.3f" [expr [clock milliseconds] / 1000.000]]
log_debug "Condition ($condition) was met"
if {$action eq "subtest"} {
subpass "Condition ($condition) should be met (within $timeout seconds)"
}
return $::RETURN_SUCCESS
} else {
log_trace "Condition ($condition) was not met"
}
# Sleep poll interval
log_trace "Sleeping for $poll_interval seconds"
after [expr {int($poll_interval * 1000)}]
# Check if we have surpassed our timeout
set now [format "%.3f" [expr [clock milliseconds] / 1000.000]]
log_trace "Checking whether the current time ([clock format [expr int($now)] -format %Y-%m-%dT%X].[lindex [split $now '.'] 1]) is greater than the start time plus the timeout ([clock format [expr int($start_time + $timeout)] -format %Y-%m-%dT%X].[lindex [split [expr $start_time + $timeout] '.'] 1])"
if {$now > $start_time + $timeout} {
set message "Condition ($condition) was not met before timeout ($timeout seconds)"
if {$action eq "fail"} {
fail $message
} else {
log_warn $message
if {$action eq "subtest"} {
subfail "Condition ($condition) should be met (within $timeout seconds)" "Timed out"
}
return $::RETURN_TIMEOUT
}
}
}
}
################################################################
#
# NAME
# wait_for_command - repeat a command until it is successful or meets a specified condition
#
# SYNOPSIS
# wait_for_command ?options? command ?condition?
#
# DESCRIPTION
# A command is repeated until it meets a condition or a timeout is reached.
# If a condition is not specified, the command will be repeated until it
# is successful (the exit code is zero).
#
# OPTIONS
# -fail
# fail the testif the condition is not met within the timeout
# -subtest
# if the condition is met within the timeout call subpass, otherwise
# call subfail
# -timeout <float_number>
# time in seconds to wait for the condition to be met before
# timing out (default is 60.0)
# -pollinterval <float_number>
# time in seconds between each loop execution and condition
# check (default is 1.0)
#
# ARGUMENTS
# command
# a string containing the command and arguments to execute
# condition
# The boolean expression to test. For each command invocation,
# the result variable will be set to the dictionary returned
# from run_command.
# The condition expression will normally involve a comparison
# with one or more values of this dictionary. If a condition is
# not specified, this condition will be used:
# { [dict get $result exit_code] == 0 }
#
# RETURN VALUE
# RETURN_SUCCESS if the condition is met before the timeout occurs,
# RETURN_TIMEOUT if the timeout occurs before the condition is met
#
################################################################
proc wait_for_command args {
set action "none"
set timeout 60
set poll_interval 1
while {[llength $args]} {
switch -glob -- [lindex $args 0] {
-fail {set action "fail"; set args [lrange $args 1 end]}
-subtest {set action "subtest"; set args [lrange $args 1 end]}
-time* {set args [lassign $args - timeout]}
-poll* {set args [lassign $args - poll_interval]}
-* {fail "Unknown option: [lindex $args 0]"}
default break
}
}
set argument_count [llength $args]
if {$argument_count < 1} {
fail "Too few arguments ($argument_count): $args"
} elseif {$argument_count > 2} {
fail "Too many arguments ($argument_count): $args"
}
lassign $args command
if {$argument_count == 2} {
set condition [lindex $args 1]
} else {
set condition { [dict get $result exit_code] == 0 }
}
set start_time [format "%.3f" [expr [clock milliseconds] / 1000.000]]
log_debug "Waiting for $condition"
while {1} {
# Run command
set result [run_command $command]
# Check condition
if {[eval expr [format "{%s}" $condition]]} {
set now [format "%.3f" [expr [clock milliseconds] / 1000.000]]
log_debug "Condition ($condition) was met"
if {$action eq "subtest"} {
subpass "Condition ($condition) should be met (within $timeout seconds)"
}
return $::RETURN_SUCCESS
} else {
log_trace "Condition ($condition) was not met"
}
# Sleep poll interval
log_trace "Sleeping for $poll_interval seconds"
after [expr {int($poll_interval * 1000)}]
# Check whether we have surpassed our timeout
set now [format "%.3f" [expr [clock milliseconds] / 1000.000]]
log_trace "Checking whether the current time ([clock format [expr int($now)] -format %Y-%m-%dT%X].[lindex [split $now '.'] 1]) is greater than the start time plus the timeout ([clock format [expr int($start_time + $timeout)] -format %Y-%m-%dT%X].[lindex [split [expr $start_time + $timeout] '.'] 1])"
if {$now > $start_time + $timeout} {
set message "Condition ($condition) was not met before timeout ($timeout seconds)"
if {$action eq "fail"} {
fail $message
} else {
log_warn $message
if {$action eq "subtest"} {
subfail "Condition ($condition) should be met (within $timeout seconds)" "Timed out"
}
return $::RETURN_TIMEOUT
}
}
}
}
################################################################
#
# NAME
# wait_for_command_match - repeat a command until its output matches the specified pattern
#
# SYNOPSIS
# wait_for_command_match ?options? command pattern
#
# DESCRIPTION
# A command is repeated until its output matches the specified pattern
#
# OPTIONS
# -negate
# negates the match. The command is repeated until its output does not
# match the specified pattern
# -fail
# fail the test if the output does not match the pattern within the
# timeout
# -subfail
# if the condition is met within the timeout call subpass, otherwise
# call subfail
# -timeout <float_number>
# time in seconds to wait for the pattern to be matched before
# timing out (default is 60.0)
# -pollinterval <float_number>
# time in seconds between each loop execution and match check
# (default is 1.0)
#
# ARGUMENTS
# command
# a string containing the command and arguments to execute
# pattern
# The regular expression to match against the command output
#
# RETURN VALUE
# RETURN_SUCCESS if the pattern is matched before the timeout occurs,
# RETURN_TIMEOUT if the timeout occurs before the pattern is matched
#
################################################################
proc wait_for_command_match args {
set pattern [lindex $args end]
set args [lrange $args 0 end-1]
set match 1
set idx [lsearch $args "-negate"]
if {$idx != -1} {
set match 0
set args [lreplace $args $idx $idx]; # Remove the -negate argument
}
return [wait_for_command {*}$args "\[regexp -- {$pattern} \[dict get \$result output\]\] == $match"]
}
################################################################
#
# NAME
# wait_for_file - waits for a file to exist with non-zero size
#
# SYNOPSIS
# wait_for_file ?options? file_name
#
# OPTIONS
# -fail
# If an error occurs or the file does not become present
# by the timeout, fail the test rather than returning an error
# -timeout <integer_number>
# time in seconds to wait for the file to exist before
# timing out (default is 90)
# -pollinterval <integer_number>
# time in seconds between each file existence test (default is 1)
#
# DESCRIPTION
# Wait for the specified file to exist and have a non-zero size.
# Note that if JobFileAppend=0 is configured, a file can exist and
# be purged then be re-created.
#
# RETURN VALUE
# RETURN_SUCCESS if the file becomes present within the timeout, or
# non-zero value otherwise.
#
################################################################
proc wait_for_file args {
global bin_sleep
set fatal false
set timeout 90
set poll_interval 1
while {[llength $args]} {
switch -glob -- [lindex $args 0] {
-fatal -
-fail {set fatal true; set args [lrange $args 1 end]}
-time* {set args [lassign $args - timeout]}
-poll* {set args [lassign $args - poll_interval]}
-* {fail "Unknown option: [lindex $args 0]"}
default break
}
}
set argument_count [llength $args]
if {$argument_count != 1} {
fail "Invalid number of arguments ($argument_count): $args"
} else {
lassign $args file_name
}
for {set my_delay 0} {$my_delay <= $timeout} \
{set my_delay [expr $my_delay + $poll_interval]} {
if {[file exists $file_name]} {
# Add small delay for I/O buffering
exec $bin_sleep 1
return $::RETURN_SUCCESS
}
exec $bin_sleep $poll_interval
# Expect may fail to load current NFS info.
# Use the ls command to load current info.
set slash_pos [string last $file_name "/"]
if {$slash_pos < 1} {
set dir_name "."
} else {
decr slash_pos
set dir_name [string $file_name 0 $slash_pos]
}
exec /bin/ls $dir_name
}
set message "Timeout waiting for file ($file_name)"
if {$fatal} {
fail $message
}
log_warn $message
return $::RETURN_TIMEOUT
}
################################################################
#
# NAME
# _wait_for_single_job - waits for a job to reach the desired state
#
# SYNOPSIS
# _wait_for_single_job ?options? job_id desired_state
#
# DESCRIPTION
# Wait for a previously submitted Slurm job to reach the desired state.
#
# OPTIONS
# -fail
# If an error occurs or the job does not reach the desired state
# by the timeout, fail the test rather than returning an error
# -timeout <integer_number>
# time in seconds to wait for the job to be in the desired state
# before timing out (default is 360)
# -pollinterval <integer_number>
# time in seconds between each job state check (default is 1)
#
# ARGUMENTS
# job_id
# The Slurm job id of a job we want to wait for.
# desired_state
# The state you want the job to attain before
# returning. Currently supports:
# CANCELLED - job is cancelled
# DONE - any terminated state (includes cancelled)
# PENDING - job is pending
# RUNNING - job is running
# SPECIAL_EXIT
# SUSPENDED - job is suspended
#
# RETURN VALUE
# RETURN_SUCCESS, or non-zero on error.
#
################################################################
proc _wait_for_single_job args {
global scontrol
set fatal false
set timeout 360
set poll_interval 1
while {[llength $args]} {
switch -glob -- [lindex $args 0] {
-fatal -
-fail {set fatal true; set args [lrange $args 1 end]}
-time* {set args [lassign $args - timeout]}
-poll* {set args [lassign $args - poll_interval]}
-* {fail "Unknown option: [lindex $args 0]"}
default break
}
}
set argument_count [llength $args]
if {$argument_count != 2} {
fail "Invalid number of arguments ($argument_count): $args"
} else {
lassign $args job_id desired_state
}
# First verify that desired_state is supported
switch $desired_state {
"CANCELLED" {}
"DONE" {}
"PENDING" {}
"RUNNING" {}
"SPECIAL_EXIT" {}
"SUSPENDED" {}
default {
set message "Invalid desired state: $desired_state"
if {$fatal} {
fail $message
}
log_warn $message
return $::RETURN_ERROR
}
}
if {$job_id == 0} {
set message "Invalid job ID ($job_id)"
if {$fatal} {
fail $message
}
log_warn $message
return $::RETURN_ERROR
}
set my_delay 0
while 1 {
set fd [open "|$scontrol -o show job $job_id"]
gets $fd line
catch {close $fd}
if {[regexp {JobState\s*=\s*(\w+)} $line foo state] != 1} {
set state "NOT_FOUND"
}
switch $state {
"CANCELLED" {
if {$desired_state eq "CANCELLED"} {
log_debug "Job $job_id is CANCELLED"
return $::RETURN_SUCCESS
}
if {$desired_state eq "DONE"} {
log_debug "Job $job_id is DONE ($state)"
return $::RETURN_SUCCESS
}
set message "Job ($job_id) is $state, but we wanted $desired_state"
if {$fatal} {
fail $message
}
log_debug $message
return $::RETURN_ERROR
}
"NOT_FOUND" -
"BOOT_FAIL" -
"COMPLETED" -
"DEADLINE" -
"FAILED" -
"NODE_FAIL" -
"OUT_OF_MEMORY" -
"PREEMPTED" -
"TIMEOUT" {
if {$desired_state eq "DONE"} {
log_debug "Job $job_id is DONE ($state)"
return $::RETURN_SUCCESS
}
set message "Job ($job_id) is $state, but we wanted $desired_state"
if {$fatal} {
fail $message
}
log_debug $message
return $::RETURN_ERROR
}
"PENDING" {
if {$desired_state eq "PENDING"} {
log_debug "Job $job_id is PENDING"
return $::RETURN_SUCCESS
}
log_debug "Job $job_id is in state $state, desire $desired_state"
}
"RUNNING" {
if {$desired_state eq "RUNNING"} {
log_debug "Job $job_id is RUNNING"
return $::RETURN_SUCCESS
}
log_debug "Job $job_id is in state $state, desire $desired_state"
}
"SPECIAL_EXIT" {
if {$desired_state eq "SPECIAL_EXIT"} {
log_debug "Job $job_id is SPECIAL_EXIT"
return $::RETURN_SUCCESS
}
log_debug "Job $job_id is in state $state, desire $desired_state"
}
"SUSPENDED" {
if {$desired_state eq "SUSPENDED"} {
log_debug "Job $job_id is SUSPENDED"
return $::RETURN_SUCCESS
}
log_debug "Job $job_id is in state $state, desire $desired_state"
}
default {
log_debug "Job $job_id is in state $state, desire $desired_state"
}
}
if { $my_delay > $timeout } {
set message "Timeout waiting for job state $desired_state"
if {$fatal} {
fail $message
}
log_warn "Timeout waiting for job state $desired_state"
return $::RETURN_TIMEOUT
}
exec sleep $poll_interval
set my_delay [expr $my_delay + $poll_interval]
}
}
################################################################
#
# NAME
# wait_for_job - waits for job to be in desired state
#
# SYNOPSIS
# wait_for_job ?options? job_id desired_state ?het_job?
#
# DESCRIPTION
# Wait for job to be in desired state. Can handle het job components.
#
# OPTIONS
# -fail
# If an error occurs or the job does not reach the desired state
# by the timeout, fail the test rather than returning an error
# -timeout <integer_number>
# time in seconds to wait for the job to be in the desired state
# before timing out (default is 90)
# -pollinterval <integer_number>
# time in seconds between each job state check (default is 1)
#
# ARGUMENTS
# job_id
# The Slurm job id of a job we want to wait for.
# desired_state
# The state you want the job to attain before returning.
# Currently supports:
# CANCELLED - job is cancelled
# DONE - any terminated state (including cancelled)
# PENDING - job is pending
# RUNNING - job is running
# SPECIAL_EXIT
# SUSPENDED - job is suspended
# het_job
# If set, checks the state of each component job if the job
# is a het one.
#
# RETURN VALUE
# RETURN_SUCCESS if job reaches the desired state, or non-zero value
# otherwise.
#
# SEE ALSO
# _wait_for_single_job
#
################################################################
proc wait_for_job args {
set options [list]
set het_job 0
while {[llength $args]} {
switch -glob -- [lindex $args 0] {
-fatal -
-fail {
lappend options [lindex $args 0]
set args [lrange $args 1 end]
}
-time* -
-poll* {
lappend options {*}[lrange $args 0 1]
set args [lrange $args 2 end]
}
default break
}
}
set argument_count [llength $args]
if {$argument_count < 2} {
fail "Too few arguments ($argument_count): $args"
} elseif {$argument_count > 3} {
fail "Too many arguments ($argument_count): $args"
} else {
lassign $args job_id desired_state
}
if {$argument_count == 3} { set hetjob [lindex $args 2] }
if { $het_job } {
# get component job ids
set jid_list [get_het_job_ids $job_id 1]
}
set rc 0
set jid_list ""
if { $jid_list == "" } {
# non-het job
set jid_list $job_id
}
foreach jid $jid_list {
set rc [_wait_for_single_job {*}$options $jid $desired_state]
if { $rc } {
# bail out on first failure
break
}
}
return $rc
}
################################################################
#
# NAME
# wait_for_job_acct - waits for the job accounting record to be updated
#
# SYNOPSIS
# wait_for_job_acct ?options? job_id ?field?
#
# DESCRIPTION
# Wait for evidence that a specific job stage has been recorded in the
# accounting database by verifying that a field has been populated.
#
# OPTIONS
# -fail
# If an error occurs or the job does has not populated the field
# by the timeout, fail the test rather than returning an error
# -timeout <integer_number>
# time in seconds to wait for the job's field to be populated
# before timing out (default is 90)
# -pollinterval <integer_number>
# time in seconds between each field check (default is 1)
#
# ARGUMENTS
# job_id
# The Slurm job id of a job we want to wait for.
# field
# The field that we are waiting to be populated, not empty and
# not "Unknown". (Default: End)
#
# RETURN VALUE
# RETURN_SUCCESS if the desired field has been populated for the job
# in the accounting database, or non-zero value otherwise.
#
################################################################
proc wait_for_job_acct args {
global sacct
set options [list]
set timeout 10
set pollinterval .2
set field End
while {[llength $args]} {
switch -glob -- [lindex $args 0] {
-fail {
lappend options [lindex $args 0]
set args [lrange $args 1 end]
}
-time* {set args [lassign $args - timeout]}
-poll* {set args [lassign $args - pollinterval]}
default break
}
}
lappend options -timeout $timeout
lappend options -pollinterval $pollinterval
set argument_count [llength $args]
if {$argument_count < 1} {
fail "Too few arguments ($argument_count): $args"
} elseif {$argument_count > 2} {
fail "Too many arguments ($argument_count): $args"
}
lassign $args job_id
if {$argument_count == 2} {set field [lindex $args 1]}
# Wait for sacct to populate the specified field
# We want to see the field be populated with anything but "Unknown"
# We can't use a negation match here because empty would match
return [wait_for_command_match {*}$options "$sacct -j $job_id --allocation --format $field --parsable2 --noheader" (?n)^(?!.*Unknown$).]
}
################################################################
#
# NAME
# wait_for_account_done - cancels and waits on jobs in specified accounts
#
# SYNOPSIS
# wait_for_account_done ?options? accounts
#
# DESCRIPTION
# Cancel jobs on and wait for them to be finished in account(s) given.
#
# OPTIONS
# -timeout <integer_number>
# time in seconds to wait for the jobs to be finished before
# timing out (default is 360)
# -pollinterval <integer_number>
# time in seconds between each job state check (default is 1)
#
# ARGUMENTS
# accounts
# Comma-delimited list of accounts
#
# RETURN VALUE
# RETURN_SUCCESS if all jobs of the account are finished, or non-zero
# otherwise.
#
# NOTE
# We sleep for two seconds before replying that a job is
# done to give time for I/O completion (stdout/stderr files)
#
################################################################
proc wait_for_account_done args {
global scancel squeue re_word_str
set timeout 360
set poll_interval 1
while {[llength $args]} {
switch -glob -- [lindex $args 0] {
-time* {set args [lassign $args - timeout]}
-poll* {set args [lassign $args - poll_interval]}
-* {fail "Unknown option: [lindex $args 0]"}
default break
}
}
set argument_count [llength $args]
if {$argument_count != 1} {
fail "Invalid number of arguments ($argument_count): $args"
} else {
lassign $args accounts
}
if { $accounts == "" } {
fail "An account must be specified"
}
log_user 0
set account_list [split $accounts ","]
foreach item $account_list {
spawn $scancel -A $item
expect {
timeout {
log_warn "No response from scancel"
}
eof {
wait
}
}
}
set my_delay 0
while 1 {
set found 0
spawn $squeue -o Account=%a -h -A$accounts
expect {
-re "Account=($re_word_str)" {
set found 1
exp_continue
}
eof {
wait
}
}
if { !$found } {
log_debug "Account(s) $accounts is/are empty"
break
}
if { $my_delay > $timeout } {
log_error "Timeout waiting for account(s) ($accounts) to be finished"
log_user 1
return $::RETURN_TIMEOUT
}
exec sleep $poll_interval
set my_delay [expr $my_delay + $poll_interval]
}
log_user 1
return $::RETURN_SUCCESS
}
################################################################
#
# NAME
# wait_for_part_done - cancels and waits on jobs in specified partition
#
# SYNOPSIS
# wait_for_part_done ?options? partition
#
# DESCRIPTION
# Cancel jobs on and wait for them to be finished in partition given.
#
# OPTIONS
# -timeout <integer_number>
# time in seconds to wait for the jobs to be finished before
# timing out (default is 360)
# -pollinterval <integer_number>
# time in seconds between each job state check (default is 1)
#
# ARGUMENTS
# partition
# partition name
#
# RETURN VALUE
# RETURN_SUCCESS if all jobs of the partition are finished, or non-zero
# otherwise.
#
# NOTE
# We sleep for two seconds before replying that a job is
# done to give time for I/O completion (stdout/stderr files)
#
################################################################
proc wait_for_part_done args {
global scancel squeue re_word_str
set timeout 360
set poll_interval 1
while {[llength $args]} {
switch -glob -- [lindex $args 0] {
-time* {set args [lassign $args - timeout]}
-poll* {set args [lassign $args - poll_interval]}
-* {fail "Unknown option: [lindex $args 0]"}
default break
}
}
set argument_count [llength $args]
if {$argument_count != 1} {
fail "Invalid number of arguments ($argument_count): $args"
} else {
lassign $args partition
}
if { $partition == "" } {
fail "A partition must be specified"
}
run_command -fail -nolog "$scancel -p $partition"
set my_delay 0
while 1 {
set found 0
spawn $squeue -o Part=%P -h -p$partition
expect {
-re "Part=($re_word_str)" {
set found 1
exp_continue
}
eof {
wait
}
}
if { !$found } {
log_debug "Partition $partition is empty"
break
}
if { $my_delay > $timeout } {
log_error "Timeout waiting for partition ($partition) to be finished"
return $::RETURN_TIMEOUT
}
exec sleep $poll_interval
set my_delay [expr $my_delay + $poll_interval]
}
return $::RETURN_SUCCESS
}
################################################################
#
# NAME
# wait_for_step - waits for a job step to be found
#
# SYNOPSIS
# wait_for_step ?options? step_id
#
# DESCRIPTION
# Wait for a job step to be found.
#
# OPTIONS
# -timeout <integer_number>
# time in seconds to wait for the job step to be found before
# timing out (default is 30)
# -pollinterval <integer_number>
# time in seconds between each step existence check (default is 1)
#
# ARGUMENTS
# step_id
# job step id
#
# RETURN VALUE
# RETURN_SUCCESS if step_id is found, or non-zero otherwise.
#
################################################################
proc wait_for_step args {
global scontrol
set timeout 30
set poll_interval 1
while {[llength $args]} {
switch -glob -- [lindex $args 0] {
-time* {set args [lassign $args - timeout]}
-poll* {set args [lassign $args - poll_interval]}
-* {fail "Unknown option: [lindex $args 0]"}
default break
}
}
set argument_count [llength $args]
if {$argument_count != 1} {
fail "Invalid number of arguments ($argument_count): $args"
} else {
lassign $args step_id
}
set my_delay 0
while 1 {
set fd [open "|$scontrol -o show step $step_id"]
gets $fd line
catch {close $fd}
if {[regexp {Nodes=} $line foo] == 1} {
return $::RETURN_SUCCESS
}
if {[regexp {MidplaneList=} $line foo] == 1} {
return $::RETURN_SUCCESS
}
if { $my_delay > $timeout } {
log_warn "Timeout waiting for job step"
return $::RETURN_TIMEOUT
}
log_debug "Step $step_id not done yet. Waiting for $poll_interval seconds"
exec sleep $poll_interval
set my_delay [expr $my_delay + $poll_interval]
}
}
################################################################
#
# NAME
# wait_job_reason - waits for a desired job state and reason
#
# SYNOPSIS
# wait_job_reason ?options? job_id ?desired_state? ?desired_reason_list?
#
# DESCRIPTION
# Wait until the job is in desired state and reason is one
# of the desired ones or until the timeout.
#
# OPTIONS
# -timeout <integer_number>
# time in seconds to wait for the job state and reason before
# timing out (default is 360)
# -pollinterval <integer_number>
# time in seconds between each job state check (default is 1)
#
# ARGUMENTS
# job_id
# The job to wait for
# desired_state
# Desired state.
# desired_reason_list
# List of desired reasons. Empty list means that any reason
# is ok.
#
# RETURN VALUE
# RETURN_SUCCESS when job is in the desired state and reason is one
# of the desired ones, or non-zero otherwise.
#
################################################################
proc wait_job_reason args {
global scontrol re_word_str
set final_state "COMPLETED CANCELLED FAILED TIMEOUT DEADLINE
OUT_OF_MEMORY"
set timeout 360
set poll_interval 1
set desired_state "PENDING"
set desired_reason_list ""
while {[llength $args]} {
switch -glob -- [lindex $args 0] {
-time* {set args [lassign $args - timeout]}
-poll* {set args [lassign $args - poll_interval]}
-* {fail "Unknown option: [lindex $args 0]"}
default break
}
}
set argument_count [llength $args]
if {$argument_count < 1} {
fail "Too few arguments ($argument_count): $args"
} else {
lassign $args job_id
}
if {$argument_count >= 2} { set desired_state [lindex $args 1] }
if {$argument_count == 3} { set desired_reason_list [lindex $args 2] }
if {$argument_count > 3} {
fail "Too many arguments ($argument_count): $args"
}
set log_user_prev [log_user -info]
log_user 0
set my_delay 0
set rc $::RETURN_ERROR
while true {
set pending 0
set has_reason 1
spawn $scontrol show job $job_id
expect {
-re "JobState=($re_word_str) Reason=(\\S+)" {
set job_state $expect_out(1,string)
set job_reason $expect_out(2,string)
}
timeout {
log_error "No response from scontrol show job"
set rc $::RETURN_TIMEOUT
break
}
}
# Check if both state and reason are the desired ones
if {$job_state == $desired_state} {
set found 0
set reason_msg ""
if {$desired_reason_list == ""} {
set found 1
}
foreach desired_reason $desired_reason_list {
if {$job_reason == $desired_reason } {
set reason_msg " with reason $job_reason"
set found 1
}
}
if {$found} {
log_debug "Job $job_id found $job_state$reason_msg"
set rc $::RETURN_SUCCESS
break
}
} elseif {[lsearch -exact final_state $job_state] >= 0} {
# Job is in final step no need to wait longer
log_error [format "Job in final state/reason '%s' / '%s' instead of the desired '%s' / '%s'." \
$job_state $job_reason \
$desired_state $desired_reason_list]
set rc $::RETURN_ERROR
break
}
# Check if this was the last poll
if {$my_delay > $timeout} {
log_error "Timeout"
set rc $::RETURN_TIMEOUT
break
}
set remamining_sec [expr $timeout - $my_delay]
log_debug [format "Job in state/reason '%s' / '%s' instead of the desired '%s' / '%s'." \
$job_state $job_reason \
$desired_state $desired_reason_list]
log_debug [format "Polling again in %ss, %ss to timeout." \
$poll_interval $remamining_sec]
sleep $poll_interval
set my_delay [expr $my_delay + $poll_interval]
}
log_user $log_user_prev
return $rc
}
################################################################
#
# NAME
# get_config - returns a dictionary of slurm configuration parameters
#
# SYNOPSIS
# get_config ?options?
#
# OPTIONS
# -dbd
# uses `sacctmgr show config` to return slurmdbd configuration
# parameters
# -slurm
# uses `scontrol show config` to return slurm configuration
# parameters (this is the default)
#
# RETURN VALUE
# Returns a dictionary of parameter values
#
################################################################
proc get_config args {
global sacctmgr scontrol
set config_dict [list]
set command "$scontrol"
while {[llength $args]} {
switch -glob -- [lindex $args 0] {
-slurm {set command "$scontrol"; set args [lrange $args 1 end]}
-dbd {set command "$sacctmgr"; set args [lrange $args 1 end]}
-* {fail "Unknown option: [lindex $args 0]"}
default break
}
}
if {[llength $args] > 0} {
fail "[lindex [info level 0] 0]: No arguments allowed: $args"
}
set output [run_command_output -fail -nolog "$command show config"]
foreach line [split $output "\n"] {
if {[regexp {^(\S+) += (.*)$} $line {} param_name param_value] == 1} {
dict set config_dict $param_name $param_value
}
}
return $config_dict
}
################################################################
#
# NAME
# get_config_param - returns a slurm configuration parameter value
#
# SYNOPSIS
# get_config_param ?options? parameter_name
#
# OPTIONS
# -dbd
# uses `sacctmgr show config` to return the specified slurmdbd
# configuration parameter value
# -slurm
# uses `scontrol show config` to return the specified slurm
# configuration parameter value (this is the default)
#
# ARGUMENTS
# parameter_name
# the parameter to return the value for
#
# DESCRIPTION
# Returns a specific configuration parameter value.
#
# RETURN VALUE
# Returns the value of the specified parameter or MISSING if it does not
# exist.
#
################################################################
proc get_config_param args {
set options [list]
while {[llength $args]} {
switch -glob -- [lindex $args 0] {
-* {
lappend options [lindex $args 0]
set args [lrange $args 1 end]
}
default break
}
}
if {[llength $args] == 1} {
lassign $args parameter_name
} else {
fail "[lindex [info level 0] 0]: Invalid number of arguments ([llength $args]): $args"
}
set config_dict [get_config {*}$options]
if [dict exists $config_dict $parameter_name] {
return [dict get $config_dict $parameter_name]
} else {
return "MISSING"
}
}
################################################################
#
# NAME
# param_contains - test whether a comma-separated-list contains a specified value
#
# SYNOPSIS
# param_contains haystack needle
#
# DESCRIPTION
# Searches for the specified value (needle) in the comma-separated-list
# string (haystack). Needle can be a glob-style pattern.
#
# RETURN VALUE
# Returns a boolean value indicating whether the value (needle) was found
# in the comma-separated-list string (haystack)
#
################################################################
proc param_contains { haystack needle } {
if {[lsearch [split $haystack ","] $needle] != -1} {
return true
} else {
return false
}
}
################################################################
#
# NAME
# param_value - returns the value of a parameter in a comma-separated-list
#
# SYNOPSIS
# param_value params_list param ?default?
#
# DESCRIPTION
# Searches for the specified param in the comma-separated-list
# string (params_list) and returns its value.
#
# RETURN VALUE
# Returns the value found or the optional default value if not found.
# If the param is found without a value, returns true (ie like
# param_contains).
#
################################################################
proc param_value {params_list param {default false}} {
global re_word_str
foreach pair [split $params_list ","] {
if {[regexp "$param" $pair] == 1} {
if {[regexp "$param=($re_word_str)" $pair - value] == 1} {
return $value
} else {
return true
}
}
}
return $default
}
################################################################
#
# NAME
# get_affinity_types - gets the task plugins running with task/ stripped
#
# SYNOPSIS
# get_affinity_types
#
# RETURN VALUE
# Returns comma separated list of task plugins running without the task/
#
################################################################
proc get_affinity_types { } {
global scontrol re_word_str
log_user 0
set affinity ""
spawn $scontrol show config
expect {
-re "TaskPlugin *= ($re_word_str)" {
set parts [split $expect_out(1,string) ",/"]
while 1 {
set task_found [lsearch $parts "task"]
if { $task_found == -1 } break
set parts [lreplace $parts $task_found $task_found]
}
set affinity [join $parts ","]
exp_continue
}
eof {
wait
}
}
log_user 1
return $affinity
}
################################################################
#
# NAME
# get_mps_count_by_index - gets the count of a specific gres/mps device
#
# SYNOPSIS
# get_mps_count_by_index index hostname
#
# RETURN VALUE
# Returns the Count of a specific gres/mps device
#
################################################################
proc get_mps_count_by_index { index hostname } {
global slurmd number re_word_str
log_user 0
set count 0
spawn $slurmd -G -N $hostname
expect {
-re "Gres Name=mps Type=$re_word_str Count=($number) Index=$index" {
set count $expect_out(1,string)
exp_continue
}
eof {
wait
}
}
log_user 1
return $count
}
################################################################
#
# NAME
# check_mpi - determines if mpicc and the desired --mpi option are available
#
# SYNOPSIS
# check_mpi desired_mpi
#
# DESCRIPTION
# Runs srun --mpi=list to determine if the desired mpi is available, as
# well as if mpicc set in globals.local is too.
#
# RETURN VALUE
# Returns true if the desired mpi and mpicc are available, false otherwise.
#
################################################################
proc check_mpi {desired_mpi} {
global srun eol mpicc
set output [run_command_output -fail "$srun --mpi=list"]
return [regexp "${desired_mpi}${eol}" $output] && [file executable $mpicc]
}
################################################################
#
# NAME
# check_influxdb_access - determines if user can access to the desired influxdb
#
# SYNOPSIS
# check_influxdb_access host port database
#
# DESCRIPTION
# It uses the global $influx CLI command to try to connect and use
# the desired database.
#
# RETURN VALUE
# Returns true if we can connect to host:port and use the database,
# false otherwise
#
################################################################
proc check_influxdb_access {host port database} {
global influx
set connected 0
set access 0
if {![file executable $influx]} {
log_warn "Cannot execute influx command: $influx"
return false
}
set log_user_save [log_user -info]
log_user 0
spawn $influx -host $host -port $port
expect {
-re "Connected to" {
set connected 1
send "use $database\r"
exp_continue
}
-re "Using database" {
set access 1
send "quit\r"
}
-re "unable to parse authentication credentials" {
send "quit\r"
}
-re "authorization failed" {
send "quit\r"
}
timeout {
fail "InfluxDB instance not responding"
}
eof {
wait
}
}
log_user $log_user_save
if {!$connected} {
log_warn "Cannot connect to $host:$port"
return false
}
if {!$access} {
log_warn "Connected to $host:$port, but cannot use $database"
return false
}
return true
}
################################################################
#
# NAME
# check_bb_emulate - determines if Cray burst buffers API is emulated
#
# SYNOPSIS
# check_bb_emulate
#
# RETURN VALUE
# Returns true if Cray burst buffers API is emulated, false otherwise
#
################################################################
proc check_bb_emulate { } {
global scontrol
log_user 0
set bb_emulate false
spawn $scontrol show burst
expect {
-re "EmulateCray" {
set bb_emulate true
exp_continue
}
eof {
wait
}
}
log_user 1
return $bb_emulate
}
################################################################
#
# NAME
# check_bb_persistent - determines if persistent burst buffers can be created by users
#
# SYNOPSIS
# check_bb_persistent
#
# RETURN VALUE
# Returns true if Cray burst buffers can be created by users,
# false otherwise
#
################################################################
proc check_bb_persistent { } {
global scontrol
log_user 0
set bb_persistent false
spawn $scontrol show burst
expect {
-re "EnablePersistent" {
set bb_persistent true
exp_continue
}
eof {
wait
}
}
log_user 1
return $bb_persistent
}
################################################################
#
# NAME
# get_default_acct - gets user's default account
#
# SYNOPSIS
# get_default_acct user
#
# RETURN VALUE
# Returns name of default account if exists, NULL otherwise
#
################################################################
proc get_default_acct { user } {
global sacctmgr re_word_str bin_id
log_user 0
set def_acct ""
if { !$user } {
set user [get_my_user_name]
}
spawn $sacctmgr -n list -P user $user format="DefaultAccount"
expect {
-re "($re_word_str)" {
set def_acct $expect_out(1,string)
exp_continue
}
eof {
wait
}
}
log_user 1
return $def_acct
}
################################################################
#
# NAME
# get_cycle_count - get desired iteration count
#
# SYNOPSIS
# get_cycle_count
#
# DESCRIPTION
# For tests with iteration counts (e.g. test9.1, test9.2)
# return the desired iteration count
#
# RETURN VALUE
# Returns desired iteration count
#
################################################################
proc get_cycle_count { } {
global enable_memory_leak_debug
if {$enable_memory_leak_debug != 0} {
return 2
}
return 100
}
################################################################
#
# NAME
# get_select_type_params - determines SelectTypeParameters being used for a given partition
#
# SYNOPSIS
# get_select_type_params ?partition?
#
# DESCRIPTION
# Determine SelectTypeParameters being used for a given partition.
# If the partition is not specified, the default partition will be used.
#
# RETURN VALUE
# Returns a string containing SelectTypeParameters
#
################################################################
proc get_select_type_params { {partition ""} } {
global scontrol bin_bash bin_grep re_word_str
log_user 0
set params ""
if {[string length $partition] == 0} {
set partition [default_partition]
}
if {$partition ne ""} {
spawn -noecho $bin_bash -c "exec $scontrol show part $partition | $bin_grep SelectTypeParameters"
expect {
-re "SelectTypeParameters *= *NONE" {
exp_continue
}
-re "SelectTypeParameters *= *($re_word_str)" {
set params $expect_out(1,string)
exp_continue
}
eof {
wait
}
}
}
if {$params eq ""} {
spawn -noecho $bin_bash -c "exec $scontrol show config | $bin_grep SelectTypeParameters"
expect {
-re "SelectTypeParameters *= *($re_word_str)" {
set params $expect_out(1,string)
exp_continue
}
eof {
wait
}
}
}
log_user 1
return $params
}
################################################################
#
# NAME
# check_config_select - checks if effectively using the select type
#
# SYNOPSIS
# check_config_select type
#
# DESCRIPTION
# Determine if SelectType is equivalent to the passed type.
#
# ARGUMENTS
# type
# the desired SelectType to check (e.g. cons_tres)
#
# RETURN VALUE
# Returns true if configured, false otherwise
#
################################################################
proc check_config_select { type } {
set select_type [get_config_param "SelectType"]
set select_type_parameters [get_config_param "SelectTypeParameters"]
if {$select_type eq "select/$type"} {
return true
}
return false
}
################################################################
#
# NAME
# get_total_cpus - gets the total available CPUs on the default partition
#
# SYNOPSIS
# get_total_cpus
#
# RETURN VALUE
# The total available CPUs on the default partition.
#
# NOTE
# CoreSpecCount are not part of the total.
#
################################################################
proc get_total_cpus {} {
set total_cpu_count 0
# Obtain the list of available nodes in the default partition
set node_list [get_nodes_by_state]
# Tally the cpus on these nodes
set nodes_dict [get_nodes]
foreach node_name $node_list {
set node_dict [dict get $nodes_dict $node_name]
set node_cpu_count [dict get $node_dict "CPUTot"]
# Subtract out any spec cores
if {[dict exists $node_dict "CoreSpecCount"] && [dict exists $node_dict "ThreadsPerCore"]} {
set spec_cpu_count [expr [dict get $node_dict "CPUTot"] * [dict get $node_dict "ThreadsPerCore"]]
incr node_cpu_count -$spec_cpu_count
}
incr total_cpu_count $node_cpu_count
}
return $total_cpu_count
}
################################################################
#
# NAME
# is_running_in_container
#
# SYNOPSIS
# is_running_in_container
#
# DESCRIPTION
# Determine if test script is running inside of a container.
#
# RETURN VALUE
# true if container detected or systemd-detect-virt is not found,
# false otherwise
#
################################################################
proc is_running_in_container {} {
global bin_systemd_detect_virt
if {[run_command_status -nolog -none "$bin_systemd_detect_virt --version"]} {
log_warn "$bin_systemd_detect_virt not found, assuming container"
return true
}
set result [run_command -nolog -none "$bin_systemd_detect_virt -c"]
set output [string trimright [dict get $result output] "\r\n"]
if { $output != "none"} {
log_debug "Detected container type: $output"
}
if {[dict get $result exit_code]} {
return false
}
return true
}
################################################################
#
# NAME
# is_super_user - determines if user is root or SlurmUser
#
# SYNOPSIS
# is_super_user ?user?
#
# DESCRIPTION
# Determine if user is a Slurm super user (i.e. user
# root or configured SlurmUser)
#
# RETURN VALUE
# true is user is root or SlurmUser, false otherwise
#
################################################################
proc is_super_user {{user ""}} {
global number
if {$user == ""} {
set user [get_my_user_name]
}
# Check if user is root
if {$user eq "root"} {
return true
}
# Check if user is SlurmUser
set slurm_user [get_config_param "SlurmUser"]
if {[regexp "${user}\\($number\\)" $slurm_user match]} {
return true
}
return false
}
################################################################
#
# NAME
# dec2hex - creates a 32 bit hex number from a signed decimal number
#
# SYNOPSIS
# dec2hex value
#
# DESCRIPTION
# Create a 32 bit hex number from a signed decimal number
#
# RETURN VALUE
# 32 bit hex version of input 'value'
#
# SOURCE
# Courtesy of Chris Cornish
# http://aspn.activestate.com/ASPN/Cookbook/Tcl/Recipe/415982
#
################################################################
# Replace all non-decimal characters
proc dec2hex {value} {
regsub -all {[^0-x\.-]} $value {} newtemp
set value [string trim $newtemp]
if {$value < 2147483647 && $value > -2147483648} {
set tempvalue [format "%#010X" [expr $value]]
return [string range $tempvalue 2 9]
} elseif {$value < -2147483647} {
return "80000000"
} else {
return "7FFFFFFF"
}
}
################################################################
#
# NAME
# uint2hex - creates a 32 bit hex number from an unsigned decimal
#
# SYNOPSIS
# uint2hex value
#
# DESCRIPTION
# Create a 32 bit hex number from an unsigned decimal number.
#
# ARGUMENTS
# value
# unsigneddecimal number to convert
#
# RETURN VALUE
# 32 bit hex version of input 'value'
#
# SOURCE
# Courtesy of Chris Cornish
# http://aspn.activestate.com/ASPN/Cookbook/Tcl/Recipe/415982
#
################################################################
# Replace all non-decimal characters
proc uint2hex {value} {
regsub -all {[^0-x\.-]} $value {} newtemp
set value [string trim $newtemp]
if {$value <= 4294967295 && $value >= 0} {
set tempvalue [format "%#010X" [expr $value]]
return [string range $tempvalue 2 9]
} else {
return "FFFFFFFF"
}
}
################################################################
#
# NAME
# partition_oversubscribe - determines the oversubscribe configuration of the specified partition
#
# SYNOPSIS
# partition_oversubscribe ?partition?
#
# DESCRIPTION
# Determine the oversubscribe configuration of the specified partition.
# If the partition is not specified, the default partition will be used.
#
# RETURN VALUE
# Return the oversubscribe configuration of the specified partition.
#
################################################################
proc partition_oversubscribe { {partition ""} } {
global sinfo
if {[string length $partition] == 0} {
set partition [default_partition]
}
set oversubscribe "NO"
log_debug "$sinfo --noheader --partition $partition --format %h"
set fd [open "|$sinfo --noheader --partition $partition --format %h"]
gets $fd line
catch {close $fd}
regexp {[a-zA-Z]+} $line oversubscribe
return $oversubscribe
}
################################################################
#
# NAME
# default_partition - determines the name of the default partition
#
# SYNOPSIS
# default_partition
#
# DESCRIPTION
# Use scontrol to determine the name of the default partition
#
# RETURN VALUE
# Name of the current default partition, or fail if not found.
#
################################################################
proc default_partition {} {
global scontrol
set name ""
set fd [open "|$scontrol --all --oneliner show partition"]
while {[gets $fd line] != -1} {
if {[regexp {^PartitionName=([^ ]*).*Default=YES} $line frag name]
== 1} {
break
}
}
catch {close $fd}
if {[string length $name] == 0} {
fail "Could not identify the default partition"
}
return $name
}
################################################################
#
# NAME
# default_part_exclusive - determines if the default partition allocates whole nodes to jobs
#
# SYNOPSIS
# default_part_exclusive
#
# DESCRIPTION
# Use scontrol to determine if the default partition
# allocates whole nodes to jobs
#
# RETURN VALUE
# Name of the current default partition
#
################################################################
proc default_part_exclusive {} {
set def_part [default_partition]
set oversubscribe [partition_oversubscribe $def_part]
if {$oversubscribe eq "EXCLUSIVE"} {
return 1
} else {
return 0
}
}
################################################################
#
# NAME
# make_bash_script - creates a bash script
#
# SYNOPSIS
# make_bash_script script_name script_contents
#
# DESCRIPTION
# Create a bash script of name "script_name", and
# make the body of the script "script_contents".
# make_bash_script removes the file if it already exists,
# then generates the #! line, and then dumps "script_contents"
# to the file. Finally, it makes certain that the script
# is executable.
#
# ARGUMENTS
# script_name
# file name for the bash script
# script_contents
# body of the script, not including the initial #! line.
#
# RETURN VALUE
# Nothing.
#
################################################################
proc make_bash_script { script_name script_contents } {
global bin_bash bin_chmod
file delete $script_name
set fd [open $script_name "w"]
puts $fd "#!$bin_bash"
puts $fd $script_contents
close $fd
exec $bin_chmod 777 $script_name
}
################################################################
#
# NAME
# check_acct_associations - checks associations
#
# SYNOPSIS
# check_acct_associations
#
# DESCRIPTION
# Use sacctmgr to check associations
#
# RETURN VALUE
# true if no error is found, false otherwise
#
################################################################
proc check_acct_associations { } {
global sacctmgr number re_word_str
# 2 versions after 23.11 we can remove the below function
# until then we should check older clusters lft/rgt
set rc [check_acct_associations_lft]
if { !$rc } {
return $rc;
}
log_user 0
log_debug "Sanity-Checking Associations"
#
# Use sacctmgr to check associations
#
spawn $sacctmgr -n -p list assoc wopi wopl withd format=lineage,cluster
expect {
-re "($re_word_str)\\|($re_word_str)\\|" {
# Here we are checking if we have duplicates and
# setting up an array to check for holes later
set lineage $expect_out(1,string)
set cluster $expect_out(2,string)
set first [info exists found($cluster,$lineage)]
if { $first } {
log_error "$cluster found lineage $lineage again"
set rc false
} else {
set found($cluster,$lineage) 1
}
exp_continue
}
timeout {
fail "sacctmgr add not responding"
}
eof {
wait
}
}
log_user 1
return $rc
}
proc check_acct_associations_lft { } {
global sacctmgr number re_word_str
set rc true
log_user 0
log_debug "Sanity-Checking Associations"
set clusters ""
spawn $sacctmgr show cluster format=cluster,rpc -p
expect {
-re "($re_word_str)\\|($number)\\|" {
# 9984 == 23.02, the last version where lft/rgt matter
if { $expect_out(2,string) > 9984 } {
exp_continue
}
set clusters [ concat $clusters "," $expect_out(1,string) ]
exp_continue
}
}
#
# Use sacctmgr to check associations
#
spawn $sacctmgr -n -p list assoc wopi wopl withd format=lft,rgt,cluster clusters="$clusters"
expect {
-re "($number)\\|($number)\\|($re_word_str)\\|" {
# Here we are checking if we have duplicates and
# setting up an array to check for holes later
set cluster $expect_out(3,string)
if { ![info exists c_min($cluster)] } {
set c_min($cluster) -1
set c_max($cluster) -1
}
set num1 $expect_out(1,string)
set num2 $expect_out(2,string)
set first [info exists found($cluster,$num1)]
set sec [info exists found($cluster,$num2)]
#log_debug "$first=$num1 $sec=$num2"
if { $first } {
log_error "$cluster found lft $num1 again"
set rc false
} elseif { $sec } {
log_error "$cluster found rgt $num2 again"
set rc false
} else {
set found($cluster,$num1) 1
set found($cluster,$num2) 1
if { $c_min($cluster) == -1
|| $c_min($cluster) > $num1 } {
set c_min($cluster) $num1
}
if { $c_max($cluster) == -1
|| $c_max($cluster) < $num2 } {
set c_max($cluster) $num2
}
}
exp_continue
}
timeout {
fail "sacctmgr add not responding"
}
eof {
wait
}
}
foreach cluster [array names c_min] {
# Here we are checking for holes in the list from above
for {set inx $c_min($cluster)} {$inx < $c_max($cluster)} {incr inx} {
if { ![info exists found($cluster,$inx)] } {
log_error "$cluster No index at $inx"
set rc false
}
}
}
log_user 1
return $rc
}
################################################################
#
# NAME
# get_job_acct_freq - gets the value of the job account gather frequency
#
# SYNOPSIS
# get_job_acct_freq
#
# RETURN VALUE
# job account gather frequency
#
################################################################
proc get_job_acct_freq { } {
global scontrol number
log_user 0
set freq_val 0
spawn $scontrol show config
expect {
-re "JobAcctGatherFrequency *= ($number)" {
set freq_val $expect_out(1,string)
if {$freq_val == 0} {
set freq_val 0
}
}
-re "JobAcctGatherFrequency *= task=($number)" {
set freq_val $expect_out(1,string)
if {$freq_val == 0} {
set freq_val 0
}
}
eof {
wait
}
}
log_user 1
return $freq_val
}
################################################################
#
# NAME
# get_admin_level - gets the AdminLevel of the user
#
# SYNOPSIS
# get_admin_level ?user?
#
# RETURN VALUE
# AdminLevel for the current user
#
################################################################
proc get_admin_level {{user_name ""}} {
global sacctmgr re_word_str re_word_str bin_id
set admin_level ""
if {$user_name == ""} {
set user_name [get_my_user_name]
if { ![string length $user_name] } {
log_error "No name returned from id"
return ""
}
}
if {[is_super_user $user_name]} {
return "Administrator"
}
#
# Use sacctmgr to check admin_level
#
log_user 0
spawn $sacctmgr -n -P list user $user_name format=admin
expect {
-re "($re_word_str)" {
set admin_level $expect_out(1,string)
exp_continue
}
timeout {
fail "sacctmgr add not responding"
}
eof {
wait
}
}
log_user 1
return $admin_level
}
#################################################
#
# NAME
# scale_to_megs - scales the value by the factor T|G|M to megabytes
#
# SYNOPSIS
# scale_to_megs value factor
#
# DESCRIPTION
# scale the value by the factor T|G|M to megabytes
#
# RETURN VALUE
# the scaled variable
#
#################################################
proc scale_to_megs { value factor } {
if {$factor == "T"} {
set value [expr $value * 1024 * 1024]
} elseif {$factor == "G"} {
set value [expr $value * 1024]
} elseif {$factor == "M"} {
set value [expr $value * 1]
} elseif {$factor == "K"} {
set value [expr $value / 1024]
set value [expr {round($value)}]
} else {
set value [expr $value / (1024 * 1024)]
set value [expr {round($value)}]
}
return $value
}
#################################################
#
# NAME
# scale_to_ks - scales the value by the factor G|M|K to kilobytes
#
# SYNOPSIS
# scale_to_ks value factor
#
# DESCRIPTION
# scale the value by the factor G|M|K to kilobytes
#
# RETURN VALUE
# the scaled variable
#
#################################################
proc scale_to_ks { value factor } {
if {$factor == "G"} {
set value [expr $value * 1024 * 1024]
} elseif {$factor == "M"} {
set value [expr $value * 1024]
} elseif {$factor == "K"} {
set value [expr $value * 1]
} else {
set value [expr $value / 1024]
set value [expr {round($value)}]
}
return $value
}
############################################################
#
# NAME
# check_config_node_mem - checks that the nodes have memory configured
#
# SYNOPSIS
# check_config_node_mem
#
# RETURN VALUE
# true if all nodes have memory, false otherwise
#
############################################################
proc check_config_node_mem { } {
set nodes_dict [get_nodes]
dict for {node_name node_dict} $nodes_dict {
if [dict exists $node_dict "RealMemory"] {
if {[dict get $node_dict "RealMemory"] == 1} {
return false
}
} else {
log_warn "Parameter RealMemory not found on node $node_name"
return false
}
}
return true
}
################################################################
#
# NAME
# wait_for_node - waits for nodes in a partition to reach a certain state
#
# SYNOPSIS
# wait_for_node ?options? state num_nodes ?partition?
#
# DESCRIPTION
# Wait for a certain number of nodes in a partition to reach a certain
# state.
#
# OPTIONS
# -timeout <integer_number>
# time in seconds to wait for the node state before
# timing out (default is 3)
# -pollinterval <integer_number>
# time in seconds between each node state check (default is 1)
#
# ARGUMENTS
# state
# The node state to wait for
# num_nodes
# The number of nodes we want to be in the specified state
# partition
# Partition name (the default partition is used if not specified)
#
# RETURN VALUE
# RETURN_SUCCESS, or non-zero on failure
#
################################################################
proc wait_for_node args {
global sinfo number
set partition ""
set timeout 3
set poll_interval 1
set desired_state "PENDING"
set desired_reason_list ""
while {[llength $args]} {
switch -glob -- [lindex $args 0] {
-time* {set args [lassign $args - timeout]}
-poll* {set args [lassign $args - poll_interval]}
-* {fail "Unknown option: [lindex $args 0]"}
default break
}
}
set argument_count [llength $args]
if {$argument_count < 2} {
fail "Too few arguments ($argument_count): $args"
} else {
lassign $args state num_nodes
}
if {$argument_count == 3} { set partition [lindex $args 2] }
if {$argument_count > 3} {
fail "Too many arguments ($argument_count): $args"
}
set wait_time 0
set done 0
set cnt 0
set rt $::RETURN_SUCCESS
if {[string length $partition] == 0} {
set partition [default_partition]
}
while {$done != 1 && $wait_time < $timeout} {
set output [run_command_output -fail "$sinfo --noheader --partition $partition --state $state --format %D"]
regexp "$number" $output cnt
if {$num_nodes <= $cnt} {
set done 1
} else {
log_debug "Partition $partition has $cnt nodes idle and we want $num_nodes"
sleep $poll_interval
incr wait_time 1
}
}
if {$done != 1} {
set rt $::RETURN_ERROR
}
return $rt
}
#####################################################################
#
# NAME
# node_list_to_range - converts a TCL list into a Slurm hostlist using scontrol
#
# SYNOPSIS
# node_list_to_range nodes_list
#
# ARGUMENTS
#
# nodes_list
# a TCL list of node names
#
# RETURN VALUE
# the hostlist form returned by scrontrol show hostlist
#
#####################################################################
proc node_list_to_range {nodes_list} {
global scontrol
set comalist [join $nodes_list ,]
set hostlist [run_command_output -nolog -fail "$scontrol show hostlist $comalist"]
set hostlist [string trimright $hostlist "\r\n"]
}
################################################################
#
# NAME
# list_to_range - converts a list of integer numbers to a range expression
#
# SYNOPSIS
# list_to_range ?numeric_list?
#
# DESCRIPTION
# Collapse a numeric list to a range expression defined by the following
# EBNF:
# <expression> ::= <range> {, <range>}*
# <range> ::= <integer> - <integer> | <integer>
#
# RETURN VALUE
# a range expression representing the numeric elements on the given list.
#
################################################################
proc list_to_range {numeric_list} {
set node_range_expression [node_list_to_range $numeric_list]
set range_expression [regsub {^\[(.*)\]$} $node_range_expression {\1}]
return $range_expression
}
#####################################################################
#
# NAME
# node_range_to_list - converts a node range expression into a list of nodes using scontrol
#
# SYNOPSIS
# node_range_to_list node_range_expression
#
# ARGUMENTS
#
# node_range_expression
# a node range expression accepted by scontrol
#
# RETURN VALUE
# a list contining all node names expanded from the node range expression
#
#####################################################################
proc node_range_to_list {node_range_expression} {
global scontrol
set node_list [list]
set output [run_command_output -nolog -fail "$scontrol show hostnames $node_range_expression"]
foreach line [split $output "\n"] {
if {$line eq ""} {
break
}
lappend node_list $line
}
return $node_list
}
################################################################
#
# NAME
# range_to_list - converts a range expression into a list with the numbers of the range
#
# SYNOPSIS
# range_to_list ?range_expression?
#
# DESCRIPTION
# Expands a range expression defined by the following EBNF into a numeric list
# <expression> ::= <range> {, <range>}*
# <range> ::= <integer> - <integer> | <integer>
#
# RETURN VALUE
# Returns the list of integer numbers defined by the range_expression
#
################################################################
proc range_to_list {range_expression} {
set node_range_expression \[$range_expression\]
set range_list [node_range_to_list $node_range_expression]
return $range_list
}
#####################################################################
#
# NAME
# get_nodes_by_state - gets the list of node names in a given partition/states
#
# SYNOPSIS
# get_nodes_by_state partition states
#
# DESCRIPTION
# sinfo is used to list node names and states in the specified partition.
# This list of nodes is filtered to return only the nodes matching one of
# the requested states.
#
# ARGUMENTS
# partition
# partition to get nodes off
# states
# comma-separated list of allowed states
#
# RETURN VALUE
# list of nodes having the required state
#
#####################################################################
proc get_nodes_by_state {{states ""} {partition ""}} {
global sinfo
set node_list [list]
if {$partition eq ""} {
set partition [default_partition]
}
if {$states eq ""} {
set states "idle"
}
set output [run_command_output -fail "$sinfo -h -N -p $partition -o '%N' -e -t $states"]
foreach line [split $output "\n"] {
if {$line eq ""} { continue }
lappend node_list $line
}
return $node_list
}
#####################################################################
#
# NAME
# set_partition_maximum_time_limit - sets the maximum time limit in a given partition
#
# SYNOPSIS
# set_partition_maximum_time_limit partition limit
#
# RETURN VALUE
# RETURN_SUCCESS, or non-zero on error
#
#####################################################################
proc set_partition_maximum_time_limit {partition limit} {
global scontrol
if {[string length $partition] == 0} {
set partition [default_partition]
if { $partition == "" } {
return $::RETURN_ERROR
}
}
if { $limit < -1 } {
fail "Trying to set invalid partition time limit of $limit"
}
if { $limit == -1 } {
set expected_lim "UNLIMITED"
} else {
set expected_lim limit
}
run_command -fail "$scontrol update partitionname=$partition MaxTime=-1"
set maxtime [get_partition_maximum_time_limit $partition]
if { $maxtime != $limit } {
log_error "Unable to update partition MaxTime, got $maxtime, wanted $limit"
return $::RETURN_ERROR
}
return $::RETURN_SUCCESS
}
#####################################################################
#
# NAME
# get_partition_maximum_time_limit - gets the maximum time limit in a given partition
#
# SYNOPSIS
# get_partition_maximum_time_limit partition
#
# DESCRIPTION
# Get the maximum time limit in a given partition
#
# RETURN VALUE
# time limit in seconds, -1 if undefined or error
#
#####################################################################
proc get_partition_maximum_time_limit {partition} {
global sinfo number
if {[string length $partition] == 0} {
set partition [default_partition]
}
set secs 0
log_user 0
spawn -noecho $sinfo -h -p $partition -O time -e
expect {
-re "infinite" {
set secs -1
exp_continue
}
-re "n/a" {
set secs -1
exp_continue
}
-re "($number)-($number):($number):($number)" {
set days [expr $expect_out(1,string) * 24 * 60 * 60]
set hours [expr $expect_out(2,string) * 60 * 60]
set mins [expr $expect_out(3,string) * 60]
set secs [expr $days + $hours + $mins + $expect_out(4,string)]
exp_continue
}
-re "($number):($number):($number)" {
set hours [expr $expect_out(1,string) * 60 * 60]
set mins [expr $expect_out(2,string) * 60]
set secs [expr $hours + $mins + $expect_out(3,string)]
exp_continue
}
-re "($number):($number)" {
set mins [expr $expect_out(1,string) * 60]
set secs [expr $mins + $expect_out(2,string)]
exp_continue
}
-re "($number)" {
set secs [expr $expect_out(1,string) * 60]
exp_continue
}
timeout {
fail "sinfo not responding"
}
eof {
wait
}
}
log_user 1
return $secs
}
################################################################
#
# NAME
# get_partition_default_time_limit - gets the default time limit in a given partition
#
# SYNOPSIS
# get_partition_default_time_limit ?partition?
#
# DESCRIPTION
# Get the default time limit in a given partition.
# If the partition is not specified, the default partition will be used.
#
# RETURN VALUE
# Returns: time limit in seconds, -1 if undefined or error.
#
################################################################
proc get_partition_default_time_limit { {partition ""} } {
global sinfo number
if {[string length $partition] == 0} {
set partition [default_partition]
}
set secs 0
log_user 0
spawn -noecho $sinfo -h -p $partition -O defaulttime -e
expect {
-re "infinite" {
set secs -1
exp_continue
}
-re "n/a" {
set secs -1
exp_continue
}
-re "($number)-($number):($number):($number)" {
set days [expr $expect_out(1,string) * 24 * 60 * 60]
set hours [expr $expect_out(2,string) * 60 * 60]
set mins [expr $expect_out(3,string) * 60]
set secs [expr $days + $hours + $mins + $expect_out(4,string)]
exp_continue
}
-re "($number):($number):($number)" {
set hours [expr $expect_out(1,string) * 60 * 60]
set mins [expr $expect_out(2,string) * 60]
set secs [expr $hours + $mins + $expect_out(3,string)]
exp_continue
}
-re "($number):($number)" {
set mins [expr $expect_out(1,string) * 60]
set secs [expr $mins + $expect_out(2,string)]
exp_continue
}
-re "($number)" {
set secs [expr $expect_out(1,string) * 60]
exp_continue
}
timeout {
fail "sinfo not responding"
}
eof {
wait
}
}
log_user 1
return $secs
}
#####################################################################
#
# NAME
# get_node_cores - given a node, returns its total number of cores
#
# SYNOPSIS
# get_node_cores node
#
# DESCRIPTION
# Given a node, return its total number of cores
# (not the CoresPerSocket, but the total cores)
#
# RETURN VALUE
# node cores if retrieved, -1 otherwise
#
#####################################################################
proc get_node_cores {node} {
global sinfo number
set cores -1
set sockets_per_node 0
set cores_per_socket 0
if {[string length $node] == 0} {
return $cores
}
log_user 0
spawn -noecho $sinfo -o "%X %Y" -h -n $node
expect {
-re "($number)" {
if {$sockets_per_node == 0} {
set sockets_per_node $expect_out(1,string)
} else {
set cores_per_socket $expect_out(1,string)
}
exp_continue
}
timeout {
fail "sinfo not responding"
}
eof {
wait
}
}
log_user 1
set cores [expr $sockets_per_node * $cores_per_socket]
return $cores
}
#####################################################################
#
# NAME
# get_node_cpus - given a node, returns its total number of threads we account for
#
# SYNOPSIS
# get_node_cpus node
#
# DESCRIPTION
# Given a node, return its total number of threads we account for.
# (not always ThreadsPerCore, but how many threads are in use.
# i.e. CPUs=6 CoresPerSocket=6 ThreadsPerCore=2 Socket=1 would
# result in only 1 thread we care about instead of the 2 listed.)
#
# RETURN VALUE
# list of node [ tot_cpus threads ] if retrieved, [ -1 -1 ] otherwise
#
#####################################################################
proc get_node_cpus {node} {
global scontrol number
set nthreads -1
set nsockets 0
set ncores 0
set totcpus -1
if {[string length $node] == 0} {
return [list $totcpus $nthreads]
}
# Get the number of CPUs on a node
spawn $scontrol show node $node
expect {
-re "CoresPerSocket=($number)" {
set ncores $expect_out(1,string)
exp_continue
}
-re "CPUTot=($number)" {
set totcpus $expect_out(1,string)
exp_continue
}
-re "Sockets=($number)" {
set nsockets $expect_out(1,string)
exp_continue
}
-re "ThreadsPerCore=($number)" {
set nthreads $expect_out(1,string)
exp_continue
}
timeout {
fail "scontrol is not responding"
}
eof {
wait
}
}
set core_cnt [expr $nsockets * $ncores]
set thread_cnt [expr $ncores * $nthreads]
if {$totcpus != $nthreads && $totcpus == $ncores} {
log_debug "Cores rather than threads are being allocated"
set nthreads 1
}
return [list $totcpus $nthreads]
}
#####################################################################
#
# NAME
# get_part_total_cores - given a partition and/or states, return its total cores
#
# SYNOPSIS
# get_part_total_cores partition states
#
# DESCRIPTION
# Given a partition and/or states, return its total cores
#
# ARGUMENTS
# partition
# partition to check cores
# states
# states to filter on partition cores
#
# RETURN VALUE
# partition cores
#
#####################################################################
proc get_part_total_cores {part states} {
global sinfo number
set cores 0
set tmp 0
set i 0
if {[string length $part] == 0} {
set part [default_partition]
}
log_user 0
if {[string length $states] == 0} {
spawn -noecho $sinfo -h -N -p $part -o "%X %Y"
} else {
spawn -noecho $sinfo -h -N -p $part -t $states -o "%X %Y"
}
expect {
-re "($number)" {
set is_even [expr {($i % 2) == 0}]
if {$is_even == 1} {
set tmp $expect_out(1,string)
} else {
set tmp [expr $tmp * $expect_out(1,string)]
set cores [expr $cores + $tmp]
}
incr i
exp_continue
}
timeout {
fail "sinfo not responding"
}
eof {
wait
}
}
log_user 1
return $cores
}
#####################################################################
#
# NAME
# check_hosts_contiguous - verify if all hosts belong to the partition and are contiguous
#
# SYNOPSIS
# check_hosts_contiguous check_hosts_list partition
#
# DESCRIPTION
# Given a partition and a list of hosts, verify if all
# hosts belong to the partition and are contiguous.
# If the partition argument is empty, the default partition
# will be used.
#
# RETURN VALUE
# Returns: true if hosts are contiguous, false otherwise.
#
#####################################################################
proc check_hosts_contiguous { check_hosts_list {partition ""} } {
global sinfo re_word_str
if {[string length $partition] == 0} {
set partition [default_partition]
}
set part_hosts_list {}
log_user 0
spawn $sinfo --noheader -p $partition -N -o %N
expect {
-re "($re_word_str)" {
lappend part_hosts_list $expect_out(1,string)
exp_continue
}
-re "Unable to contact" {
fail "Slurm appears to be down"
}
timeout {
fail "sinfo not responding"
}
eof {
wait
}
}
log_user 1
foreach host $check_hosts_list {
set idx_cur [lsearch $part_hosts_list $host]
if {$idx_cur == -1} {
fail "Host ($host) not found in list of hosts from partition $partition"
}
if {[info exists idx_old]} {
if {$idx_cur != [expr $idx_old + 1]} {
log_error "Node sequence number not contiguous"
return false
}
}
set idx_old $idx_cur
}
return true
}
################################################################
#
# NAME
# get_het_job_ids - gets list of component job ids for a het job
#
# SYNOPSIS
# get_het_job_ids job_id ?use_offset?
#
# DESCRIPTION
# Gets list of component job ids for a het job.
#
# ARGUMENTS
# job_id
# Slurm job id
# use_offset
# If zero, returns list of integer job ids, else returns ids in
# the form of X+Y where X is het job master id and Y is the
# offset.
#
# RETURN VALUE
# A list of ids for a hetjob or an empty list if jobid
# is not a het one.
#
################################################################
proc get_het_job_ids { jobid {use_offset 0}} {
global scontrol number
set id_list ""
set log_user_save [log_user -info]
log_user 0
spawn $scontrol show job $jobid
expect {
-re "JobId=($number) HetJobId=($number) HetJobOffset=($number)" {
if { $use_offset } {
lappend id_list "$expect_out(2,string)+$expect_out(3,string)"
} else {
lappend id_list $expect_out(1,string)
}
exp_continue
}
timeout {
fail "scontrol not responding"
}
eof {
wait
}
}
log_user $log_user_save
return $id_list
}
################################################################
#
# NAME
# reconfigure - calls scontrol reconfigure
#
# SYNOPSIS
# reconfigure ?options? ?cluster?
#
# DESCRIPTION
# Calls scontrol reconfigure. This routine takes the same options as
# run_command, passing them to the underlying run_command invocation.
# This command waits an additional 5 seconds before returning.
#
# OPTIONS
# See OPTIONS of run_command proc.
#
# ARGUMENTS
# cluster
# The cluster to reconfigure
#
# RETURN VALUE
# RETURN_SUCCESS on success, otherwise RETURN_ERROR
#
################################################################
proc reconfigure args {
global scontrol
set options [list]
set cluster ""
while {[llength $args]} {
switch -glob -- [lindex $args 0] {
-* {
lappend options {*}[lrange $args 0 1]
set args [lrange $args 2 end]
}
default break
}
}
set argument_count [llength $args]
if {$argument_count > 1} {
fail "Too many arguments ($argument_count): $args"
} elseif {$argument_count == 1} {
lassign $args cluster
}
set command $scontrol
if {$cluster ne ""} {
append command " -M$cluster"
}
append command " reconfigure"
set rc [run_command_status {*}$options "$command"]
#
# Wait 5 seconds for reconfigure to complete, then return.
#
sleep 5
return $rc
}
#####################################################################
#
# NAME
# log_fatal - prints a fatal message
#
# SYNOPSIS
# log_fatal message
#
# SEE ALSO
# _log_format for options governing the message format and colorization
#
#####################################################################
proc log_fatal {message} {
global testsuite_log_level LOG_LEVEL_FATAL
if {$testsuite_log_level >= $LOG_LEVEL_FATAL} {
_log_format "fatal" "$message"
}
}
#####################################################################
#
# NAME
# log_error - prints an error message
#
# SYNOPSIS
# log_error message
#
# SEE ALSO
# _log_format for options governing the message format and colorization
#
#####################################################################
proc log_error {message} {
global testsuite_log_level LOG_LEVEL_ERROR
if {$testsuite_log_level >= $LOG_LEVEL_ERROR} {
_log_format "error" "$message"
}
}
#####################################################################
#
# NAME
# log_warn - prints a warning message
#
# SYNOPSIS
# log_warn message
#
# SEE ALSO
# _log_format for options governing the message format and colorization
#
#####################################################################
proc log_warn {message} {
global testsuite_log_level LOG_LEVEL_WARNING
if {$testsuite_log_level >= $LOG_LEVEL_WARNING} {
_log_format "warning" "$message"
}
}
#####################################################################
#
# NAME
# log_info - prints an information message
#
# SYNOPSIS
# log_info message
#
# SEE ALSO
# _log_format for options governing the message format and colorization
#
#####################################################################
proc log_info {message} {
global testsuite_log_level LOG_LEVEL_INFO
if {$testsuite_log_level >= $LOG_LEVEL_INFO} {
_log_format "info" "$message"
}
}
#####################################################################
#
# NAME
# log_pass - prints a pass level message
#
# SYNOPSIS
# log_pass message
#
# SEE ALSO
# _log_format for options governing the message format and colorization
#
#####################################################################
proc log_pass {message} {
global testsuite_log_level LOG_LEVEL_PASS
if {$testsuite_log_level >= $LOG_LEVEL_PASS} {
_log_format "pass" "$message"
}
}
#####################################################################
#
# NAME
# log_command - prints a command level message
#
# SYNOPSIS
# log_command message
#
# SEE ALSO
# _log_format for options governing the message format and colorization
#
#####################################################################
proc log_command {message} {
global testsuite_log_level LOG_LEVEL_COMMAND
if {$testsuite_log_level >= $LOG_LEVEL_COMMAND} {
_log_format "command" "$message"
}
}
#####################################################################
#
# NAME
# log_debug - prints a debug level message
#
# SYNOPSIS
# log_debug message
#
# SEE ALSO
# _log_format for options governing the message format and colorization
#
#####################################################################
proc log_debug {message} {
global testsuite_log_level LOG_LEVEL_DEBUG
if {$testsuite_log_level >= $LOG_LEVEL_DEBUG} {
_log_format "debug" "$message"
}
}
#####################################################################
#
# NAME
# log_trace - prints a trace level message
#
# SYNOPSIS
# log_trace message
#
# SEE ALSO
# _log_format for options governing the message format and colorization
#
#####################################################################
proc log_trace {message} {
global testsuite_log_level LOG_LEVEL_TRACE
if {$testsuite_log_level >= $LOG_LEVEL_TRACE} {
_log_format "trace" "$message"
}
}
################################################################
#
# NAME
# in_fed - checks whether this cluster is in a federation
#
# SYNOPSIS
# in_fed
#
# RETURN VALUE
# Returns true if this cluster is in a federation, false otherwise
#
################################################################
proc in_fed {} {
global scontrol
set output [run_command_output -fail -nolog "$scontrol show fed"]
if {[regexp "Federation" $output]} {
return true
}
return false
}
################################################################
#
# NAME
# check_job_state - checks if the state of a job is the expected one
#
# SYNOPSIS
# check_job_state job state ?het_job?
#
# DESCRIPTION
# Checks if the state of a job is the expected one.
#
# ARGUMENTS
# job
# Job ID to check
# state
# Desired state of the job to match
# het_job
# If set, checks state of each component job if the
# job is a hetjob.
#
# RETURN VALUE
# true if job was on the desired state, or the number of job components
# on that state if it's a hetjob and het_job option enabled, false
# otherwise.
#
################################################################
proc check_job_state { job state {het_job 0}} {
global scontrol
set jid_list ""
if { $het_job } {
set jid_list [get_het_job_ids $job 1]
}
if { $jid_list == "" } {
# non-het job
set jid_list $job
}
foreach jid $jid_list {
set state_match 0
spawn $scontrol show job $jid
expect {
-re "JobState=($state)" {
incr state_match
}
timeout {
fail "scontrol not responding"
}
eof {
wait
}
}
if {$state_match != 1} {
log_error "job $jid should be in $state state, but is not"
return false
}
}
return true
}
################################################################
#
# NAME
# get_gres_count - returns a dict of nodes and GRES counts
#
# SYNOPSIS
# get_gres_count gres_name ?node_list?
#
# DESCRIPTION
# Returns a dict of node names and the count of a specifed
# GRES aggregating all its types on each node.
#
# RETURN VALUE
# If the node_list is not specified node name is specified,
# this function will return a dict with the GRES count for all
# the nodes of the default partition.
# If specified, a dict only with the nodes of the node_list.
#
################################################################
proc get_gres_count { gres_name {node_list ""} } {
set nodes_dict [get_nodes $node_list]
set nodes_gres_dict [dict create]
dict for {node_name node_dict} $nodes_dict {
if [dict exists $node_dict "Gres"] {
set gres_param [dict get $node_dict "Gres"]
set gres_dict [count_gres $gres_param]
}
if [dict exists $gres_dict $gres_name] {
set gres_count [dict get $gres_dict $gres_name]
dict set nodes_gres_dict $node_name $gres_count
}
}
return $nodes_gres_dict
}
################################################################
#
# NAME
# count_gres - returns a dict of GRES names and their total counts
#
# SYNOPSIS
# count_gres gres_param
#
# DESCRIPTION
# Parses a GRES parameter string typically obtained from nodes or
# jobs info, and returns a dict of GRES names and their count
# aggregating all the types of each GRES.
#
# ARGUMENTS
# gres_param
# The usual coma-separated list of Gres (e.g gpu:2,craynetwork:1).
# only_consumable
# If true, the no_consume gres are ignored.
#
# RETURN VALUE
# A dict of GRES names and their count aggregating all types of
# each GRES.
#
################################################################
proc count_gres {gres_param {only_consumable false}} {
global gres_regex
set gres_dict [dict create]
foreach gres [split $gres_param ","] {
if {[regexp $gres_regex $gres {} name type nc count] == 1} {
if {$only_consumable && ($type eq "no_consume" || $nc eq "no_consume")} {
continue
} elseif {$nc eq ""} {
set count $type
} elseif {$count eq ""} {
set count $nc
}
if {[dict exists $gres_dict $name]} {
dict set gres_dict $name [expr [dict get $gres_dict $name] + $count]
} else {
dict set gres_dict $name $count
}
}
}
return $gres_dict
}
################################################################
#
# NAME
# get_highest_gres_count - returns highest number of GRES per node on node_count nodes
#
# SYNOPSIS
# get_highest_gres_count node_count gres_name
#
# DESCRIPTION
# For a given number of nodes, returns the highest GRES count per
# node available on at least that number of nodes.
#
# EXAMPLE
# For example: node1 has 1 GPU, node2 has 2 GPUs and node3 has 3 GPUs
# [get_highest_gres_count 1 "gpu"] returns 3 (i.e. 1 node 3 GPUs)
# [get_highest_gres_count 2 "gpu"] returns 2 (i.e. 2 nodes have at least 2 GPUs each)
# [get_highest_gres_count 3 "gpu"] returns 1 (i.e. 3 nodes have at least 1 GPU each)
#
################################################################
proc get_highest_gres_count { node_count gres_name } {
set available_nodes [node_list_to_range [get_nodes_by_state]]
set gres_dict [get_gres_count $gres_name $available_nodes]
set gres_count [list]
dict for {node gres} $gres_dict {
lappend gres_count $gres
}
set count [lindex [lsort -decreasing -integer $gres_count] [expr $node_count - 1]]
return $count
}
################################################################
#
# NAME
# _set_gpu_socket_inx - adds a socket index to the gpu_sock_list if not already on it
#
# SYNOPSIS
# _set_gpu_socket_inx sock_inx
#
# DESCRIPTION
# Add a socket index to the array gpu_sock_list if not already
# on the list. Subroutine used by get_gpu_socket_count
#
################################################################
proc _set_gpu_socket_inx { sock_inx } {
global gpu_sock_list
if {$sock_inx == -1} {
set gpu_sock_list [lreplace $gpu_sock_list 0 99]
return
}
set sock_cnt [llength $gpu_sock_list]
for {set i 0} {$i < $sock_cnt} {incr i} {
if {[lindex $gpu_sock_list $i] == $sock_inx} {
return
}
}
lappend gpu_sock_list $sock_inx
}
################################################################
# Subroutine used by get_gpu_socket_count
# Add a socket index to the array gpu_sock_list if not already
# on the list.
################################################################
proc _set_gpu_socket_range { sock_first_inx sock_last_inx } {
global gpu_sock_list
set sock_cnt [llength $gpu_sock_list]
for {set s $sock_first_inx} {$s <= $sock_last_inx} {incr s} {
set found 0
for {set i 0} {$i < $sock_cnt} {incr i} {
if {[lindex $gpu_sock_list $i] == $s} {
set found 1
break
}
}
if {$found == 0} {
lappend gpu_sock_list $s
}
}
}
################################################################
#
# NAME
# get_gpu_socket_count - returns the number of sockets with GPUS on a node with the given per-node GPU count
#
# SYNOPSIS
# get_gpu_socket_count gpu_cnt sockets_per_node
#
# DESCRIPTION
# Given a per-node GPU count, return the number of sockets with
# GPUs on a node with the given per-node GPU count.
# If the sockets_per_node has a value of 1 then just return 1
# rather than determine the count (for performance reasons).
#
################################################################
proc get_gpu_socket_count { gpu_cnt sockets_per_node } {
global test_dir re_word_str bin_rm number scontrol srun gpu_sock_list
set sockets_with_gpus 1
set file_in "$test_dir/test_get_gpu_socket_count"
if {$sockets_per_node == 1} {
return 1
}
log_user 0
_set_gpu_socket_inx -1
make_bash_script $file_in "$scontrol show node \$SLURMD_NODENAME"
spawn $srun -N1 --gres=gpu:$gpu_cnt $file_in
expect {
-re "gpu:${number}.S:($number)-($number)" {
_set_gpu_socket_range $expect_out(1,string) $expect_out(2,string)
exp_continue
}
-re "gpu:${re_word_str}:${number}.S:($number),($number),($number),($number)" {
_set_gpu_socket_inx $expect_out(1,string)
_set_gpu_socket_inx $expect_out(2,string)
_set_gpu_socket_inx $expect_out(3,string)
_set_gpu_socket_inx $expect_out(4,string)
exp_continue
}
-re "gpu:${re_word_str}:${number}.S:($number),($number),($number)" {
_set_gpu_socket_inx $expect_out(1,string)
_set_gpu_socket_inx $expect_out(2,string)
_set_gpu_socket_inx $expect_out(3,string)
exp_continue
}
-re "gpu:${re_word_str}:${number}.S:($number),($number)" {
_set_gpu_socket_inx $expect_out(1,string)
_set_gpu_socket_inx $expect_out(2,string)
exp_continue
}
-re "gpu:${re_word_str}:${number}.S:($number)" {
_set_gpu_socket_inx $expect_out(1,string)
exp_continue
}
-re "gpu:${number}.S:($number),($number),($number),($number)" {
_set_gpu_socket_inx $expect_out(1,string)
_set_gpu_socket_inx $expect_out(2,string)
_set_gpu_socket_inx $expect_out(3,string)
_set_gpu_socket_inx $expect_out(4,string)
exp_continue
}
-re "gpu:${number}.S:($number),($number),($number)" {
_set_gpu_socket_inx $expect_out(1,string)
_set_gpu_socket_inx $expect_out(2,string)
_set_gpu_socket_inx $expect_out(3,string)
exp_continue
}
-re "gpu:${number}.S:($number),($number)" {
_set_gpu_socket_inx $expect_out(1,string)
_set_gpu_socket_inx $expect_out(2,string)
exp_continue
}
-re "gpu:${number}.S:($number)" {
_set_gpu_socket_inx $expect_out(1,string)
exp_continue
}
eof {
wait
}
}
log_user 1
set sock_cnt [llength $gpu_sock_list]
if {$sock_cnt > 1} {
set sockets_with_gpus $sock_cnt
}
return $sockets_with_gpus
}
################################################################
#
# NAME
# get_highest_mps_count - get_highest_gres_count nodes mps, but for "mps per GPU"
#
# SYNOPSIS
# get_highest_mps_count node_count
#
# DESCRIPTION
# For a given number of nodes, returns the higest number of MPS per GPU
# available at least on those number of nodes.
#
################################################################
proc get_highest_mps_count { node_count } {
# We cannot use get_highest_gres_count because we need "per gpu",
# so we get all the mps per node and all gpus per node, to create
# a mps_per_gpu list to sort and get the count.
set available_nodes [node_list_to_range [get_nodes_by_state]]
set mps_dict [get_gres_count "mps" $available_nodes]
set gpu_dict [get_gres_count "gpu" $available_nodes]
set mps_per_gpu [list]
dict for {node mps} $mps_dict {
if { $mps > 0 } {
if [dict exists $gpu_dict $node] {
set gpu [dict get $gpu_dict $node]
if { $gpu > 0 } {
lappend mps_per_gpu [expr $mps / $gpu]
} else {
fail "All nodes with MPS should have a GPU"
}
} else {
fail "All nodes with MPS should have a GPU"
}
}
}
set count [lindex [lsort -decreasing -integer $mps_per_gpu] [expr $node_count - 1]]
return $count
}
################################################################
#
# NAME
# get_mps_node_count - gets the number of nodes with a positive number of GRES MPS
#
# SYNOPSIS
# get_mps_node_count
#
# RETURN VALUE
# Return the count of nodes with a non-zero count of GRES MPS
#
################################################################
proc get_mps_node_count { } {
global number sinfo re_word_str
set fini 0
set node_inx 0
set def_part [default_partition]
log_user 0
spawn $sinfo -N -p$def_part -oGRES=%G -h
expect {
-re "GRES=($re_word_str)" {
set mps_count 0
set parts [split $expect_out(1,string) ",/"]
while 1 {
set mps_found [lsearch $parts "mps*"]
if { $mps_found == -1 } break
set parts2 [split [lindex $parts $mps_found] ":(/"]
set col [lsearch -regexp $parts2 ^$number$]
if { $col == -1 } {
incr mps_count
} else {
set mps_count [expr $mps_count + [lindex $parts2 $col]]
}
set parts [lreplace $parts $mps_found $mps_found]
}
if {$mps_count > 0} {
incr node_inx
}
exp_continue
}
eof {
wait
}
}
log_user 1
return $node_inx
}
################################################################
#
# NAME
# cuda_count - determines the count of allocated GPUs
#
# SYNOPSIS
# cuda_count cuda_string
#
# ARGUMENTS
# cuda_string
# Contents of a CUDA_VISIBLE_DEVICES environment variable
#
# RETURN VALUE
# Return the number of GPUs or -1 on error
#
################################################################
proc cuda_count { cuda_string } {
set cuda_count 0
set has_number 0
set len [string length $cuda_string]
for {set char_inx 0} {$char_inx < $len} {incr char_inx} {
set cuda_char [string index $cuda_string $char_inx]
if {[string match , $cuda_char]} {
if {$has_number > 0} {
incr cuda_count
set has_number 0
} else {
log_error "Invalid input ($cuda_string)"
return -1
}
} elseif {[string is digit $cuda_char]} {
set has_number 1
}
}
if {$has_number > 0} {
incr cuda_count
} else {
log_error "Invalid input ($cuda_string)"
return -1
}
return $cuda_count
}
################################################################
# NAME
# get_conf_path - gets the path to the slurm.conf file
#
# SYNOPSIS
# get_conf_path
#
# RETURN VALUE
# Returns the path to the slurm.conf file
#
################################################################
proc get_conf_path { } {
global scontrol re_word_str eol
if [regexp {(.*)/slurm.conf} [get_config_param "SLURM_CONF"] {} config_dir] {
return $config_dir
} else {
fail "Unable to determine config dir"
}
}
################################################################
#
# NAME
# save_conf - saves a backup of the specfied configuration file
#
# SYNOPSIS
# save_conf file_name
#
# DESCRIPTION
# If the specified file_name exists, a backup is made which will be
# restored when restore_conf is called.
# If the specified file_name does not exist, a special backup will be
# made that will cause the file to be removed when restore_conf is
# called.
# If a backup already exists, a warning is issued and no backup is made
# (honoring the existing backup).
#
# SEE ALSO
# restore_conf
#
################################################################
proc save_conf { file_name } {
global bin_chmod bin_cp bin_mv bin_touch test_name
log_debug "Saving backup of $file_name"
#
# Check for existing backup
# If a backup exists, issue a warning and return (honor existing backup)
#
set conf_dir [file dirname $file_name]
set dir_files [glob -nocomplain -directory $conf_dir *]
set preexisting_backup_file [lsearch -inline -regexp $dir_files "$file_name\\\.test\\d+\\\.\\d+\$"]
if {$preexisting_backup_file ne ""} {
log_warn "Backup file already exists: ($preexisting_backup_file)"
return
}
#
# Check if file to backup exists.
# If it doesn't exist, warn the user, touch an empty backup file with
# the sticky bit set and allow the test to continue.
# restore_conf will remove the file.
#
set new_backup_file "$file_name.$test_name"
if {![file exists $file_name]} {
log_warn "Backup of a nonexistent file requested: $file_name"
run_command -fail -nolog "$bin_touch $new_backup_file"
run_command -fail -nolog "$bin_chmod +t $new_backup_file"
return
}
run_command -fail -nolog "$bin_mv $file_name $new_backup_file"
run_command -fail -nolog "$bin_cp $new_backup_file $file_name"
}
################################################################
#
# NAME
# restore_conf - restores the original confiration file from backup
#
# SYNOPSIS
# restore_conf file_name
#
# DESCRIPTION
# If a backup exists for the specified file_name, it is restored.
# If the specified file_name did not exist when originally backed up,
# it will be removed.
# If no backup exists, a warning is issued.
#
# SEE ALSO
# save_conf
#
################################################################
proc restore_conf { file_name } {
global bin_mv bin_rm
log_debug "Restoring backup of $file_name"
set conf_dir [file dirname $file_name]
set dir_files [glob -nocomplain -directory $conf_dir *]
set backup_file [lsearch -inline -regexp $dir_files "$file_name\\\.test\\d+\\\.\\d+\$"]
if {$backup_file ne ""} {
file stat $backup_file stat
# If the sticky bit is set and the file is empty, remove both
if {! $stat(size) && [expr $stat(mode) & 512]} {
log_debug "Removing file used for the test: $file_name"
run_command -fail -nolog "$bin_rm -f $backup_file $file_name"
# Else replace the original with the backup
} else {
run_command -fail -nolog "$bin_mv $backup_file $file_name"
}
} else {
#
# If backup file doesn't exist, it has probably already been
# restored by a previous call to restore_conf
#
log_warn "Backup file does not exist for $file_name. It has probably already been restored"
return
}
}
################################################################
#
# NAME
# have_nvml - checks if HAVE_NVML is set in config.h
#
# SYNOPSIS
# have_nvml
#
# RETURN VALUE
# Returns true if HAVE_NVML is set in config.h. Else, returns false
#
################################################################
proc have_nvml { } {
global bin_grep number config_h
return [expr [run_command_status -none -nolog "$bin_grep \"HAVE_NVML 1\" $config_h"] == 0]
}
################################################################
#
# NAME
# delete_part - deletes partition on system
#
# SYNOPSIS
# delete_part partition
#
################################################################
proc delete_part { part_name } {
global scontrol
# Remove part
spawn $scontrol delete partition=$part_name
expect {
timeout {
fail "scontrol is not responding"
}
eof {
wait
}
}
}
################################################################
#
# NAME
# have_lua - checks if HAVE_LUA is set in config.h
#
# SYNOPSIS
# have_lua
#
# RETURN VALUE
# Returns true if HAVE_LUA is set in config.h. Else, returns false
#
################################################################
proc have_lua { } {
global bin_grep config_h
return [expr [run_command_status -none -nolog "$bin_grep HAVE_LUA $config_h"] == 0]
}
################################################################
#
# NAME
# get_reservations - returns a dictionary of dictionaries of reservation parameters
#
# SYNOPSIS
# get_reservations ?resv_name?
#
# RETURN VALUE
# Uses `scontrol show reservation` to return a dictionary of dictionaries
# of job parameters. Specifying an invalid resv_name result in a failure.
#
################################################################
proc get_reservations { {resv_name ""} } {
global scontrol
set command "$scontrol show reservation -o"
if {$resv_name ne ""} {
append command " $resv_name"
}
set output [run_command_output -fail "$command"]
# Iterate over each reservation's parameter list
foreach line [split $output "\n"] {
if {$line eq ""} { continue }
# Peel off the resv parameters one at a time
# The first quantifier sets the greediness for the whole RE
while {[regexp {^ *?([^ =]+)=(.*)(?= +[^ =]+=| *$)} $line {} param_name param_value]} {
# Remove the consumed parameter from the line
set line [regsub {^ *?[^ =]+=.*(?= +[^ =]+=| *$)} $line {}]
# Add it to the temporary job dictionary
dict set resv_dict $param_name $param_value
}
set resv_name_dict [dict get $resv_dict "ReservationName"]
# Add the resv dictionary to resvs dictionary
dict set resvs_dict $resv_name_dict $resv_dict
# Clear the resv dictionary for the next resv
set resv_dict {}
}
return $resvs_dict
}
################################################################
#
# NAME
# get_reservation_param - returns a specific parameter value for a specific reservation
#
# SYNOPSIS
# get_reservation_param resv_name parameter_name
#
# DESCRIPTION
# Returns a specific parameter value for a specified reservation if the
# parameter exists for the reservation, or MISSING if it does not exist.
# Specifying an invalid reservation name will result in a failure.
#
################################################################
proc get_reservation_param { resv_name parameter_name } {
set resvs_dict [get_reservations $resv_name]
if [dict exists $resvs_dict $resv_name $parameter_name] {
return [dict get $resvs_dict $resv_name $parameter_name]
} else {
return "MISSING"
}
}
################################################################
#
# NAME
# create_res - create new reservation in system
#
# SYNOPSIS
# create_res ?res_name? ?res_params?
#
# RETURN VALUE
# the exit code of the scontrol command run
#
################################################################
proc create_res { res_name res_params } {
global scontrol
set result [run_command "$scontrol create res ReservationName=$res_name $res_params"]
set output [dict get $result output]
set ret_code [dict get $result exit_code]
if { $ret_code } {
log_warn "[lindex [info level 0] 0]: error from scontrol: $output"
} else {
log_debug "[lindex [info level 0] 0]: success from scontrol: $output"
}
return $ret_code
}
################################################################
#
# NAME
# update_res - update exisiting reservation in system
#
# SYNOPSIS
# update_res ?res_name? ?res_params?
#
# RETURN VALUE
# the exit code of the scontrol command run
#
################################################################
proc update_res { res_name res_params } {
global scontrol
set result [run_command "$scontrol update ReservationName=$res_name $res_params"]
set output [dict get $result output]
set ret_code [dict get $result exit_code]
if { $ret_code } {
log_warn "Return code from scontrol: $ret_code. Output: $output"
}
return $ret_code
}
################################################################
#
# NAME
# delete_res - delete reservation from system
#
# SYNOPSIS
# delete_res ?res_name?
#
# RETURN VALUE
# the exit code of the scontrol command run
#
################################################################
proc delete_res { res_name } {
global scontrol
set result [run_command "$scontrol delete ReservationName=$res_name"]
set output [dict get $result output]
set ret_code [dict get $result exit_code]
if { $ret_code } {
log_warn "Return code from scontrol: $ret_code. Output: $output"
}
return $ret_code
}
################################################################
#
# NAME
# create_part - creates a partition
#
# SYNOPSIS
# create_part partition num_nodes
#
# ARGUMENTS
# partition
# Name of partition to create
# num_nodes
# Number of nodes of partition to create
#
# RETURN VALUE
# RETURN_SUCCESS, or non-zero on error
#
################################################################
proc create_part { part_name num_nodes_in } {
global scontrol srun bin_printenv number re_word_str
set nodes ""
set num_nodes_out 0
set found 0
spawn $scontrol show partitionname=$part_name
expect {
-re "PartitionName=$part_name" {
set found 1
exp_continue
}
timeout {
fail "scontrol is not responding"
}
eof {
wait
}
}
if {$found == 1} {
log_error "There is already a partition $part_name"
return $::RETURN_ERROR
}
if {[string length [default_partition]] == 0} {
log_warn "create_part does not work without a default partition"
return $::RETURN_ERROR
}
if { $num_nodes_in } {
set num_nodes $num_nodes_in
} else {
set num_nodes [llength [get_nodes_by_state]]
}
log_user 0
# Get a list of nodes
spawn $srun -t1 -N1-$num_nodes $bin_printenv
expect {
-re "SLURM_JOB_NUM_NODES=($number)" {
set num_nodes_out $expect_out(1,string)
exp_continue
}
-re "SLURM_NODELIST=($re_word_str)" {
set nodes $expect_out(1,string)
exp_continue
}
timeout {
fail "srun is not responding getting number of nodes creating part"
}
eof {
wait
}
}
if {[string length $nodes] == 0} {
log_error "Did not get a valid node list"
return $::RETURN_ERROR
} elseif { $num_nodes_out != $num_nodes_in } {
log_error "Did not get enough nodes ($num_nodes_out != $num_nodes_in) to run test"
return $::RETURN_ERROR
}
spawn $scontrol create partitionname=$part_name nodes=$nodes
expect {
timeout {
fail "scontrol is not responding creating partition"
}
eof {
wait
}
}
set found 0
spawn $scontrol show partitionname=$part_name
expect {
-re "PartitionName=$part_name" {
set found 1
exp_continue
}
timeout {
fail "scontrol is not responding"
}
eof {
wait
}
}
if { $found == 0 } {
log_error "scontrol did not create partition $part_name"
return $::RETURN_ERROR
}
log_user 1
log_debug "Created partition $part_name with $num_nodes_in nodes"
return $::RETURN_SUCCESS
}
################################################################
#
# NAME
# get_nodes - returns a dictionary of dictionaries of node parameters
#
# SYNOPSIS
# get_nodes ?hostlist_expression?
#
# DESCRIPTION
# Uses `scontrol show node` to query node parameters, returning a
# dictionary of dictionaries with the node names as keys of the first
# level dictionary and with the parameters as keys of the second level
# dictionary. Specifying an invalid node name will result in a failure.
#
# RETURN VALUE
# If the optional node expression argument is specified, the result will
# be constrained by the specified hostlist expression. Otherwise, the
# results for all nodes will be returned.
#
################################################################
proc get_nodes { {hostlist_expression ""} } {
global scontrol
set command "$scontrol show node -o"
if {$hostlist_expression ne ""} {
append command " $hostlist_expression"
}
set output [run_command_output -fail -nolog "$command"]
# Iterate over each node parameter line
foreach line [split $output "\n"] {
if {$line eq ""} { continue }
# Peel off the node parameters one at a time
# The first quantifier sets the greediness for the whole RE
while {[regexp {^ *?([^ =]+)=(.*)(?= +[^ =]+=| *$)} $line {} param_name param_value]} {
# Remove the consumed parameter from the line
set line [regsub {^ *?[^ =]+=.*(?= +[^ =]+=| *$)} $line {}]
# Add it to the temporary node dictionary
dict set node_dict $param_name $param_value
}
set node_name [dict get $node_dict "NodeName"]
# Add the node dictionary to nodes dictionary
dict set nodes_dict $node_name $node_dict
# Clear the node dictionary for use by the next node
set node_dict {}
}
return $nodes_dict
}
################################################################
#
# NAME
# get_node_param - returns a specific parameter value for a specific node
#
# SYNOPSIS
# get_node_param node_name parameter_name
#
# DESCRIPTION
# Returns a specific parameter value for a specified node if the
# parameter exists for the node, or MISSING if it does not exist.
# Specifying an invalid node name will result in a failure.
#
################################################################
proc get_node_param { node_name parameter_name } {
set nodes_dict [get_nodes $node_name]
if [dict exists $nodes_dict $node_name $parameter_name] {
return [dict get $nodes_dict $node_name $parameter_name]
} else {
return "MISSING"
}
}
################################################################
#
# NAME
# get_nodes_by_request - get a list of nodes satisfying requested resources
#
# SYNOPSIS
# get_nodes_by_request ?options? ?request_args?
#
# DESCRIPTION
# Using srun (optionally with the specified arguments), returns a list
# of nodes having the requested resources.
# If an error occurs, the invoking test will fail.
#
# OPTIONS
# -fail
# fail the test if the execution of srun results in an error or timeout
#
# ARGUMENTS
# request_args
# Desired resources of a node in form of srun arguments,
# e.g. "--gres=gpu:1 -n1 -t1"
#
# RETURN VALUE
# A list of nodes with at least the requested resources, or an empty
# list otherwise.
#
################################################################
proc get_nodes_by_request args {
global srun
set options [list]
while {[llength $args]} {
switch -glob -- [lindex $args 0] {
-fail {
lappend options [lrange $args 0 0]
set args [lrange $args 1 end]
}
default break
}
}
if {[llength $args] == 1} {
lassign $args request_args
} elseif {[llength $args] == 0} {
set request_args "-n1 -t1"
} else {
fail "[lindex [info level 0] 0]: Invalid number of arguments ([llength $args]): $args"
}
log_debug "Getting nodes that can be allocated with request: $request_args"
set command "$srun -Q $request_args printenv SLURMD_NODENAME"
set result [run_command {*}$options $command]
if [dict get $result exit_code] {
return {}
}
set output [dict get $result output]
foreach line [split $output "\n"] {
if {$line eq ""} { continue }
dict incr allocated_nodes $line
}
return [lsort [dict keys $allocated_nodes]]
}
################################################################
#
# NAME
# get_partitions - returns a dictionary of dictionaries of partition parameters
#
# SYNOPSIS
# get_partitions ?partition_name?
#
# DESCRIPTION
# Uses `scontrol show partitions` to query partition parameters,
# returning a dictionary of dictionaries with the partition names
# as keys of the first level dictionary and with the parameters as
# keys of the second level dictionary. Specifying an invalid partition
# name will result in a failure.
#
# RETURN VALUE
# If the optional partition_name argument is specified, the result will
# contain only the one patition. Otherwise, the results for all
# partitions will be returned.
#
################################################################
proc get_partitions { {partition_name ""} } {
global scontrol
set command "$scontrol show partition -o"
if {$partition_name ne ""} {
append command " $partition_name"
}
set output [run_command_output -fail -nolog "$command"]
# Iterate over each partition parameter line
foreach line [split $output "\n"] {
if {$line eq ""} { continue }
# Peel off the partition parameters one at a time
# The first quantifier sets the greediness for the whole RE
while {[regexp {^ *?([^ =]+)=(.*)(?= +[^ =]+=| *$)} $line {} param_name param_value]} {
# Remove the consumed parameter from the line
set line [regsub {^ *?[^ =]+=.*(?= +[^ =]+=| *$)} $line {}]
# Add it to the temporary node dictionary
dict set part_dict $param_name $param_value
}
set part_name [dict get $part_dict "PartitionName"]
# Add the node dictionary to nodes dictionary
dict set parts_dict $part_name $part_dict
# Clear the node dictionary for use by the next node
set part_dict {}
}
return $parts_dict
}
################################################################
#
# NAME
# get_partition_param - returns a specific parameter value for a specific partition
#
# SYNOPSIS
# get_partition_param partitoin_name parameter_name
#
# DESCRIPTION
# Returns a specific parameter value for a specified partition if the
# parameter exists for the partition, or MISSING if it does not exist.
# Specifying an invalid partition name will result in a failure.
#
################################################################
proc get_partition_param { partition_name parameter_name } {
set partitions_dict [get_partitions $partition_name]
if [dict exists $partitions_dict $partition_name $parameter_name] {
return [dict get $partitions_dict $partition_name $parameter_name]
} else {
return "MISSING"
}
}
################################################################
#
# NAME
# get_jobs - returns a dictionary of dictionaries of job parameters
#
# SYNOPSIS
# get_jobs ?job_id?
#
# DESCRIPTION
# Uses `scontrol show job` to return a dictionary of dictionaries of job
# parameters. Specifying an invalid job id will result in a failure.
#
################################################################
proc get_jobs { {job_id_in ""} } {
global scontrol
set command "$scontrol show job -d -o"
if {$job_id_in ne ""} {
append command " $job_id_in"
}
set output [run_command_output -fail "$command"]
# Iterate over each job's parameter list
foreach line [split $output "\n"] {
if {$line eq ""} { continue }
# Peel off the job parameters one at a time
# The first quantifier sets the greediness for the whole RE
while {[regexp {^ *?([^ =]+)=(.*)(?= +[^ =]+=| *$)} $line {} param_name param_value]} {
# Remove the consumed parameter from the line
set line [regsub {^ *?[^ =]+=.*(?= +[^ =]+=| *$)} $line {}]
# Add it to the temporary job dictionary
dict set job_dict $param_name $param_value
}
set job_id [dict get $job_dict "JobId"]
# Add the job dictionary to jobs dictionary
dict set jobs_dict $job_id $job_dict
# Clear the job dictionary for the next job
set job_dict {}
}
return $jobs_dict
}
################################################################
#
# NAME
# get_job_param - returns a specific parameter value for a specific job
#
# SYNOPSIS
# get_job_param job_id parameter_name
#
# DESCRIPTION
# Returns a specific parameter value for a specified job if the
# parameter exists for the job, or MISSING if it does not exist.
# Specifying an invalid job id will result in a failure.
#
################################################################
proc get_job_param { job_id parameter_name } {
set jobs_dict [get_jobs $job_id]
if [dict exists $jobs_dict $job_id $parameter_name] {
return [dict get $jobs_dict $job_id $parameter_name]
} else {
return "MISSING"
}
}
proc check_reason { job_id reason } {
global squeue
set found 0
spawn $squeue -j $job_id --noheader -o "%r"
expect {
-re "$reason" {
set found 1
exp_continue
}
timeout {
log_error "squeue not responding"
}
eof {
wait
}
}
if {$found == 0} {
log_error "Job $job_id should have a wait reason of $reason"
}
return $found
}
################################################################
#
# NAME
# submit_job - submits a job with sbatch and returns its job id
#
# SYNOPSIS
# submit_job ?options? job_args
#
# DESCRIPTION
# Submits a job with sbatch and returns its jobid, or 0 if error.
# It accepts all the options of run_command.
#
# OPTIONS
# It accepts and passes all the options of/to run_command and also:
# -env env
# Prepend $env to the actual sbatch command to set environment
# variables. For example "-env 'SLURM_NTASKS_PER_GPU=2'".
#
# ARGUMENTS
# job_args
# a string containing all the arguments to pass to sbatch
#
# RETURN VALUE
# the job id, or 0 if an error happen
#
################################################################
proc submit_job args {
global sbatch
set env ""
set job_id 0
set idx [lsearch $args -env]
if {$idx >= 0} {
set env [lindex $args [expr $idx+1]]
set args [lreplace $args $idx [expr $idx+1]]
}
if {[llength $args] < 1} {
fail "Wrong number of parameters, should be >=1"
}
set job_options [lindex $args [expr [llength $args] - 1 ]]
set run_options ""
if {[llength $args] > 1} {
set run_options [lrange $args 0 [expr [llength $args] - 2 ]]
}
set command ""
if {$env ne ""} {
set command "$env "
}
set command "$command$sbatch $job_options"
set output [run_command_output {*}$run_options "$command"]
regexp {Submitted \S+ job (\d+)} $output - job_id
return $job_id
}
###############################################################################
#
# NAME
# check_exclusive_gres - Given a job_id and node_name of an exclusive
# allocation make sure we allocated all GRES.
#
################################################################################
proc check_exclusive_gres { job_id node_name } {
# Check all consumable GRES of the node were allocated on the job
set gres_dict_job [count_gres [get_job_param $job_id "JOB_GRES"]]
set gres_dict_node [count_gres [get_node_param $node_name "Gres"] true]
dict for {gres_name gres_count} $gres_dict_node {
# We don't want mps to be allocated with --exclusive
if {$gres_name eq "mps" || $gres_name eq "shard"} {
if {[dict exists $gres_dict_job $gres_name]} {
subfail "Gres ($gres_name) on node ($node_name) allocated on job ($job_id) with --exclusive but it shouldn't be"
} else {
subpass "Gres ($gres_name) on node ($node_name) was not allocated on job ($job_id) with --exclusive, this is expected"
}
} elseif {![dict exists $gres_dict_job $gres_name]} {
subfail "Gres ($gres_name) on node ($node_name) not allocated on job ($job_id) with --exclusive"
} else {
subpass "Gres ($gres_name) on node ($node_name) was allocated on job ($job_id) with --exclusive"
set gres_count_job [dict get $gres_dict_job $gres_name]
subtest { $gres_count_job == $gres_count } "Gres ($gres_name) on node ($node_name) fully allocated on job ($job_id) with --exclusive" "$gres_count_job != $gres_count"
}
}
}
################################################################
#
# NAME
# compile_against_libslurm - compiles a test program against either libslurm.so or libslurmfull.so
#
# SYNOPSIS
# compile_against_libslurm ?options? test_prog ?build_args?
#
# DESCRIPTION
# Compile a test program against either libslurm.so or libslurmfull.so.
#
# OPTIONS
# -full
# use libslurmfull.so instead of libslurm.so
# -shared
# produces a shared library (adds the -shared compiler option
# and adds a .so suffix to the output file name)
#
# ARGUMENTS
# test_prog
# The name of the test program (and .c file)
# build_args
# Additional string to be appended to the build command.
# E.g. "-DUSING_VALGRIND -lm ${build_dir}/src/slurmctld/locks.o"
# (initial space will be added automatically).
#
################################################################################
proc compile_against_libslurm args {
global slurm_dir bin_cc src_dir build_dir bin_chmod
set use_full false
set shared false
set build_args ""
while {[llength $args]} {
switch -glob -- [lindex $args 0] {
-full {set use_full true; set args [lrange $args 1 end]}
-shared {set shared true; set args [lrange $args 1 end]}
-* {fail "Unknown option: [lindex $args 0]"}
default break
}
}
set argument_count [llength $args]
if {$argument_count < 1} {
fail "Too few arguments ($argument_count): $args"
} else {
lassign $args test_prog
}
if {$argument_count == 2} { set build_args [lindex $args 1] }
if {$argument_count > 2} {
fail "Too many arguments ($argument_count): $args"
}
if {$use_full} {
set libfile "libslurmfull.so"
} else {
set libfile "libslurm.so"
}
if [file exists $slurm_dir/lib64/slurm/$libfile] {
set libdir "lib64"
} else {
set libdir "lib"
}
if {$use_full} {
set libline "$slurm_dir/$libdir/slurm"
set libfile "slurmfull"
} else {
set libline "$slurm_dir/$libdir"
set libfile "slurm"
}
set build_cmd "$bin_cc ${test_prog}.c -g -pthread"
if {$shared} {
set out "${test_prog}.so"
append build_cmd " -fPIC -shared"
} else {
set out "${test_prog}"
}
append build_cmd " -o $out"
append build_cmd " -I$src_dir -I$build_dir -I$slurm_dir/include -Wl,-rpath=$libline -L$libline -l$libfile -lresolv"
# Add additional arguments to the build command
if {$build_args != ""} {
append build_cmd " $build_args"
}
log_debug "Build command: $build_cmd"
catch {exec {*}$build_cmd} out_str out_dict
if {[dict get $out_dict -code]} {
log_error $out_str
return $::RETURN_ERROR
}
exec $bin_chmod 700 $out
return $::RETURN_SUCCESS
}
################################################################
#
# NAME
# subtest - tests a boolean condition and updates subtest tallies
#
# SYNOPSIS
# subtest ?options? condition description ?diagnostics?
#
# DESCRIPTION
# Based on the results of testing a boolean expression, increments the
# relevant subtest count (pass, fail or skip) and logs a message.
#
# OPTIONS
# -fatal
# If the subtest fails, causes a fatal error ending the test
#
# ARGUMENTS
# condition
# The boolean expression to test
# description
# A single-line string describing what is being tested. This is
# a subtest "name" that is displayed with the log message
# whether the subtest passes or fails
# diagnostics
# A string providing additional diagnostic information that is
# only included in the log message on failure
#
# RETURN VALUE
# the boolean condition evaluated
#
# ENVIRONMENT
# testsuite_subtest_fatal
# Specifies whether first failing subtest aborts the test
#
################################################################
proc subtest args {
set options [list]
while {[llength $args]} {
switch -glob -- [lindex $args 0] {
-fatal -
-fail {
lappend options -fatal
set args [lrange $args 1 end]
}
-* {fail "Unknown option: [lindex $args 0]"}
default break
}
}
set argument_count [llength $args]
if {$argument_count < 2} {
fail "Too few arguments ($argument_count): $args"
} else {
set args [lassign $args condition description]
}
if [uplevel 1 expr [format "{%s}" $condition]] {
subpass $description
return true
} else {
subfail {*}$options $description {*}$args
return false
}
}
################################################################
#
# NAME
# _is_testproc_included - returns if testproc_id was included or not excluded in argv
#
# SYNOPSIS
# _is_testproc_included testproc_id testproc_alias
#
# DESCRIPTION
# From command line the test runner can use -i and -e to include or
# exclude some test functions by their number. This function must be
# used to check if the test runner included or excluded the given
# testproc_id.
#
# RETURN VALUE
# true is testproc_id or testproc_alias was included with -i or not
# excluded -e, false otherwise
#
################################################################
proc _is_testproc_included {testproc_id testproc_alias} {
global _testproc_included _testproc_excluded
if {[llength $_testproc_included]} {
if {[lsearch $_testproc_included $testproc_id] >= 0 || \
[lsearch $_testproc_included $testproc_alias] >= 0} {
return true
}
return false
}
if {[llength $_testproc_excluded]} {
if {[lsearch $_testproc_excluded $testproc_id] >= 0 || \
[lsearch $_testproc_excluded $testproc_alias] >= 0} {
return false
}
}
return true
}
################################################################
#
# NAME
# skip_following_testprocs - the following testproc calls will be skipped
#
# SYNOPSIS
# skip_following_testprocs reason
#
# ARGUMENTS
# reason
# The string with the reason message to add on the skip message
# on each skipped testproc.
#
# DESCRIPTION
# This function disables normal execution of testproc calls.
# It is meant to be used when some testprocs cannot be run due config
# limitations, but still call testproc to register what testprocs
# are skipped for a given reason.
# Use run_following_testprocs to reenable the norma execution of testprocs.
################################################################
proc skip_following_testprocs {reason} {
global _testproc_skip_next _testproc_skip_reason
set _testproc_skip_next true
set _testproc_skip_reason $reason
}
################################################################
#
# NAME
# run_following_testprocs - the following testproc call will be run (if not excluded from command line)
#
# SYNOPSIS
# run_following_testprocs
#
# DESCRIPTION
# This function reenables the normal execution of testproc calls.
# It is meant to be used when skip_following_testprocs was called to skip
# previous testproc calls, and we want to normally run the following ones.
# Note that it does NOT overwrite what -i and -e included are passed
# from command line.
#
################################################################
proc run_following_testprocs {} {
global _testproc_skip_next _testproc_skip_reason
set _testproc_skip_next false
set _testproc_skip_reason ""
}
################################################################
#
# NAME
# testproc - launcher to run or skip a testproc_call
#
# SYNOPSIS
# testproc testproc_call
#
# ARGUMENTS
# A testproc_call is any normal call to a proc with any arguments that
# could be done normally without the testproc launcher.
# For example, we could normally do:
#
# test_my_feature $some_args $expected_out
#
# Or use the launcher like:
#
# testproc test_my_feature $some_args $expected_out
#
# DESCRIPTION
# Using the testproc launcher has the following main benefits:
# a) Handles the -i and -e terminal options to include or exclude some
# testprocs numbers.
# b) Runs or skips based on the last call of testproc_{skip,run}_following.
# c) Creates extra sections in the status summary
# (see testsuite_testproc_details).
#
# RETURN VALUE
# The rc of the testproc_call if it has been run, or $::RETURN_SUCCESS
# otherwise. Using it is not recommended, though.
#
################################################################
proc testproc args {
return [testproc_alias "" {*}$args]
}
################################################################
#
# NAME
# testproc_alias - launcher to run or skip a testproc_call, with an alias
#
# SYNOPSIS
# testproc alias testproc_call
#
# DESCRIPTION
# See testproc
#
################################################################
proc testproc_alias {alias args} {
global _testproc_pass_list _testproc_skip_list _testproc_fail_list
global _testproc_skip_next _testproc_skip_reason
global _testproc_messages testsuite_testproc_log_calls
global _subtest_pass_count
global _subtest_skip_count
global _subtest_fail_count
# Avoid integer alias to avoid confusions with testproc_id
if {[llength $alias] && [string is integer -strict $alias]} {
fail "testproc_alias doesn't support integer alias, use alphanumeric"
}
# Save previous subtest counts and next subtest num
set prev_pass $_subtest_pass_count
set prev_skip $_subtest_skip_count
set prev_fail $_subtest_fail_count
set prev_subtest [expr $_subtest_pass_count + \
$_subtest_skip_count + \
$_subtest_fail_count + 1]
# Get the testproc number
set testproc_id [expr [llength $_testproc_pass_list] + \
[llength $_testproc_skip_list] + \
[llength $_testproc_fail_list] + 1]
# Run or skip the testproc
set rc $::RETURN_SUCCESS
set reason ""
if {![_is_testproc_included $testproc_id $alias]} {
set reason "(Excluded from command line)"
if {$testsuite_testproc_log_calls == yes} {
subskip -nolog "Skipping testproc $testproc_id: {$args} $reason"
} else {
subskip -nolog "Skipping testproc $testproc_id: $reason"
}
} else {
if {$_testproc_skip_next} {
set reason "($_testproc_skip_reason)"
if {$testsuite_testproc_log_calls == yes} {
subskip "Skipping testproc $testproc_id: {$args} $reason"
} else {
subskip "Skipping testproc $testproc_id: $reason"
}
} else {
if {$testsuite_testproc_log_calls != no} {
log_info "Running testproc $testproc_id: $args"
} else {
log_info "Running testproc $testproc_id"
}
set rc [{*}$args]
}
}
# Get current subtest counts
set curr_subtest [expr $_subtest_pass_count + \
$_subtest_skip_count + \
$_subtest_fail_count]
# Register the testproc as fail, skip or pass (based on subtests)
if {[llength $alias]} {
set alias "${alias}: "
}
if {$_subtest_fail_count > $prev_fail} {
set reason "(Subtests: $prev_subtest to $curr_subtest)"
lappend _testproc_fail_list $testproc_id
if {$testsuite_testproc_log_calls != no} {
dict set _testproc_messages $testproc_id [list failed "$alias{$args} $reason"]
} else {
dict set _testproc_messages $testproc_id [list failed "$alias$reason"]
}
} elseif {$_subtest_skip_count > $prev_skip} {
lappend _testproc_skip_list $testproc_id
if {$testsuite_testproc_log_calls == yes} {
dict set _testproc_messages $testproc_id [list skipped "$alias{$args} $reason"]
} else {
dict set _testproc_messages $testproc_id [list skipped "$alias$reason"]
}
} else {
set reason "(Subtests: $prev_subtest to $curr_subtest)"
lappend _testproc_pass_list $testproc_id
if {$testsuite_testproc_log_calls == yes} {
dict set _testproc_messages $testproc_id [list passed "$alias{$args} $reason"]
} else {
dict set _testproc_messages $testproc_id [list passed "$alias$reason"]
}
}
return $rc
}
################################################################
#
# NAME
# _log_format - prints a log message with colorization and formatting
#
# SYNOPSIS
# _log_format log_level message
#
# DESCRIPTION
# This procedure is called by the log_<level> procedures and
# derives the relevant log level from the caller's procedure name.
#
# ARGUMENTS
# log_level
# The logging threshold that triggered the log statement
# message
# The message to print with colorization and formatting
#
# ENVIRONMENT
# testsuite_log_format
# Used as the template for the fields to be output.
# Fields must be expressed in the form:
# %{<field_name>}<format_conversion_specifier>
# Supported fields include the following:
# message
# The log message
# filename
# The file name where the log_<log_level>
# procedure was called from
# lineno
# The line number where the log_<log_level>
# procedure was called from
# timestamp
# The date and time when the log_<log_level>
# procedure was called at
# msecs
# The milliseconds when the log_<log_level>
# procedure was called at
# loglevel
# The log level that triggers the log_<log_level>
# procedure to be called
# backtrace
# An abbreviated call stack trace with line
# numbers
# testsuite_time_format
# Used as a template for the timestamp. See the format groups
# for the tcl clock format command.
# testsuite_colorize
# Boolean that turns colorization on or off
# testsuite_color_<log_level>
# Can be set to define the color used for each log level
#
################################################################
proc _log_format { log_level message } {
global testsuite_colorize testsuite_log_format testsuite_time_format
global COLOR_NONE
global testsuite_color_fatal testsuite_color_error testsuite_color_warn
global testsuite_color_info testsuite_color_pass testsuite_color_command
global testsuite_color_debug testsuite_color_trace
set format_string $testsuite_log_format
set milliseconds_since_epoch [clock milliseconds]
set date_time [clock format [expr {$milliseconds_since_epoch / 1000}] -format "$testsuite_time_format"]
set milliseconds [expr {$milliseconds_since_epoch % 1000}]
set frame_level -2
while { [dict get [info frame $frame_level] type] != "source" } {
incr frame_level -1
}
set format_args {}
while {[regexp "%{\[a-z]+}" $format_string format_field]} {
if {$format_field eq "%{message}"} {
lappend format_args $message
} elseif {$format_field eq "%{filename}"} {
lappend format_args [file tail [dict get [info frame $frame_level] file]]
} elseif {$format_field eq "%{lineno}"} {
lappend format_args [dict get [info frame $frame_level] line]
} elseif {$format_field eq "%{timestamp}"} {
lappend format_args $date_time
} elseif {$format_field eq "%{msecs}" || $format_field eq "%{milliseconds}"} {
lappend format_args $milliseconds
} elseif {$format_field eq "%{loglevel}" || $format_field eq "%{levelname}"} {
lappend format_args [string totitle $log_level]
} elseif {$format_field eq "%{backtrace}"} {
lappend format_args [_line_trace]
} else {
fail "Invalid field ($format_field) specified in testsuite_log_format"
}
regsub $format_field $format_string "%" format_string
}
if ($testsuite_colorize) {
switch $log_level {
fatal { append output $testsuite_color_fatal }
error { append output $testsuite_color_error }
warning { append output $testsuite_color_warn }
info { append output $testsuite_color_info }
pass { append output $testsuite_color_pass }
command { append output $testsuite_color_command }
debug { append output $testsuite_color_debug }
trace { append output $testsuite_color_trace }
}
}
append output [format $format_string {*}$format_args]
if ($testsuite_colorize) {
append output $COLOR_NONE
}
puts $output
}
################################################################
#
# NAME
# _print_header - prints a test header
#
# SYNOPSIS
# _print_header
#
# ENVIRONMENT
# testsuite_colorize
# Boolean that turns colorization on or off
# testsuite_color_header
# Can be set to define the color used for the header
#
################################################################
proc _print_header { } {
global test_name testsuite_color_header testsuite_colorize COLOR_NONE
if ($testsuite_colorize) {
append output $testsuite_color_header
}
append output [string repeat = 78]\n
append output [format "%-9s" "TEST:"]${test_name}\n
append output [string repeat = 78]
if ($testsuite_colorize) {
append output $COLOR_NONE
}
puts $output
}
################################################################
#
# NAME
# _print_summary - prints the final status summary
#
# SYNOPSIS
# _print_summary status completed
#
# ARGUMENTS
# status
# The final status of the test.
# When status is zero, we print SUCCESS.
# When status is negative, we print SKIPPED.
# When status is positive, we print FAILURE.
#
# completed
# A boolean value that is true if the test completed and false
# if aborted (ended early with exit status != 0)
#
# ENVIRONMENT
# testsuite_colorize
# Boolean that turns colorization on or off
# testsuite_color_<test_status>
# Can be set to define the color used for each test status
#
################################################################
proc _print_summary {status completed} {
global test_name testsuite_colorize COLOR_NONE
global testsuite_color_success testsuite_color_skipped
global testsuite_color_failure
global _subtest_fail_count _subtest_pass_count _subtest_skip_count
global _testproc_pass_list _testproc_skip_list _testproc_fail_list
global _incomplete_reason
global _subtest_messages _testproc_messages
global testsuite_subtest_details testsuite_testproc_details
if {$status == 0} {
set color $testsuite_color_success
set header "SUCCESS"
} elseif {$status < 0} {
set color $testsuite_color_skipped
set header "SKIPPED"
} elseif {$status > 0} {
set color $testsuite_color_failure;
set header "FAILURE"
}
if ($testsuite_colorize) {
append output $color
}
append output [string repeat = 78]\n
# Get subtest and testproc counts
set testproc_fail [llength $_testproc_fail_list]
set testproc_skip [llength $_testproc_skip_list]
set testproc_pass [llength $_testproc_pass_list]
set testproc_count [expr $testproc_pass + \
$testproc_skip + \
$testproc_fail]
set subtest_count [expr $_subtest_fail_count + \
$_subtest_pass_count + \
$_subtest_skip_count]
# Initial summary
append output [format "%s : %s\n" $header $test_name]
if {$testproc_count > 0} {
append output [format " Testprocs failed : %3d (%3d%%)%s\n" $testproc_fail [expr $testproc_fail * 100 / $testproc_count] \
[expr {$testproc_fail ? " List: [join $_testproc_fail_list ,]" : ""}]]
append output [format " Testprocs skipped : %3d (%3d%%)%s\n" $testproc_skip [expr $testproc_skip * 100 / $testproc_count] \
[expr {$testproc_skip ? " List: [join $_testproc_skip_list ,]" : ""}]]
append output [format " Testprocs passed : %3d (%3d%%)\n" $testproc_pass [expr $testproc_pass * 100 / $testproc_count]]
append output [format " Testprocs total : %3d %s\n" $testproc_count [expr {$completed ? "COMPLETE" : "INCOMPLETE: $_incomplete_reason"}]]
}
if {$subtest_count > 0} {
if {$testproc_count > 0} {
append output \n
}
append output [format " Subtests failed : %3d (%3d%%)\n" $_subtest_fail_count [expr $_subtest_fail_count * 100 / $subtest_count]]
append output [format " Subtests skipped : %3d (%3d%%)\n" $_subtest_skip_count [expr $_subtest_skip_count * 100 / $subtest_count]]
append output [format " Subtests passed : %3d (%3d%%)\n" $_subtest_pass_count [expr $_subtest_pass_count * 100 / $subtest_count]]
append output [format " Subtests total : %3d %s\n" $subtest_count [expr {$completed ? "COMPLETE" : "INCOMPLETE: $_incomplete_reason"}]]
}
append output [string repeat = 78]\n
# Detailed information
if {$testsuite_subtest_details ne "none"} {
set show_details false
if {$testsuite_subtest_details eq "all" && $subtest_count > 0} {
set show_details true
} elseif {$testsuite_subtest_details eq "fail_skip" && $_subtest_skip_count > 0} {
set show_details true
} elseif {$_subtest_fail_count > 0} {
set show_details true
}
if {$show_details} {
append output [format "SUBTESTS DETAILS : %s\n" $test_name]
dict for {id result_msg} $_subtest_messages {
set result [lindex $result_msg 0]
if {$testsuite_subtest_details eq "all"} {
append output [format " %s\n" [lindex $result_msg 1]]
} elseif {$testsuite_subtest_details eq "fail_skip" && $result eq "skip"} {
append output [format " %s\n" [lindex $result_msg 1]]
} elseif {$result eq "fail"} {
append output [format " %s\n" [lindex $result_msg 1]]
}
}
append output [string repeat = 78]\n
}
}
if {$testsuite_testproc_details ne "none"} {
set show_details false
if {$testsuite_testproc_details eq "all" && $testproc_count > 0} {
set show_details true
} elseif {$testsuite_testproc_details eq "fail_skip" && $testproc_skip > 0} {
set show_details true
} elseif {$testproc_fail > 0} {
set show_details true
}
if {$show_details} {
append output [format "TESTPROCS DETAILS : %s\n" $test_name]
dict for {num result_msg} $_testproc_messages {
set result [lindex $result_msg 0]
if {$testsuite_testproc_details eq "all"} {
append output [format " Testproc %2d %-7s : %s\n" $num [lindex $result_msg 0] [lindex $result_msg 1]]
} elseif {$testsuite_testproc_details eq "fail_skip" && $result eq "skipped"} {
append output [format " Testproc %2d %-7s : %s\n" $num [lindex $result_msg 0] [lindex $result_msg 1]]
} elseif {$result eq "failed"} {
append output [format " Testproc %2d %-7s : %s\n" $num [lindex $result_msg 0] [lindex $result_msg 1]]
}
}
append output [string repeat = 78]\n
}
}
if ($testsuite_colorize) {
append output $COLOR_NONE
}
puts -nonewline $output
}
################################################################
#
# NAME
# _get_test_name - gets the name of the invoking source script
#
# SYNOPSIS
# _get_test_name
#
# RETURN VALUE
# The name of the originally called script
#
################################################################
proc _get_test_name { } {
set test_name unknown
set frame_level 1
while { $frame_level <= [info frame] } {
if { [dict get [info frame $frame_level] type] == "source" } {
set test_name [file tail [dict get [info frame $frame_level] file]]
break
}
incr frame_level
}
return $test_name
}
################################################################
#
# NAME
# _test_cleanup - performs the test cleanup
#
# SYNOPSIS
# _test_cleanup
#
# DESCRIPTION
# This procedure removes the temporary test_dir and calls the
# test-defined cleanup procedure.
#
# NOTES
# This function should be called only from _test_init and _test_fini.
#
################################################################
proc _test_cleanup {} {
global log_warn test_dir
set rc 0
# Call global cleanup procedure if it is defined by the test
if {[info procs cleanup] eq "cleanup"} {
if {[catch {cleanup} cleanup_error ]} {
log_error "Cleanup had errors: $cleanup_error"
set rc 1
}
}
# Remove the temporary test directory
exec rm -rf $test_dir
return $rc
}
################################################################
#
# NAME
# _test_init - performs test initialization
#
# SYNOPSIS
# _test_init
#
# DESCRIPTION
# This procedure is called automatically at the beginning of each test.
# It prints the header, creates the temporary test dir, etc.
#
################################################################
proc _test_init {} {
global test_dir test_id test_name testsuite_shared_dir
global _testproc_included _testproc_excluded argv bin_chmod scontrol
# parse argv to get and remove _testproc_included and _testproc_excluded params
set idx [expr {[info exists argv] ? [lsearch $argv -i] : -1}]
if {$idx >= 0} {
set _testproc_included [split [lindex $argv [expr $idx + 1]] ,]
set argv [lreplace $argv $idx [expr $idx + 1]]
}
set idx [expr {[info exists argv] ? [lsearch $argv -e] : -1}]
if {$idx >= 0} {
set _testproc_excluded [split [lindex $argv [expr $idx + 1]] ,]
set argv [lreplace $argv $idx [expr $idx + 1]]
}
# Set test name to name of originally invoked test script, e.g. test1.1
set test_name [_get_test_name]
# Set test id to suffix of the test script, e.g. 1.1
set test_id [string map {test ""} $test_name]
# Temporary test directory used to stash saved configs, output files...
set test_dir "$testsuite_shared_dir/${test_name}dir"
# Print test header
_print_header
# Cleanup in case test was not cleaned up on last execution
if { [_test_cleanup] } {
fail "Error in the initial cleanup"
}
# Create temporary shared test directory
exec mkdir -p $test_dir
exec $bin_chmod a+rwx $test_dir
# Default precondition checks
if {![regexp "is UP" [run_command_output -nolog -none "$scontrol ping"]]} {
skip "The expect testsuite requires Slurm to be running, but '$scontrol ping' failed."
}
set idle_nodes [get_nodes_by_state]
if {[llength $idle_nodes] < 1} {
skip "The expect testsuite requires there to be at least one idle node in the default partition."
}
# Ensure that idle nodes are not drained/down
# Checking one by one to be able to print a warning if some was drain/down
foreach idle_node $idle_nodes {
run_command -nolog -xfail "$scontrol update nodename=$idle_node state=resume"
}
}
################################################################
#
# NAME
# _test_fini - performs test finalization
#
# SYNOPSIS
# _test_fini ?status?
#
# DESCRIPTION
# This procedure is called automatically from the ending functions
# pass, skip and failure.
# It cleans up based on the status and the testsuite_cleanup_on_failure
# variable, prints the final test status/summary, and exits the test.
#
# We will always cleanup for SUCCESS or SKIPPED tests.
# Whether or not the cleanup procedure is called for FAILURE tests
# depends on the setting of the $testsuite_cleanup_on_failure variable
# which can be set in the globals.local file or overridden with the
# SLURM_TESTSUITE_CLEANUP_ON_FAILURE environment variable.
#
################################################################
proc _test_fini { status } {
global testsuite_cleanup_on_failure _test_fini_called
global _subtest_fail_count _subtest_skip_count
global STATUS_FAIL STATUS_PASS STATUS_SKIP test_status
# Avoid potential infinite recursive calls.
# _test_fini should be called only once, but custom cleanup procs
# called from _test_cleanup can potentially call it (eg fail)
if {$_test_fini_called} {
log_debug "Recursive _test_fini call detected, most probably a fail on a cleanup function"
return
}
set _test_fini_called true
# Determine if test completed or was aborted
set completed [expr $status == $STATUS_PASS ? true : false]
# Override status with subtest status if available and necessary
if {$status != $STATUS_FAIL} {
if {$_subtest_fail_count > 0} {
set status $STATUS_FAIL
} elseif {$_subtest_skip_count > 0} {
set status $STATUS_SKIP
}
}
# Set final test status global variable so it can be used in cleanup
set test_status $status
# Only cleanup if test not failed or configured to do so
if {$status != $STATUS_FAIL || $testsuite_cleanup_on_failure} {
_test_cleanup
}
_print_summary $status $completed
__exit $status
}
################################################################
#
# Overload the exit routine to ensure that no one is explicitly
# calling it, and to enforce _test_fini if exit is called when
# the test reach its EOF.
#
# All tests should exit using pass, skip or fail.
#
################################################################
rename exit __exit
proc exit { {status 0} } {
global test_name
# To avoid failures when using "expect -c 'source globals'"
if {$test_name eq "globals"} {
pass
}
#
# Minor sanity check to detect if exit was explicitly called (not
# allowed) or automatically executed when the test ends
#
if {[info level] > 1} {
# exit was called from a function, and it shouldn't
fail "Exit should not be directly called, use pass, skip or fail instead"
} else {
set frame_level 1
while { $frame_level <= [info frame] } {
if { [dict get [info frame $frame_level] type] == "source" } {
if { [file tail [dict get [info frame $frame_level] file]] eq $test_name } {
# exit was called explicitly from the
# test, and it shouldn't
fail "Exit should not be directly called, use pass, skip or fail instead"
}
break
}
incr frame_level
}
}
if {$status != 0} {
fail "An Expect/TCL exception occurred"
}
# The exit was called implicitly when the test ends, allowed but
# _test_fini call enforced
pass
}
# Call _test_init at the beginning of each test
_test_init