| #!/usr/bin/env expect |
| ############################################################################ |
| # Purpose: Establish global state information for Slurm test suite |
| # |
| # To define site-specific state information, set the values in a file |
| # named 'globals.local'. Those values will override any specified here. |
| # for example: |
| # |
| # $ cat globals.local |
| # set slurm_dir "/usr/local" |
| # set build_dir "/home/mine/SLURM/build_smd" |
| # set src_dir "/home/mine/SLURM/slurm.git" |
| # set mpicc "/usr/local/bin/mpicc" |
| # |
| # If you want to have more than one test going at the same time for multiple |
| # installs you can have multiple globals.local files and set the |
| # SLURM_LOCAL_GLOBALS_FILE env var, and have that set to the correct |
| # globals.local file for your various installs. The file can be named anything, |
| # not just globals.local. |
| # |
| ############################################################################ |
| # Copyright (C) 2002-2007 The Regents of the University of California. |
| # Copyright (C) 2008-2010 Lawrence Livermore National Security. |
| # Copyright (C) SchedMD LLC. |
| # Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). |
| # Written by Morris Jette <jette1@llnl.gov> |
| # Additions by Joseph Donaghy <donaghy1@llnl.gov> |
| # CODE-OCEC-09-009. All rights reserved. |
| # |
| # This file is part of Slurm, a resource management program. |
| # For details, see <https://slurm.schedmd.com/>. |
| # Please also read the supplied file: DISCLAIMER. |
| # |
| # Slurm is free software; you can redistribute it and/or modify it under |
| # the terms of the GNU General Public License as published by the Free |
| # Software Foundation; either version 2 of the License, or (at your option) |
| # any later version. |
| # |
| # Slurm is distributed in the hope that it will be useful, but WITHOUT ANY |
| # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS |
| # FOR A PARTICULAR PURPOSE. See the GNU General Public License for more |
| # details. |
| # |
| # You should have received a copy of the GNU General Public License along |
| # with Slurm; if not, write to the Free Software Foundation, Inc., |
| # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. |
| ############################################################################ |
| |
| # Avoid sourcing this file multiple times |
| if {[info procs exit] eq "exit"} { |
| return |
| } |
| |
| # Read in unified testsuite properties from testsuite/testsuite.conf |
| set unified_testsuite_dir [file dirname [file normalize "[info script]/.."]] |
| set unified_testsuite_conf_file "$unified_testsuite_dir/testsuite.conf" |
| if {[info exists env(SLURM_TESTSUITE_CONF)]} { |
| set unified_testsuite_conf_file $env(SLURM_TESTSUITE_CONF) |
| } |
| set unified_testsuite_config_dict [dict create] |
| if [file readable "$unified_testsuite_conf_file"] { |
| set fp [open "$unified_testsuite_conf_file" r] |
| set file_data [read $fp] |
| close $fp |
| foreach line [split $file_data "\n"] { |
| if [regexp {^\s*(\S+)\s*=\s*(\S+)\s*$} $line {} parameter_name parameter_value] { |
| dict set unified_testsuite_config_dict [string tolower $parameter_name] $parameter_value |
| } |
| } |
| } else { |
| puts stderr "Warning: Unable to open the unified testsuite configuration file ($unified_testsuite_conf_file) for reading. This file can be created from a copy of the autogenerated sample found in BUILDDIR/testsuite/testsuite.conf.sample. By default, this file is expected to be found in SRCDIR/testsuite ($unified_testsuite_dir). If placed elsewhere, set the SLURM_TESTSUITE_CONF environment variable to the full path of your testsuite.conf file." |
| flush stderr |
| } |
| if [dict exists $unified_testsuite_config_dict "slurmsourcedir"] { |
| set src_dir [dict get $unified_testsuite_config_dict "slurmsourcedir"] |
| } |
| if [dict exists $unified_testsuite_config_dict "slurmbuilddir"] { |
| set build_dir [dict get $unified_testsuite_config_dict "slurmbuilddir"] |
| } |
| if [dict exists $unified_testsuite_config_dict "slurminstalldir"] { |
| set slurm_dir [dict get $unified_testsuite_config_dict "slurminstalldir"] |
| } |
| |
| global sacctmgr sacct salloc sattach sbatch sbcast scancel scontrol sinfo |
| global smd squeue sreport srun sstat strigger |
| |
| ################################################################ |
| # |
| # NAME |
| # cset - conditional set |
| # |
| # SYNOPSIS |
| # cset name value |
| # |
| # DESCRIPTION |
| # Conditional set. Only set variable if variable does not yet exist. |
| # |
| # Input: name -- name of the variable to set |
| # value -- value to set to 'name' |
| # |
| ################################################################ |
| |
| proc cset {name value} { |
| if {![uplevel 1 info exists $name]} { |
| upvar $name tmp |
| set tmp $value |
| } |
| } |
| |
| |
| # |
| # Defining colors here to be able to use them in globals.local. |
| # By default, these colors are bold |
| # |
| set COLOR_RED "\033\[1;31m" |
| set COLOR_RED_NORMAL "\033\[31m" |
| set COLOR_ORANGE "\033\[1;38;5;208m" |
| set COLOR_YELLOW "\033\[1;33m" |
| set COLOR_GREEN "\033\[1;32m" |
| set COLOR_BLUE "\033\[1;34m" |
| set COLOR_MAGENTA "\033\[1;35m" |
| set COLOR_CYAN "\033\[1;36m" |
| set COLOR_NONE "\033\[0m" |
| |
| cset local_globals_file "./globals.local" |
| |
| # Log level "enum" |
| # Define log levels here so they are available in globals.local |
| set LOG_LEVEL_QUIET 0 |
| set LOG_LEVEL_FATAL 1 |
| set LOG_LEVEL_ERROR 2 |
| set LOG_LEVEL_WARNING 3 |
| set LOG_LEVEL_INFO 4 |
| set LOG_LEVEL_PASS 4 |
| set LOG_LEVEL_COMMAND 4 |
| set LOG_LEVEL_DEBUG 5 |
| set LOG_LEVEL_TRACE 6 |
| |
| if {[info exists env(SLURM_LOCAL_GLOBALS_FILE)]} { |
| set local_globals_file $env(SLURM_LOCAL_GLOBALS_FILE) |
| } |
| |
| if [file exists $local_globals_file] { |
| source $local_globals_file |
| } |
| |
| # |
| # Specify the slurm install directory. |
| # Used to locate binaries, libraries, and header files. |
| # |
| cset slurm_dir "/usr" |
| cset build_dir "../../" |
| cset src_dir "../../" |
| cset config_h "${build_dir}/config.h" |
| cset sacctmgr "${slurm_dir}/bin/sacctmgr" |
| cset sacct "${slurm_dir}/bin/sacct" |
| cset salloc "${slurm_dir}/bin/salloc" |
| cset sattach "${slurm_dir}/bin/sattach" |
| cset sbatch "${slurm_dir}/bin/sbatch" |
| cset sbcast "${slurm_dir}/bin/sbcast" |
| cset scancel "${slurm_dir}/bin/scancel" |
| cset scontrol "${slurm_dir}/bin/scontrol" |
| cset sdiag "${slurm_dir}/bin/sdiag" |
| cset sgather "${slurm_dir}/bin/sgather" |
| cset sh5util "${slurm_dir}/bin/sh5util" |
| cset sinfo "${slurm_dir}/bin/sinfo" |
| cset smd "${slurm_dir}/bin/smd" |
| cset sprio "${slurm_dir}/bin/sprio" |
| cset squeue "${slurm_dir}/bin/squeue" |
| cset srun "${slurm_dir}/bin/srun" |
| cset sreport "${slurm_dir}/bin/sreport" |
| cset sshare "${slurm_dir}/bin/sshare" |
| cset sstat "${slurm_dir}/bin/sstat" |
| cset strigger "${slurm_dir}/bin/strigger" |
| |
| cset slurmd "${slurm_dir}/sbin/slurmd" |
| cset slurmrestd "${slurm_dir}/sbin/slurmrestd" |
| |
| cset pbsnodes "${slurm_dir}/bin/pbsnodes" |
| cset qdel "${slurm_dir}/bin/qdel" |
| cset qstat "${slurm_dir}/bin/qstat" |
| cset qsub "${slurm_dir}/bin/qsub" |
| cset qalter "${slurm_dir}/bin/qalter" |
| cset qrerun "${slurm_dir}/bin/qrerun" |
| |
| cset seff "${slurm_dir}/bin/seff" |
| |
| cset lsid "${slurm_dir}/bin/lsid" |
| cset bjobs "${slurm_dir}/bin/bjobs" |
| cset bkill "${slurm_dir}/bin/bkill" |
| cset bsub "${slurm_dir}/bin/bsub" |
| |
| cset influx "/usr/bin/influx" |
| # If using MPICH-2 or other version of MPI requiring pmi libary, use this |
| #cset mpicc "/home/jette/mpich2-install/bin/mpicc" |
| #cset use_pmi 1 |
| # OR for other versions of MPICH, use this |
| cset mpicc "/usr/local/bin/mpicc" |
| cset nvcc "/usr/bin/nvcc" |
| cset use_pmi 0 |
| #cset upcc "/usr/local/bin/upcc" |
| cset upcc "/usr/bin/xlupc" |
| cset oshcc "/usr/local/bin/oshcc" |
| |
| cset mpirun "mpirun" |
| cset totalviewcli "/usr/local/bin/totalviewcli" |
| |
| # Set if using "--enable-memory-leak-debug" configuration option |
| cset enable_memory_leak_debug 0 |
| |
| # test_prompt: to be used as prompt for interactive shells (see proc reset_bash_prompt) |
| set test_prompt "TEST_PROMPT: " |
| |
| # |
| # Specify locations of other executable files used |
| # Only the shell names (e.g. bin_bash) must be full pathnames |
| # |
| cset bin_awk "awk" |
| cset bin_bash [exec which bash | tail -n 1] |
| cset bin_cat "cat" |
| cset bin_cc "gcc" |
| cset bin_chmod "chmod" |
| cset bin_cmp "cmp" |
| cset bin_cp "cp" |
| cset bin_date "date" |
| cset bin_diff "diff" |
| cset bin_echo "echo" |
| cset bin_env "env" |
| cset bin_file "file" |
| cset bin_id "id" |
| cset bin_ip "ip" |
| cset bin_jq "jq" |
| cset bin_grep "grep" |
| cset bin_head "head" |
| cset bin_ln "ln" |
| cset bin_perldoc "/usr/bin/perldoc" |
| cset bin_py3 "python3" |
| cset bin_oasgen "openapi-generator-cli" |
| |
| # Don't user $bin_hostname, use $bin_printenv SLURMD_NODENAME |
| cset bin_hostname "hostname" |
| |
| cset bin_kill "kill" |
| cset bin_lscpu "lscpu" |
| cset bin_make "make" |
| cset bin_mv "mv" |
| cset bin_od "od" |
| cset bin_pkill "pkill" |
| cset bin_printenv "printenv" |
| cset bin_ps "ps" |
| cset bin_pwd "pwd" |
| cset bin_rm "rm" |
| cset bin_sed "sed" |
| cset bin_sleep "sleep" |
| cset bin_sort "sort" |
| cset bin_socat "socat" |
| cset bin_sum "sum" |
| cset bin_sudo "sudo" |
| cset bin_systemd_detect_virt "systemd-detect-virt" |
| cset bin_touch "touch" |
| cset bin_true "true" |
| cset bin_uname "uname" |
| cset bin_uniq "uniq" |
| cset bin_unshare "unshare" |
| cset bin_virtualenv "virtualenv" |
| cset bin_wc "wc" |
| |
| # |
| # Let the commands complete without expect timing out waiting for a |
| # response. Single node jobs submitted to the default partition should |
| # be initiated within this number of seconds. |
| # for interactive slurm jobs: cset timeout $max_job_delay |
| # |
| cset max_job_delay 120 |
| |
| # |
| # Specify the maximum number of tasks to use in the stress tests. |
| # |
| cset max_stress_tasks 4 |
| |
| # |
| # The error message that the "sleep" command prints when we run "sleep aaa". |
| # |
| cset sleep_error_message "(invalid time interval)|(bad character in argument)|(usage: sleep seconds)" |
| |
| # Force LANG, as the expect tests aren't localized |
| set ::env(LANG) "en_US.UTF-8" |
| |
| # Testsuite level variables |
| cset testsuite_dir "[file dirname [file normalize [info script]]]" |
| cset testsuite_shared_dir "[$bin_pwd]" |
| |
| # Testsuite non-privileged user (set it in globals.local) |
| cset testsuite_user "" |
| |
| # Testsuite log variables |
| cset testsuite_log_level $LOG_LEVEL_DEBUG |
| cset testsuite_log_format "\[%{timestamp}s.%{msecs}03d] %{loglevel}-7s %{message}s \(%{backtrace}s)" |
| cset testsuite_time_format "%Y-%m-%d %H:%M:%S" |
| |
| # Default to using color if writing to a terminal and not if writing to a file |
| cset testsuite_colorize [dict exists [fconfigure stdout] -mode] |
| cset testsuite_color_fatal $COLOR_RED |
| cset testsuite_color_error $COLOR_RED_NORMAL |
| cset testsuite_color_warn $COLOR_ORANGE |
| cset testsuite_color_info $COLOR_YELLOW |
| cset testsuite_color_pass $COLOR_GREEN |
| cset testsuite_color_command $COLOR_CYAN |
| cset testsuite_color_debug $COLOR_BLUE |
| cset testsuite_color_trace $COLOR_MAGENTA |
| cset testsuite_color_header $COLOR_NONE |
| cset testsuite_color_success $COLOR_GREEN |
| cset testsuite_color_failure $COLOR_RED |
| cset testsuite_color_skipped $COLOR_ORANGE |
| |
| # Set to true to cause the first subtest failure to immediately end the test |
| cset testsuite_subtest_fatal false |
| # Set to all, fail_skip, fail or none print datails of subtests and testprocs |
| cset testsuite_subtest_details fail |
| cset testsuite_testproc_details fail |
| cset testsuite_testproc_log_calls ignore_skips |
| |
| # To automatically call cleanup or not when ending the test |
| cset testsuite_cleanup_on_failure true |
| if {[info exists env(SLURM_TESTSUITE_CLEANUP_ON_FAILURE)]} { |
| set testsuite_cleanup_on_failure $env(SLURM_TESTSUITE_CLEANUP_ON_FAILURE) |
| } |
| |
| # To avoid potential infinite loops due calls to fail/pass/skip inside |
| # custom cleanup procs (_test_fini should be called only once) |
| set _test_fini_called false |
| |
| # Testproc internal variables |
| set _testproc_included [list] |
| set _testproc_excluded [list] |
| set _testproc_pass_list [list] |
| set _testproc_skip_list [list] |
| set _testproc_fail_list [list] |
| set _testproc_messages [dict create] |
| set _testproc_skip_next false |
| set _testproc_skip_reason "" |
| set _incomplete_reason "" |
| set _subtest_messages [dict create] |
| |
| set _subtest_pass_count 0 |
| set _subtest_skip_count 0 |
| set _subtest_fail_count 0 |
| |
| set STATUS_PASS 0 |
| set STATUS_FAIL 1 |
| set STATUS_SKIP -1 |
| |
| # Final test status |
| set test_status $STATUS_PASS |
| |
| # Other common variables |
| set re_word_str "\\S+" |
| set digit "\\d" |
| set eol "\r?\n" |
| set float "\\d+\\.?\\d*" |
| set number "\\d+" |
| set format_time "\\d+\\:\\d+\\:\\d+" |
| set number_with_suffix "\\d+\[KM\]*" |
| set slash "/" |
| set whitespace "\\s+" |
| set controlmachine_regex "\\S+" |
| # Any characters except ( , : newline |
| set no_delim "\[^(,:\r\n\]" |
| set no_delim_slash "\[^(,:/\r\n\]" |
| # The first group matches GRES name |
| # The second **optional** group matches GRES type. |
| # The third group matches GRES count. |
| # Test out the regex here: https://regex101.com/r/FlNYKM/7 |
| set gres_regex "($no_delim_slash*):($no_delim*)?:?($no_delim*)?:?($no_delim*)" |
| |
| #basic #defines in slurm.h |
| set NO_VAL 0xfffffffe |
| set INFINITE 0xffffffff |
| set SLURM_MAX_NORMAL_STEP_ID 0xfffffff0 |
| set SLURM_EXTERN_CONT 0xfffffffc |
| set SLURM_BATCH_SCRIPT 0xfffffffb |
| |
| |
| # |
| # Global variable used in multiple functions in "globals" file |
| # |
| set gpu_sock_list {} |
| |
| # |
| # Procedure return values |
| # |
| set RETURN_SUCCESS 0 |
| set RETURN_ERROR 1 |
| set RETURN_TIMEOUT 110 ; # ETIMEDOUT |
| |
| |
| ################################################################ |
| # |
| # NAME |
| # fail - fails a test |
| # |
| # SYNOPSIS |
| # fail message |
| # |
| # DESCRIPTION |
| # To be used when an error is fatal for the test. This routine |
| # prints the specified error message, optionally cleans up, prints |
| # a final test failure message, and exits the test with exit code 1. |
| # |
| # ENVIRONMENT |
| # Whether or not the cleanup procedure is called depends on the setting |
| # of the $testsuite_cleanup_on_failure set in the globals.local file or |
| # overridden with the SLURM_TESTSUITE_CLEANUP_ON_FAILURE environment |
| # variable. |
| # |
| # NOTE |
| # DO NOT call this within your local cleanup procedure. |
| # |
| ################################################################ |
| |
| proc fail { message } { |
| global _incomplete_reason STATUS_FAIL |
| |
| # Avoid recursive calls from within cleanup |
| if {[info level] > 1 && [lindex [info level -1] 0] eq "cleanup"} { |
| log_error "Local cleanup shouldn't call pass, fail or skip" |
| return |
| } |
| |
| log_fatal $message |
| set _incomplete_reason $message |
| |
| # _test_fini will handle cleanup and print the failure message. |
| _test_fini $STATUS_FAIL |
| } |
| |
| |
| ################################################################ |
| # |
| # NAME |
| # skip - skips a test |
| # |
| # SYNOPSIS |
| # skip message |
| # |
| # DESCRIPTION |
| # To be used when a precondition for the test fails and the test |
| # should be skipped. This routine prints the specified warning message, |
| # calls the cleanup procedure if defined, prints a final test skipped |
| # message, and exits the test with exit code -1 (aka 255). |
| # |
| # NOTE |
| # DO NOT call this within your local cleanup procedure. |
| # |
| ################################################################ |
| |
| proc skip { message } { |
| global _incomplete_reason STATUS_SKIP |
| |
| # Avoid recursive calls from within cleanup |
| if {[info level] > 1 && [lindex [info level -1] 0] eq "cleanup"} { |
| log_error "Local cleanup shouldn't call pass, fail or skip" |
| return |
| } |
| |
| log_warn $message |
| set _incomplete_reason $message |
| |
| # _test_fini will handle cleanup and print the skipped message. |
| _test_fini $STATUS_SKIP |
| } |
| |
| |
| ################################################################ |
| # |
| # NAME |
| # pass - passes a test |
| # |
| # SYNOPSIS |
| # pass |
| # |
| # DESCRIPTION |
| # To be used when a test passes and should complete with success. |
| # This routine calls the cleanup procedure if defined, prints a final |
| # test success message, and exits with exit code 0. |
| # |
| # NOTE |
| # DO NOT call this within your local cleanup procedure. |
| # |
| ################################################################ |
| |
| proc pass { } { |
| global STATUS_PASS |
| |
| # Avoid recursive calls from within cleanup |
| if {[info level] > 1 && [lindex [info level -1] 0] eq "cleanup"} { |
| log_error "Local cleanup shouldn't call pass, fail or skip" |
| return |
| } |
| |
| # _test_fini will handle cleanup and print the success message. |
| _test_fini $STATUS_PASS |
| } |
| |
| |
| ################################################################ |
| # |
| # NAME |
| # subpass - registers a passing subtest result |
| # |
| # SYNOPSIS |
| # subpass ?description? |
| # |
| # DESCRIPTION |
| # Increments the subtest pass count and logs a passing subtest message |
| # |
| # ARGUMENTS |
| # description |
| # A single-line string describing the subtest being verified |
| # |
| ################################################################ |
| |
| proc subpass args { |
| global _subtest_fail_count _subtest_pass_count _subtest_skip_count |
| global _subtest_messages |
| |
| set description "" |
| set argument_count [llength $args] |
| if {$argument_count == 1} { set args [lassign $args description] } |
| if {$argument_count > 1} { |
| fail "Too many arguments ($argument_count): $args" |
| } |
| |
| set subtest_count [expr $_subtest_pass_count + $_subtest_fail_count + $_subtest_skip_count + 1] |
| incr _subtest_pass_count |
| set message [format "Subtest %2d passed" $subtest_count] |
| if {$description ne ""} { append message " : $description" } |
| log_pass $message |
| dict set _subtest_messages $subtest_count [list pass $message] |
| } |
| |
| |
| ################################################################ |
| # |
| # NAME |
| # subfail - registers a failing subtest result |
| # |
| # SYNOPSIS |
| # subfail ?options? ?description? ?diagnostics? |
| # |
| # DESCRIPTION |
| # Increments the subtest failure count and logs a failing subtest message |
| # |
| # OPTIONS |
| # -fatal |
| # Causes this subtest failure to be fatal, ending the test |
| # ARGUMENTS |
| # description |
| # A single-line string describing the condition being verified |
| # diagnostics |
| # A string providing additional diagnostic information that will |
| # be included with the log message |
| # |
| # ENVIRONMENT |
| # testsuite_subtest_fatal |
| # Specifies whether first failing subtest aborts the test |
| # |
| ################################################################ |
| |
| proc subfail args { |
| global _subtest_fail_count _subtest_pass_count _subtest_skip_count |
| global testsuite_subtest_fatal |
| global _subtest_messages |
| |
| set description "" |
| set fatal false |
| while {[llength $args]} { |
| switch -glob -- [lindex $args 0] { |
| -fatal {set fatal true; set args [lrange $args 1 end]} |
| -* {fail "Unknown option: [lindex $args 0]"} |
| default break |
| } |
| } |
| set argument_count [llength $args] |
| if {$argument_count >= 1} { set args [lassign $args description] } |
| |
| set subtest_count [expr $_subtest_pass_count + $_subtest_fail_count + $_subtest_skip_count + 1] |
| incr _subtest_fail_count |
| set message [format "Subtest %2d failed" $subtest_count] |
| if {$description ne ""} { append message " : $description" } |
| if [llength $args] { append message " (" [join $args ", "] ")" } |
| if {$fatal || $testsuite_subtest_fatal} { |
| fail $message |
| } else { |
| log_error $message |
| } |
| dict set _subtest_messages $subtest_count [list fail $message] |
| } |
| |
| |
| ################################################################ |
| # |
| # NAME |
| # subskip - registers a skipped subtest result |
| # |
| # SYNOPSIS |
| # subskip ?options? ?description? |
| # |
| # DESCRIPTION |
| # Increments the subtest skip count and logs a skipped subtest message |
| # |
| # OPTIONS |
| # -count NUMBER |
| # When used with -skip, indicates the number of subtests that |
| # were skipped |
| # ARGUMENTS |
| # description |
| # A single-line string describing the reason the subtest is |
| # being skipped |
| # |
| ################################################################ |
| |
| proc subskip args { |
| global _subtest_fail_count _subtest_pass_count _subtest_skip_count |
| global _subtest_messages |
| |
| set description "" |
| set count 1 |
| set nolog 0 |
| while {[llength $args]} { |
| switch -glob -- [lindex $args 0] { |
| -nolog { |
| set nolog 1 |
| # Shift args down one |
| set args [lassign $args -] |
| } |
| -count {set args [lassign $args - count]} |
| -* {fail "Unknown option: [lindex $args 0]"} |
| default break |
| } |
| } |
| set argument_count [llength $args] |
| if {$argument_count == 1} { set args [lassign $args description] } |
| if {$argument_count > 1} { |
| fail "Too many arguments ($argument_count): $args" |
| } |
| |
| set subtest_count [expr $_subtest_pass_count + $_subtest_fail_count + $_subtest_skip_count + 1] |
| incr _subtest_skip_count $count |
| if {$count > 1} { |
| set message "Subtest $subtest_count-[expr $subtest_count+$count-1] skipped" |
| } else { |
| set message [format "Subtest %2d skipped" $subtest_count] |
| } |
| if {$description ne ""} { append message " : $description" } |
| if {$nolog != 1} { |
| log_warn $message |
| } |
| dict set _subtest_messages [expr $subtest_count] [list skip $message] |
| } |
| |
| |
| ################################################################ |
| # |
| # NAME |
| # get_subtest_fail_count - returns the current subtest failure count |
| # |
| # SYNOPSIS |
| # get_subtest_fail_count |
| # |
| ################################################################ |
| |
| proc get_subtest_fail_count {} { |
| global _subtest_fail_count |
| |
| return $_subtest_fail_count |
| } |
| |
| |
| ################################################################ |
| # |
| # NAME |
| # reset_bash_prompt - used in expect body to reset bash prompt |
| # |
| # SYNOPSIS |
| # reset_bash_prompt |
| # |
| # DESCRIPTION |
| # The procedure must be called within an expect body to |
| # reset the bash prompt to the $test_prompt. |
| # |
| # EXAMPLE |
| # spawn $salloc $bin_bash |
| # expect { |
| # -re "Granted job allocation ($number)" { |
| # set job_id $expect_out(1,string) |
| # reset_bash_prompt |
| # exp_continue |
| # } |
| # -re $test_prompt {}; # Expect initial test prompt from bash |
| # timeout { |
| # fail "salloc not responding or prompt not recognized" |
| # } |
| # } |
| # |
| ################################################################ |
| |
| proc reset_bash_prompt {} { |
| global testsuite_dir |
| |
| upvar spawn_id sid |
| send -i $sid "source $testsuite_dir/reset_bash_prompt.sh\n" |
| } |
| |
| |
| ################################################################ |
| # |
| # NAME |
| # print_time - prints the current date and time |
| # |
| # SYNOPSIS |
| # print_time |
| # |
| ################################################################ |
| |
| proc print_time { } { |
| global bin_date |
| |
| spawn $bin_date |
| expect { |
| eof { |
| wait |
| } |
| } |
| |
| return |
| } |
| |
| ################################################################ |
| # |
| # NAME |
| # dict_getdef - 'dict get' with ability to specify the default value |
| # |
| # SYNOPSIS |
| # dict_getdef dictionary_value key default_value |
| # |
| # DESCRIPTION |
| # Tcl < 8.7 lacks a built in 'dict get' with ability to specify the |
| # default value. Tcl 8.7 adds a dict getdef. |
| # This proc returns the value from the dictionary corresponding to the |
| # keys if it exists, or the default value otherwise. |
| # |
| # EXAMPLE |
| # dict_getdef $option_dict action "warn" |
| # |
| # SOURCE |
| # https://core.tcl-lang.org/tips/doc/trunk/tip/342.md |
| # https://core.tcl-lang.org/tcl/tktview/2370575 |
| # |
| ################################################################ |
| |
| proc dict_getdef {D args} { |
| if {[dict exists $D {*}[lrange $args 0 end-1]]} then { |
| dict get $D {*}[lrange $args 0 end-1] |
| } else { |
| lindex $args end |
| } |
| } |
| |
| |
| ################################################################ |
| # |
| # NAME |
| # _line_trace - returns an abbreviated call stack trace with line numbers |
| # |
| # SYNOPSIS |
| # _line_trace |
| # |
| ################################################################ |
| |
| proc _line_trace {} { |
| set line_trace "" |
| set first_entry true |
| for {set f [expr [info frame] - 3]} {$f >= 1} {incr f -1} { |
| set frame_dict [info frame $f] |
| if [dict exists $frame_dict file] { |
| if [regexp uplevel [dict get $frame_dict cmd]] { |
| continue |
| } |
| if {$first_entry} { |
| set first_entry false |
| } else { |
| append line_trace "," |
| } |
| if [dict exists $frame_dict proc] { |
| set proc [namespace tail [dict get $frame_dict proc]] |
| if {$proc ne ""} { |
| append line_trace "$proc\@" |
| } |
| } |
| append line_trace [file tail [dict get $frame_dict file]] |
| if [dict exists $frame_dict line] { |
| append line_trace ":[dict get $frame_dict line]" |
| } |
| } |
| } |
| return $line_trace |
| } |
| |
| |
| ################################################################ |
| # |
| # NAME |
| # tolerance - determines whether a value is within a specified tolerance |
| # |
| # SYNOPSIS |
| # tolerance expected observed tolerance_expression |
| # |
| # ARGUMENTS |
| # expected |
| # the expected (numeric) value |
| # observed |
| # the observed (numeric) value |
| # tolerance_expression |
| # a string of the form: [~][+|-]<tolerance>[%] |
| # |
| # DESCRIPTION |
| # tolerance |
| # A numeric tolerance |
| # symmetry |
| # By default the permitted range of values is symetric: |
| # [expected - tolerance, expected + tolerance] |
| # If the + sign is specified, the tolerance is limited to the |
| # the higher side only: |
| # [expected, expected + tolerance] |
| # If the - sign is specified, the tolerance is limited to the |
| # the lower side only: |
| # [expected - tolerance, expected] |
| # percent |
| # By default the permitted range is computed as absolute values: |
| # [expected - tolerance, expected + tolerance] |
| # If % is specified, the permitted range is computed as a |
| # percentage of the expected value: |
| # [expected*(1-tolerance/100), expected*(1+tolerance/100)] |
| # exclusivity |
| # By default the permitted range of values is inclusive, ie |
| # the min and max tolerated values are included in the range: |
| # [expected - tolerance, expected + tolerance] |
| # If ~ (exclusive) is specified, the tolerance limits are |
| # exclusive, ie the min and max tolerated values are excluded: |
| # (expected - tolerance, expected + tolerance) |
| # expression |
| # any combination of symetry, percent and exclusivity is allowed |
| # |
| # RETURN VALUE |
| # Returns true if the observed value is within the specified tolerance |
| # range of the expected value, otherwise false |
| # |
| # EXAMPLES |
| # The indicated tolerance_expression is true if: |
| # "5" expected - 5 <= observed <= expected + 5 |
| # "-5" expected - 5 <= observed <= expected |
| # "+5" expected <= observed <= expected + 5 |
| # "5%" expected - 5% <= observed <= expected + 5% |
| # "~5" expected - 5 < observed < expected + 5 |
| # "~+5%" expected <= observed < expected + 5% |
| # |
| ################################################################ |
| |
| proc tolerance { expected observed tolerance_expression } { |
| if {![regexp {^(~?)([-+]?)([0-9\.]+)(%?)$} $tolerance_expression {} exclusive sign tolerance percent]} { |
| fail "Invalid tolerance expression ($tolerance_expression)" |
| } |
| |
| set lower_bound_expression $observed |
| if {$sign eq "+" || $exclusive ne "~"} { |
| append lower_bound_expression " >=" |
| } else { |
| append lower_bound_expression " >" |
| } |
| append lower_bound_expression " $expected" |
| if {$sign eq "-" || $sign eq ""} { |
| if {$percent eq "%"} { |
| append lower_bound_expression " - $tolerance * $expected / 100" |
| } else { |
| append lower_bound_expression " - $tolerance" |
| } |
| } |
| |
| set upper_bound_expression $observed |
| if {$sign eq "-" || $exclusive ne "~"} { |
| append upper_bound_expression " <=" |
| } else { |
| append upper_bound_expression " <" |
| } |
| append upper_bound_expression " $expected" |
| if {$sign eq "+" || $sign eq ""} { |
| if {$percent eq "%"} { |
| append upper_bound_expression " + $tolerance * $expected / 100" |
| } else { |
| append upper_bound_expression " + $tolerance" |
| } |
| } |
| |
| if {[expr $lower_bound_expression] && [expr $upper_bound_expression]} { |
| log_debug "$observed is within tolerance $tolerance_expression of $expected" |
| return true |
| } else { |
| log_warn "$observed is not within tolerance $tolerance_expression of $expected" |
| return false |
| } |
| } |
| |
| |
| ################################################################ |
| # |
| # NAME |
| # check_run_as_user - check if the caller may run_command as the supplied user |
| # |
| # SYNOPSIS |
| # check_run_as_user user |
| # |
| # DESCRIPTION |
| # Note that a proper sudo config needs to be set in orther to pass this |
| # check. Calling user should be permitted to run_commands as the |
| # supplied user using sudo without password. |
| # See the -user option of run_command. |
| # This proc also log_warn a message if user already exists in the DB |
| # because most probably this user is testsuite_user and that user is |
| # expected NOT to be in the DB and could potentially be removed from it by |
| # the test. |
| # |
| # RETURN VALUE |
| # Returns a boolean value indicating whether the calling user may |
| # run_command as user. |
| # |
| ################################################################ |
| |
| proc check_run_as_user user { |
| global bin_id |
| |
| if {$user eq ""} { |
| return false |
| } |
| if {[run_command_status -none -user $user "$bin_id -un"]} { |
| return false |
| } |
| |
| if {[get_admin_level $user] != ""} { |
| log_warn "User $user already exists in DB, but it's probable that it's going to be removed by the test cleanup" |
| } |
| |
| return true |
| } |
| |
| |
| ################################################################ |
| # |
| # NAME |
| # check_user_id - check if user exists in the system |
| # |
| # SYNOPSIS |
| # check_user_id user |
| # |
| # DESCRIPTION |
| # This proc also log_warn a message if user already exists in the DB |
| # because most probably this user is testsuite_user and that user is |
| # expected NOT to be in the DB and could potentially be removed from |
| # it by the test. |
| # |
| # RETURN VALUE |
| # Returns true if the user passed exists in the system, false otherwise. |
| # |
| ################################################################ |
| |
| proc check_user_id user { |
| global bin_id |
| |
| if {$user eq ""} { |
| return false |
| } |
| if {[run_command_status -none "$bin_id -un $user"]} { |
| return false |
| } |
| |
| if {[get_admin_level $user] != ""} { |
| log_warn "User $user already exists in DB, but it's probable that it's going to be removed by the test cleanup" |
| } |
| |
| return true |
| } |
| |
| |
| ################################################################ |
| # |
| # NAME |
| # run_command - executes a command and returns a dictionary result |
| # |
| # SYNOPSIS |
| # run_command ?options? command |
| # |
| # DESCRIPTION |
| # Executes a command and returns a dictionary that includes the output, |
| # exit code, etc. An action can be taken (fail, warn, subtest, none) if |
| # the command's exit code is unexpected. By default, the action |
| # will be applied if the command fails. If the -xfail option is |
| # specified, the behavior will be reversed to apply the action if the |
| # command ran successfully. |
| # A timeout is always treated as unexpected, so log_error will be shown |
| # by default, or fail/subfail will be called if -fail/-subtest are used. |
| # |
| # OPTIONS |
| # -fail |
| # If the exit code is unexpected, the action that will |
| # be taken is to fail the test |
| # -subtest |
| # If the exit code is unexpected, the action that will |
| # be taken is to subfail, otherwise subpass will be called |
| # -warn |
| # If the exit code is unexpected, the action that will |
| # be taken is to log a warning (this is the default) |
| # -none |
| # If the exit code is unexpected, no action will be taken |
| # -xfail |
| # If the command exits with zero the action will be applied. |
| # Without this option, the action will be applied if the |
| # command exits with a non-zero exit code. |
| # -timeout <float_number> |
| # Time in seconds to wait for the command to complete before |
| # timing out (default is 60.0) |
| # -nolog |
| # Logging for this command will occur at trace threshold only |
| # -stdin |
| # Provide standard in to be piped into command |
| # -user <user> |
| # Attempt to execute command as <user>. Note that sudo must be |
| # properly configured to permit the caller to execute as <user>. |
| # See check_run_as_user. |
| # |
| # ARGUMENTS |
| # command |
| # a string containing the command and arguments to execute |
| # |
| # RETURN VALUE |
| # A dictionary containing the following elements: |
| # command - The command that was invoked |
| # exit_code - Exit code |
| # output - The combined standard output and standard error |
| # start_time - The time (with ms) the command was executed |
| # duration - The duration (seconds and milliseconds) the |
| # command took to run |
| # |
| ################################################################ |
| proc run_command args { |
| global bin_bash bin_sudo |
| |
| set alt_user "" |
| set exit_status 0 |
| set timedout false |
| set output "" |
| set action "warn" |
| set timeout 60 |
| set expect_failure false |
| set log_at_trace_level false |
| set stdin "" |
| while {[llength $args]} { |
| switch -glob -- [lindex $args 0] { |
| -fail {set action "fail"; set args [lrange $args 1 end]} |
| -subtest {set action "subtest"; set args [lrange $args 1 end]} |
| -none {set action "none"; set args [lrange $args 1 end]} |
| -timeout {set args [lassign $args - timeout]} |
| -warn {set action "warn"; set args [lrange $args 1 end]} |
| -xfail {set expect_failure true; set args [lrange $args 1 end]} |
| -nolog {set log_at_trace_level true; set args [lrange $args 1 end]} |
| -stdin {set stdin [lindex $args 1]; set args [lassign $args - stdin]} |
| -user {set args [lassign $args - alt_user]} |
| -* {fail "Unknown option: [lindex $args 0]"} |
| default break |
| } |
| } |
| if {[llength $args] == 1} { |
| lassign $args command |
| } else { |
| fail "Invalid number of arguments [llength $args]: $args" |
| } |
| |
| if {$action eq "subtest"} { |
| if {$expect_failure} { |
| set test_description "Command \"$command\" should fail" |
| } else { |
| set test_description "Command \"$command\" should succeed" |
| } |
| } |
| |
| if {$log_at_trace_level} { |
| interp alias {} log_run {} log_trace |
| interp alias {} log_details {} log_trace |
| } else { |
| interp alias {} log_run {} log_command |
| interp alias {} log_details {} log_debug |
| } |
| |
| set orig_log_user [log_user -info] |
| log_user 0 |
| |
| if {$alt_user ne ""} { |
| log_run "Run Command as user $alt_user: $command" |
| } else { |
| log_run "Run Command: $command" |
| } |
| set start_clock_ms [clock milliseconds] |
| set stty_init raw ; # Prevent the terminal from inserting \r |
| if {$alt_user ne ""} { |
| set expect_pid [spawn -noecho $bin_sudo -nu $alt_user $bin_bash -c "$command"] |
| } else { |
| set expect_pid [spawn -noecho $bin_bash -c "$command"] |
| } |
| if { $stdin != "" } { |
| exp_send "$stdin" |
| set command "$command <<< $stdin" |
| } |
| expect { |
| -re "(.+)" { |
| append output $expect_out(1,string) |
| exp_continue |
| } |
| timeout { |
| slow_kill $expect_pid |
| set exit_status $::RETURN_TIMEOUT |
| set timedout true |
| } |
| eof { |
| lassign [wait] pid spawnid os_error_flag errno |
| set exit_status [expr $errno > 128 ? $errno - 256 : $errno] |
| } |
| } |
| set start_time [format "%.3f" [expr $start_clock_ms / 1000.000]] |
| set end_time [format "%.3f" [expr [clock milliseconds] / 1000.000]] |
| set duration [format "%.3f" [expr $end_time - $start_time]] |
| |
| log_details "Command Results:" |
| log_details " Duration: $duration" |
| log_details " Exit Code: $exit_status" |
| if {[info exists output]} { |
| log_details " Output: $output" |
| } |
| |
| if {$timedout} { |
| set message "Command \"$command\" timed out after $timeout seconds" |
| if {$action eq "fail"} { |
| fail $message |
| } elseif {$action eq "subtest"} { |
| subfail $test_description $message |
| } else { |
| log_error $message |
| } |
| } elseif {! $expect_failure && $exit_status != 0} { |
| set message "Command \"$command\" failed with rc=$exit_status" |
| if {[info exists output] && $output != ""} { |
| append message ": [string trimright $output]" |
| } |
| if {$action eq "warn"} { |
| log_warn $message |
| } elseif {$action eq "subtest"} { |
| subfail $test_description $message |
| } elseif {$action eq "fail"} { |
| fail $message |
| } |
| } elseif {$expect_failure && $exit_status == 0} { |
| set message "Command \"$command\" was expected to fail but succeeeded" |
| if {$action eq "warn"} { |
| log_warn $message |
| } elseif {$action eq "subtest"} { |
| subfail $test_description $message |
| } elseif {$action eq "fail"} { |
| fail $message |
| } |
| } elseif {$action eq "subtest"} { |
| subpass $test_description |
| } |
| log_user $orig_log_user |
| |
| dict set result command $command |
| dict set result exit_code $exit_status |
| dict set result output $output |
| dict set result start_time $start_time |
| dict set result duration $duration |
| |
| return $result |
| } |
| |
| |
| ################################################################ |
| # |
| # NAME |
| # run_command_output - executes a command and returns the output |
| # |
| # SYNOPSIS |
| # run_command_output ?options? command |
| # |
| # DESCRIPTION |
| # Executes a command and returns a dictionary that includes the output, |
| # exit code, etc. An action can be taken (fail, warn, none) if the |
| # command's exit code or timeout is unexpected. By default, the action |
| # will be applied if the command fails. If the -xfail option is |
| # specified, the behavior will be reversed to apply the action if the |
| # command ran successfully. |
| # |
| # OPTIONS |
| # -fail |
| # if the exit code or timeout is unexpected, the action that will |
| # be taken is to fail the test |
| # -warn |
| # if the exit code or timeout is unexpected, the action that will |
| # be taken is to log a warning (this is the default) |
| # -none |
| # if the exit code or timeout is unexpected, no action will be |
| # taken |
| # -xfail |
| # if the command exits with zero and does not time out, the |
| # action will be applied. Without this option, the action will |
| # be applied if the command exits with a non-zero exit code or |
| # times out. |
| # -timeout <float_number> |
| # time in seconds to wait for the command to complete before |
| # timing out (default is 60.0) |
| # -stdin |
| # Provide standard in to be piped into command |
| # |
| # ARGUMENTS |
| # command |
| # a string containing the command and arguments to execute |
| # |
| # RETURN VALUE |
| # A string containing the combined standard output and standard error |
| # |
| ################################################################ |
| |
| proc run_command_output args { |
| |
| set result [run_command {*}$args] |
| |
| if [dict exists $result output] { |
| return [dict get $result output] |
| } else { |
| return "" |
| } |
| } |
| |
| |
| ################################################################ |
| # |
| # NAME |
| # run_command_status - executes a command and returns the exit code |
| # |
| # SYNOPSIS |
| # run_command_status ?options? command |
| # |
| # DESCRIPTION |
| # Executes a command and returns a dictionary that includes the output, |
| # exit code, etc. An action can be taken (fail, warn, none) if the |
| # command's exit code or timeout is unexpected. By default, the action |
| # will be applied if the command fails. If the -xfail option is |
| # specified, the behavior will be reversed to apply the action if the |
| # command ran successfully. |
| # |
| # OPTIONS |
| # -fail |
| # if the exit code or timeout is unexpected, the action that will |
| # be taken is to fail the test |
| # -warn |
| # if the exit code or timeout is unexpected, the action that will |
| # be taken is to log a warning (this is the default) |
| # -none |
| # if the exit code or timeout is unexpected, no action will be |
| # taken |
| # -xfail |
| # if the command exits with zero and does not time out, the |
| # action will be applied. Without this option, the action will |
| # be applied if the command exits with a non-zero exit code or |
| # times out. |
| # -timeout <float_number> |
| # time in seconds to wait for the command to complete before |
| # timing out (default is 60.0) |
| # -stdin |
| # Provide standard in to be piped into command |
| # |
| # ARGUMENTS |
| # command |
| # a string containing the command and arguments to execute |
| # |
| # RETURN VALUE |
| # The exit code for the invoked command |
| # |
| ################################################################ |
| |
| proc run_command_status args { |
| |
| set result [run_command {*}$args] |
| |
| return [dict get $result exit_code] |
| } |
| |
| |
| ################################################################ |
| # |
| # NAME |
| # cancel_job - cancels the specified job list |
| # |
| # SYNOPSIS |
| # cancel_job job_id_list ?het_job? |
| # |
| # DESCRIPTION |
| # Cancels one or more jobs. A job_id of 0 will be silently ignored. |
| # |
| # OPTIONS |
| # -fail |
| # if scancel fails with exit code or timeout, or the job doesn't |
| # end, test will will fail |
| # |
| # ARGUMENTS |
| # job_id_list |
| # The list of Slurm job ids that we want to cancel |
| # het_job |
| # 1 if jobs are hetjobs and we want to confirm each |
| # component has completed |
| # |
| # RETURN VALUE |
| # RETURN_SUCCESS if jobs are cancelled, or non-zero value otherwise. |
| # |
| ################################################################ |
| |
| proc cancel_job args { |
| global scancel |
| |
| set fatal false |
| while {[llength $args]} { |
| switch -glob -- [lindex $args 0] { |
| -fail {set fatal true; set args [lrange $args 1 end]} |
| -* {fail "Unknown option: [lindex $args 0]"} |
| default break |
| } |
| } |
| |
| set argument_count [llength $args] |
| if {$argument_count < 1} { |
| fail "Too few arguments ($argument_count): $args" |
| } elseif {$argument_count > 2} { |
| fail "Too many arguments ($argument_count): $args" |
| } |
| lassign $args job_id_list |
| if {$argument_count == 2} { |
| set het_job [lindex $args 1] |
| } else { |
| set het_job 0 |
| } |
| |
| set job_list_clean [list] |
| foreach job_id $job_id_list { |
| if {$job_id != 0} { |
| lappend job_list_clean $job_id |
| } |
| } |
| |
| if {![llength $job_list_clean]} { |
| return $::RETURN_SUCCESS |
| } |
| |
| log_debug "Cancelling $job_list_clean" |
| set result [run_command "$scancel -Q $job_list_clean"] |
| if {[dict get $result exit_code]} { |
| set message "scancel command returned an error ([dict get $result output])" |
| if {$fatal} { |
| fail $message |
| } else { |
| log_warn $message |
| return $::RETURN_ERROR |
| } |
| } |
| foreach job_id $job_list_clean { |
| if {[wait_for_job $job_id "DONE" $het_job]} { |
| set message "Job ($job_id) did not end when cancelled" |
| if {$fatal} { |
| fail $message |
| } else { |
| log_warn $message |
| return $::RETURN_ERROR |
| } |
| } |
| } |
| return $::RETURN_SUCCESS |
| } |
| |
| |
| ################################################################ |
| # |
| # NAME |
| # get_line_cnt - returns the size of the specified file |
| # |
| # SYNOPSIS |
| # get_line_cnt file_name |
| # |
| # RETURN VALUE |
| # Number of lines in the specified file. |
| # |
| ################################################################ |
| |
| proc get_line_cnt { file_name } { |
| global bin_wc number |
| set lines 0 |
| spawn $bin_wc -l $file_name |
| expect { |
| -re "($number) " { |
| set lines $expect_out(1,string) |
| exp_continue |
| } |
| eof { |
| wait |
| } |
| } |
| return $lines |
| } |
| |
| |
| ################################################################ |
| # |
| # NAME |
| # convert_time_str - Converts a string like [HH:]MM:SS[.sss] to secs |
| # |
| # SYNOPSIS |
| # Convert the string to a number. |
| # |
| # DESCRIPTION |
| # Units can be hours, mins, secs. |
| # |
| # RETURN VALUE |
| # A non-zero return code indicates a failure. |
| # |
| ################################################################ |
| |
| proc convert_time_str { time_str units } { |
| subtest {[regexp {(?:(\d+):)?(\d+):(\d+)(?:\.(\d+))?} $time_str - hours mins secs msecs]} "Time should be in the format \[HH:\]MM:SS\[.sss\]" "$time_str" |
| if { $hours eq "" } { |
| set hours 0 |
| } |
| |
| # Use scan to avoid TCL octals |
| set value [expr ([scan $hours "%d"] * 3600) + ([scan $mins "%d"] * 60) + [scan $secs "%d"]] |
| |
| if { $units eq "hours" } { |
| set value [expr $value / 3600] |
| } elseif { $units eq "mins" } { |
| set value [expr $value / 60] |
| } |
| |
| return $value |
| } |
| |
| |
| ################################################################ |
| # |
| # NAME |
| # slow_kill - kills a process slowly |
| # |
| # SYNOPSIS |
| # slow_kill pid |
| # |
| # DESCRIPTION |
| # Kill a process slowly, first trying SIGINT, pausing for |
| # a second, then sending SIGKILL. |
| # |
| # RETURN VALUE |
| # A non-zero return code indicates a failure. |
| # |
| ################################################################ |
| |
| proc slow_kill { pid } { |
| global bin_kill |
| |
| catch {exec $bin_kill -INT $pid} |
| catch {exec $bin_kill -INT $pid} |
| sleep 1 |
| catch {exec $bin_kill -KILL $pid} |
| |
| return 0 |
| } |
| |
| |
| ################################################################ |
| # |
| # NAME |
| # get_my_id - gets the id from the running user |
| # |
| # SYNOPSIS |
| # get_my_id |
| # |
| # RETURN VALUE |
| # output of id |
| # |
| ################################################################ |
| |
| proc get_my_id {} { |
| |
| global bin_id number |
| set login_info -1 |
| |
| log_user 0 |
| |
| spawn $bin_id |
| expect { |
| -re "(uid=.*\n)" { |
| set login_info $expect_out(1,string) |
| exp_continue |
| } |
| eof { |
| wait |
| } |
| } |
| |
| log_user 1 |
| |
| if {$login_info == -1} { |
| fail "Unable to get user info" |
| } |
| |
| return $login_info |
| } |
| |
| |
| ################################################################ |
| # |
| # NAME |
| # get_my_user_name - gets the name from the running user |
| # |
| # SYNOPSIS |
| # get_my_user_name |
| # |
| # RETURN VALUE |
| # A non-zero return code indicates a failure. |
| # |
| ################################################################ |
| |
| proc get_my_user_name { } { |
| global bin_id re_word_str |
| |
| set user_name -1 |
| |
| log_user 0 |
| spawn $bin_id -nu |
| expect { |
| -re "($re_word_str)" { |
| set user_name $expect_out(1,string) |
| exp_continue |
| } |
| eof { |
| wait |
| } |
| } |
| log_user 1 |
| |
| if {$user_name == -1} { |
| fail "Unable to get user name" |
| } |
| |
| return $user_name |
| } |
| |
| |
| ################################################################ |
| # |
| # NAME |
| # get_my_uid - gets the uid from the running user |
| # |
| # SYNOPSIS |
| # get_my_uid |
| # |
| # RETURN VALUE |
| # The uid of the current user, or fails. |
| # |
| ################################################################ |
| |
| proc get_my_uid { } { |
| global bin_id number |
| |
| set out [run_command_output -nolog -fail "$bin_id -u"] |
| if {![regexp "($number)" $out - uid]} { |
| fail "Unable to get UID with $bin_id ($out)" |
| } |
| |
| return $uid |
| } |
| |
| |
| ################################################################ |
| # |
| # NAME |
| # get_my_gid - gets the gid from the running user |
| # |
| # SYNOPSIS |
| # get_my_gid |
| # |
| # RETURN VALUE |
| # A non-zero return code indicates a failure. |
| # |
| ################################################################ |
| |
| proc get_my_gid { } { |
| global bin_id number |
| |
| set gid -1 |
| |
| log_user 0 |
| spawn $bin_id -g |
| expect { |
| -re "($number)" { |
| set gid $expect_out(1,string) |
| exp_continue |
| } |
| eof { |
| wait |
| } |
| } |
| log_user 1 |
| |
| return $gid |
| } |
| |
| |
| ################################################################ |
| # |
| # NAME |
| # kill_salloc - kills all salloc commands associated with this user |
| # |
| # SYNOPSIS |
| # kill_salloc |
| # |
| # DESCRIPTION |
| # Kill all salloc commands associated with this user. |
| # Issue two SIGINT, sleep 1 and a SIGKILL |
| # |
| # RETURN VALUE |
| # A non-zero return code indicates a failure. |
| # |
| # NOTE |
| # Use slow_kill instead of kill_salloc if you can capture |
| # the process id |
| # |
| ################################################################ |
| |
| proc kill_salloc { } { |
| global bin_id bin_pkill bin_sleep number |
| |
| set uid [get_my_uid] |
| catch {exec $bin_pkill -INT -u $uid salloc} |
| catch {exec $bin_pkill -INT -u $uid salloc} |
| sleep 1 |
| catch {exec $bin_pkill -KILL -u $uid salloc} |
| |
| return 0 |
| } |
| |
| |
| ################################################################ |
| # |
| # NAME |
| # kill_srun - kills all srun commands associated with this user |
| # |
| # SYNOPSIS |
| # kill_srun |
| # |
| # DESCRIPTION |
| # Kill all srun commands associated with this user. |
| # Issue two SIGINT, sleep 1 and a SIGKILL |
| # |
| # RETURN VALUE |
| # A non-zero return code indicates a failure. |
| # |
| # NOTE |
| # Use slow_kill instead of kill_srun if you can capture |
| # the process id |
| # |
| ################################################################ |
| |
| proc kill_srun { } { |
| global bin_id bin_pkill bin_sleep number |
| |
| set uid [get_my_uid] |
| catch {exec $bin_pkill -INT -u $uid srun} |
| catch {exec $bin_pkill -INT -u $uid srun} |
| sleep 1 |
| catch {exec $bin_pkill -KILL -u $uid srun} |
| |
| return 0 |
| } |
| |
| |
| ################################################################ |
| # |
| # NAME |
| # wait_for - generic wait utility |
| # |
| # SYNOPSIS |
| # wait_for ?options? condition body |
| # |
| # DESCRIPTION |
| # Generic wait utility allowing you to repeatedly execute a generic block |
| # of code until a specified boolean expression is met. The code block and |
| # condition check occur every poll interval until a timeout is reached. |
| # |
| # OPTIONS |
| # -fail |
| # fail the test if the condition is not met within the timeout |
| # -subtest |
| # if the condition is met within the timeout call subpass, otherwise |
| # call subfail |
| # -timeout <float_number> |
| # time in seconds to wait for the condition to be met before |
| # timing out (default is 60.0) |
| # -pollinterval <float_number> |
| # time in seconds between each loop execution and condition check |
| # (default is 1.0) |
| # |
| # ARGUMENTS |
| # condition |
| # The boolean expression to test |
| # body |
| # A block of code to evaluate in the invoking stack frame |
| # |
| # RETURN VALUE |
| # RETURN_SUCCESS if the condition is met before the timeout occurs, |
| # RETURN_TIMEOUT if the timeout occurs before the condition is met |
| # |
| ################################################################ |
| |
| proc wait_for args { |
| set action "none" |
| set subtest false |
| set timeout 60 |
| set poll_interval 1 |
| while {[llength $args]} { |
| switch -glob -- [lindex $args 0] { |
| -fail {set action "fail"; set args [lrange $args 1 end]} |
| -subtest {set action "subtest"; set args [lrange $args 1 end]} |
| -time* {set args [lassign $args - timeout]} |
| -poll* {set args [lassign $args - poll_interval]} |
| -* {fail "Unknown option: [lindex $args 0]"} |
| default break |
| } |
| } |
| if {[llength $args] == 2} { |
| lassign $args condition body |
| } else { |
| fail "Invalid number of arguments [llength $args]: $args" |
| } |
| |
| set start_time [format "%.3f" [expr [clock milliseconds] / 1000.000]] |
| |
| log_debug "Waiting for $condition" |
| |
| while {1} { |
| # Evaluate code block |
| log_trace "Evaluating code block ([string trim $body])" |
| uplevel $body |
| |
| # Check condition |
| if {[uplevel expr [format "{%s}" $condition]]} { |
| set now [format "%.3f" [expr [clock milliseconds] / 1000.000]] |
| log_debug "Condition ($condition) was met" |
| if {$action eq "subtest"} { |
| subpass "Condition ($condition) should be met (within $timeout seconds)" |
| } |
| return $::RETURN_SUCCESS |
| } else { |
| log_trace "Condition ($condition) was not met" |
| } |
| |
| # Sleep poll interval |
| log_trace "Sleeping for $poll_interval seconds" |
| after [expr {int($poll_interval * 1000)}] |
| |
| # Check if we have surpassed our timeout |
| set now [format "%.3f" [expr [clock milliseconds] / 1000.000]] |
| log_trace "Checking whether the current time ([clock format [expr int($now)] -format %Y-%m-%dT%X].[lindex [split $now '.'] 1]) is greater than the start time plus the timeout ([clock format [expr int($start_time + $timeout)] -format %Y-%m-%dT%X].[lindex [split [expr $start_time + $timeout] '.'] 1])" |
| if {$now > $start_time + $timeout} { |
| set message "Condition ($condition) was not met before timeout ($timeout seconds)" |
| if {$action eq "fail"} { |
| fail $message |
| } else { |
| log_warn $message |
| if {$action eq "subtest"} { |
| subfail "Condition ($condition) should be met (within $timeout seconds)" "Timed out" |
| } |
| return $::RETURN_TIMEOUT |
| } |
| } |
| } |
| } |
| |
| |
| ################################################################ |
| # |
| # NAME |
| # wait_for_command - repeat a command until it is successful or meets a specified condition |
| # |
| # SYNOPSIS |
| # wait_for_command ?options? command ?condition? |
| # |
| # DESCRIPTION |
| # A command is repeated until it meets a condition or a timeout is reached. |
| # If a condition is not specified, the command will be repeated until it |
| # is successful (the exit code is zero). |
| # |
| # OPTIONS |
| # -fail |
| # fail the testif the condition is not met within the timeout |
| # -subtest |
| # if the condition is met within the timeout call subpass, otherwise |
| # call subfail |
| # -timeout <float_number> |
| # time in seconds to wait for the condition to be met before |
| # timing out (default is 60.0) |
| # -pollinterval <float_number> |
| # time in seconds between each loop execution and condition |
| # check (default is 1.0) |
| # |
| # ARGUMENTS |
| # command |
| # a string containing the command and arguments to execute |
| # condition |
| # The boolean expression to test. For each command invocation, |
| # the result variable will be set to the dictionary returned |
| # from run_command. |
| # The condition expression will normally involve a comparison |
| # with one or more values of this dictionary. If a condition is |
| # not specified, this condition will be used: |
| # { [dict get $result exit_code] == 0 } |
| # |
| # RETURN VALUE |
| # RETURN_SUCCESS if the condition is met before the timeout occurs, |
| # RETURN_TIMEOUT if the timeout occurs before the condition is met |
| # |
| ################################################################ |
| |
| proc wait_for_command args { |
| set action "none" |
| set timeout 60 |
| set poll_interval 1 |
| while {[llength $args]} { |
| switch -glob -- [lindex $args 0] { |
| -fail {set action "fail"; set args [lrange $args 1 end]} |
| -subtest {set action "subtest"; set args [lrange $args 1 end]} |
| -time* {set args [lassign $args - timeout]} |
| -poll* {set args [lassign $args - poll_interval]} |
| -* {fail "Unknown option: [lindex $args 0]"} |
| default break |
| } |
| } |
| |
| set argument_count [llength $args] |
| if {$argument_count < 1} { |
| fail "Too few arguments ($argument_count): $args" |
| } elseif {$argument_count > 2} { |
| fail "Too many arguments ($argument_count): $args" |
| } |
| lassign $args command |
| if {$argument_count == 2} { |
| set condition [lindex $args 1] |
| } else { |
| set condition { [dict get $result exit_code] == 0 } |
| } |
| |
| set start_time [format "%.3f" [expr [clock milliseconds] / 1000.000]] |
| |
| log_debug "Waiting for $condition" |
| |
| while {1} { |
| # Run command |
| set result [run_command $command] |
| |
| # Check condition |
| if {[eval expr [format "{%s}" $condition]]} { |
| set now [format "%.3f" [expr [clock milliseconds] / 1000.000]] |
| log_debug "Condition ($condition) was met" |
| if {$action eq "subtest"} { |
| subpass "Condition ($condition) should be met (within $timeout seconds)" |
| } |
| return $::RETURN_SUCCESS |
| } else { |
| log_trace "Condition ($condition) was not met" |
| } |
| |
| # Sleep poll interval |
| log_trace "Sleeping for $poll_interval seconds" |
| after [expr {int($poll_interval * 1000)}] |
| |
| # Check whether we have surpassed our timeout |
| set now [format "%.3f" [expr [clock milliseconds] / 1000.000]] |
| log_trace "Checking whether the current time ([clock format [expr int($now)] -format %Y-%m-%dT%X].[lindex [split $now '.'] 1]) is greater than the start time plus the timeout ([clock format [expr int($start_time + $timeout)] -format %Y-%m-%dT%X].[lindex [split [expr $start_time + $timeout] '.'] 1])" |
| if {$now > $start_time + $timeout} { |
| set message "Condition ($condition) was not met before timeout ($timeout seconds)" |
| if {$action eq "fail"} { |
| fail $message |
| } else { |
| log_warn $message |
| if {$action eq "subtest"} { |
| subfail "Condition ($condition) should be met (within $timeout seconds)" "Timed out" |
| } |
| return $::RETURN_TIMEOUT |
| } |
| } |
| } |
| } |
| |
| |
| ################################################################ |
| # |
| # NAME |
| # wait_for_command_match - repeat a command until its output matches the specified pattern |
| # |
| # SYNOPSIS |
| # wait_for_command_match ?options? command pattern |
| # |
| # DESCRIPTION |
| # A command is repeated until its output matches the specified pattern |
| # |
| # OPTIONS |
| # -negate |
| # negates the match. The command is repeated until its output does not |
| # match the specified pattern |
| # -fail |
| # fail the test if the output does not match the pattern within the |
| # timeout |
| # -subfail |
| # if the condition is met within the timeout call subpass, otherwise |
| # call subfail |
| # -timeout <float_number> |
| # time in seconds to wait for the pattern to be matched before |
| # timing out (default is 60.0) |
| # -pollinterval <float_number> |
| # time in seconds between each loop execution and match check |
| # (default is 1.0) |
| # |
| # ARGUMENTS |
| # command |
| # a string containing the command and arguments to execute |
| # pattern |
| # The regular expression to match against the command output |
| # |
| # RETURN VALUE |
| # RETURN_SUCCESS if the pattern is matched before the timeout occurs, |
| # RETURN_TIMEOUT if the timeout occurs before the pattern is matched |
| # |
| ################################################################ |
| |
| proc wait_for_command_match args { |
| |
| set pattern [lindex $args end] |
| set args [lrange $args 0 end-1] |
| set match 1 |
| set idx [lsearch $args "-negate"] |
| if {$idx != -1} { |
| set match 0 |
| set args [lreplace $args $idx $idx]; # Remove the -negate argument |
| } |
| |
| return [wait_for_command {*}$args "\[regexp -- {$pattern} \[dict get \$result output\]\] == $match"] |
| } |
| |
| |
| ################################################################ |
| # |
| # NAME |
| # wait_for_file - waits for a file to exist with non-zero size |
| # |
| # SYNOPSIS |
| # wait_for_file ?options? file_name |
| # |
| # OPTIONS |
| # -fail |
| # If an error occurs or the file does not become present |
| # by the timeout, fail the test rather than returning an error |
| # -timeout <integer_number> |
| # time in seconds to wait for the file to exist before |
| # timing out (default is 90) |
| # -pollinterval <integer_number> |
| # time in seconds between each file existence test (default is 1) |
| # |
| # DESCRIPTION |
| # Wait for the specified file to exist and have a non-zero size. |
| # Note that if JobFileAppend=0 is configured, a file can exist and |
| # be purged then be re-created. |
| # |
| # RETURN VALUE |
| # RETURN_SUCCESS if the file becomes present within the timeout, or |
| # non-zero value otherwise. |
| # |
| ################################################################ |
| |
| proc wait_for_file args { |
| global bin_sleep |
| |
| set fatal false |
| set timeout 90 |
| set poll_interval 1 |
| while {[llength $args]} { |
| switch -glob -- [lindex $args 0] { |
| -fatal - |
| -fail {set fatal true; set args [lrange $args 1 end]} |
| -time* {set args [lassign $args - timeout]} |
| -poll* {set args [lassign $args - poll_interval]} |
| -* {fail "Unknown option: [lindex $args 0]"} |
| default break |
| } |
| } |
| set argument_count [llength $args] |
| if {$argument_count != 1} { |
| fail "Invalid number of arguments ($argument_count): $args" |
| } else { |
| lassign $args file_name |
| } |
| |
| for {set my_delay 0} {$my_delay <= $timeout} \ |
| {set my_delay [expr $my_delay + $poll_interval]} { |
| if {[file exists $file_name]} { |
| # Add small delay for I/O buffering |
| exec $bin_sleep 1 |
| return $::RETURN_SUCCESS |
| } |
| exec $bin_sleep $poll_interval |
| |
| # Expect may fail to load current NFS info. |
| # Use the ls command to load current info. |
| set slash_pos [string last $file_name "/"] |
| if {$slash_pos < 1} { |
| set dir_name "." |
| } else { |
| decr slash_pos |
| set dir_name [string $file_name 0 $slash_pos] |
| } |
| exec /bin/ls $dir_name |
| } |
| set message "Timeout waiting for file ($file_name)" |
| if {$fatal} { |
| fail $message |
| } |
| |
| log_warn $message |
| return $::RETURN_TIMEOUT |
| } |
| |
| |
| ################################################################ |
| # |
| # NAME |
| # _wait_for_single_job - waits for a job to reach the desired state |
| # |
| # SYNOPSIS |
| # _wait_for_single_job ?options? job_id desired_state |
| # |
| # DESCRIPTION |
| # Wait for a previously submitted Slurm job to reach the desired state. |
| # |
| # OPTIONS |
| # -fail |
| # If an error occurs or the job does not reach the desired state |
| # by the timeout, fail the test rather than returning an error |
| # -timeout <integer_number> |
| # time in seconds to wait for the job to be in the desired state |
| # before timing out (default is 360) |
| # -pollinterval <integer_number> |
| # time in seconds between each job state check (default is 1) |
| # |
| # ARGUMENTS |
| # job_id |
| # The Slurm job id of a job we want to wait for. |
| # desired_state |
| # The state you want the job to attain before |
| # returning. Currently supports: |
| # CANCELLED - job is cancelled |
| # DONE - any terminated state (includes cancelled) |
| # PENDING - job is pending |
| # RUNNING - job is running |
| # SPECIAL_EXIT |
| # SUSPENDED - job is suspended |
| # |
| # RETURN VALUE |
| # RETURN_SUCCESS, or non-zero on error. |
| # |
| ################################################################ |
| |
| proc _wait_for_single_job args { |
| global scontrol |
| |
| set fatal false |
| set timeout 360 |
| set poll_interval 1 |
| while {[llength $args]} { |
| switch -glob -- [lindex $args 0] { |
| -fatal - |
| -fail {set fatal true; set args [lrange $args 1 end]} |
| -time* {set args [lassign $args - timeout]} |
| -poll* {set args [lassign $args - poll_interval]} |
| -* {fail "Unknown option: [lindex $args 0]"} |
| default break |
| } |
| } |
| set argument_count [llength $args] |
| if {$argument_count != 2} { |
| fail "Invalid number of arguments ($argument_count): $args" |
| } else { |
| lassign $args job_id desired_state |
| } |
| |
| # First verify that desired_state is supported |
| switch $desired_state { |
| "CANCELLED" {} |
| "DONE" {} |
| "PENDING" {} |
| "RUNNING" {} |
| "SPECIAL_EXIT" {} |
| "SUSPENDED" {} |
| default { |
| set message "Invalid desired state: $desired_state" |
| if {$fatal} { |
| fail $message |
| } |
| log_warn $message |
| return $::RETURN_ERROR |
| } |
| } |
| |
| if {$job_id == 0} { |
| set message "Invalid job ID ($job_id)" |
| if {$fatal} { |
| fail $message |
| } |
| log_warn $message |
| return $::RETURN_ERROR |
| } |
| |
| set my_delay 0 |
| while 1 { |
| set fd [open "|$scontrol -o show job $job_id"] |
| gets $fd line |
| catch {close $fd} |
| if {[regexp {JobState\s*=\s*(\w+)} $line foo state] != 1} { |
| set state "NOT_FOUND" |
| } |
| |
| switch $state { |
| "CANCELLED" { |
| if {$desired_state eq "CANCELLED"} { |
| log_debug "Job $job_id is CANCELLED" |
| return $::RETURN_SUCCESS |
| } |
| if {$desired_state eq "DONE"} { |
| log_debug "Job $job_id is DONE ($state)" |
| return $::RETURN_SUCCESS |
| } |
| set message "Job ($job_id) is $state, but we wanted $desired_state" |
| if {$fatal} { |
| fail $message |
| } |
| log_debug $message |
| return $::RETURN_ERROR |
| } |
| "NOT_FOUND" - |
| "BOOT_FAIL" - |
| "COMPLETED" - |
| "DEADLINE" - |
| "FAILED" - |
| "NODE_FAIL" - |
| "OUT_OF_MEMORY" - |
| "PREEMPTED" - |
| "TIMEOUT" { |
| if {$desired_state eq "DONE"} { |
| log_debug "Job $job_id is DONE ($state)" |
| return $::RETURN_SUCCESS |
| } |
| set message "Job ($job_id) is $state, but we wanted $desired_state" |
| if {$fatal} { |
| fail $message |
| } |
| log_debug $message |
| return $::RETURN_ERROR |
| } |
| "PENDING" { |
| if {$desired_state eq "PENDING"} { |
| log_debug "Job $job_id is PENDING" |
| return $::RETURN_SUCCESS |
| } |
| log_debug "Job $job_id is in state $state, desire $desired_state" |
| } |
| "RUNNING" { |
| if {$desired_state eq "RUNNING"} { |
| log_debug "Job $job_id is RUNNING" |
| return $::RETURN_SUCCESS |
| } |
| log_debug "Job $job_id is in state $state, desire $desired_state" |
| } |
| "SPECIAL_EXIT" { |
| if {$desired_state eq "SPECIAL_EXIT"} { |
| log_debug "Job $job_id is SPECIAL_EXIT" |
| return $::RETURN_SUCCESS |
| } |
| log_debug "Job $job_id is in state $state, desire $desired_state" |
| } |
| "SUSPENDED" { |
| if {$desired_state eq "SUSPENDED"} { |
| log_debug "Job $job_id is SUSPENDED" |
| return $::RETURN_SUCCESS |
| } |
| log_debug "Job $job_id is in state $state, desire $desired_state" |
| } |
| default { |
| log_debug "Job $job_id is in state $state, desire $desired_state" |
| } |
| } |
| |
| if { $my_delay > $timeout } { |
| set message "Timeout waiting for job state $desired_state" |
| if {$fatal} { |
| fail $message |
| } |
| log_warn "Timeout waiting for job state $desired_state" |
| return $::RETURN_TIMEOUT |
| } |
| |
| exec sleep $poll_interval |
| set my_delay [expr $my_delay + $poll_interval] |
| } |
| } |
| |
| |
| ################################################################ |
| # |
| # NAME |
| # wait_for_job - waits for job to be in desired state |
| # |
| # SYNOPSIS |
| # wait_for_job ?options? job_id desired_state ?het_job? |
| # |
| # DESCRIPTION |
| # Wait for job to be in desired state. Can handle het job components. |
| # |
| # OPTIONS |
| # -fail |
| # If an error occurs or the job does not reach the desired state |
| # by the timeout, fail the test rather than returning an error |
| # -timeout <integer_number> |
| # time in seconds to wait for the job to be in the desired state |
| # before timing out (default is 90) |
| # -pollinterval <integer_number> |
| # time in seconds between each job state check (default is 1) |
| # |
| # ARGUMENTS |
| # job_id |
| # The Slurm job id of a job we want to wait for. |
| # desired_state |
| # The state you want the job to attain before returning. |
| # Currently supports: |
| # CANCELLED - job is cancelled |
| # DONE - any terminated state (including cancelled) |
| # PENDING - job is pending |
| # RUNNING - job is running |
| # SPECIAL_EXIT |
| # SUSPENDED - job is suspended |
| # het_job |
| # If set, checks the state of each component job if the job |
| # is a het one. |
| # |
| # RETURN VALUE |
| # RETURN_SUCCESS if job reaches the desired state, or non-zero value |
| # otherwise. |
| # |
| # SEE ALSO |
| # _wait_for_single_job |
| # |
| ################################################################ |
| |
| proc wait_for_job args { |
| |
| set options [list] |
| set het_job 0 |
| |
| while {[llength $args]} { |
| switch -glob -- [lindex $args 0] { |
| -fatal - |
| -fail { |
| lappend options [lindex $args 0] |
| set args [lrange $args 1 end] |
| } |
| -time* - |
| -poll* { |
| lappend options {*}[lrange $args 0 1] |
| set args [lrange $args 2 end] |
| } |
| default break |
| } |
| } |
| set argument_count [llength $args] |
| if {$argument_count < 2} { |
| fail "Too few arguments ($argument_count): $args" |
| } elseif {$argument_count > 3} { |
| fail "Too many arguments ($argument_count): $args" |
| } else { |
| lassign $args job_id desired_state |
| } |
| if {$argument_count == 3} { set hetjob [lindex $args 2] } |
| |
| if { $het_job } { |
| # get component job ids |
| set jid_list [get_het_job_ids $job_id 1] |
| } |
| |
| set rc 0 |
| set jid_list "" |
| if { $jid_list == "" } { |
| # non-het job |
| set jid_list $job_id |
| } |
| |
| foreach jid $jid_list { |
| set rc [_wait_for_single_job {*}$options $jid $desired_state] |
| if { $rc } { |
| # bail out on first failure |
| break |
| } |
| } |
| return $rc |
| } |
| |
| |
| ################################################################ |
| # |
| # NAME |
| # wait_for_job_acct - waits for the job accounting record to be updated |
| # |
| # SYNOPSIS |
| # wait_for_job_acct ?options? job_id ?field? |
| # |
| # DESCRIPTION |
| # Wait for evidence that a specific job stage has been recorded in the |
| # accounting database by verifying that a field has been populated. |
| # |
| # OPTIONS |
| # -fail |
| # If an error occurs or the job does has not populated the field |
| # by the timeout, fail the test rather than returning an error |
| # -timeout <integer_number> |
| # time in seconds to wait for the job's field to be populated |
| # before timing out (default is 90) |
| # -pollinterval <integer_number> |
| # time in seconds between each field check (default is 1) |
| # |
| # ARGUMENTS |
| # job_id |
| # The Slurm job id of a job we want to wait for. |
| # field |
| # The field that we are waiting to be populated, not empty and |
| # not "Unknown". (Default: End) |
| # |
| # RETURN VALUE |
| # RETURN_SUCCESS if the desired field has been populated for the job |
| # in the accounting database, or non-zero value otherwise. |
| # |
| ################################################################ |
| |
| proc wait_for_job_acct args { |
| global sacct |
| |
| set options [list] |
| set timeout 10 |
| set pollinterval .2 |
| set field End |
| |
| while {[llength $args]} { |
| switch -glob -- [lindex $args 0] { |
| -fail { |
| lappend options [lindex $args 0] |
| set args [lrange $args 1 end] |
| } |
| -time* {set args [lassign $args - timeout]} |
| -poll* {set args [lassign $args - pollinterval]} |
| default break |
| } |
| } |
| lappend options -timeout $timeout |
| lappend options -pollinterval $pollinterval |
| |
| set argument_count [llength $args] |
| if {$argument_count < 1} { |
| fail "Too few arguments ($argument_count): $args" |
| } elseif {$argument_count > 2} { |
| fail "Too many arguments ($argument_count): $args" |
| } |
| lassign $args job_id |
| if {$argument_count == 2} {set field [lindex $args 1]} |
| |
| # Wait for sacct to populate the specified field |
| # We want to see the field be populated with anything but "Unknown" |
| # We can't use a negation match here because empty would match |
| return [wait_for_command_match {*}$options "$sacct -j $job_id --allocation --format $field --parsable2 --noheader" (?n)^(?!.*Unknown$).] |
| } |
| |
| |
| ################################################################ |
| # |
| # NAME |
| # wait_for_account_done - cancels and waits on jobs in specified accounts |
| # |
| # SYNOPSIS |
| # wait_for_account_done ?options? accounts |
| # |
| # DESCRIPTION |
| # Cancel jobs on and wait for them to be finished in account(s) given. |
| # |
| # OPTIONS |
| # -timeout <integer_number> |
| # time in seconds to wait for the jobs to be finished before |
| # timing out (default is 360) |
| # -pollinterval <integer_number> |
| # time in seconds between each job state check (default is 1) |
| # |
| # ARGUMENTS |
| # accounts |
| # Comma-delimited list of accounts |
| # |
| # RETURN VALUE |
| # RETURN_SUCCESS if all jobs of the account are finished, or non-zero |
| # otherwise. |
| # |
| # NOTE |
| # We sleep for two seconds before replying that a job is |
| # done to give time for I/O completion (stdout/stderr files) |
| # |
| ################################################################ |
| |
| proc wait_for_account_done args { |
| global scancel squeue re_word_str |
| |
| set timeout 360 |
| set poll_interval 1 |
| while {[llength $args]} { |
| switch -glob -- [lindex $args 0] { |
| -time* {set args [lassign $args - timeout]} |
| -poll* {set args [lassign $args - poll_interval]} |
| -* {fail "Unknown option: [lindex $args 0]"} |
| default break |
| } |
| } |
| set argument_count [llength $args] |
| if {$argument_count != 1} { |
| fail "Invalid number of arguments ($argument_count): $args" |
| } else { |
| lassign $args accounts |
| } |
| |
| if { $accounts == "" } { |
| fail "An account must be specified" |
| } |
| |
| log_user 0 |
| set account_list [split $accounts ","] |
| foreach item $account_list { |
| spawn $scancel -A $item |
| expect { |
| timeout { |
| log_warn "No response from scancel" |
| } |
| eof { |
| wait |
| } |
| } |
| } |
| |
| set my_delay 0 |
| while 1 { |
| set found 0 |
| spawn $squeue -o Account=%a -h -A$accounts |
| expect { |
| -re "Account=($re_word_str)" { |
| set found 1 |
| exp_continue |
| } |
| eof { |
| wait |
| } |
| } |
| |
| if { !$found } { |
| log_debug "Account(s) $accounts is/are empty" |
| break |
| } |
| |
| if { $my_delay > $timeout } { |
| log_error "Timeout waiting for account(s) ($accounts) to be finished" |
| log_user 1 |
| return $::RETURN_TIMEOUT |
| } |
| |
| exec sleep $poll_interval |
| set my_delay [expr $my_delay + $poll_interval] |
| } |
| log_user 1 |
| return $::RETURN_SUCCESS |
| } |
| |
| |
| ################################################################ |
| # |
| # NAME |
| # wait_for_part_done - cancels and waits on jobs in specified partition |
| # |
| # SYNOPSIS |
| # wait_for_part_done ?options? partition |
| # |
| # DESCRIPTION |
| # Cancel jobs on and wait for them to be finished in partition given. |
| # |
| # OPTIONS |
| # -timeout <integer_number> |
| # time in seconds to wait for the jobs to be finished before |
| # timing out (default is 360) |
| # -pollinterval <integer_number> |
| # time in seconds between each job state check (default is 1) |
| # |
| # ARGUMENTS |
| # partition |
| # partition name |
| # |
| # RETURN VALUE |
| # RETURN_SUCCESS if all jobs of the partition are finished, or non-zero |
| # otherwise. |
| # |
| # NOTE |
| # We sleep for two seconds before replying that a job is |
| # done to give time for I/O completion (stdout/stderr files) |
| # |
| ################################################################ |
| |
| proc wait_for_part_done args { |
| global scancel squeue re_word_str |
| |
| set timeout 360 |
| set poll_interval 1 |
| while {[llength $args]} { |
| switch -glob -- [lindex $args 0] { |
| -time* {set args [lassign $args - timeout]} |
| -poll* {set args [lassign $args - poll_interval]} |
| -* {fail "Unknown option: [lindex $args 0]"} |
| default break |
| } |
| } |
| set argument_count [llength $args] |
| if {$argument_count != 1} { |
| fail "Invalid number of arguments ($argument_count): $args" |
| } else { |
| lassign $args partition |
| } |
| |
| if { $partition == "" } { |
| fail "A partition must be specified" |
| } |
| |
| run_command -fail -nolog "$scancel -p $partition" |
| |
| set my_delay 0 |
| while 1 { |
| set found 0 |
| spawn $squeue -o Part=%P -h -p$partition |
| expect { |
| -re "Part=($re_word_str)" { |
| set found 1 |
| exp_continue |
| } |
| eof { |
| wait |
| } |
| } |
| |
| if { !$found } { |
| log_debug "Partition $partition is empty" |
| break |
| } |
| |
| if { $my_delay > $timeout } { |
| log_error "Timeout waiting for partition ($partition) to be finished" |
| return $::RETURN_TIMEOUT |
| } |
| |
| exec sleep $poll_interval |
| set my_delay [expr $my_delay + $poll_interval] |
| } |
| return $::RETURN_SUCCESS |
| } |
| |
| |
| ################################################################ |
| # |
| # NAME |
| # wait_for_step - waits for a job step to be found |
| # |
| # SYNOPSIS |
| # wait_for_step ?options? step_id |
| # |
| # DESCRIPTION |
| # Wait for a job step to be found. |
| # |
| # OPTIONS |
| # -timeout <integer_number> |
| # time in seconds to wait for the job step to be found before |
| # timing out (default is 30) |
| # -pollinterval <integer_number> |
| # time in seconds between each step existence check (default is 1) |
| # |
| # ARGUMENTS |
| # step_id |
| # job step id |
| # |
| # RETURN VALUE |
| # RETURN_SUCCESS if step_id is found, or non-zero otherwise. |
| # |
| ################################################################ |
| |
| proc wait_for_step args { |
| global scontrol |
| |
| set timeout 30 |
| set poll_interval 1 |
| while {[llength $args]} { |
| switch -glob -- [lindex $args 0] { |
| -time* {set args [lassign $args - timeout]} |
| -poll* {set args [lassign $args - poll_interval]} |
| -* {fail "Unknown option: [lindex $args 0]"} |
| default break |
| } |
| } |
| set argument_count [llength $args] |
| if {$argument_count != 1} { |
| fail "Invalid number of arguments ($argument_count): $args" |
| } else { |
| lassign $args step_id |
| } |
| |
| set my_delay 0 |
| while 1 { |
| set fd [open "|$scontrol -o show step $step_id"] |
| gets $fd line |
| catch {close $fd} |
| if {[regexp {Nodes=} $line foo] == 1} { |
| return $::RETURN_SUCCESS |
| } |
| if {[regexp {MidplaneList=} $line foo] == 1} { |
| return $::RETURN_SUCCESS |
| } |
| if { $my_delay > $timeout } { |
| log_warn "Timeout waiting for job step" |
| return $::RETURN_TIMEOUT |
| } |
| |
| log_debug "Step $step_id not done yet. Waiting for $poll_interval seconds" |
| exec sleep $poll_interval |
| set my_delay [expr $my_delay + $poll_interval] |
| } |
| } |
| |
| |
| ################################################################ |
| # |
| # NAME |
| # wait_job_reason - waits for a desired job state and reason |
| # |
| # SYNOPSIS |
| # wait_job_reason ?options? job_id ?desired_state? ?desired_reason_list? |
| # |
| # DESCRIPTION |
| # Wait until the job is in desired state and reason is one |
| # of the desired ones or until the timeout. |
| # |
| # OPTIONS |
| # -timeout <integer_number> |
| # time in seconds to wait for the job state and reason before |
| # timing out (default is 360) |
| # -pollinterval <integer_number> |
| # time in seconds between each job state check (default is 1) |
| # |
| # ARGUMENTS |
| # job_id |
| # The job to wait for |
| # desired_state |
| # Desired state. |
| # desired_reason_list |
| # List of desired reasons. Empty list means that any reason |
| # is ok. |
| # |
| # RETURN VALUE |
| # RETURN_SUCCESS when job is in the desired state and reason is one |
| # of the desired ones, or non-zero otherwise. |
| # |
| ################################################################ |
| |
| proc wait_job_reason args { |
| global scontrol re_word_str |
| |
| set final_state "COMPLETED CANCELLED FAILED TIMEOUT DEADLINE |
| OUT_OF_MEMORY" |
| |
| set timeout 360 |
| set poll_interval 1 |
| set desired_state "PENDING" |
| set desired_reason_list "" |
| while {[llength $args]} { |
| switch -glob -- [lindex $args 0] { |
| -time* {set args [lassign $args - timeout]} |
| -poll* {set args [lassign $args - poll_interval]} |
| -* {fail "Unknown option: [lindex $args 0]"} |
| default break |
| } |
| } |
| set argument_count [llength $args] |
| if {$argument_count < 1} { |
| fail "Too few arguments ($argument_count): $args" |
| } else { |
| lassign $args job_id |
| } |
| if {$argument_count >= 2} { set desired_state [lindex $args 1] } |
| if {$argument_count == 3} { set desired_reason_list [lindex $args 2] } |
| if {$argument_count > 3} { |
| fail "Too many arguments ($argument_count): $args" |
| } |
| |
| set log_user_prev [log_user -info] |
| log_user 0 |
| |
| set my_delay 0 |
| set rc $::RETURN_ERROR |
| while true { |
| set pending 0 |
| set has_reason 1 |
| spawn $scontrol show job $job_id |
| expect { |
| -re "JobState=($re_word_str) Reason=(\\S+)" { |
| set job_state $expect_out(1,string) |
| set job_reason $expect_out(2,string) |
| } |
| timeout { |
| log_error "No response from scontrol show job" |
| set rc $::RETURN_TIMEOUT |
| break |
| } |
| } |
| |
| # Check if both state and reason are the desired ones |
| if {$job_state == $desired_state} { |
| set found 0 |
| set reason_msg "" |
| if {$desired_reason_list == ""} { |
| set found 1 |
| } |
| foreach desired_reason $desired_reason_list { |
| if {$job_reason == $desired_reason } { |
| set reason_msg " with reason $job_reason" |
| set found 1 |
| } |
| } |
| if {$found} { |
| log_debug "Job $job_id found $job_state$reason_msg" |
| set rc $::RETURN_SUCCESS |
| break |
| } |
| } elseif {[lsearch -exact final_state $job_state] >= 0} { |
| # Job is in final step no need to wait longer |
| log_error [format "Job in final state/reason '%s' / '%s' instead of the desired '%s' / '%s'." \ |
| $job_state $job_reason \ |
| $desired_state $desired_reason_list] |
| set rc $::RETURN_ERROR |
| break |
| } |
| |
| # Check if this was the last poll |
| if {$my_delay > $timeout} { |
| log_error "Timeout" |
| set rc $::RETURN_TIMEOUT |
| break |
| } |
| set remamining_sec [expr $timeout - $my_delay] |
| log_debug [format "Job in state/reason '%s' / '%s' instead of the desired '%s' / '%s'." \ |
| $job_state $job_reason \ |
| $desired_state $desired_reason_list] |
| log_debug [format "Polling again in %ss, %ss to timeout." \ |
| $poll_interval $remamining_sec] |
| |
| sleep $poll_interval |
| set my_delay [expr $my_delay + $poll_interval] |
| } |
| |
| log_user $log_user_prev |
| return $rc |
| } |
| |
| |
| ################################################################ |
| # |
| # NAME |
| # get_config - returns a dictionary of slurm configuration parameters |
| # |
| # SYNOPSIS |
| # get_config ?options? |
| # |
| # OPTIONS |
| # -dbd |
| # uses `sacctmgr show config` to return slurmdbd configuration |
| # parameters |
| # -slurm |
| # uses `scontrol show config` to return slurm configuration |
| # parameters (this is the default) |
| # |
| # RETURN VALUE |
| # Returns a dictionary of parameter values |
| # |
| ################################################################ |
| |
| proc get_config args { |
| global sacctmgr scontrol |
| |
| set config_dict [list] |
| set command "$scontrol" |
| while {[llength $args]} { |
| switch -glob -- [lindex $args 0] { |
| -slurm {set command "$scontrol"; set args [lrange $args 1 end]} |
| -dbd {set command "$sacctmgr"; set args [lrange $args 1 end]} |
| -* {fail "Unknown option: [lindex $args 0]"} |
| default break |
| } |
| } |
| if {[llength $args] > 0} { |
| fail "[lindex [info level 0] 0]: No arguments allowed: $args" |
| } |
| |
| set output [run_command_output -fail -nolog "$command show config"] |
| |
| foreach line [split $output "\n"] { |
| if {[regexp {^(\S+) += (.*)$} $line {} param_name param_value] == 1} { |
| dict set config_dict $param_name $param_value |
| } |
| } |
| |
| return $config_dict |
| } |
| |
| |
| ################################################################ |
| # |
| # NAME |
| # get_config_param - returns a slurm configuration parameter value |
| # |
| # SYNOPSIS |
| # get_config_param ?options? parameter_name |
| # |
| # OPTIONS |
| # -dbd |
| # uses `sacctmgr show config` to return the specified slurmdbd |
| # configuration parameter value |
| # -slurm |
| # uses `scontrol show config` to return the specified slurm |
| # configuration parameter value (this is the default) |
| # |
| # ARGUMENTS |
| # parameter_name |
| # the parameter to return the value for |
| # |
| # DESCRIPTION |
| # Returns a specific configuration parameter value. |
| # |
| # RETURN VALUE |
| # Returns the value of the specified parameter or MISSING if it does not |
| # exist. |
| # |
| ################################################################ |
| |
| proc get_config_param args { |
| |
| set options [list] |
| while {[llength $args]} { |
| switch -glob -- [lindex $args 0] { |
| -* { |
| lappend options [lindex $args 0] |
| set args [lrange $args 1 end] |
| } |
| default break |
| } |
| } |
| if {[llength $args] == 1} { |
| lassign $args parameter_name |
| } else { |
| fail "[lindex [info level 0] 0]: Invalid number of arguments ([llength $args]): $args" |
| } |
| |
| set config_dict [get_config {*}$options] |
| |
| if [dict exists $config_dict $parameter_name] { |
| return [dict get $config_dict $parameter_name] |
| } else { |
| return "MISSING" |
| } |
| } |
| |
| |
| ################################################################ |
| # |
| # NAME |
| # param_contains - test whether a comma-separated-list contains a specified value |
| # |
| # SYNOPSIS |
| # param_contains haystack needle |
| # |
| # DESCRIPTION |
| # Searches for the specified value (needle) in the comma-separated-list |
| # string (haystack). Needle can be a glob-style pattern. |
| # |
| # RETURN VALUE |
| # Returns a boolean value indicating whether the value (needle) was found |
| # in the comma-separated-list string (haystack) |
| # |
| ################################################################ |
| |
| proc param_contains { haystack needle } { |
| if {[lsearch [split $haystack ","] $needle] != -1} { |
| return true |
| } else { |
| return false |
| } |
| } |
| |
| |
| ################################################################ |
| # |
| # NAME |
| # param_value - returns the value of a parameter in a comma-separated-list |
| # |
| # SYNOPSIS |
| # param_value params_list param ?default? |
| # |
| # DESCRIPTION |
| # Searches for the specified param in the comma-separated-list |
| # string (params_list) and returns its value. |
| # |
| # RETURN VALUE |
| # Returns the value found or the optional default value if not found. |
| # If the param is found without a value, returns true (ie like |
| # param_contains). |
| # |
| ################################################################ |
| |
| proc param_value {params_list param {default false}} { |
| global re_word_str |
| foreach pair [split $params_list ","] { |
| if {[regexp "$param" $pair] == 1} { |
| if {[regexp "$param=($re_word_str)" $pair - value] == 1} { |
| return $value |
| } else { |
| return true |
| } |
| } |
| } |
| return $default |
| } |
| |
| |
| ################################################################ |
| # |
| # NAME |
| # get_affinity_types - gets the task plugins running with task/ stripped |
| # |
| # SYNOPSIS |
| # get_affinity_types |
| # |
| # RETURN VALUE |
| # Returns comma separated list of task plugins running without the task/ |
| # |
| ################################################################ |
| |
| proc get_affinity_types { } { |
| global scontrol re_word_str |
| |
| log_user 0 |
| set affinity "" |
| spawn $scontrol show config |
| expect { |
| -re "TaskPlugin *= ($re_word_str)" { |
| set parts [split $expect_out(1,string) ",/"] |
| while 1 { |
| set task_found [lsearch $parts "task"] |
| if { $task_found == -1 } break |
| set parts [lreplace $parts $task_found $task_found] |
| } |
| set affinity [join $parts ","] |
| exp_continue |
| } |
| eof { |
| wait |
| } |
| } |
| log_user 1 |
| |
| return $affinity |
| } |
| |
| |
| ################################################################ |
| # |
| # NAME |
| # get_mps_count_by_index - gets the count of a specific gres/mps device |
| # |
| # SYNOPSIS |
| # get_mps_count_by_index index hostname |
| # |
| # RETURN VALUE |
| # Returns the Count of a specific gres/mps device |
| # |
| ################################################################ |
| |
| proc get_mps_count_by_index { index hostname } { |
| global slurmd number re_word_str |
| |
| log_user 0 |
| set count 0 |
| spawn $slurmd -G -N $hostname |
| expect { |
| -re "Gres Name=mps Type=$re_word_str Count=($number) Index=$index" { |
| set count $expect_out(1,string) |
| exp_continue |
| } |
| eof { |
| wait |
| } |
| } |
| log_user 1 |
| |
| return $count |
| } |
| |
| ################################################################ |
| # |
| # NAME |
| # check_mpi - determines if mpicc and the desired --mpi option are available |
| # |
| # SYNOPSIS |
| # check_mpi desired_mpi |
| # |
| # DESCRIPTION |
| # Runs srun --mpi=list to determine if the desired mpi is available, as |
| # well as if mpicc set in globals.local is too. |
| # |
| # RETURN VALUE |
| # Returns true if the desired mpi and mpicc are available, false otherwise. |
| # |
| ################################################################ |
| |
| proc check_mpi {desired_mpi} { |
| global srun eol mpicc |
| set output [run_command_output -fail "$srun --mpi=list"] |
| |
| return [regexp "${desired_mpi}${eol}" $output] && [file executable $mpicc] |
| } |
| |
| ################################################################ |
| # |
| # NAME |
| # check_influxdb_access - determines if user can access to the desired influxdb |
| # |
| # SYNOPSIS |
| # check_influxdb_access host port database |
| # |
| # DESCRIPTION |
| # It uses the global $influx CLI command to try to connect and use |
| # the desired database. |
| # |
| # RETURN VALUE |
| # Returns true if we can connect to host:port and use the database, |
| # false otherwise |
| # |
| ################################################################ |
| |
| proc check_influxdb_access {host port database} { |
| global influx |
| set connected 0 |
| set access 0 |
| |
| if {![file executable $influx]} { |
| log_warn "Cannot execute influx command: $influx" |
| return false |
| } |
| |
| set log_user_save [log_user -info] |
| log_user 0 |
| spawn $influx -host $host -port $port |
| expect { |
| -re "Connected to" { |
| set connected 1 |
| send "use $database\r" |
| exp_continue |
| } |
| -re "Using database" { |
| set access 1 |
| send "quit\r" |
| } |
| -re "unable to parse authentication credentials" { |
| send "quit\r" |
| } |
| -re "authorization failed" { |
| send "quit\r" |
| } |
| timeout { |
| fail "InfluxDB instance not responding" |
| } |
| eof { |
| wait |
| } |
| } |
| log_user $log_user_save |
| |
| if {!$connected} { |
| log_warn "Cannot connect to $host:$port" |
| return false |
| } |
| if {!$access} { |
| log_warn "Connected to $host:$port, but cannot use $database" |
| return false |
| } |
| return true |
| } |
| |
| |
| ################################################################ |
| # |
| # NAME |
| # check_bb_emulate - determines if Cray burst buffers API is emulated |
| # |
| # SYNOPSIS |
| # check_bb_emulate |
| # |
| # RETURN VALUE |
| # Returns true if Cray burst buffers API is emulated, false otherwise |
| # |
| ################################################################ |
| |
| proc check_bb_emulate { } { |
| global scontrol |
| |
| log_user 0 |
| set bb_emulate false |
| spawn $scontrol show burst |
| expect { |
| -re "EmulateCray" { |
| set bb_emulate true |
| exp_continue |
| } |
| eof { |
| wait |
| } |
| } |
| log_user 1 |
| return $bb_emulate |
| } |
| |
| |
| ################################################################ |
| # |
| # NAME |
| # check_bb_persistent - determines if persistent burst buffers can be created by users |
| # |
| # SYNOPSIS |
| # check_bb_persistent |
| # |
| # RETURN VALUE |
| # Returns true if Cray burst buffers can be created by users, |
| # false otherwise |
| # |
| ################################################################ |
| |
| proc check_bb_persistent { } { |
| global scontrol |
| |
| log_user 0 |
| set bb_persistent false |
| spawn $scontrol show burst |
| expect { |
| -re "EnablePersistent" { |
| set bb_persistent true |
| exp_continue |
| } |
| eof { |
| wait |
| } |
| } |
| log_user 1 |
| return $bb_persistent |
| } |
| |
| |
| ################################################################ |
| # |
| # NAME |
| # get_default_acct - gets user's default account |
| # |
| # SYNOPSIS |
| # get_default_acct user |
| # |
| # RETURN VALUE |
| # Returns name of default account if exists, NULL otherwise |
| # |
| ################################################################ |
| |
| proc get_default_acct { user } { |
| global sacctmgr re_word_str bin_id |
| |
| log_user 0 |
| set def_acct "" |
| |
| if { !$user } { |
| set user [get_my_user_name] |
| } |
| |
| spawn $sacctmgr -n list -P user $user format="DefaultAccount" |
| expect { |
| -re "($re_word_str)" { |
| set def_acct $expect_out(1,string) |
| exp_continue |
| } |
| eof { |
| wait |
| } |
| } |
| log_user 1 |
| |
| return $def_acct |
| } |
| |
| |
| ################################################################ |
| # |
| # NAME |
| # get_cycle_count - get desired iteration count |
| # |
| # SYNOPSIS |
| # get_cycle_count |
| # |
| # DESCRIPTION |
| # For tests with iteration counts (e.g. test9.1, test9.2) |
| # return the desired iteration count |
| # |
| # RETURN VALUE |
| # Returns desired iteration count |
| # |
| ################################################################ |
| |
| proc get_cycle_count { } { |
| global enable_memory_leak_debug |
| |
| if {$enable_memory_leak_debug != 0} { |
| return 2 |
| } |
| return 100 |
| } |
| |
| |
| ################################################################ |
| # |
| # NAME |
| # get_select_type_params - determines SelectTypeParameters being used for a given partition |
| # |
| # SYNOPSIS |
| # get_select_type_params ?partition? |
| # |
| # DESCRIPTION |
| # Determine SelectTypeParameters being used for a given partition. |
| # If the partition is not specified, the default partition will be used. |
| # |
| # RETURN VALUE |
| # Returns a string containing SelectTypeParameters |
| # |
| ################################################################ |
| |
| proc get_select_type_params { {partition ""} } { |
| global scontrol bin_bash bin_grep re_word_str |
| |
| log_user 0 |
| set params "" |
| |
| if {[string length $partition] == 0} { |
| set partition [default_partition] |
| } |
| |
| if {$partition ne ""} { |
| spawn -noecho $bin_bash -c "exec $scontrol show part $partition | $bin_grep SelectTypeParameters" |
| expect { |
| -re "SelectTypeParameters *= *NONE" { |
| exp_continue |
| } |
| -re "SelectTypeParameters *= *($re_word_str)" { |
| set params $expect_out(1,string) |
| exp_continue |
| } |
| eof { |
| wait |
| } |
| } |
| } |
| if {$params eq ""} { |
| spawn -noecho $bin_bash -c "exec $scontrol show config | $bin_grep SelectTypeParameters" |
| expect { |
| -re "SelectTypeParameters *= *($re_word_str)" { |
| set params $expect_out(1,string) |
| exp_continue |
| } |
| eof { |
| wait |
| } |
| } |
| } |
| log_user 1 |
| |
| return $params |
| } |
| |
| |
| ################################################################ |
| # |
| # NAME |
| # check_config_select - checks if effectively using the select type |
| # |
| # SYNOPSIS |
| # check_config_select type |
| # |
| # DESCRIPTION |
| # Determine if SelectType is equivalent to the passed type. |
| # |
| # ARGUMENTS |
| # type |
| # the desired SelectType to check (e.g. cons_tres) |
| # |
| # RETURN VALUE |
| # Returns true if configured, false otherwise |
| # |
| ################################################################ |
| |
| proc check_config_select { type } { |
| set select_type [get_config_param "SelectType"] |
| set select_type_parameters [get_config_param "SelectTypeParameters"] |
| |
| if {$select_type eq "select/$type"} { |
| return true |
| } |
| |
| return false |
| } |
| |
| |
| ################################################################ |
| # |
| # NAME |
| # get_total_cpus - gets the total available CPUs on the default partition |
| # |
| # SYNOPSIS |
| # get_total_cpus |
| # |
| # RETURN VALUE |
| # The total available CPUs on the default partition. |
| # |
| # NOTE |
| # CoreSpecCount are not part of the total. |
| # |
| ################################################################ |
| |
| proc get_total_cpus {} { |
| |
| set total_cpu_count 0 |
| |
| # Obtain the list of available nodes in the default partition |
| set node_list [get_nodes_by_state] |
| |
| # Tally the cpus on these nodes |
| set nodes_dict [get_nodes] |
| foreach node_name $node_list { |
| set node_dict [dict get $nodes_dict $node_name] |
| set node_cpu_count [dict get $node_dict "CPUTot"] |
| |
| # Subtract out any spec cores |
| if {[dict exists $node_dict "CoreSpecCount"] && [dict exists $node_dict "ThreadsPerCore"]} { |
| set spec_cpu_count [expr [dict get $node_dict "CPUTot"] * [dict get $node_dict "ThreadsPerCore"]] |
| incr node_cpu_count -$spec_cpu_count |
| } |
| |
| incr total_cpu_count $node_cpu_count |
| } |
| |
| return $total_cpu_count |
| } |
| |
| |
| ################################################################ |
| # |
| # NAME |
| # is_running_in_container |
| # |
| # SYNOPSIS |
| # is_running_in_container |
| # |
| # DESCRIPTION |
| # Determine if test script is running inside of a container. |
| # |
| # RETURN VALUE |
| # true if container detected or systemd-detect-virt is not found, |
| # false otherwise |
| # |
| ################################################################ |
| |
| proc is_running_in_container {} { |
| global bin_systemd_detect_virt |
| |
| if {[run_command_status -nolog -none "$bin_systemd_detect_virt --version"]} { |
| log_warn "$bin_systemd_detect_virt not found, assuming container" |
| return true |
| } |
| |
| set result [run_command -nolog -none "$bin_systemd_detect_virt -c"] |
| set output [string trimright [dict get $result output] "\r\n"] |
| if { $output != "none"} { |
| log_debug "Detected container type: $output" |
| } |
| if {[dict get $result exit_code]} { |
| return false |
| } |
| return true |
| } |
| |
| ################################################################ |
| # |
| # NAME |
| # is_super_user - determines if user is root or SlurmUser |
| # |
| # SYNOPSIS |
| # is_super_user ?user? |
| # |
| # DESCRIPTION |
| # Determine if user is a Slurm super user (i.e. user |
| # root or configured SlurmUser) |
| # |
| # RETURN VALUE |
| # true is user is root or SlurmUser, false otherwise |
| # |
| ################################################################ |
| |
| proc is_super_user {{user ""}} { |
| global number |
| |
| if {$user == ""} { |
| set user [get_my_user_name] |
| } |
| |
| # Check if user is root |
| if {$user eq "root"} { |
| return true |
| } |
| |
| # Check if user is SlurmUser |
| set slurm_user [get_config_param "SlurmUser"] |
| if {[regexp "${user}\\($number\\)" $slurm_user match]} { |
| return true |
| } |
| |
| return false |
| } |
| |
| |
| ################################################################ |
| # |
| # NAME |
| # dec2hex - creates a 32 bit hex number from a signed decimal number |
| # |
| # SYNOPSIS |
| # dec2hex value |
| # |
| # DESCRIPTION |
| # Create a 32 bit hex number from a signed decimal number |
| # |
| # RETURN VALUE |
| # 32 bit hex version of input 'value' |
| # |
| # SOURCE |
| # Courtesy of Chris Cornish |
| # http://aspn.activestate.com/ASPN/Cookbook/Tcl/Recipe/415982 |
| # |
| ################################################################ |
| # Replace all non-decimal characters |
| |
| proc dec2hex {value} { |
| regsub -all {[^0-x\.-]} $value {} newtemp |
| set value [string trim $newtemp] |
| if {$value < 2147483647 && $value > -2147483648} { |
| set tempvalue [format "%#010X" [expr $value]] |
| return [string range $tempvalue 2 9] |
| } elseif {$value < -2147483647} { |
| return "80000000" |
| } else { |
| return "7FFFFFFF" |
| } |
| } |
| |
| |
| ################################################################ |
| # |
| # NAME |
| # uint2hex - creates a 32 bit hex number from an unsigned decimal |
| # |
| # SYNOPSIS |
| # uint2hex value |
| # |
| # DESCRIPTION |
| # Create a 32 bit hex number from an unsigned decimal number. |
| # |
| # ARGUMENTS |
| # value |
| # unsigneddecimal number to convert |
| # |
| # RETURN VALUE |
| # 32 bit hex version of input 'value' |
| # |
| # SOURCE |
| # Courtesy of Chris Cornish |
| # http://aspn.activestate.com/ASPN/Cookbook/Tcl/Recipe/415982 |
| # |
| ################################################################ |
| # Replace all non-decimal characters |
| |
| proc uint2hex {value} { |
| regsub -all {[^0-x\.-]} $value {} newtemp |
| set value [string trim $newtemp] |
| if {$value <= 4294967295 && $value >= 0} { |
| set tempvalue [format "%#010X" [expr $value]] |
| return [string range $tempvalue 2 9] |
| } else { |
| return "FFFFFFFF" |
| } |
| } |
| |
| |
| ################################################################ |
| # |
| # NAME |
| # partition_oversubscribe - determines the oversubscribe configuration of the specified partition |
| # |
| # SYNOPSIS |
| # partition_oversubscribe ?partition? |
| # |
| # DESCRIPTION |
| # Determine the oversubscribe configuration of the specified partition. |
| # If the partition is not specified, the default partition will be used. |
| # |
| # RETURN VALUE |
| # Return the oversubscribe configuration of the specified partition. |
| # |
| ################################################################ |
| |
| proc partition_oversubscribe { {partition ""} } { |
| global sinfo |
| |
| if {[string length $partition] == 0} { |
| set partition [default_partition] |
| } |
| |
| set oversubscribe "NO" |
| log_debug "$sinfo --noheader --partition $partition --format %h" |
| set fd [open "|$sinfo --noheader --partition $partition --format %h"] |
| gets $fd line |
| catch {close $fd} |
| regexp {[a-zA-Z]+} $line oversubscribe |
| return $oversubscribe |
| } |
| |
| |
| ################################################################ |
| # |
| # NAME |
| # default_partition - determines the name of the default partition |
| # |
| # SYNOPSIS |
| # default_partition |
| # |
| # DESCRIPTION |
| # Use scontrol to determine the name of the default partition |
| # |
| # RETURN VALUE |
| # Name of the current default partition, or fail if not found. |
| # |
| ################################################################ |
| |
| proc default_partition {} { |
| global scontrol |
| |
| set name "" |
| set fd [open "|$scontrol --all --oneliner show partition"] |
| while {[gets $fd line] != -1} { |
| if {[regexp {^PartitionName=([^ ]*).*Default=YES} $line frag name] |
| == 1} { |
| break |
| } |
| } |
| catch {close $fd} |
| |
| if {[string length $name] == 0} { |
| fail "Could not identify the default partition" |
| } |
| |
| return $name |
| } |
| |
| |
| ################################################################ |
| # |
| # NAME |
| # default_part_exclusive - determines if the default partition allocates whole nodes to jobs |
| # |
| # SYNOPSIS |
| # default_part_exclusive |
| # |
| # DESCRIPTION |
| # Use scontrol to determine if the default partition |
| # allocates whole nodes to jobs |
| # |
| # RETURN VALUE |
| # Name of the current default partition |
| # |
| ################################################################ |
| |
| proc default_part_exclusive {} { |
| set def_part [default_partition] |
| set oversubscribe [partition_oversubscribe $def_part] |
| if {$oversubscribe eq "EXCLUSIVE"} { |
| return 1 |
| } else { |
| return 0 |
| } |
| } |
| |
| |
| ################################################################ |
| # |
| # NAME |
| # make_bash_script - creates a bash script |
| # |
| # SYNOPSIS |
| # make_bash_script script_name script_contents |
| # |
| # DESCRIPTION |
| # Create a bash script of name "script_name", and |
| # make the body of the script "script_contents". |
| # make_bash_script removes the file if it already exists, |
| # then generates the #! line, and then dumps "script_contents" |
| # to the file. Finally, it makes certain that the script |
| # is executable. |
| # |
| # ARGUMENTS |
| # script_name |
| # file name for the bash script |
| # script_contents |
| # body of the script, not including the initial #! line. |
| # |
| # RETURN VALUE |
| # Nothing. |
| # |
| ################################################################ |
| |
| proc make_bash_script { script_name script_contents } { |
| global bin_bash bin_chmod |
| |
| file delete $script_name |
| set fd [open $script_name "w"] |
| puts $fd "#!$bin_bash" |
| puts $fd $script_contents |
| close $fd |
| exec $bin_chmod 777 $script_name |
| } |
| |
| |
| ################################################################ |
| # |
| # NAME |
| # check_acct_associations - checks associations |
| # |
| # SYNOPSIS |
| # check_acct_associations |
| # |
| # DESCRIPTION |
| # Use sacctmgr to check associations |
| # |
| # RETURN VALUE |
| # true if no error is found, false otherwise |
| # |
| ################################################################ |
| |
| proc check_acct_associations { } { |
| global sacctmgr number re_word_str |
| |
| # 2 versions after 23.11 we can remove the below function |
| # until then we should check older clusters lft/rgt |
| set rc [check_acct_associations_lft] |
| |
| if { !$rc } { |
| return $rc; |
| } |
| |
| log_user 0 |
| log_debug "Sanity-Checking Associations" |
| # |
| # Use sacctmgr to check associations |
| # |
| spawn $sacctmgr -n -p list assoc wopi wopl withd format=lineage,cluster |
| expect { |
| -re "($re_word_str)\\|($re_word_str)\\|" { |
| # Here we are checking if we have duplicates and |
| # setting up an array to check for holes later |
| |
| set lineage $expect_out(1,string) |
| set cluster $expect_out(2,string) |
| set first [info exists found($cluster,$lineage)] |
| |
| if { $first } { |
| log_error "$cluster found lineage $lineage again" |
| set rc false |
| } else { |
| set found($cluster,$lineage) 1 |
| } |
| exp_continue |
| } |
| timeout { |
| fail "sacctmgr add not responding" |
| } |
| eof { |
| wait |
| } |
| } |
| |
| log_user 1 |
| return $rc |
| } |
| |
| proc check_acct_associations_lft { } { |
| global sacctmgr number re_word_str |
| |
| set rc true |
| log_user 0 |
| |
| log_debug "Sanity-Checking Associations" |
| |
| set clusters "" |
| |
| spawn $sacctmgr show cluster format=cluster,rpc -p |
| expect { |
| -re "($re_word_str)\\|($number)\\|" { |
| # 9984 == 23.02, the last version where lft/rgt matter |
| if { $expect_out(2,string) > 9984 } { |
| exp_continue |
| } |
| set clusters [ concat $clusters "," $expect_out(1,string) ] |
| exp_continue |
| } |
| } |
| |
| # |
| # Use sacctmgr to check associations |
| # |
| spawn $sacctmgr -n -p list assoc wopi wopl withd format=lft,rgt,cluster clusters="$clusters" |
| expect { |
| -re "($number)\\|($number)\\|($re_word_str)\\|" { |
| # Here we are checking if we have duplicates and |
| # setting up an array to check for holes later |
| |
| set cluster $expect_out(3,string) |
| if { ![info exists c_min($cluster)] } { |
| set c_min($cluster) -1 |
| set c_max($cluster) -1 |
| } |
| |
| set num1 $expect_out(1,string) |
| set num2 $expect_out(2,string) |
| set first [info exists found($cluster,$num1)] |
| set sec [info exists found($cluster,$num2)] |
| #log_debug "$first=$num1 $sec=$num2" |
| if { $first } { |
| log_error "$cluster found lft $num1 again" |
| set rc false |
| } elseif { $sec } { |
| log_error "$cluster found rgt $num2 again" |
| set rc false |
| } else { |
| set found($cluster,$num1) 1 |
| set found($cluster,$num2) 1 |
| if { $c_min($cluster) == -1 |
| || $c_min($cluster) > $num1 } { |
| set c_min($cluster) $num1 |
| } |
| if { $c_max($cluster) == -1 |
| || $c_max($cluster) < $num2 } { |
| set c_max($cluster) $num2 |
| } |
| } |
| exp_continue |
| } |
| timeout { |
| fail "sacctmgr add not responding" |
| } |
| eof { |
| wait |
| } |
| } |
| |
| foreach cluster [array names c_min] { |
| # Here we are checking for holes in the list from above |
| for {set inx $c_min($cluster)} {$inx < $c_max($cluster)} {incr inx} { |
| if { ![info exists found($cluster,$inx)] } { |
| log_error "$cluster No index at $inx" |
| set rc false |
| } |
| } |
| } |
| log_user 1 |
| return $rc |
| } |
| |
| ################################################################ |
| # |
| # NAME |
| # get_job_acct_freq - gets the value of the job account gather frequency |
| # |
| # SYNOPSIS |
| # get_job_acct_freq |
| # |
| # RETURN VALUE |
| # job account gather frequency |
| # |
| ################################################################ |
| |
| proc get_job_acct_freq { } { |
| global scontrol number |
| |
| log_user 0 |
| set freq_val 0 |
| |
| spawn $scontrol show config |
| expect { |
| -re "JobAcctGatherFrequency *= ($number)" { |
| set freq_val $expect_out(1,string) |
| if {$freq_val == 0} { |
| set freq_val 0 |
| } |
| } |
| -re "JobAcctGatherFrequency *= task=($number)" { |
| set freq_val $expect_out(1,string) |
| if {$freq_val == 0} { |
| set freq_val 0 |
| } |
| } |
| eof { |
| wait |
| } |
| } |
| |
| log_user 1 |
| return $freq_val |
| } |
| |
| |
| ################################################################ |
| # |
| # NAME |
| # get_admin_level - gets the AdminLevel of the user |
| # |
| # SYNOPSIS |
| # get_admin_level ?user? |
| # |
| # RETURN VALUE |
| # AdminLevel for the current user |
| # |
| ################################################################ |
| |
| proc get_admin_level {{user_name ""}} { |
| global sacctmgr re_word_str re_word_str bin_id |
| |
| set admin_level "" |
| |
| if {$user_name == ""} { |
| set user_name [get_my_user_name] |
| if { ![string length $user_name] } { |
| log_error "No name returned from id" |
| return "" |
| } |
| } |
| |
| if {[is_super_user $user_name]} { |
| return "Administrator" |
| } |
| |
| # |
| # Use sacctmgr to check admin_level |
| # |
| log_user 0 |
| spawn $sacctmgr -n -P list user $user_name format=admin |
| expect { |
| -re "($re_word_str)" { |
| set admin_level $expect_out(1,string) |
| exp_continue |
| } |
| timeout { |
| fail "sacctmgr add not responding" |
| } |
| eof { |
| wait |
| } |
| } |
| log_user 1 |
| |
| return $admin_level |
| } |
| |
| |
| ################################################# |
| # |
| # NAME |
| # scale_to_megs - scales the value by the factor T|G|M to megabytes |
| # |
| # SYNOPSIS |
| # scale_to_megs value factor |
| # |
| # DESCRIPTION |
| # scale the value by the factor T|G|M to megabytes |
| # |
| # RETURN VALUE |
| # the scaled variable |
| # |
| ################################################# |
| |
| proc scale_to_megs { value factor } { |
| |
| if {$factor == "T"} { |
| set value [expr $value * 1024 * 1024] |
| } elseif {$factor == "G"} { |
| set value [expr $value * 1024] |
| } elseif {$factor == "M"} { |
| set value [expr $value * 1] |
| } elseif {$factor == "K"} { |
| set value [expr $value / 1024] |
| set value [expr {round($value)}] |
| } else { |
| set value [expr $value / (1024 * 1024)] |
| set value [expr {round($value)}] |
| } |
| |
| return $value |
| } |
| |
| |
| ################################################# |
| # |
| # NAME |
| # scale_to_ks - scales the value by the factor G|M|K to kilobytes |
| # |
| # SYNOPSIS |
| # scale_to_ks value factor |
| # |
| # DESCRIPTION |
| # scale the value by the factor G|M|K to kilobytes |
| # |
| # RETURN VALUE |
| # the scaled variable |
| # |
| ################################################# |
| |
| proc scale_to_ks { value factor } { |
| |
| if {$factor == "G"} { |
| set value [expr $value * 1024 * 1024] |
| } elseif {$factor == "M"} { |
| set value [expr $value * 1024] |
| } elseif {$factor == "K"} { |
| set value [expr $value * 1] |
| } else { |
| set value [expr $value / 1024] |
| set value [expr {round($value)}] |
| } |
| |
| return $value |
| } |
| |
| |
| ############################################################ |
| # |
| # NAME |
| # check_config_node_mem - checks that the nodes have memory configured |
| # |
| # SYNOPSIS |
| # check_config_node_mem |
| # |
| # RETURN VALUE |
| # true if all nodes have memory, false otherwise |
| # |
| ############################################################ |
| |
| proc check_config_node_mem { } { |
| |
| set nodes_dict [get_nodes] |
| dict for {node_name node_dict} $nodes_dict { |
| if [dict exists $node_dict "RealMemory"] { |
| if {[dict get $node_dict "RealMemory"] == 1} { |
| return false |
| } |
| } else { |
| log_warn "Parameter RealMemory not found on node $node_name" |
| return false |
| } |
| } |
| |
| return true |
| } |
| |
| |
| ################################################################ |
| # |
| # NAME |
| # wait_for_node - waits for nodes in a partition to reach a certain state |
| # |
| # SYNOPSIS |
| # wait_for_node ?options? state num_nodes ?partition? |
| # |
| # DESCRIPTION |
| # Wait for a certain number of nodes in a partition to reach a certain |
| # state. |
| # |
| # OPTIONS |
| # -timeout <integer_number> |
| # time in seconds to wait for the node state before |
| # timing out (default is 3) |
| # -pollinterval <integer_number> |
| # time in seconds between each node state check (default is 1) |
| # |
| # ARGUMENTS |
| # state |
| # The node state to wait for |
| # num_nodes |
| # The number of nodes we want to be in the specified state |
| # partition |
| # Partition name (the default partition is used if not specified) |
| # |
| # RETURN VALUE |
| # RETURN_SUCCESS, or non-zero on failure |
| # |
| ################################################################ |
| |
| proc wait_for_node args { |
| global sinfo number |
| |
| set partition "" |
| set timeout 3 |
| set poll_interval 1 |
| set desired_state "PENDING" |
| set desired_reason_list "" |
| while {[llength $args]} { |
| switch -glob -- [lindex $args 0] { |
| -time* {set args [lassign $args - timeout]} |
| -poll* {set args [lassign $args - poll_interval]} |
| -* {fail "Unknown option: [lindex $args 0]"} |
| default break |
| } |
| } |
| set argument_count [llength $args] |
| if {$argument_count < 2} { |
| fail "Too few arguments ($argument_count): $args" |
| } else { |
| lassign $args state num_nodes |
| } |
| if {$argument_count == 3} { set partition [lindex $args 2] } |
| if {$argument_count > 3} { |
| fail "Too many arguments ($argument_count): $args" |
| } |
| |
| set wait_time 0 |
| set done 0 |
| set cnt 0 |
| set rt $::RETURN_SUCCESS |
| |
| if {[string length $partition] == 0} { |
| set partition [default_partition] |
| } |
| |
| while {$done != 1 && $wait_time < $timeout} { |
| set output [run_command_output -fail "$sinfo --noheader --partition $partition --state $state --format %D"] |
| regexp "$number" $output cnt |
| |
| if {$num_nodes <= $cnt} { |
| set done 1 |
| } else { |
| log_debug "Partition $partition has $cnt nodes idle and we want $num_nodes" |
| sleep $poll_interval |
| incr wait_time 1 |
| } |
| } |
| if {$done != 1} { |
| set rt $::RETURN_ERROR |
| } |
| |
| return $rt |
| } |
| |
| |
| ##################################################################### |
| # |
| # NAME |
| # node_list_to_range - converts a TCL list into a Slurm hostlist using scontrol |
| # |
| # SYNOPSIS |
| # node_list_to_range nodes_list |
| # |
| # ARGUMENTS |
| # |
| # nodes_list |
| # a TCL list of node names |
| # |
| # RETURN VALUE |
| # the hostlist form returned by scrontrol show hostlist |
| # |
| ##################################################################### |
| |
| proc node_list_to_range {nodes_list} { |
| global scontrol |
| |
| set comalist [join $nodes_list ,] |
| set hostlist [run_command_output -nolog -fail "$scontrol show hostlist $comalist"] |
| set hostlist [string trimright $hostlist "\r\n"] |
| } |
| |
| |
| ################################################################ |
| # |
| # NAME |
| # list_to_range - converts a list of integer numbers to a range expression |
| # |
| # SYNOPSIS |
| # list_to_range ?numeric_list? |
| # |
| # DESCRIPTION |
| # Collapse a numeric list to a range expression defined by the following |
| # EBNF: |
| # <expression> ::= <range> {, <range>}* |
| # <range> ::= <integer> - <integer> | <integer> |
| # |
| # RETURN VALUE |
| # a range expression representing the numeric elements on the given list. |
| # |
| ################################################################ |
| |
| proc list_to_range {numeric_list} { |
| set node_range_expression [node_list_to_range $numeric_list] |
| set range_expression [regsub {^\[(.*)\]$} $node_range_expression {\1}] |
| return $range_expression |
| } |
| |
| |
| ##################################################################### |
| # |
| # NAME |
| # node_range_to_list - converts a node range expression into a list of nodes using scontrol |
| # |
| # SYNOPSIS |
| # node_range_to_list node_range_expression |
| # |
| # ARGUMENTS |
| # |
| # node_range_expression |
| # a node range expression accepted by scontrol |
| # |
| # RETURN VALUE |
| # a list contining all node names expanded from the node range expression |
| # |
| ##################################################################### |
| |
| proc node_range_to_list {node_range_expression} { |
| global scontrol |
| |
| set node_list [list] |
| set output [run_command_output -nolog -fail "$scontrol show hostnames $node_range_expression"] |
| foreach line [split $output "\n"] { |
| if {$line eq ""} { |
| break |
| } |
| lappend node_list $line |
| } |
| return $node_list |
| } |
| |
| |
| ################################################################ |
| # |
| # NAME |
| # range_to_list - converts a range expression into a list with the numbers of the range |
| # |
| # SYNOPSIS |
| # range_to_list ?range_expression? |
| # |
| # DESCRIPTION |
| # Expands a range expression defined by the following EBNF into a numeric list |
| # <expression> ::= <range> {, <range>}* |
| # <range> ::= <integer> - <integer> | <integer> |
| # |
| # RETURN VALUE |
| # Returns the list of integer numbers defined by the range_expression |
| # |
| ################################################################ |
| |
| proc range_to_list {range_expression} { |
| set node_range_expression \[$range_expression\] |
| set range_list [node_range_to_list $node_range_expression] |
| return $range_list |
| } |
| |
| |
| ##################################################################### |
| # |
| # NAME |
| # get_nodes_by_state - gets the list of node names in a given partition/states |
| # |
| # SYNOPSIS |
| # get_nodes_by_state partition states |
| # |
| # DESCRIPTION |
| # sinfo is used to list node names and states in the specified partition. |
| # This list of nodes is filtered to return only the nodes matching one of |
| # the requested states. |
| # |
| # ARGUMENTS |
| # partition |
| # partition to get nodes off |
| # states |
| # comma-separated list of allowed states |
| # |
| # RETURN VALUE |
| # list of nodes having the required state |
| # |
| ##################################################################### |
| |
| proc get_nodes_by_state {{states ""} {partition ""}} { |
| global sinfo |
| |
| set node_list [list] |
| |
| if {$partition eq ""} { |
| set partition [default_partition] |
| } |
| |
| if {$states eq ""} { |
| set states "idle" |
| } |
| |
| set output [run_command_output -fail "$sinfo -h -N -p $partition -o '%N' -e -t $states"] |
| foreach line [split $output "\n"] { |
| if {$line eq ""} { continue } |
| lappend node_list $line |
| } |
| |
| return $node_list |
| } |
| |
| |
| ##################################################################### |
| # |
| # NAME |
| # set_partition_maximum_time_limit - sets the maximum time limit in a given partition |
| # |
| # SYNOPSIS |
| # set_partition_maximum_time_limit partition limit |
| # |
| # RETURN VALUE |
| # RETURN_SUCCESS, or non-zero on error |
| # |
| ##################################################################### |
| |
| proc set_partition_maximum_time_limit {partition limit} { |
| global scontrol |
| |
| if {[string length $partition] == 0} { |
| set partition [default_partition] |
| if { $partition == "" } { |
| return $::RETURN_ERROR |
| } |
| } |
| |
| if { $limit < -1 } { |
| fail "Trying to set invalid partition time limit of $limit" |
| } |
| if { $limit == -1 } { |
| set expected_lim "UNLIMITED" |
| } else { |
| set expected_lim limit |
| } |
| |
| run_command -fail "$scontrol update partitionname=$partition MaxTime=-1" |
| |
| set maxtime [get_partition_maximum_time_limit $partition] |
| if { $maxtime != $limit } { |
| log_error "Unable to update partition MaxTime, got $maxtime, wanted $limit" |
| return $::RETURN_ERROR |
| } |
| |
| return $::RETURN_SUCCESS |
| } |
| |
| |
| ##################################################################### |
| # |
| # NAME |
| # get_partition_maximum_time_limit - gets the maximum time limit in a given partition |
| # |
| # SYNOPSIS |
| # get_partition_maximum_time_limit partition |
| # |
| # DESCRIPTION |
| # Get the maximum time limit in a given partition |
| # |
| # RETURN VALUE |
| # time limit in seconds, -1 if undefined or error |
| # |
| ##################################################################### |
| |
| proc get_partition_maximum_time_limit {partition} { |
| global sinfo number |
| |
| if {[string length $partition] == 0} { |
| set partition [default_partition] |
| } |
| |
| set secs 0 |
| log_user 0 |
| spawn -noecho $sinfo -h -p $partition -O time -e |
| expect { |
| -re "infinite" { |
| set secs -1 |
| exp_continue |
| } |
| -re "n/a" { |
| set secs -1 |
| exp_continue |
| } |
| -re "($number)-($number):($number):($number)" { |
| set days [expr $expect_out(1,string) * 24 * 60 * 60] |
| set hours [expr $expect_out(2,string) * 60 * 60] |
| set mins [expr $expect_out(3,string) * 60] |
| set secs [expr $days + $hours + $mins + $expect_out(4,string)] |
| exp_continue |
| } |
| -re "($number):($number):($number)" { |
| set hours [expr $expect_out(1,string) * 60 * 60] |
| set mins [expr $expect_out(2,string) * 60] |
| set secs [expr $hours + $mins + $expect_out(3,string)] |
| exp_continue |
| } |
| -re "($number):($number)" { |
| set mins [expr $expect_out(1,string) * 60] |
| set secs [expr $mins + $expect_out(2,string)] |
| exp_continue |
| } |
| -re "($number)" { |
| set secs [expr $expect_out(1,string) * 60] |
| exp_continue |
| } |
| timeout { |
| fail "sinfo not responding" |
| } |
| eof { |
| wait |
| } |
| } |
| |
| log_user 1 |
| return $secs |
| } |
| |
| |
| ################################################################ |
| # |
| # NAME |
| # get_partition_default_time_limit - gets the default time limit in a given partition |
| # |
| # SYNOPSIS |
| # get_partition_default_time_limit ?partition? |
| # |
| # DESCRIPTION |
| # Get the default time limit in a given partition. |
| # If the partition is not specified, the default partition will be used. |
| # |
| # RETURN VALUE |
| # Returns: time limit in seconds, -1 if undefined or error. |
| # |
| ################################################################ |
| |
| proc get_partition_default_time_limit { {partition ""} } { |
| global sinfo number |
| |
| if {[string length $partition] == 0} { |
| set partition [default_partition] |
| } |
| |
| set secs 0 |
| log_user 0 |
| spawn -noecho $sinfo -h -p $partition -O defaulttime -e |
| expect { |
| -re "infinite" { |
| set secs -1 |
| exp_continue |
| } |
| -re "n/a" { |
| set secs -1 |
| exp_continue |
| } |
| -re "($number)-($number):($number):($number)" { |
| set days [expr $expect_out(1,string) * 24 * 60 * 60] |
| set hours [expr $expect_out(2,string) * 60 * 60] |
| set mins [expr $expect_out(3,string) * 60] |
| set secs [expr $days + $hours + $mins + $expect_out(4,string)] |
| exp_continue |
| } |
| -re "($number):($number):($number)" { |
| set hours [expr $expect_out(1,string) * 60 * 60] |
| set mins [expr $expect_out(2,string) * 60] |
| set secs [expr $hours + $mins + $expect_out(3,string)] |
| exp_continue |
| } |
| -re "($number):($number)" { |
| set mins [expr $expect_out(1,string) * 60] |
| set secs [expr $mins + $expect_out(2,string)] |
| exp_continue |
| } |
| -re "($number)" { |
| set secs [expr $expect_out(1,string) * 60] |
| exp_continue |
| } |
| timeout { |
| fail "sinfo not responding" |
| } |
| eof { |
| wait |
| } |
| } |
| |
| log_user 1 |
| return $secs |
| } |
| |
| |
| ##################################################################### |
| # |
| # NAME |
| # get_node_cores - given a node, returns its total number of cores |
| # |
| # SYNOPSIS |
| # get_node_cores node |
| # |
| # DESCRIPTION |
| # Given a node, return its total number of cores |
| # (not the CoresPerSocket, but the total cores) |
| # |
| # RETURN VALUE |
| # node cores if retrieved, -1 otherwise |
| # |
| ##################################################################### |
| |
| proc get_node_cores {node} { |
| global sinfo number |
| |
| set cores -1 |
| set sockets_per_node 0 |
| set cores_per_socket 0 |
| |
| if {[string length $node] == 0} { |
| return $cores |
| } |
| |
| log_user 0 |
| spawn -noecho $sinfo -o "%X %Y" -h -n $node |
| expect { |
| -re "($number)" { |
| if {$sockets_per_node == 0} { |
| set sockets_per_node $expect_out(1,string) |
| } else { |
| set cores_per_socket $expect_out(1,string) |
| } |
| exp_continue |
| } |
| timeout { |
| fail "sinfo not responding" |
| } |
| eof { |
| wait |
| } |
| } |
| log_user 1 |
| |
| set cores [expr $sockets_per_node * $cores_per_socket] |
| |
| return $cores |
| } |
| |
| |
| ##################################################################### |
| # |
| # NAME |
| # get_node_cpus - given a node, returns its total number of threads we account for |
| # |
| # SYNOPSIS |
| # get_node_cpus node |
| # |
| # DESCRIPTION |
| # Given a node, return its total number of threads we account for. |
| # (not always ThreadsPerCore, but how many threads are in use. |
| # i.e. CPUs=6 CoresPerSocket=6 ThreadsPerCore=2 Socket=1 would |
| # result in only 1 thread we care about instead of the 2 listed.) |
| # |
| # RETURN VALUE |
| # list of node [ tot_cpus threads ] if retrieved, [ -1 -1 ] otherwise |
| # |
| ##################################################################### |
| |
| proc get_node_cpus {node} { |
| global scontrol number |
| |
| set nthreads -1 |
| set nsockets 0 |
| set ncores 0 |
| set totcpus -1 |
| |
| if {[string length $node] == 0} { |
| return [list $totcpus $nthreads] |
| } |
| |
| # Get the number of CPUs on a node |
| spawn $scontrol show node $node |
| expect { |
| -re "CoresPerSocket=($number)" { |
| set ncores $expect_out(1,string) |
| exp_continue |
| } |
| -re "CPUTot=($number)" { |
| set totcpus $expect_out(1,string) |
| exp_continue |
| } |
| -re "Sockets=($number)" { |
| set nsockets $expect_out(1,string) |
| exp_continue |
| } |
| -re "ThreadsPerCore=($number)" { |
| set nthreads $expect_out(1,string) |
| exp_continue |
| } |
| timeout { |
| fail "scontrol is not responding" |
| } |
| eof { |
| wait |
| } |
| } |
| |
| set core_cnt [expr $nsockets * $ncores] |
| set thread_cnt [expr $ncores * $nthreads] |
| if {$totcpus != $nthreads && $totcpus == $ncores} { |
| log_debug "Cores rather than threads are being allocated" |
| set nthreads 1 |
| } |
| |
| return [list $totcpus $nthreads] |
| } |
| |
| |
| ##################################################################### |
| # |
| # NAME |
| # get_part_total_cores - given a partition and/or states, return its total cores |
| # |
| # SYNOPSIS |
| # get_part_total_cores partition states |
| # |
| # DESCRIPTION |
| # Given a partition and/or states, return its total cores |
| # |
| # ARGUMENTS |
| # partition |
| # partition to check cores |
| # states |
| # states to filter on partition cores |
| # |
| # RETURN VALUE |
| # partition cores |
| # |
| ##################################################################### |
| |
| proc get_part_total_cores {part states} { |
| global sinfo number |
| |
| set cores 0 |
| set tmp 0 |
| set i 0 |
| |
| if {[string length $part] == 0} { |
| set part [default_partition] |
| } |
| |
| log_user 0 |
| if {[string length $states] == 0} { |
| spawn -noecho $sinfo -h -N -p $part -o "%X %Y" |
| } else { |
| spawn -noecho $sinfo -h -N -p $part -t $states -o "%X %Y" |
| } |
| expect { |
| -re "($number)" { |
| set is_even [expr {($i % 2) == 0}] |
| if {$is_even == 1} { |
| set tmp $expect_out(1,string) |
| } else { |
| set tmp [expr $tmp * $expect_out(1,string)] |
| set cores [expr $cores + $tmp] |
| } |
| incr i |
| exp_continue |
| } |
| timeout { |
| fail "sinfo not responding" |
| } |
| eof { |
| wait |
| } |
| } |
| log_user 1 |
| |
| return $cores |
| } |
| |
| |
| ##################################################################### |
| # |
| # NAME |
| # check_hosts_contiguous - verify if all hosts belong to the partition and are contiguous |
| # |
| # SYNOPSIS |
| # check_hosts_contiguous check_hosts_list partition |
| # |
| # DESCRIPTION |
| # Given a partition and a list of hosts, verify if all |
| # hosts belong to the partition and are contiguous. |
| # If the partition argument is empty, the default partition |
| # will be used. |
| # |
| # RETURN VALUE |
| # Returns: true if hosts are contiguous, false otherwise. |
| # |
| ##################################################################### |
| |
| proc check_hosts_contiguous { check_hosts_list {partition ""} } { |
| global sinfo re_word_str |
| |
| if {[string length $partition] == 0} { |
| set partition [default_partition] |
| } |
| |
| set part_hosts_list {} |
| |
| log_user 0 |
| spawn $sinfo --noheader -p $partition -N -o %N |
| expect { |
| -re "($re_word_str)" { |
| lappend part_hosts_list $expect_out(1,string) |
| exp_continue |
| } |
| -re "Unable to contact" { |
| fail "Slurm appears to be down" |
| } |
| timeout { |
| fail "sinfo not responding" |
| } |
| eof { |
| wait |
| } |
| } |
| log_user 1 |
| |
| foreach host $check_hosts_list { |
| set idx_cur [lsearch $part_hosts_list $host] |
| if {$idx_cur == -1} { |
| fail "Host ($host) not found in list of hosts from partition $partition" |
| } |
| if {[info exists idx_old]} { |
| if {$idx_cur != [expr $idx_old + 1]} { |
| log_error "Node sequence number not contiguous" |
| return false |
| } |
| } |
| set idx_old $idx_cur |
| } |
| |
| return true |
| } |
| |
| |
| ################################################################ |
| # |
| # NAME |
| # get_het_job_ids - gets list of component job ids for a het job |
| # |
| # SYNOPSIS |
| # get_het_job_ids job_id ?use_offset? |
| # |
| # DESCRIPTION |
| # Gets list of component job ids for a het job. |
| # |
| # ARGUMENTS |
| # job_id |
| # Slurm job id |
| # use_offset |
| # If zero, returns list of integer job ids, else returns ids in |
| # the form of X+Y where X is het job master id and Y is the |
| # offset. |
| # |
| # RETURN VALUE |
| # A list of ids for a hetjob or an empty list if jobid |
| # is not a het one. |
| # |
| ################################################################ |
| |
| proc get_het_job_ids { jobid {use_offset 0}} { |
| global scontrol number |
| |
| set id_list "" |
| set log_user_save [log_user -info] |
| log_user 0 |
| spawn $scontrol show job $jobid |
| expect { |
| -re "JobId=($number) HetJobId=($number) HetJobOffset=($number)" { |
| if { $use_offset } { |
| lappend id_list "$expect_out(2,string)+$expect_out(3,string)" |
| } else { |
| lappend id_list $expect_out(1,string) |
| } |
| exp_continue |
| } |
| timeout { |
| fail "scontrol not responding" |
| } |
| eof { |
| wait |
| } |
| } |
| log_user $log_user_save |
| return $id_list |
| } |
| |
| |
| ################################################################ |
| # |
| # NAME |
| # reconfigure - calls scontrol reconfigure |
| # |
| # SYNOPSIS |
| # reconfigure ?options? ?cluster? |
| # |
| # DESCRIPTION |
| # Calls scontrol reconfigure. This routine takes the same options as |
| # run_command, passing them to the underlying run_command invocation. |
| # This command waits an additional 5 seconds before returning. |
| # |
| # OPTIONS |
| # See OPTIONS of run_command proc. |
| # |
| # ARGUMENTS |
| # cluster |
| # The cluster to reconfigure |
| # |
| # RETURN VALUE |
| # RETURN_SUCCESS on success, otherwise RETURN_ERROR |
| # |
| ################################################################ |
| |
| proc reconfigure args { |
| global scontrol |
| |
| set options [list] |
| set cluster "" |
| |
| while {[llength $args]} { |
| switch -glob -- [lindex $args 0] { |
| -* { |
| lappend options {*}[lrange $args 0 1] |
| set args [lrange $args 2 end] |
| } |
| default break |
| } |
| } |
| set argument_count [llength $args] |
| if {$argument_count > 1} { |
| fail "Too many arguments ($argument_count): $args" |
| } elseif {$argument_count == 1} { |
| lassign $args cluster |
| } |
| |
| set command $scontrol |
| if {$cluster ne ""} { |
| append command " -M$cluster" |
| } |
| append command " reconfigure" |
| set rc [run_command_status {*}$options "$command"] |
| |
| # |
| # Wait 5 seconds for reconfigure to complete, then return. |
| # |
| sleep 5 |
| return $rc |
| } |
| |
| |
| ##################################################################### |
| # |
| # NAME |
| # log_fatal - prints a fatal message |
| # |
| # SYNOPSIS |
| # log_fatal message |
| # |
| # SEE ALSO |
| # _log_format for options governing the message format and colorization |
| # |
| ##################################################################### |
| |
| proc log_fatal {message} { |
| global testsuite_log_level LOG_LEVEL_FATAL |
| |
| if {$testsuite_log_level >= $LOG_LEVEL_FATAL} { |
| _log_format "fatal" "$message" |
| } |
| } |
| |
| |
| ##################################################################### |
| # |
| # NAME |
| # log_error - prints an error message |
| # |
| # SYNOPSIS |
| # log_error message |
| # |
| # SEE ALSO |
| # _log_format for options governing the message format and colorization |
| # |
| ##################################################################### |
| |
| proc log_error {message} { |
| global testsuite_log_level LOG_LEVEL_ERROR |
| |
| if {$testsuite_log_level >= $LOG_LEVEL_ERROR} { |
| _log_format "error" "$message" |
| } |
| } |
| |
| |
| ##################################################################### |
| # |
| # NAME |
| # log_warn - prints a warning message |
| # |
| # SYNOPSIS |
| # log_warn message |
| # |
| # SEE ALSO |
| # _log_format for options governing the message format and colorization |
| # |
| ##################################################################### |
| |
| proc log_warn {message} { |
| global testsuite_log_level LOG_LEVEL_WARNING |
| |
| if {$testsuite_log_level >= $LOG_LEVEL_WARNING} { |
| _log_format "warning" "$message" |
| } |
| } |
| |
| |
| ##################################################################### |
| # |
| # NAME |
| # log_info - prints an information message |
| # |
| # SYNOPSIS |
| # log_info message |
| # |
| # SEE ALSO |
| # _log_format for options governing the message format and colorization |
| # |
| ##################################################################### |
| |
| proc log_info {message} { |
| global testsuite_log_level LOG_LEVEL_INFO |
| |
| if {$testsuite_log_level >= $LOG_LEVEL_INFO} { |
| _log_format "info" "$message" |
| } |
| } |
| |
| |
| ##################################################################### |
| # |
| # NAME |
| # log_pass - prints a pass level message |
| # |
| # SYNOPSIS |
| # log_pass message |
| # |
| # SEE ALSO |
| # _log_format for options governing the message format and colorization |
| # |
| ##################################################################### |
| |
| proc log_pass {message} { |
| global testsuite_log_level LOG_LEVEL_PASS |
| |
| if {$testsuite_log_level >= $LOG_LEVEL_PASS} { |
| _log_format "pass" "$message" |
| } |
| } |
| |
| |
| ##################################################################### |
| # |
| # NAME |
| # log_command - prints a command level message |
| # |
| # SYNOPSIS |
| # log_command message |
| # |
| # SEE ALSO |
| # _log_format for options governing the message format and colorization |
| # |
| ##################################################################### |
| |
| proc log_command {message} { |
| global testsuite_log_level LOG_LEVEL_COMMAND |
| |
| if {$testsuite_log_level >= $LOG_LEVEL_COMMAND} { |
| _log_format "command" "$message" |
| } |
| } |
| |
| |
| ##################################################################### |
| # |
| # NAME |
| # log_debug - prints a debug level message |
| # |
| # SYNOPSIS |
| # log_debug message |
| # |
| # SEE ALSO |
| # _log_format for options governing the message format and colorization |
| # |
| ##################################################################### |
| |
| proc log_debug {message} { |
| global testsuite_log_level LOG_LEVEL_DEBUG |
| |
| if {$testsuite_log_level >= $LOG_LEVEL_DEBUG} { |
| _log_format "debug" "$message" |
| } |
| } |
| |
| |
| ##################################################################### |
| # |
| # NAME |
| # log_trace - prints a trace level message |
| # |
| # SYNOPSIS |
| # log_trace message |
| # |
| # SEE ALSO |
| # _log_format for options governing the message format and colorization |
| # |
| ##################################################################### |
| |
| proc log_trace {message} { |
| global testsuite_log_level LOG_LEVEL_TRACE |
| |
| if {$testsuite_log_level >= $LOG_LEVEL_TRACE} { |
| _log_format "trace" "$message" |
| } |
| } |
| |
| |
| ################################################################ |
| # |
| # NAME |
| # in_fed - checks whether this cluster is in a federation |
| # |
| # SYNOPSIS |
| # in_fed |
| # |
| # RETURN VALUE |
| # Returns true if this cluster is in a federation, false otherwise |
| # |
| ################################################################ |
| |
| proc in_fed {} { |
| global scontrol |
| |
| set output [run_command_output -fail -nolog "$scontrol show fed"] |
| if {[regexp "Federation" $output]} { |
| return true |
| } |
| |
| return false |
| } |
| |
| |
| ################################################################ |
| # |
| # NAME |
| # check_job_state - checks if the state of a job is the expected one |
| # |
| # SYNOPSIS |
| # check_job_state job state ?het_job? |
| # |
| # DESCRIPTION |
| # Checks if the state of a job is the expected one. |
| # |
| # ARGUMENTS |
| # job |
| # Job ID to check |
| # state |
| # Desired state of the job to match |
| # het_job |
| # If set, checks state of each component job if the |
| # job is a hetjob. |
| # |
| # RETURN VALUE |
| # true if job was on the desired state, or the number of job components |
| # on that state if it's a hetjob and het_job option enabled, false |
| # otherwise. |
| # |
| ################################################################ |
| |
| proc check_job_state { job state {het_job 0}} { |
| global scontrol |
| |
| set jid_list "" |
| if { $het_job } { |
| set jid_list [get_het_job_ids $job 1] |
| } |
| |
| if { $jid_list == "" } { |
| # non-het job |
| set jid_list $job |
| } |
| |
| foreach jid $jid_list { |
| set state_match 0 |
| spawn $scontrol show job $jid |
| expect { |
| -re "JobState=($state)" { |
| incr state_match |
| } |
| timeout { |
| fail "scontrol not responding" |
| } |
| eof { |
| wait |
| } |
| } |
| |
| if {$state_match != 1} { |
| log_error "job $jid should be in $state state, but is not" |
| return false |
| } |
| } |
| |
| return true |
| } |
| |
| |
| ################################################################ |
| # |
| # NAME |
| # get_gres_count - returns a dict of nodes and GRES counts |
| # |
| # SYNOPSIS |
| # get_gres_count gres_name ?node_list? |
| # |
| # DESCRIPTION |
| # Returns a dict of node names and the count of a specifed |
| # GRES aggregating all its types on each node. |
| # |
| # RETURN VALUE |
| # If the node_list is not specified node name is specified, |
| # this function will return a dict with the GRES count for all |
| # the nodes of the default partition. |
| # If specified, a dict only with the nodes of the node_list. |
| # |
| ################################################################ |
| |
| proc get_gres_count { gres_name {node_list ""} } { |
| set nodes_dict [get_nodes $node_list] |
| set nodes_gres_dict [dict create] |
| dict for {node_name node_dict} $nodes_dict { |
| if [dict exists $node_dict "Gres"] { |
| set gres_param [dict get $node_dict "Gres"] |
| set gres_dict [count_gres $gres_param] |
| } |
| if [dict exists $gres_dict $gres_name] { |
| set gres_count [dict get $gres_dict $gres_name] |
| dict set nodes_gres_dict $node_name $gres_count |
| } |
| } |
| return $nodes_gres_dict |
| } |
| |
| |
| ################################################################ |
| # |
| # NAME |
| # count_gres - returns a dict of GRES names and their total counts |
| # |
| # SYNOPSIS |
| # count_gres gres_param |
| # |
| # DESCRIPTION |
| # Parses a GRES parameter string typically obtained from nodes or |
| # jobs info, and returns a dict of GRES names and their count |
| # aggregating all the types of each GRES. |
| # |
| # ARGUMENTS |
| # gres_param |
| # The usual coma-separated list of Gres (e.g gpu:2,craynetwork:1). |
| # only_consumable |
| # If true, the no_consume gres are ignored. |
| # |
| # RETURN VALUE |
| # A dict of GRES names and their count aggregating all types of |
| # each GRES. |
| # |
| ################################################################ |
| |
| proc count_gres {gres_param {only_consumable false}} { |
| global gres_regex |
| set gres_dict [dict create] |
| foreach gres [split $gres_param ","] { |
| if {[regexp $gres_regex $gres {} name type nc count] == 1} { |
| if {$only_consumable && ($type eq "no_consume" || $nc eq "no_consume")} { |
| continue |
| } elseif {$nc eq ""} { |
| set count $type |
| } elseif {$count eq ""} { |
| set count $nc |
| } |
| |
| if {[dict exists $gres_dict $name]} { |
| dict set gres_dict $name [expr [dict get $gres_dict $name] + $count] |
| } else { |
| dict set gres_dict $name $count |
| } |
| } |
| } |
| return $gres_dict |
| } |
| |
| ################################################################ |
| # |
| # NAME |
| # get_highest_gres_count - returns highest number of GRES per node on node_count nodes |
| # |
| # SYNOPSIS |
| # get_highest_gres_count node_count gres_name |
| # |
| # DESCRIPTION |
| # For a given number of nodes, returns the highest GRES count per |
| # node available on at least that number of nodes. |
| # |
| # EXAMPLE |
| # For example: node1 has 1 GPU, node2 has 2 GPUs and node3 has 3 GPUs |
| # [get_highest_gres_count 1 "gpu"] returns 3 (i.e. 1 node 3 GPUs) |
| # [get_highest_gres_count 2 "gpu"] returns 2 (i.e. 2 nodes have at least 2 GPUs each) |
| # [get_highest_gres_count 3 "gpu"] returns 1 (i.e. 3 nodes have at least 1 GPU each) |
| # |
| ################################################################ |
| |
| proc get_highest_gres_count { node_count gres_name } { |
| |
| set available_nodes [node_list_to_range [get_nodes_by_state]] |
| set gres_dict [get_gres_count $gres_name $available_nodes] |
| set gres_count [list] |
| |
| dict for {node gres} $gres_dict { |
| lappend gres_count $gres |
| } |
| |
| set count [lindex [lsort -decreasing -integer $gres_count] [expr $node_count - 1]] |
| return $count |
| } |
| |
| |
| ################################################################ |
| # |
| # NAME |
| # _set_gpu_socket_inx - adds a socket index to the gpu_sock_list if not already on it |
| # |
| # SYNOPSIS |
| # _set_gpu_socket_inx sock_inx |
| # |
| # DESCRIPTION |
| # Add a socket index to the array gpu_sock_list if not already |
| # on the list. Subroutine used by get_gpu_socket_count |
| # |
| ################################################################ |
| |
| proc _set_gpu_socket_inx { sock_inx } { |
| global gpu_sock_list |
| |
| if {$sock_inx == -1} { |
| set gpu_sock_list [lreplace $gpu_sock_list 0 99] |
| return |
| } |
| |
| set sock_cnt [llength $gpu_sock_list] |
| for {set i 0} {$i < $sock_cnt} {incr i} { |
| if {[lindex $gpu_sock_list $i] == $sock_inx} { |
| return |
| } |
| } |
| lappend gpu_sock_list $sock_inx |
| } |
| |
| |
| ################################################################ |
| # Subroutine used by get_gpu_socket_count |
| # Add a socket index to the array gpu_sock_list if not already |
| # on the list. |
| ################################################################ |
| |
| proc _set_gpu_socket_range { sock_first_inx sock_last_inx } { |
| global gpu_sock_list |
| |
| set sock_cnt [llength $gpu_sock_list] |
| for {set s $sock_first_inx} {$s <= $sock_last_inx} {incr s} { |
| set found 0 |
| for {set i 0} {$i < $sock_cnt} {incr i} { |
| if {[lindex $gpu_sock_list $i] == $s} { |
| set found 1 |
| break |
| } |
| } |
| if {$found == 0} { |
| lappend gpu_sock_list $s |
| } |
| } |
| } |
| |
| |
| ################################################################ |
| # |
| # NAME |
| # get_gpu_socket_count - returns the number of sockets with GPUS on a node with the given per-node GPU count |
| # |
| # SYNOPSIS |
| # get_gpu_socket_count gpu_cnt sockets_per_node |
| # |
| # DESCRIPTION |
| # Given a per-node GPU count, return the number of sockets with |
| # GPUs on a node with the given per-node GPU count. |
| # If the sockets_per_node has a value of 1 then just return 1 |
| # rather than determine the count (for performance reasons). |
| # |
| ################################################################ |
| |
| proc get_gpu_socket_count { gpu_cnt sockets_per_node } { |
| global test_dir re_word_str bin_rm number scontrol srun gpu_sock_list |
| |
| set sockets_with_gpus 1 |
| set file_in "$test_dir/test_get_gpu_socket_count" |
| |
| if {$sockets_per_node == 1} { |
| return 1 |
| } |
| |
| log_user 0 |
| _set_gpu_socket_inx -1 |
| make_bash_script $file_in "$scontrol show node \$SLURMD_NODENAME" |
| spawn $srun -N1 --gres=gpu:$gpu_cnt $file_in |
| expect { |
| -re "gpu:${number}.S:($number)-($number)" { |
| _set_gpu_socket_range $expect_out(1,string) $expect_out(2,string) |
| exp_continue |
| } |
| -re "gpu:${re_word_str}:${number}.S:($number),($number),($number),($number)" { |
| _set_gpu_socket_inx $expect_out(1,string) |
| _set_gpu_socket_inx $expect_out(2,string) |
| _set_gpu_socket_inx $expect_out(3,string) |
| _set_gpu_socket_inx $expect_out(4,string) |
| exp_continue |
| } |
| -re "gpu:${re_word_str}:${number}.S:($number),($number),($number)" { |
| _set_gpu_socket_inx $expect_out(1,string) |
| _set_gpu_socket_inx $expect_out(2,string) |
| _set_gpu_socket_inx $expect_out(3,string) |
| exp_continue |
| } |
| -re "gpu:${re_word_str}:${number}.S:($number),($number)" { |
| _set_gpu_socket_inx $expect_out(1,string) |
| _set_gpu_socket_inx $expect_out(2,string) |
| exp_continue |
| } |
| -re "gpu:${re_word_str}:${number}.S:($number)" { |
| _set_gpu_socket_inx $expect_out(1,string) |
| exp_continue |
| } |
| -re "gpu:${number}.S:($number),($number),($number),($number)" { |
| _set_gpu_socket_inx $expect_out(1,string) |
| _set_gpu_socket_inx $expect_out(2,string) |
| _set_gpu_socket_inx $expect_out(3,string) |
| _set_gpu_socket_inx $expect_out(4,string) |
| exp_continue |
| } |
| -re "gpu:${number}.S:($number),($number),($number)" { |
| _set_gpu_socket_inx $expect_out(1,string) |
| _set_gpu_socket_inx $expect_out(2,string) |
| _set_gpu_socket_inx $expect_out(3,string) |
| exp_continue |
| } |
| -re "gpu:${number}.S:($number),($number)" { |
| _set_gpu_socket_inx $expect_out(1,string) |
| _set_gpu_socket_inx $expect_out(2,string) |
| exp_continue |
| } |
| -re "gpu:${number}.S:($number)" { |
| _set_gpu_socket_inx $expect_out(1,string) |
| exp_continue |
| } |
| eof { |
| wait |
| } |
| } |
| log_user 1 |
| |
| set sock_cnt [llength $gpu_sock_list] |
| if {$sock_cnt > 1} { |
| set sockets_with_gpus $sock_cnt |
| } |
| |
| return $sockets_with_gpus |
| } |
| |
| |
| ################################################################ |
| # |
| # NAME |
| # get_highest_mps_count - get_highest_gres_count nodes mps, but for "mps per GPU" |
| # |
| # SYNOPSIS |
| # get_highest_mps_count node_count |
| # |
| # DESCRIPTION |
| # For a given number of nodes, returns the higest number of MPS per GPU |
| # available at least on those number of nodes. |
| # |
| ################################################################ |
| |
| proc get_highest_mps_count { node_count } { |
| # We cannot use get_highest_gres_count because we need "per gpu", |
| # so we get all the mps per node and all gpus per node, to create |
| # a mps_per_gpu list to sort and get the count. |
| set available_nodes [node_list_to_range [get_nodes_by_state]] |
| set mps_dict [get_gres_count "mps" $available_nodes] |
| set gpu_dict [get_gres_count "gpu" $available_nodes] |
| set mps_per_gpu [list] |
| |
| dict for {node mps} $mps_dict { |
| if { $mps > 0 } { |
| if [dict exists $gpu_dict $node] { |
| set gpu [dict get $gpu_dict $node] |
| if { $gpu > 0 } { |
| lappend mps_per_gpu [expr $mps / $gpu] |
| } else { |
| fail "All nodes with MPS should have a GPU" |
| } |
| } else { |
| fail "All nodes with MPS should have a GPU" |
| } |
| } |
| } |
| |
| set count [lindex [lsort -decreasing -integer $mps_per_gpu] [expr $node_count - 1]] |
| return $count |
| } |
| |
| |
| ################################################################ |
| # |
| # NAME |
| # get_mps_node_count - gets the number of nodes with a positive number of GRES MPS |
| # |
| # SYNOPSIS |
| # get_mps_node_count |
| # |
| # RETURN VALUE |
| # Return the count of nodes with a non-zero count of GRES MPS |
| # |
| ################################################################ |
| |
| proc get_mps_node_count { } { |
| global number sinfo re_word_str |
| set fini 0 |
| set node_inx 0 |
| set def_part [default_partition] |
| |
| log_user 0 |
| spawn $sinfo -N -p$def_part -oGRES=%G -h |
| expect { |
| -re "GRES=($re_word_str)" { |
| set mps_count 0 |
| set parts [split $expect_out(1,string) ",/"] |
| while 1 { |
| set mps_found [lsearch $parts "mps*"] |
| if { $mps_found == -1 } break |
| |
| set parts2 [split [lindex $parts $mps_found] ":(/"] |
| set col [lsearch -regexp $parts2 ^$number$] |
| if { $col == -1 } { |
| incr mps_count |
| } else { |
| set mps_count [expr $mps_count + [lindex $parts2 $col]] |
| } |
| set parts [lreplace $parts $mps_found $mps_found] |
| } |
| |
| if {$mps_count > 0} { |
| incr node_inx |
| } |
| exp_continue |
| } |
| eof { |
| wait |
| } |
| } |
| log_user 1 |
| |
| return $node_inx |
| } |
| |
| |
| ################################################################ |
| # |
| # NAME |
| # cuda_count - determines the count of allocated GPUs |
| # |
| # SYNOPSIS |
| # cuda_count cuda_string |
| # |
| # ARGUMENTS |
| # cuda_string |
| # Contents of a CUDA_VISIBLE_DEVICES environment variable |
| # |
| # RETURN VALUE |
| # Return the number of GPUs or -1 on error |
| # |
| ################################################################ |
| |
| proc cuda_count { cuda_string } { |
| set cuda_count 0 |
| set has_number 0 |
| set len [string length $cuda_string] |
| for {set char_inx 0} {$char_inx < $len} {incr char_inx} { |
| set cuda_char [string index $cuda_string $char_inx] |
| if {[string match , $cuda_char]} { |
| if {$has_number > 0} { |
| incr cuda_count |
| set has_number 0 |
| } else { |
| log_error "Invalid input ($cuda_string)" |
| return -1 |
| } |
| } elseif {[string is digit $cuda_char]} { |
| set has_number 1 |
| } |
| } |
| if {$has_number > 0} { |
| incr cuda_count |
| } else { |
| log_error "Invalid input ($cuda_string)" |
| return -1 |
| } |
| return $cuda_count |
| } |
| |
| |
| ################################################################ |
| # NAME |
| # get_conf_path - gets the path to the slurm.conf file |
| # |
| # SYNOPSIS |
| # get_conf_path |
| # |
| # RETURN VALUE |
| # Returns the path to the slurm.conf file |
| # |
| ################################################################ |
| |
| proc get_conf_path { } { |
| global scontrol re_word_str eol |
| |
| if [regexp {(.*)/slurm.conf} [get_config_param "SLURM_CONF"] {} config_dir] { |
| return $config_dir |
| } else { |
| fail "Unable to determine config dir" |
| } |
| } |
| |
| |
| ################################################################ |
| # |
| # NAME |
| # save_conf - saves a backup of the specfied configuration file |
| # |
| # SYNOPSIS |
| # save_conf file_name |
| # |
| # DESCRIPTION |
| # If the specified file_name exists, a backup is made which will be |
| # restored when restore_conf is called. |
| # If the specified file_name does not exist, a special backup will be |
| # made that will cause the file to be removed when restore_conf is |
| # called. |
| # If a backup already exists, a warning is issued and no backup is made |
| # (honoring the existing backup). |
| # |
| # SEE ALSO |
| # restore_conf |
| # |
| ################################################################ |
| |
| proc save_conf { file_name } { |
| global bin_chmod bin_cp bin_mv bin_touch test_name |
| |
| log_debug "Saving backup of $file_name" |
| |
| # |
| # Check for existing backup |
| # If a backup exists, issue a warning and return (honor existing backup) |
| # |
| set conf_dir [file dirname $file_name] |
| set dir_files [glob -nocomplain -directory $conf_dir *] |
| set preexisting_backup_file [lsearch -inline -regexp $dir_files "$file_name\\\.test\\d+\\\.\\d+\$"] |
| if {$preexisting_backup_file ne ""} { |
| log_warn "Backup file already exists: ($preexisting_backup_file)" |
| return |
| } |
| |
| # |
| # Check if file to backup exists. |
| # If it doesn't exist, warn the user, touch an empty backup file with |
| # the sticky bit set and allow the test to continue. |
| # restore_conf will remove the file. |
| # |
| set new_backup_file "$file_name.$test_name" |
| if {![file exists $file_name]} { |
| log_warn "Backup of a nonexistent file requested: $file_name" |
| run_command -fail -nolog "$bin_touch $new_backup_file" |
| run_command -fail -nolog "$bin_chmod +t $new_backup_file" |
| return |
| } |
| |
| run_command -fail -nolog "$bin_mv $file_name $new_backup_file" |
| run_command -fail -nolog "$bin_cp $new_backup_file $file_name" |
| } |
| |
| |
| ################################################################ |
| # |
| # NAME |
| # restore_conf - restores the original confiration file from backup |
| # |
| # SYNOPSIS |
| # restore_conf file_name |
| # |
| # DESCRIPTION |
| # If a backup exists for the specified file_name, it is restored. |
| # If the specified file_name did not exist when originally backed up, |
| # it will be removed. |
| # If no backup exists, a warning is issued. |
| # |
| # SEE ALSO |
| # save_conf |
| # |
| ################################################################ |
| |
| proc restore_conf { file_name } { |
| global bin_mv bin_rm |
| |
| log_debug "Restoring backup of $file_name" |
| |
| set conf_dir [file dirname $file_name] |
| set dir_files [glob -nocomplain -directory $conf_dir *] |
| set backup_file [lsearch -inline -regexp $dir_files "$file_name\\\.test\\d+\\\.\\d+\$"] |
| if {$backup_file ne ""} { |
| file stat $backup_file stat |
| |
| # If the sticky bit is set and the file is empty, remove both |
| if {! $stat(size) && [expr $stat(mode) & 512]} { |
| log_debug "Removing file used for the test: $file_name" |
| run_command -fail -nolog "$bin_rm -f $backup_file $file_name" |
| # Else replace the original with the backup |
| } else { |
| run_command -fail -nolog "$bin_mv $backup_file $file_name" |
| } |
| } else { |
| # |
| # If backup file doesn't exist, it has probably already been |
| # restored by a previous call to restore_conf |
| # |
| log_warn "Backup file does not exist for $file_name. It has probably already been restored" |
| return |
| } |
| } |
| |
| |
| ################################################################ |
| # |
| # NAME |
| # have_nvml - checks if HAVE_NVML is set in config.h |
| # |
| # SYNOPSIS |
| # have_nvml |
| # |
| # RETURN VALUE |
| # Returns true if HAVE_NVML is set in config.h. Else, returns false |
| # |
| ################################################################ |
| |
| proc have_nvml { } { |
| global bin_grep number config_h |
| |
| return [expr [run_command_status -none -nolog "$bin_grep \"HAVE_NVML 1\" $config_h"] == 0] |
| } |
| |
| |
| ################################################################ |
| # |
| # NAME |
| # delete_part - deletes partition on system |
| # |
| # SYNOPSIS |
| # delete_part partition |
| # |
| ################################################################ |
| |
| proc delete_part { part_name } { |
| global scontrol |
| |
| # Remove part |
| spawn $scontrol delete partition=$part_name |
| expect { |
| timeout { |
| fail "scontrol is not responding" |
| } |
| eof { |
| wait |
| } |
| } |
| } |
| |
| |
| ################################################################ |
| # |
| # NAME |
| # have_lua - checks if HAVE_LUA is set in config.h |
| # |
| # SYNOPSIS |
| # have_lua |
| # |
| # RETURN VALUE |
| # Returns true if HAVE_LUA is set in config.h. Else, returns false |
| # |
| ################################################################ |
| |
| proc have_lua { } { |
| global bin_grep config_h |
| |
| return [expr [run_command_status -none -nolog "$bin_grep HAVE_LUA $config_h"] == 0] |
| } |
| |
| |
| ################################################################ |
| # |
| # NAME |
| # get_reservations - returns a dictionary of dictionaries of reservation parameters |
| # |
| # SYNOPSIS |
| # get_reservations ?resv_name? |
| # |
| # RETURN VALUE |
| # Uses `scontrol show reservation` to return a dictionary of dictionaries |
| # of job parameters. Specifying an invalid resv_name result in a failure. |
| # |
| ################################################################ |
| |
| proc get_reservations { {resv_name ""} } { |
| global scontrol |
| |
| set command "$scontrol show reservation -o" |
| if {$resv_name ne ""} { |
| append command " $resv_name" |
| } |
| set output [run_command_output -fail "$command"] |
| |
| # Iterate over each reservation's parameter list |
| foreach line [split $output "\n"] { |
| if {$line eq ""} { continue } |
| |
| # Peel off the resv parameters one at a time |
| # The first quantifier sets the greediness for the whole RE |
| while {[regexp {^ *?([^ =]+)=(.*)(?= +[^ =]+=| *$)} $line {} param_name param_value]} { |
| # Remove the consumed parameter from the line |
| set line [regsub {^ *?[^ =]+=.*(?= +[^ =]+=| *$)} $line {}] |
| # Add it to the temporary job dictionary |
| dict set resv_dict $param_name $param_value |
| } |
| |
| set resv_name_dict [dict get $resv_dict "ReservationName"] |
| |
| # Add the resv dictionary to resvs dictionary |
| dict set resvs_dict $resv_name_dict $resv_dict |
| |
| # Clear the resv dictionary for the next resv |
| set resv_dict {} |
| } |
| |
| return $resvs_dict |
| } |
| |
| |
| ################################################################ |
| # |
| # NAME |
| # get_reservation_param - returns a specific parameter value for a specific reservation |
| # |
| # SYNOPSIS |
| # get_reservation_param resv_name parameter_name |
| # |
| # DESCRIPTION |
| # Returns a specific parameter value for a specified reservation if the |
| # parameter exists for the reservation, or MISSING if it does not exist. |
| # Specifying an invalid reservation name will result in a failure. |
| # |
| ################################################################ |
| |
| proc get_reservation_param { resv_name parameter_name } { |
| |
| set resvs_dict [get_reservations $resv_name] |
| |
| if [dict exists $resvs_dict $resv_name $parameter_name] { |
| return [dict get $resvs_dict $resv_name $parameter_name] |
| } else { |
| return "MISSING" |
| } |
| } |
| |
| |
| ################################################################ |
| # |
| # NAME |
| # create_res - create new reservation in system |
| # |
| # SYNOPSIS |
| # create_res ?res_name? ?res_params? |
| # |
| # RETURN VALUE |
| # the exit code of the scontrol command run |
| # |
| ################################################################ |
| |
| proc create_res { res_name res_params } { |
| global scontrol |
| |
| set result [run_command "$scontrol create res ReservationName=$res_name $res_params"] |
| set output [dict get $result output] |
| set ret_code [dict get $result exit_code] |
| |
| if { $ret_code } { |
| log_warn "[lindex [info level 0] 0]: error from scontrol: $output" |
| } else { |
| log_debug "[lindex [info level 0] 0]: success from scontrol: $output" |
| } |
| |
| return $ret_code |
| } |
| |
| |
| ################################################################ |
| # |
| # NAME |
| # update_res - update exisiting reservation in system |
| # |
| # SYNOPSIS |
| # update_res ?res_name? ?res_params? |
| # |
| # RETURN VALUE |
| # the exit code of the scontrol command run |
| # |
| ################################################################ |
| |
| proc update_res { res_name res_params } { |
| global scontrol |
| |
| set result [run_command "$scontrol update ReservationName=$res_name $res_params"] |
| set output [dict get $result output] |
| set ret_code [dict get $result exit_code] |
| |
| if { $ret_code } { |
| log_warn "Return code from scontrol: $ret_code. Output: $output" |
| } |
| |
| return $ret_code |
| } |
| |
| |
| ################################################################ |
| # |
| # NAME |
| # delete_res - delete reservation from system |
| # |
| # SYNOPSIS |
| # delete_res ?res_name? |
| # |
| # RETURN VALUE |
| # the exit code of the scontrol command run |
| # |
| ################################################################ |
| |
| proc delete_res { res_name } { |
| global scontrol |
| |
| set result [run_command "$scontrol delete ReservationName=$res_name"] |
| set output [dict get $result output] |
| set ret_code [dict get $result exit_code] |
| |
| if { $ret_code } { |
| log_warn "Return code from scontrol: $ret_code. Output: $output" |
| } |
| |
| return $ret_code |
| } |
| |
| |
| ################################################################ |
| # |
| # NAME |
| # create_part - creates a partition |
| # |
| # SYNOPSIS |
| # create_part partition num_nodes |
| # |
| # ARGUMENTS |
| # partition |
| # Name of partition to create |
| # num_nodes |
| # Number of nodes of partition to create |
| # |
| # RETURN VALUE |
| # RETURN_SUCCESS, or non-zero on error |
| # |
| ################################################################ |
| |
| proc create_part { part_name num_nodes_in } { |
| global scontrol srun bin_printenv number re_word_str |
| |
| set nodes "" |
| set num_nodes_out 0 |
| |
| set found 0 |
| spawn $scontrol show partitionname=$part_name |
| expect { |
| -re "PartitionName=$part_name" { |
| set found 1 |
| exp_continue |
| } |
| timeout { |
| fail "scontrol is not responding" |
| } |
| eof { |
| wait |
| } |
| } |
| |
| if {$found == 1} { |
| log_error "There is already a partition $part_name" |
| return $::RETURN_ERROR |
| } |
| |
| if {[string length [default_partition]] == 0} { |
| log_warn "create_part does not work without a default partition" |
| return $::RETURN_ERROR |
| } |
| |
| if { $num_nodes_in } { |
| set num_nodes $num_nodes_in |
| } else { |
| set num_nodes [llength [get_nodes_by_state]] |
| } |
| |
| log_user 0 |
| # Get a list of nodes |
| spawn $srun -t1 -N1-$num_nodes $bin_printenv |
| expect { |
| -re "SLURM_JOB_NUM_NODES=($number)" { |
| set num_nodes_out $expect_out(1,string) |
| exp_continue |
| } |
| -re "SLURM_NODELIST=($re_word_str)" { |
| set nodes $expect_out(1,string) |
| exp_continue |
| } |
| timeout { |
| fail "srun is not responding getting number of nodes creating part" |
| } |
| eof { |
| wait |
| } |
| } |
| |
| if {[string length $nodes] == 0} { |
| log_error "Did not get a valid node list" |
| return $::RETURN_ERROR |
| } elseif { $num_nodes_out != $num_nodes_in } { |
| log_error "Did not get enough nodes ($num_nodes_out != $num_nodes_in) to run test" |
| return $::RETURN_ERROR |
| } |
| |
| spawn $scontrol create partitionname=$part_name nodes=$nodes |
| expect { |
| timeout { |
| fail "scontrol is not responding creating partition" |
| } |
| eof { |
| wait |
| } |
| } |
| |
| set found 0 |
| spawn $scontrol show partitionname=$part_name |
| expect { |
| -re "PartitionName=$part_name" { |
| set found 1 |
| exp_continue |
| } |
| timeout { |
| fail "scontrol is not responding" |
| } |
| eof { |
| wait |
| } |
| } |
| |
| if { $found == 0 } { |
| log_error "scontrol did not create partition $part_name" |
| return $::RETURN_ERROR |
| } |
| log_user 1 |
| |
| log_debug "Created partition $part_name with $num_nodes_in nodes" |
| return $::RETURN_SUCCESS |
| } |
| |
| |
| ################################################################ |
| # |
| # NAME |
| # get_nodes - returns a dictionary of dictionaries of node parameters |
| # |
| # SYNOPSIS |
| # get_nodes ?hostlist_expression? |
| # |
| # DESCRIPTION |
| # Uses `scontrol show node` to query node parameters, returning a |
| # dictionary of dictionaries with the node names as keys of the first |
| # level dictionary and with the parameters as keys of the second level |
| # dictionary. Specifying an invalid node name will result in a failure. |
| # |
| # RETURN VALUE |
| # If the optional node expression argument is specified, the result will |
| # be constrained by the specified hostlist expression. Otherwise, the |
| # results for all nodes will be returned. |
| # |
| ################################################################ |
| |
| proc get_nodes { {hostlist_expression ""} } { |
| global scontrol |
| |
| set command "$scontrol show node -o" |
| if {$hostlist_expression ne ""} { |
| append command " $hostlist_expression" |
| } |
| set output [run_command_output -fail -nolog "$command"] |
| |
| # Iterate over each node parameter line |
| foreach line [split $output "\n"] { |
| if {$line eq ""} { continue } |
| |
| # Peel off the node parameters one at a time |
| # The first quantifier sets the greediness for the whole RE |
| while {[regexp {^ *?([^ =]+)=(.*)(?= +[^ =]+=| *$)} $line {} param_name param_value]} { |
| # Remove the consumed parameter from the line |
| set line [regsub {^ *?[^ =]+=.*(?= +[^ =]+=| *$)} $line {}] |
| # Add it to the temporary node dictionary |
| dict set node_dict $param_name $param_value |
| } |
| |
| set node_name [dict get $node_dict "NodeName"] |
| |
| # Add the node dictionary to nodes dictionary |
| dict set nodes_dict $node_name $node_dict |
| |
| # Clear the node dictionary for use by the next node |
| set node_dict {} |
| } |
| |
| return $nodes_dict |
| } |
| |
| |
| ################################################################ |
| # |
| # NAME |
| # get_node_param - returns a specific parameter value for a specific node |
| # |
| # SYNOPSIS |
| # get_node_param node_name parameter_name |
| # |
| # DESCRIPTION |
| # Returns a specific parameter value for a specified node if the |
| # parameter exists for the node, or MISSING if it does not exist. |
| # Specifying an invalid node name will result in a failure. |
| # |
| ################################################################ |
| |
| proc get_node_param { node_name parameter_name } { |
| |
| set nodes_dict [get_nodes $node_name] |
| |
| if [dict exists $nodes_dict $node_name $parameter_name] { |
| return [dict get $nodes_dict $node_name $parameter_name] |
| } else { |
| return "MISSING" |
| } |
| } |
| |
| |
| ################################################################ |
| # |
| # NAME |
| # get_nodes_by_request - get a list of nodes satisfying requested resources |
| # |
| # SYNOPSIS |
| # get_nodes_by_request ?options? ?request_args? |
| # |
| # DESCRIPTION |
| # Using srun (optionally with the specified arguments), returns a list |
| # of nodes having the requested resources. |
| # If an error occurs, the invoking test will fail. |
| # |
| # OPTIONS |
| # -fail |
| # fail the test if the execution of srun results in an error or timeout |
| # |
| # ARGUMENTS |
| # request_args |
| # Desired resources of a node in form of srun arguments, |
| # e.g. "--gres=gpu:1 -n1 -t1" |
| # |
| # RETURN VALUE |
| # A list of nodes with at least the requested resources, or an empty |
| # list otherwise. |
| # |
| ################################################################ |
| |
| proc get_nodes_by_request args { |
| global srun |
| |
| set options [list] |
| while {[llength $args]} { |
| switch -glob -- [lindex $args 0] { |
| -fail { |
| lappend options [lrange $args 0 0] |
| set args [lrange $args 1 end] |
| } |
| default break |
| } |
| } |
| |
| if {[llength $args] == 1} { |
| lassign $args request_args |
| } elseif {[llength $args] == 0} { |
| set request_args "-n1 -t1" |
| } else { |
| fail "[lindex [info level 0] 0]: Invalid number of arguments ([llength $args]): $args" |
| } |
| |
| log_debug "Getting nodes that can be allocated with request: $request_args" |
| set command "$srun -Q $request_args printenv SLURMD_NODENAME" |
| set result [run_command {*}$options $command] |
| |
| if [dict get $result exit_code] { |
| return {} |
| } |
| |
| set output [dict get $result output] |
| foreach line [split $output "\n"] { |
| if {$line eq ""} { continue } |
| dict incr allocated_nodes $line |
| } |
| |
| return [lsort [dict keys $allocated_nodes]] |
| } |
| |
| |
| ################################################################ |
| # |
| # NAME |
| # get_partitions - returns a dictionary of dictionaries of partition parameters |
| # |
| # SYNOPSIS |
| # get_partitions ?partition_name? |
| # |
| # DESCRIPTION |
| # Uses `scontrol show partitions` to query partition parameters, |
| # returning a dictionary of dictionaries with the partition names |
| # as keys of the first level dictionary and with the parameters as |
| # keys of the second level dictionary. Specifying an invalid partition |
| # name will result in a failure. |
| # |
| # RETURN VALUE |
| # If the optional partition_name argument is specified, the result will |
| # contain only the one patition. Otherwise, the results for all |
| # partitions will be returned. |
| # |
| ################################################################ |
| |
| proc get_partitions { {partition_name ""} } { |
| global scontrol |
| |
| set command "$scontrol show partition -o" |
| if {$partition_name ne ""} { |
| append command " $partition_name" |
| } |
| set output [run_command_output -fail -nolog "$command"] |
| |
| # Iterate over each partition parameter line |
| foreach line [split $output "\n"] { |
| if {$line eq ""} { continue } |
| |
| # Peel off the partition parameters one at a time |
| # The first quantifier sets the greediness for the whole RE |
| while {[regexp {^ *?([^ =]+)=(.*)(?= +[^ =]+=| *$)} $line {} param_name param_value]} { |
| # Remove the consumed parameter from the line |
| set line [regsub {^ *?[^ =]+=.*(?= +[^ =]+=| *$)} $line {}] |
| # Add it to the temporary node dictionary |
| dict set part_dict $param_name $param_value |
| } |
| |
| set part_name [dict get $part_dict "PartitionName"] |
| |
| # Add the node dictionary to nodes dictionary |
| dict set parts_dict $part_name $part_dict |
| |
| # Clear the node dictionary for use by the next node |
| set part_dict {} |
| } |
| |
| return $parts_dict |
| } |
| |
| |
| ################################################################ |
| # |
| # NAME |
| # get_partition_param - returns a specific parameter value for a specific partition |
| # |
| # SYNOPSIS |
| # get_partition_param partitoin_name parameter_name |
| # |
| # DESCRIPTION |
| # Returns a specific parameter value for a specified partition if the |
| # parameter exists for the partition, or MISSING if it does not exist. |
| # Specifying an invalid partition name will result in a failure. |
| # |
| ################################################################ |
| |
| proc get_partition_param { partition_name parameter_name } { |
| |
| set partitions_dict [get_partitions $partition_name] |
| |
| if [dict exists $partitions_dict $partition_name $parameter_name] { |
| return [dict get $partitions_dict $partition_name $parameter_name] |
| } else { |
| return "MISSING" |
| } |
| } |
| |
| |
| ################################################################ |
| # |
| # NAME |
| # get_jobs - returns a dictionary of dictionaries of job parameters |
| # |
| # SYNOPSIS |
| # get_jobs ?job_id? |
| # |
| # DESCRIPTION |
| # Uses `scontrol show job` to return a dictionary of dictionaries of job |
| # parameters. Specifying an invalid job id will result in a failure. |
| # |
| ################################################################ |
| |
| proc get_jobs { {job_id_in ""} } { |
| global scontrol |
| |
| set command "$scontrol show job -d -o" |
| if {$job_id_in ne ""} { |
| append command " $job_id_in" |
| } |
| set output [run_command_output -fail "$command"] |
| |
| # Iterate over each job's parameter list |
| foreach line [split $output "\n"] { |
| if {$line eq ""} { continue } |
| |
| # Peel off the job parameters one at a time |
| # The first quantifier sets the greediness for the whole RE |
| while {[regexp {^ *?([^ =]+)=(.*)(?= +[^ =]+=| *$)} $line {} param_name param_value]} { |
| # Remove the consumed parameter from the line |
| set line [regsub {^ *?[^ =]+=.*(?= +[^ =]+=| *$)} $line {}] |
| # Add it to the temporary job dictionary |
| dict set job_dict $param_name $param_value |
| } |
| |
| set job_id [dict get $job_dict "JobId"] |
| |
| # Add the job dictionary to jobs dictionary |
| dict set jobs_dict $job_id $job_dict |
| |
| # Clear the job dictionary for the next job |
| set job_dict {} |
| } |
| |
| return $jobs_dict |
| } |
| |
| |
| ################################################################ |
| # |
| # NAME |
| # get_job_param - returns a specific parameter value for a specific job |
| # |
| # SYNOPSIS |
| # get_job_param job_id parameter_name |
| # |
| # DESCRIPTION |
| # Returns a specific parameter value for a specified job if the |
| # parameter exists for the job, or MISSING if it does not exist. |
| # Specifying an invalid job id will result in a failure. |
| # |
| ################################################################ |
| |
| proc get_job_param { job_id parameter_name } { |
| |
| set jobs_dict [get_jobs $job_id] |
| |
| if [dict exists $jobs_dict $job_id $parameter_name] { |
| return [dict get $jobs_dict $job_id $parameter_name] |
| } else { |
| return "MISSING" |
| } |
| } |
| |
| proc check_reason { job_id reason } { |
| global squeue |
| |
| set found 0 |
| spawn $squeue -j $job_id --noheader -o "%r" |
| expect { |
| -re "$reason" { |
| set found 1 |
| exp_continue |
| } |
| timeout { |
| log_error "squeue not responding" |
| } |
| eof { |
| wait |
| } |
| } |
| |
| if {$found == 0} { |
| log_error "Job $job_id should have a wait reason of $reason" |
| } |
| return $found |
| } |
| |
| |
| ################################################################ |
| # |
| # NAME |
| # submit_job - submits a job with sbatch and returns its job id |
| # |
| # SYNOPSIS |
| # submit_job ?options? job_args |
| # |
| # DESCRIPTION |
| # Submits a job with sbatch and returns its jobid, or 0 if error. |
| # It accepts all the options of run_command. |
| # |
| # OPTIONS |
| # It accepts and passes all the options of/to run_command and also: |
| # -env env |
| # Prepend $env to the actual sbatch command to set environment |
| # variables. For example "-env 'SLURM_NTASKS_PER_GPU=2'". |
| # |
| # ARGUMENTS |
| # job_args |
| # a string containing all the arguments to pass to sbatch |
| # |
| # RETURN VALUE |
| # the job id, or 0 if an error happen |
| # |
| ################################################################ |
| |
| proc submit_job args { |
| global sbatch |
| |
| set env "" |
| set job_id 0 |
| |
| set idx [lsearch $args -env] |
| if {$idx >= 0} { |
| set env [lindex $args [expr $idx+1]] |
| set args [lreplace $args $idx [expr $idx+1]] |
| } |
| |
| if {[llength $args] < 1} { |
| fail "Wrong number of parameters, should be >=1" |
| } |
| set job_options [lindex $args [expr [llength $args] - 1 ]] |
| set run_options "" |
| if {[llength $args] > 1} { |
| set run_options [lrange $args 0 [expr [llength $args] - 2 ]] |
| } |
| |
| set command "" |
| if {$env ne ""} { |
| set command "$env " |
| } |
| set command "$command$sbatch $job_options" |
| set output [run_command_output {*}$run_options "$command"] |
| regexp {Submitted \S+ job (\d+)} $output - job_id |
| |
| return $job_id |
| } |
| |
| ############################################################################### |
| # |
| # NAME |
| # check_exclusive_gres - Given a job_id and node_name of an exclusive |
| # allocation make sure we allocated all GRES. |
| # |
| ################################################################################ |
| |
| proc check_exclusive_gres { job_id node_name } { |
| # Check all consumable GRES of the node were allocated on the job |
| set gres_dict_job [count_gres [get_job_param $job_id "JOB_GRES"]] |
| set gres_dict_node [count_gres [get_node_param $node_name "Gres"] true] |
| |
| dict for {gres_name gres_count} $gres_dict_node { |
| # We don't want mps to be allocated with --exclusive |
| if {$gres_name eq "mps" || $gres_name eq "shard"} { |
| if {[dict exists $gres_dict_job $gres_name]} { |
| subfail "Gres ($gres_name) on node ($node_name) allocated on job ($job_id) with --exclusive but it shouldn't be" |
| } else { |
| subpass "Gres ($gres_name) on node ($node_name) was not allocated on job ($job_id) with --exclusive, this is expected" |
| } |
| } elseif {![dict exists $gres_dict_job $gres_name]} { |
| subfail "Gres ($gres_name) on node ($node_name) not allocated on job ($job_id) with --exclusive" |
| } else { |
| subpass "Gres ($gres_name) on node ($node_name) was allocated on job ($job_id) with --exclusive" |
| |
| set gres_count_job [dict get $gres_dict_job $gres_name] |
| subtest { $gres_count_job == $gres_count } "Gres ($gres_name) on node ($node_name) fully allocated on job ($job_id) with --exclusive" "$gres_count_job != $gres_count" |
| } |
| } |
| } |
| |
| ################################################################ |
| # |
| # NAME |
| # compile_against_libslurm - compiles a test program against either libslurm.so or libslurmfull.so |
| # |
| # SYNOPSIS |
| # compile_against_libslurm ?options? test_prog ?build_args? |
| # |
| # DESCRIPTION |
| # Compile a test program against either libslurm.so or libslurmfull.so. |
| # |
| # OPTIONS |
| # -full |
| # use libslurmfull.so instead of libslurm.so |
| # -shared |
| # produces a shared library (adds the -shared compiler option |
| # and adds a .so suffix to the output file name) |
| # |
| # ARGUMENTS |
| # test_prog |
| # The name of the test program (and .c file) |
| # build_args |
| # Additional string to be appended to the build command. |
| # E.g. "-DUSING_VALGRIND -lm ${build_dir}/src/slurmctld/locks.o" |
| # (initial space will be added automatically). |
| # |
| ################################################################################ |
| |
| proc compile_against_libslurm args { |
| global slurm_dir bin_cc src_dir build_dir bin_chmod |
| |
| set use_full false |
| set shared false |
| set build_args "" |
| while {[llength $args]} { |
| switch -glob -- [lindex $args 0] { |
| -full {set use_full true; set args [lrange $args 1 end]} |
| -shared {set shared true; set args [lrange $args 1 end]} |
| -* {fail "Unknown option: [lindex $args 0]"} |
| default break |
| } |
| } |
| set argument_count [llength $args] |
| if {$argument_count < 1} { |
| fail "Too few arguments ($argument_count): $args" |
| } else { |
| lassign $args test_prog |
| } |
| if {$argument_count == 2} { set build_args [lindex $args 1] } |
| if {$argument_count > 2} { |
| fail "Too many arguments ($argument_count): $args" |
| } |
| |
| if {$use_full} { |
| set libfile "libslurmfull.so" |
| } else { |
| set libfile "libslurm.so" |
| } |
| |
| if [file exists $slurm_dir/lib64/slurm/$libfile] { |
| set libdir "lib64" |
| } else { |
| set libdir "lib" |
| } |
| |
| if {$use_full} { |
| set libline "$slurm_dir/$libdir/slurm" |
| set libfile "slurmfull" |
| } else { |
| set libline "$slurm_dir/$libdir" |
| set libfile "slurm" |
| } |
| |
| set build_cmd "$bin_cc ${test_prog}.c -g -pthread" |
| |
| if {$shared} { |
| set out "${test_prog}.so" |
| append build_cmd " -fPIC -shared" |
| } else { |
| set out "${test_prog}" |
| } |
| append build_cmd " -o $out" |
| |
| append build_cmd " -I$src_dir -I$build_dir -I$slurm_dir/include -Wl,-rpath=$libline -L$libline -l$libfile -lresolv" |
| |
| # Add additional arguments to the build command |
| if {$build_args != ""} { |
| append build_cmd " $build_args" |
| } |
| |
| log_debug "Build command: $build_cmd" |
| catch {exec {*}$build_cmd} out_str out_dict |
| if {[dict get $out_dict -code]} { |
| log_error $out_str |
| return $::RETURN_ERROR |
| } |
| |
| exec $bin_chmod 700 $out |
| return $::RETURN_SUCCESS |
| } |
| |
| |
| ################################################################ |
| # |
| # NAME |
| # subtest - tests a boolean condition and updates subtest tallies |
| # |
| # SYNOPSIS |
| # subtest ?options? condition description ?diagnostics? |
| # |
| # DESCRIPTION |
| # Based on the results of testing a boolean expression, increments the |
| # relevant subtest count (pass, fail or skip) and logs a message. |
| # |
| # OPTIONS |
| # -fatal |
| # If the subtest fails, causes a fatal error ending the test |
| # |
| # ARGUMENTS |
| # condition |
| # The boolean expression to test |
| # description |
| # A single-line string describing what is being tested. This is |
| # a subtest "name" that is displayed with the log message |
| # whether the subtest passes or fails |
| # diagnostics |
| # A string providing additional diagnostic information that is |
| # only included in the log message on failure |
| # |
| # RETURN VALUE |
| # the boolean condition evaluated |
| # |
| # ENVIRONMENT |
| # testsuite_subtest_fatal |
| # Specifies whether first failing subtest aborts the test |
| # |
| ################################################################ |
| |
| proc subtest args { |
| |
| set options [list] |
| while {[llength $args]} { |
| switch -glob -- [lindex $args 0] { |
| -fatal - |
| -fail { |
| lappend options -fatal |
| set args [lrange $args 1 end] |
| } |
| -* {fail "Unknown option: [lindex $args 0]"} |
| default break |
| } |
| } |
| set argument_count [llength $args] |
| if {$argument_count < 2} { |
| fail "Too few arguments ($argument_count): $args" |
| } else { |
| set args [lassign $args condition description] |
| } |
| |
| if [uplevel 1 expr [format "{%s}" $condition]] { |
| subpass $description |
| return true |
| } else { |
| subfail {*}$options $description {*}$args |
| return false |
| } |
| } |
| |
| |
| ################################################################ |
| # |
| # NAME |
| # _is_testproc_included - returns if testproc_id was included or not excluded in argv |
| # |
| # SYNOPSIS |
| # _is_testproc_included testproc_id testproc_alias |
| # |
| # DESCRIPTION |
| # From command line the test runner can use -i and -e to include or |
| # exclude some test functions by their number. This function must be |
| # used to check if the test runner included or excluded the given |
| # testproc_id. |
| # |
| # RETURN VALUE |
| # true is testproc_id or testproc_alias was included with -i or not |
| # excluded -e, false otherwise |
| # |
| ################################################################ |
| |
| proc _is_testproc_included {testproc_id testproc_alias} { |
| global _testproc_included _testproc_excluded |
| |
| if {[llength $_testproc_included]} { |
| if {[lsearch $_testproc_included $testproc_id] >= 0 || \ |
| [lsearch $_testproc_included $testproc_alias] >= 0} { |
| return true |
| } |
| return false |
| } |
| |
| if {[llength $_testproc_excluded]} { |
| if {[lsearch $_testproc_excluded $testproc_id] >= 0 || \ |
| [lsearch $_testproc_excluded $testproc_alias] >= 0} { |
| return false |
| } |
| } |
| |
| return true |
| } |
| |
| |
| ################################################################ |
| # |
| # NAME |
| # skip_following_testprocs - the following testproc calls will be skipped |
| # |
| # SYNOPSIS |
| # skip_following_testprocs reason |
| # |
| # ARGUMENTS |
| # reason |
| # The string with the reason message to add on the skip message |
| # on each skipped testproc. |
| # |
| # DESCRIPTION |
| # This function disables normal execution of testproc calls. |
| # It is meant to be used when some testprocs cannot be run due config |
| # limitations, but still call testproc to register what testprocs |
| # are skipped for a given reason. |
| # Use run_following_testprocs to reenable the norma execution of testprocs. |
| |
| ################################################################ |
| |
| proc skip_following_testprocs {reason} { |
| global _testproc_skip_next _testproc_skip_reason |
| |
| set _testproc_skip_next true |
| set _testproc_skip_reason $reason |
| } |
| |
| |
| ################################################################ |
| # |
| # NAME |
| # run_following_testprocs - the following testproc call will be run (if not excluded from command line) |
| # |
| # SYNOPSIS |
| # run_following_testprocs |
| # |
| # DESCRIPTION |
| # This function reenables the normal execution of testproc calls. |
| # It is meant to be used when skip_following_testprocs was called to skip |
| # previous testproc calls, and we want to normally run the following ones. |
| # Note that it does NOT overwrite what -i and -e included are passed |
| # from command line. |
| # |
| ################################################################ |
| |
| proc run_following_testprocs {} { |
| global _testproc_skip_next _testproc_skip_reason |
| |
| set _testproc_skip_next false |
| set _testproc_skip_reason "" |
| } |
| |
| |
| ################################################################ |
| # |
| # NAME |
| # testproc - launcher to run or skip a testproc_call |
| # |
| # SYNOPSIS |
| # testproc testproc_call |
| # |
| # ARGUMENTS |
| # A testproc_call is any normal call to a proc with any arguments that |
| # could be done normally without the testproc launcher. |
| # For example, we could normally do: |
| # |
| # test_my_feature $some_args $expected_out |
| # |
| # Or use the launcher like: |
| # |
| # testproc test_my_feature $some_args $expected_out |
| # |
| # DESCRIPTION |
| # Using the testproc launcher has the following main benefits: |
| # a) Handles the -i and -e terminal options to include or exclude some |
| # testprocs numbers. |
| # b) Runs or skips based on the last call of testproc_{skip,run}_following. |
| # c) Creates extra sections in the status summary |
| # (see testsuite_testproc_details). |
| # |
| # RETURN VALUE |
| # The rc of the testproc_call if it has been run, or $::RETURN_SUCCESS |
| # otherwise. Using it is not recommended, though. |
| # |
| ################################################################ |
| |
| proc testproc args { |
| return [testproc_alias "" {*}$args] |
| } |
| |
| |
| ################################################################ |
| # |
| # NAME |
| # testproc_alias - launcher to run or skip a testproc_call, with an alias |
| # |
| # SYNOPSIS |
| # testproc alias testproc_call |
| # |
| # DESCRIPTION |
| # See testproc |
| # |
| ################################################################ |
| |
| proc testproc_alias {alias args} { |
| global _testproc_pass_list _testproc_skip_list _testproc_fail_list |
| global _testproc_skip_next _testproc_skip_reason |
| global _testproc_messages testsuite_testproc_log_calls |
| global _subtest_pass_count |
| global _subtest_skip_count |
| global _subtest_fail_count |
| |
| # Avoid integer alias to avoid confusions with testproc_id |
| if {[llength $alias] && [string is integer -strict $alias]} { |
| fail "testproc_alias doesn't support integer alias, use alphanumeric" |
| } |
| |
| # Save previous subtest counts and next subtest num |
| set prev_pass $_subtest_pass_count |
| set prev_skip $_subtest_skip_count |
| set prev_fail $_subtest_fail_count |
| set prev_subtest [expr $_subtest_pass_count + \ |
| $_subtest_skip_count + \ |
| $_subtest_fail_count + 1] |
| |
| # Get the testproc number |
| set testproc_id [expr [llength $_testproc_pass_list] + \ |
| [llength $_testproc_skip_list] + \ |
| [llength $_testproc_fail_list] + 1] |
| |
| # Run or skip the testproc |
| set rc $::RETURN_SUCCESS |
| set reason "" |
| if {![_is_testproc_included $testproc_id $alias]} { |
| set reason "(Excluded from command line)" |
| if {$testsuite_testproc_log_calls == yes} { |
| subskip -nolog "Skipping testproc $testproc_id: {$args} $reason" |
| } else { |
| subskip -nolog "Skipping testproc $testproc_id: $reason" |
| } |
| } else { |
| if {$_testproc_skip_next} { |
| set reason "($_testproc_skip_reason)" |
| if {$testsuite_testproc_log_calls == yes} { |
| subskip "Skipping testproc $testproc_id: {$args} $reason" |
| } else { |
| subskip "Skipping testproc $testproc_id: $reason" |
| } |
| } else { |
| if {$testsuite_testproc_log_calls != no} { |
| log_info "Running testproc $testproc_id: $args" |
| } else { |
| log_info "Running testproc $testproc_id" |
| } |
| set rc [{*}$args] |
| } |
| } |
| |
| # Get current subtest counts |
| set curr_subtest [expr $_subtest_pass_count + \ |
| $_subtest_skip_count + \ |
| $_subtest_fail_count] |
| |
| # Register the testproc as fail, skip or pass (based on subtests) |
| if {[llength $alias]} { |
| set alias "${alias}: " |
| } |
| if {$_subtest_fail_count > $prev_fail} { |
| set reason "(Subtests: $prev_subtest to $curr_subtest)" |
| lappend _testproc_fail_list $testproc_id |
| if {$testsuite_testproc_log_calls != no} { |
| dict set _testproc_messages $testproc_id [list failed "$alias{$args} $reason"] |
| } else { |
| dict set _testproc_messages $testproc_id [list failed "$alias$reason"] |
| } |
| } elseif {$_subtest_skip_count > $prev_skip} { |
| lappend _testproc_skip_list $testproc_id |
| if {$testsuite_testproc_log_calls == yes} { |
| dict set _testproc_messages $testproc_id [list skipped "$alias{$args} $reason"] |
| } else { |
| dict set _testproc_messages $testproc_id [list skipped "$alias$reason"] |
| } |
| } else { |
| set reason "(Subtests: $prev_subtest to $curr_subtest)" |
| lappend _testproc_pass_list $testproc_id |
| if {$testsuite_testproc_log_calls == yes} { |
| dict set _testproc_messages $testproc_id [list passed "$alias{$args} $reason"] |
| } else { |
| dict set _testproc_messages $testproc_id [list passed "$alias$reason"] |
| } |
| } |
| |
| return $rc |
| } |
| |
| |
| ################################################################ |
| # |
| # NAME |
| # _log_format - prints a log message with colorization and formatting |
| # |
| # SYNOPSIS |
| # _log_format log_level message |
| # |
| # DESCRIPTION |
| # This procedure is called by the log_<level> procedures and |
| # derives the relevant log level from the caller's procedure name. |
| # |
| # ARGUMENTS |
| # log_level |
| # The logging threshold that triggered the log statement |
| # message |
| # The message to print with colorization and formatting |
| # |
| # ENVIRONMENT |
| # testsuite_log_format |
| # Used as the template for the fields to be output. |
| # Fields must be expressed in the form: |
| # %{<field_name>}<format_conversion_specifier> |
| # Supported fields include the following: |
| # message |
| # The log message |
| # filename |
| # The file name where the log_<log_level> |
| # procedure was called from |
| # lineno |
| # The line number where the log_<log_level> |
| # procedure was called from |
| # timestamp |
| # The date and time when the log_<log_level> |
| # procedure was called at |
| # msecs |
| # The milliseconds when the log_<log_level> |
| # procedure was called at |
| # loglevel |
| # The log level that triggers the log_<log_level> |
| # procedure to be called |
| # backtrace |
| # An abbreviated call stack trace with line |
| # numbers |
| # testsuite_time_format |
| # Used as a template for the timestamp. See the format groups |
| # for the tcl clock format command. |
| # testsuite_colorize |
| # Boolean that turns colorization on or off |
| # testsuite_color_<log_level> |
| # Can be set to define the color used for each log level |
| # |
| ################################################################ |
| |
| proc _log_format { log_level message } { |
| global testsuite_colorize testsuite_log_format testsuite_time_format |
| global COLOR_NONE |
| global testsuite_color_fatal testsuite_color_error testsuite_color_warn |
| global testsuite_color_info testsuite_color_pass testsuite_color_command |
| global testsuite_color_debug testsuite_color_trace |
| |
| set format_string $testsuite_log_format |
| set milliseconds_since_epoch [clock milliseconds] |
| set date_time [clock format [expr {$milliseconds_since_epoch / 1000}] -format "$testsuite_time_format"] |
| set milliseconds [expr {$milliseconds_since_epoch % 1000}] |
| |
| set frame_level -2 |
| while { [dict get [info frame $frame_level] type] != "source" } { |
| incr frame_level -1 |
| } |
| |
| set format_args {} |
| while {[regexp "%{\[a-z]+}" $format_string format_field]} { |
| if {$format_field eq "%{message}"} { |
| lappend format_args $message |
| } elseif {$format_field eq "%{filename}"} { |
| lappend format_args [file tail [dict get [info frame $frame_level] file]] |
| } elseif {$format_field eq "%{lineno}"} { |
| lappend format_args [dict get [info frame $frame_level] line] |
| } elseif {$format_field eq "%{timestamp}"} { |
| lappend format_args $date_time |
| } elseif {$format_field eq "%{msecs}" || $format_field eq "%{milliseconds}"} { |
| lappend format_args $milliseconds |
| } elseif {$format_field eq "%{loglevel}" || $format_field eq "%{levelname}"} { |
| lappend format_args [string totitle $log_level] |
| } elseif {$format_field eq "%{backtrace}"} { |
| lappend format_args [_line_trace] |
| } else { |
| fail "Invalid field ($format_field) specified in testsuite_log_format" |
| } |
| regsub $format_field $format_string "%" format_string |
| } |
| |
| if ($testsuite_colorize) { |
| switch $log_level { |
| fatal { append output $testsuite_color_fatal } |
| error { append output $testsuite_color_error } |
| warning { append output $testsuite_color_warn } |
| info { append output $testsuite_color_info } |
| pass { append output $testsuite_color_pass } |
| command { append output $testsuite_color_command } |
| debug { append output $testsuite_color_debug } |
| trace { append output $testsuite_color_trace } |
| } |
| } |
| append output [format $format_string {*}$format_args] |
| if ($testsuite_colorize) { |
| append output $COLOR_NONE |
| } |
| puts $output |
| } |
| |
| |
| ################################################################ |
| # |
| # NAME |
| # _print_header - prints a test header |
| # |
| # SYNOPSIS |
| # _print_header |
| # |
| # ENVIRONMENT |
| # testsuite_colorize |
| # Boolean that turns colorization on or off |
| # testsuite_color_header |
| # Can be set to define the color used for the header |
| # |
| ################################################################ |
| |
| proc _print_header { } { |
| global test_name testsuite_color_header testsuite_colorize COLOR_NONE |
| |
| if ($testsuite_colorize) { |
| append output $testsuite_color_header |
| } |
| append output [string repeat = 78]\n |
| append output [format "%-9s" "TEST:"]${test_name}\n |
| append output [string repeat = 78] |
| if ($testsuite_colorize) { |
| append output $COLOR_NONE |
| } |
| puts $output |
| } |
| |
| |
| ################################################################ |
| # |
| # NAME |
| # _print_summary - prints the final status summary |
| # |
| # SYNOPSIS |
| # _print_summary status completed |
| # |
| # ARGUMENTS |
| # status |
| # The final status of the test. |
| # When status is zero, we print SUCCESS. |
| # When status is negative, we print SKIPPED. |
| # When status is positive, we print FAILURE. |
| # |
| # completed |
| # A boolean value that is true if the test completed and false |
| # if aborted (ended early with exit status != 0) |
| # |
| # ENVIRONMENT |
| # testsuite_colorize |
| # Boolean that turns colorization on or off |
| # testsuite_color_<test_status> |
| # Can be set to define the color used for each test status |
| # |
| ################################################################ |
| |
| proc _print_summary {status completed} { |
| global test_name testsuite_colorize COLOR_NONE |
| global testsuite_color_success testsuite_color_skipped |
| global testsuite_color_failure |
| global _subtest_fail_count _subtest_pass_count _subtest_skip_count |
| global _testproc_pass_list _testproc_skip_list _testproc_fail_list |
| global _incomplete_reason |
| global _subtest_messages _testproc_messages |
| global testsuite_subtest_details testsuite_testproc_details |
| |
| if {$status == 0} { |
| set color $testsuite_color_success |
| set header "SUCCESS" |
| } elseif {$status < 0} { |
| set color $testsuite_color_skipped |
| set header "SKIPPED" |
| } elseif {$status > 0} { |
| set color $testsuite_color_failure; |
| set header "FAILURE" |
| } |
| |
| if ($testsuite_colorize) { |
| append output $color |
| } |
| |
| append output [string repeat = 78]\n |
| |
| # Get subtest and testproc counts |
| set testproc_fail [llength $_testproc_fail_list] |
| set testproc_skip [llength $_testproc_skip_list] |
| set testproc_pass [llength $_testproc_pass_list] |
| set testproc_count [expr $testproc_pass + \ |
| $testproc_skip + \ |
| $testproc_fail] |
| set subtest_count [expr $_subtest_fail_count + \ |
| $_subtest_pass_count + \ |
| $_subtest_skip_count] |
| |
| # Initial summary |
| append output [format "%s : %s\n" $header $test_name] |
| if {$testproc_count > 0} { |
| append output [format " Testprocs failed : %3d (%3d%%)%s\n" $testproc_fail [expr $testproc_fail * 100 / $testproc_count] \ |
| [expr {$testproc_fail ? " List: [join $_testproc_fail_list ,]" : ""}]] |
| append output [format " Testprocs skipped : %3d (%3d%%)%s\n" $testproc_skip [expr $testproc_skip * 100 / $testproc_count] \ |
| [expr {$testproc_skip ? " List: [join $_testproc_skip_list ,]" : ""}]] |
| append output [format " Testprocs passed : %3d (%3d%%)\n" $testproc_pass [expr $testproc_pass * 100 / $testproc_count]] |
| append output [format " Testprocs total : %3d %s\n" $testproc_count [expr {$completed ? "COMPLETE" : "INCOMPLETE: $_incomplete_reason"}]] |
| } |
| if {$subtest_count > 0} { |
| if {$testproc_count > 0} { |
| append output \n |
| } |
| append output [format " Subtests failed : %3d (%3d%%)\n" $_subtest_fail_count [expr $_subtest_fail_count * 100 / $subtest_count]] |
| append output [format " Subtests skipped : %3d (%3d%%)\n" $_subtest_skip_count [expr $_subtest_skip_count * 100 / $subtest_count]] |
| append output [format " Subtests passed : %3d (%3d%%)\n" $_subtest_pass_count [expr $_subtest_pass_count * 100 / $subtest_count]] |
| append output [format " Subtests total : %3d %s\n" $subtest_count [expr {$completed ? "COMPLETE" : "INCOMPLETE: $_incomplete_reason"}]] |
| } |
| append output [string repeat = 78]\n |
| |
| # Detailed information |
| if {$testsuite_subtest_details ne "none"} { |
| set show_details false |
| if {$testsuite_subtest_details eq "all" && $subtest_count > 0} { |
| set show_details true |
| } elseif {$testsuite_subtest_details eq "fail_skip" && $_subtest_skip_count > 0} { |
| set show_details true |
| } elseif {$_subtest_fail_count > 0} { |
| set show_details true |
| } |
| |
| if {$show_details} { |
| append output [format "SUBTESTS DETAILS : %s\n" $test_name] |
| dict for {id result_msg} $_subtest_messages { |
| set result [lindex $result_msg 0] |
| if {$testsuite_subtest_details eq "all"} { |
| append output [format " %s\n" [lindex $result_msg 1]] |
| } elseif {$testsuite_subtest_details eq "fail_skip" && $result eq "skip"} { |
| append output [format " %s\n" [lindex $result_msg 1]] |
| } elseif {$result eq "fail"} { |
| append output [format " %s\n" [lindex $result_msg 1]] |
| } |
| } |
| append output [string repeat = 78]\n |
| } |
| } |
| if {$testsuite_testproc_details ne "none"} { |
| set show_details false |
| if {$testsuite_testproc_details eq "all" && $testproc_count > 0} { |
| set show_details true |
| } elseif {$testsuite_testproc_details eq "fail_skip" && $testproc_skip > 0} { |
| set show_details true |
| } elseif {$testproc_fail > 0} { |
| set show_details true |
| } |
| |
| if {$show_details} { |
| append output [format "TESTPROCS DETAILS : %s\n" $test_name] |
| dict for {num result_msg} $_testproc_messages { |
| set result [lindex $result_msg 0] |
| if {$testsuite_testproc_details eq "all"} { |
| append output [format " Testproc %2d %-7s : %s\n" $num [lindex $result_msg 0] [lindex $result_msg 1]] |
| } elseif {$testsuite_testproc_details eq "fail_skip" && $result eq "skipped"} { |
| append output [format " Testproc %2d %-7s : %s\n" $num [lindex $result_msg 0] [lindex $result_msg 1]] |
| } elseif {$result eq "failed"} { |
| append output [format " Testproc %2d %-7s : %s\n" $num [lindex $result_msg 0] [lindex $result_msg 1]] |
| } |
| } |
| append output [string repeat = 78]\n |
| } |
| } |
| |
| if ($testsuite_colorize) { |
| append output $COLOR_NONE |
| } |
| |
| puts -nonewline $output |
| } |
| |
| |
| ################################################################ |
| # |
| # NAME |
| # _get_test_name - gets the name of the invoking source script |
| # |
| # SYNOPSIS |
| # _get_test_name |
| # |
| # RETURN VALUE |
| # The name of the originally called script |
| # |
| ################################################################ |
| |
| proc _get_test_name { } { |
| |
| set test_name unknown |
| set frame_level 1 |
| while { $frame_level <= [info frame] } { |
| if { [dict get [info frame $frame_level] type] == "source" } { |
| set test_name [file tail [dict get [info frame $frame_level] file]] |
| break |
| } |
| incr frame_level |
| } |
| |
| return $test_name |
| } |
| |
| |
| ################################################################ |
| # |
| # NAME |
| # _test_cleanup - performs the test cleanup |
| # |
| # SYNOPSIS |
| # _test_cleanup |
| # |
| # DESCRIPTION |
| # This procedure removes the temporary test_dir and calls the |
| # test-defined cleanup procedure. |
| # |
| # NOTES |
| # This function should be called only from _test_init and _test_fini. |
| # |
| ################################################################ |
| |
| proc _test_cleanup {} { |
| global log_warn test_dir |
| |
| set rc 0 |
| |
| # Call global cleanup procedure if it is defined by the test |
| if {[info procs cleanup] eq "cleanup"} { |
| if {[catch {cleanup} cleanup_error ]} { |
| log_error "Cleanup had errors: $cleanup_error" |
| set rc 1 |
| } |
| } |
| |
| # Remove the temporary test directory |
| exec rm -rf $test_dir |
| |
| return $rc |
| } |
| |
| |
| ################################################################ |
| # |
| # NAME |
| # _test_init - performs test initialization |
| # |
| # SYNOPSIS |
| # _test_init |
| # |
| # DESCRIPTION |
| # This procedure is called automatically at the beginning of each test. |
| # It prints the header, creates the temporary test dir, etc. |
| # |
| ################################################################ |
| |
| proc _test_init {} { |
| global test_dir test_id test_name testsuite_shared_dir |
| global _testproc_included _testproc_excluded argv bin_chmod scontrol |
| |
| # parse argv to get and remove _testproc_included and _testproc_excluded params |
| set idx [expr {[info exists argv] ? [lsearch $argv -i] : -1}] |
| if {$idx >= 0} { |
| set _testproc_included [split [lindex $argv [expr $idx + 1]] ,] |
| set argv [lreplace $argv $idx [expr $idx + 1]] |
| } |
| set idx [expr {[info exists argv] ? [lsearch $argv -e] : -1}] |
| if {$idx >= 0} { |
| set _testproc_excluded [split [lindex $argv [expr $idx + 1]] ,] |
| set argv [lreplace $argv $idx [expr $idx + 1]] |
| } |
| |
| # Set test name to name of originally invoked test script, e.g. test1.1 |
| set test_name [_get_test_name] |
| |
| # Set test id to suffix of the test script, e.g. 1.1 |
| set test_id [string map {test ""} $test_name] |
| |
| # Temporary test directory used to stash saved configs, output files... |
| set test_dir "$testsuite_shared_dir/${test_name}dir" |
| |
| # Print test header |
| _print_header |
| |
| # Cleanup in case test was not cleaned up on last execution |
| if { [_test_cleanup] } { |
| fail "Error in the initial cleanup" |
| } |
| |
| # Create temporary shared test directory |
| exec mkdir -p $test_dir |
| exec $bin_chmod a+rwx $test_dir |
| |
| # Default precondition checks |
| if {![regexp "is UP" [run_command_output -nolog -none "$scontrol ping"]]} { |
| skip "The expect testsuite requires Slurm to be running, but '$scontrol ping' failed." |
| } |
| set idle_nodes [get_nodes_by_state] |
| if {[llength $idle_nodes] < 1} { |
| skip "The expect testsuite requires there to be at least one idle node in the default partition." |
| } |
| # Ensure that idle nodes are not drained/down |
| # Checking one by one to be able to print a warning if some was drain/down |
| foreach idle_node $idle_nodes { |
| run_command -nolog -xfail "$scontrol update nodename=$idle_node state=resume" |
| } |
| } |
| |
| |
| ################################################################ |
| # |
| # NAME |
| # _test_fini - performs test finalization |
| # |
| # SYNOPSIS |
| # _test_fini ?status? |
| # |
| # DESCRIPTION |
| # This procedure is called automatically from the ending functions |
| # pass, skip and failure. |
| # It cleans up based on the status and the testsuite_cleanup_on_failure |
| # variable, prints the final test status/summary, and exits the test. |
| # |
| # We will always cleanup for SUCCESS or SKIPPED tests. |
| # Whether or not the cleanup procedure is called for FAILURE tests |
| # depends on the setting of the $testsuite_cleanup_on_failure variable |
| # which can be set in the globals.local file or overridden with the |
| # SLURM_TESTSUITE_CLEANUP_ON_FAILURE environment variable. |
| # |
| ################################################################ |
| |
| proc _test_fini { status } { |
| global testsuite_cleanup_on_failure _test_fini_called |
| global _subtest_fail_count _subtest_skip_count |
| global STATUS_FAIL STATUS_PASS STATUS_SKIP test_status |
| |
| # Avoid potential infinite recursive calls. |
| # _test_fini should be called only once, but custom cleanup procs |
| # called from _test_cleanup can potentially call it (eg fail) |
| if {$_test_fini_called} { |
| log_debug "Recursive _test_fini call detected, most probably a fail on a cleanup function" |
| return |
| } |
| set _test_fini_called true |
| |
| |
| # Determine if test completed or was aborted |
| set completed [expr $status == $STATUS_PASS ? true : false] |
| |
| # Override status with subtest status if available and necessary |
| if {$status != $STATUS_FAIL} { |
| if {$_subtest_fail_count > 0} { |
| set status $STATUS_FAIL |
| } elseif {$_subtest_skip_count > 0} { |
| set status $STATUS_SKIP |
| } |
| } |
| |
| # Set final test status global variable so it can be used in cleanup |
| set test_status $status |
| |
| # Only cleanup if test not failed or configured to do so |
| if {$status != $STATUS_FAIL || $testsuite_cleanup_on_failure} { |
| _test_cleanup |
| } |
| _print_summary $status $completed |
| |
| __exit $status |
| } |
| |
| |
| ################################################################ |
| # |
| # Overload the exit routine to ensure that no one is explicitly |
| # calling it, and to enforce _test_fini if exit is called when |
| # the test reach its EOF. |
| # |
| # All tests should exit using pass, skip or fail. |
| # |
| ################################################################ |
| |
| rename exit __exit |
| |
| proc exit { {status 0} } { |
| global test_name |
| |
| # To avoid failures when using "expect -c 'source globals'" |
| if {$test_name eq "globals"} { |
| pass |
| } |
| |
| # |
| # Minor sanity check to detect if exit was explicitly called (not |
| # allowed) or automatically executed when the test ends |
| # |
| if {[info level] > 1} { |
| # exit was called from a function, and it shouldn't |
| fail "Exit should not be directly called, use pass, skip or fail instead" |
| } else { |
| set frame_level 1 |
| while { $frame_level <= [info frame] } { |
| if { [dict get [info frame $frame_level] type] == "source" } { |
| if { [file tail [dict get [info frame $frame_level] file]] eq $test_name } { |
| # exit was called explicitly from the |
| # test, and it shouldn't |
| fail "Exit should not be directly called, use pass, skip or fail instead" |
| } |
| break |
| } |
| incr frame_level |
| } |
| } |
| |
| if {$status != 0} { |
| fail "An Expect/TCL exception occurred" |
| } |
| |
| # The exit was called implicitly when the test ends, allowed but |
| # _test_fini call enforced |
| pass |
| } |
| |
| |
| # Call _test_init at the beginning of each test |
| _test_init |