blob: 9c2f8ff6727242e52c499a7d6203a2be09e260b7 [file] [log] [blame]
#!/usr/bin/env expect
############################################################################
# Purpose: Test of Slurm functionality
# Test sreport cluster utilization. Also test for
# sreport -M option
############################################################################
# Copyright (C) 2008 Lawrence Livermore National Security.
# Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
# Written by Danny Auble <da@llnl.gov>
# CODE-OCEC-09-009. All rights reserved.
#
# This file is part of Slurm, a resource management program.
# For details, see <https://slurm.schedmd.com/>.
# Please also read the included file: DISCLAIMER.
#
# Slurm is free software; you can redistribute it and/or modify it under
# the terms of the GNU General Public License as published by the Free
# Software Foundation; either version 2 of the License, or (at your option)
# any later version.
#
# Slurm is distributed in the hope that it will be useful, but WITHOUT ANY
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
# details.
#
# You should have received a copy of the GNU General Public License along
# with Slurm; if not, write to the Free Software Foundation, Inc.,
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
############################################################################
source ./globals
source ./globals_accounting
source ./globals_federation
source ./inc22.1.1
source ./inc22.1.2
source ./inc22.1.3
source ./inc22.1.4
source ./inc22.1.5
source ./inc22.1.fed
set test_nu "test22-1"
set matches 0
set not_support 0
set sql_in_clus1 "$test_dir/clus1-in.sql"
set sql_rem_clus1 "$test_dir/clus1-rem.sql"
set sql_in_clus2 "$test_dir/clus2-in.sql"
set sql_rem_clus2 "$test_dir/clus2-rem.sql"
set sql_out "$test_name-out.sql"
set cluster1 [format "%s%s" $test_nu "clus1"]
set cluster2 [format "%s%s" $test_nu "clus2"]
set account1 [format "%s%s" $test_nu "acct1"]
set account2 [format "%s%s" $test_nu "acct2"]
set account3 [format "%s%s" $test_nu "acct3"]
set accounts [format "%s,%s,%s" $account1 $account2 $account3]
set wckey1 [format "%s%s" $test_nu "wckey"]
set user1 [format "%s%s" $test_nu "user1"]
set user2 [format "%s%s" $test_nu "user2"]
set users [format "%s,%s" $user1 $user2]
set node0 [format "%s%d" $cluster1 0]
set node1 [format "%s%d" $cluster1 1]
set node_list [format "%s%s" $cluster1 "\[0-1\]"]
set node0_cpus 2
set node1_cpus 2
set test_job1 "${test_name}_job1"
set test_job2 "${test_name}_job2"
set test_job3 "${test_name}_job3"
set test_job4 "${test_name}_job4"
set cluster_cpus [expr $node0_cpus + $node1_cpus]
set access_err 0
set uid [get_my_uid]
set gid [get_my_gid]
set timeout 180
set fedname "test22_1"
# Cluster
array set clus_req {}
# Account 1 Cluster 1
array set acct_req1 {}
set acct_req1(cluster) "$cluster1,$cluster2"
# Account 2 Cluster 1
array set acct_req2 {}
set acct_req2(cluster) "$cluster1,$cluster2"
set acct_req2(parent) $account2
# User Cluster 1
array set user_req {}
set user_req(cluster) "$cluster1,$cluster2"
set user_req(account) $accounts
set user_req(wckey) $wckey1
# Fri Thu Jan 31 00:00:00 2008 - Needed only for the 00:00:00 for timing purposes
set midnight_time "Thu Jan 31 00:00:00 2008"
set midnight [exec date -d $midnight_time +%s]
# Mon Dec 31 23:00:00 2007
set start_date "Mon Dec 31 23:00:00 2007"
set period_start [exec date -d $start_date +%s]
# Thu Jan 31 23:59:59 2008
set end_date "Thu Jan 31 23:59:59 2008"
set period_end [exec date -d $end_date +%s]
set start_str [timestamp -format %Y-%m-%dT%X -seconds $period_start]
set end_str [timestamp -format %Y-%m-%dT%X -seconds $period_end]
# job0 - we really want this to look like job1 but run right before hand.
set job0_start $period_start
set job0_run 1200
set job0_end [expr $job0_start + $job0_run]
# job1
set job1_start [expr $period_start + 1200]
set job1_run 2700
set job1_end [expr $job1_start + $job1_run]
# This will give us the correct time we ran for
set job1_diff_str [timestamp -format %X -seconds [expr $midnight+$job1_run]]
set job1_start_str [timestamp -format %Y-%m-%dT%X -seconds $job1_start]
set job1_end_str [timestamp -format %Y-%m-%dT%X -seconds $job1_end]
set job1_nodes $node1
set job1_cpus $node1_cpus
set job1_alloc [expr ($job1_run + $job0_run) * $job1_cpus]
set job1_acct $account1
# job2
# Make job eligible an hour into the allocation
set job2_elig [expr $period_start+3600]
# Start the job 65 minutes later so we can check reserved time
set job2_start [expr $job2_elig+3900]
# Run for a day
set job2_run 86400
set job2_end [expr $job2_start+$job2_run]
# This will give us the correct time we ran for
set job2_diff_str "1-00:00:00"
set job2_start_str [timestamp -format %Y-%m-%dT%X -seconds $job2_start]
set job2_end_str [timestamp -format %Y-%m-%dT%X -seconds $job2_end]
set job2_nodes [format "%s\[%s\]" $cluster1 "0-1"]
set job2_cpus [expr $node0_cpus + $node1_cpus]
set job2_alloc [expr $job2_run * $job2_cpus]
set job2_acct $account3
# job3
# Make job eligible an hour before the end of job2
set job3_elig [expr $job2_end-3600]
# Start the job at the end of job2
set job3_start $job2_end
# Run for 65 minutes
set job3_run 3900
set job3_end [expr $job3_start+$job3_run]
# This will give us the correct time we ran for
set job3_diff_str [timestamp -format %X -seconds [expr $midnight+$job3_run]]
set job3_start_str [timestamp -format %Y-%m-%dT%X -seconds $job3_start]
set job3_end_str [timestamp -format %Y-%m-%dT%X -seconds $job3_end]
# Run on just node0
set job3_nodes $node0
set job3_cpus $node0_cpus
set job3_alloc [expr $job3_run * $job3_cpus]
set job3_acct $account2
# job4
# Make job eligible right at the end of job3
set job4_elig $job3_elig
# It never starts, so all it's time is 'planned'
set job4_start 0
set job4_run 0
set job4_end 0
set job4_diff_str "0"
# Run on just node0
set job4_nodes $node0
set job4_cpus $node0_cpus
set job4_alloc [expr $job4_run * $job4_cpus]
set job4_acct $account2
set acct1_alloc $job1_alloc
set acct3_alloc $job2_alloc
set acct2_alloc [expr $acct3_alloc + $job3_alloc]
set total_alloc [expr $job1_alloc + $job2_alloc + $job3_alloc]
set wckey1_alloc [expr $job1_alloc + $job2_alloc + $job3_alloc]
set user1_wckey1_alloc [expr $job1_alloc + $job3_alloc]
set user2_wckey1_alloc $job2_alloc
# Node0 down
set node0_down_start [expr $period_start+(60*45)]
set node0_down_end [expr $period_start+(60*75)]
set node0_start_str [timestamp -format %Y-%m-%dT%X -seconds $node0_down_start]
set node0_end_str [timestamp -format %Y-%m-%dT%X -seconds $node0_down_end]
#
# Check accounting config and bail if not found.
#
if {[get_config_param "AccountingStorageType"] ne "accounting_storage/slurmdbd"} {
skip "This test can't be run without a usable AccountStorageType"
}
set wc_key_track true
if { [get_config_param -dbd "TrackWCKey"] eq "no" } {
log_warn "This test can't be totally run without TrackWCKey set in slurmdbd.conf"
set wc_key_track false
}
if {[get_admin_level] ne "Administrator"} {
skip "This test can't be run without being an Accounting administrator.\nUse: sacctmgr mod user \$USER set admin=admin"
}
proc cleanup {} {
global sql_rem_clus1 sql_rem_clus2 users accounts
global cluster1 cluster2 fed_name
archive_load $sql_rem_clus1
archive_load $sql_rem_clus2
remove_user "" "" $users
remove_acct "" $accounts
remove_cluster "$cluster1"
remove_cluster "$cluster2"
}
# Add clusters
if [add_cluster $cluster1 [array get clus_req]] {
fail "Unable to add cluster 1"
}
if [add_cluster $cluster2 [array get clus_req]] {
fail "Unable to add cluster 2"
}
# Add accounts
if [add_acct "$account1,$account2" [array get acct_req1]] {
fail "Unable to add accounts ($account1,$account2)"
}
# Add accounts
if [add_acct $account3 [array get acct_req2]] {
fail "Unable to add account ($account3)"
}
# Add users
if [add_user $users [array get user_req]] {
fail "Unable to add user ($users)"
}
# Get the user association ids for the jobs we plan to add
set user1acct1 0
set user1acct2 0
set user1acct3 0
set user2acct1 0
set user2acct2 0
set user2acct3 0
eval spawn $sacctmgr -n -p list assoc users=$users account=$accounts cluster=$cluster2 format="User,account,id"
expect {
-re "There was a problem" {
fail "There was a problem with the sacctmgr command"
}
-re "$user1.$account1.($number)." {
set user1acct1 $expect_out(1,string)
exp_continue
}
-re "$user2.$account1.($number)." {
set user2acct1 $expect_out(1,string)
exp_continue
}
-re "$user1.$account2.($number)." {
set user1acct2 $expect_out(1,string)
exp_continue
}
-re "$user2.$account2.($number)." {
set user2acct2 $expect_out(1,string)
exp_continue
}
-re "$user1.$account3.($number)." {
set user1acct3 $expect_out(1,string)
exp_continue
}
-re "$user2.$account3.($number)." {
set user2acct3 $expect_out(1,string)
exp_continue
}
timeout {
fail "sacctmgr list associations not responding"
}
eof {
wait
}
}
if {!$user1acct1 || !$user1acct2 || !$user1acct3 || !$user2acct1 || !$user2acct2|| !$user2acct3} {
fail "Didn't get one of the user associations ($user1acct1,$user1acct2,$user1acct3,$user2acct1,$user2acct2,$user2acct3)"
}
# Get the wckey ids for the jobs we plan to add
set user1wckey1 0
set user2wckey1 0
eval spawn $sacctmgr -n -p list wckeys users=$users wckeys=$wckey1 cluster=$cluster1 format="user,wckey,id"
expect {
-re "There was a problem" {
fail "There was a problem with the sacctmgr command"
}
-re "$user1.$wckey1.($number)." {
set user1wckey1 $expect_out(1,string)
exp_continue
}
-re "$user2.$wckey1.($number)." {
set user2wckey1 $expect_out(1,string)
exp_continue
}
timeout {
fail "sacctmgr list wckeys not responding"
}
eof {
wait
}
}
if {!$user1wckey1 || !$user2wckey1} {
remove_user "" "" $users
remove_acct "" $accounts
remove_cluster "$cluster1"
fail "Didn't get one of the user wckeys ($user1wckey1,$user2wckey1)"
}
proc test_fed {} {
global scontrol re_word_str sacctmgr env bin_rm cluster1 cluster2 fedname test_dir
set config_dir ""
set new_conf $test_dir/slurm.conf.test22.1
spawn $scontrol show config
expect {
-re "SLURM_CONF *= (/.*)/($re_word_str).*SLURM_VERSION" {
set config_dir $expect_out(1,string)
exp_continue
}
timeout {
fail "scontrol is not responding"
}
eof {
wait
}
}
delete_federations $fedname
set cmd1 "$sacctmgr -i add federation $fedname"
log_debug "$cmd1"
log_debug "[eval exec $cmd1]"
set cmd2 "$sacctmgr -i mod fed $fedname set clusters=$cluster1,$cluster2"
log_debug "$cmd2"
log_debug "[eval exec $cmd2]"
log_debug "This is the config dir: $config_dir; new config: $new_conf "
set old_conf "$config_dir/slurm.conf"
log_debug [exec sed "s/ClusterName\\s*=.*/ClusterName=$cluster1/Ig" $old_conf > $new_conf]
set env(SLURM_CONF) $new_conf
log_debug "ENV VARIABLE: [exec printenv SLURM_CONF] "
# Execute the federation tests in a catch block in order to
# run federation-specific cleanup
set exception_code [catch {
inc22_1_fed
} message] ; # Store any error message in $message
# Cleanup federation
delete_federations $fedname
set env(SLURM_CONF) $old_conf
# Convert any errors into failures (after cleaning up)
if {$exception_code == 1} { ; # errors only
fail "Failure testing federated sreport: $message"
}
}
proc create_sql {cluster sql_in sql_rem} {
global bin_rm cluster_cpus period_start period_end node_list
global node0 node0_cpus node0_down_start node0_down_end
global user1acct1 wckey1 user1wckey1 uid gid debug job1_acct job0_start job0_end
global job1_cpus job1_nodes job1_start job1_end user2acct3 user2wckey1
global job2_acct job2_elig job2_start job2_end job2_cpus job2_nodes
global user1acct2 job3_acct job3_elig job3_start job3_end job3_cpus
global job3_nodes sacct start_str end_str account1 job1_start_str
global job1_end_str job1_diff_str account3 job2_start_str
global job2_end_str job2_diff_str account2 job3_start_str
global job3_end_str job3_diff_str sacctmgr node0_start_str
global node0_end_str matches users accounts
global test_job1 test_job2 test_job3 test_job4
global job4_elig job4_start job4_run job4_end job4_diff_str
global job4_nodes job4_alloc job4_acct job4_cpus
exec $bin_rm -f $sql_in
set file [open $sql_in "w"]
# DON'T MESS WITH THIS UNLESS YOU REALLY UNDERSTAND WHAT YOU ARE DOING!!!!!
# THIS COULD SERIOUSLY MESS UP YOUR DATABASE IF YOU ALTER THIS INCORRECTLY
# JUST A FRIENDLY REMINDER ;)
# Put in the cluster for back in the day before accounting was made here for us we are using 'Tue Jan 1 00:00:00 2008' = 1199174400 as the start
puts $file "insert into cluster_event_table (node_name, cluster, tres, period_start, period_end, reason, cluster_nodes) values"
puts $file "('', '$cluster', '1=$cluster_cpus', $period_start, $period_end, 'Cluster processor count', '$node_list' )"
# Put a node down for 30 minutes starting at 45 minutes after the start to make sure our rollups work so we should get 15 minutes on one hour and 15 on the other
puts $file ", ('$node0', '$cluster', '1=$node0_cpus', $node0_down_start, $node0_down_end, 'down','')"
#puts $file ", ('$node1', '$cluster', '1=$node1_cpus', $period_start, $period_end, 'down')"
puts $file "on duplicate key update period_start=VALUES(period_start), period_end=VALUES(period_end);"
# Now we will put in a job running for an hour and 5 minutes
puts $file "insert into job_table (jobid, associd, wckey, wckeyid, uid, gid, `partition`, blockid, cluster, account, eligible, submit, start, end, suspended, name, state, comp_code, priority, req_cpus, tres_alloc, nodelist, kill_requid, qos, deleted) values"
puts $file "('65536', '$user1acct1', '$wckey1', '$user1wckey1', '$uid', '$gid', 'debug', '', '$cluster', '$job1_acct', $job0_start, $job0_start, $job0_start, $job0_end, '0', '$test_job1', '3', '0', '$job1_cpus', $job1_cpus, '1=$job1_cpus', '$job1_nodes', '0', '0', '0')"
puts $file "('65537', '$user1acct1', '$wckey1', '$user1wckey1', '$uid', '$gid', 'debug', '', '$cluster', '$job1_acct', $job1_start, $job1_start, $job1_start, $job1_end, '0', '$test_job1', '3', '0', '$job1_cpus', $job1_cpus, '1=$job1_cpus', '$job1_nodes', '0', '0', '0')"
puts $file ", ('65538', '$user2acct3', '$wckey1', '$user2wckey1', '$uid', '$gid', 'debug', '', '$cluster', '$job2_acct', $job2_elig, $job2_elig, $job2_start, $job2_end, '0', '$test_job2', '3', '0', '$job2_cpus', '$job2_cpus', '1=$job2_cpus', '$job2_nodes', '0', '0', '0')"
puts $file ", ('65539', '$user1acct2', '$wckey1', '$user1wckey1', '$uid', '$gid', 'debug', '', '$cluster', '$job3_acct', $job3_elig, $job3_elig, $job3_start, $job3_end, '0', '$test_job3', '3', '0', '$job3_cpus', '$job3_cpus', '1=$job3_cpus', '$job3_nodes', '0', '0', '0')"
puts $file ", ('65540', '$user1acct2', '$wckey1', '$user1wckey1', '$uid', '$gid', 'debug', '', '$cluster', '$job4_acct', $job4_elig, $job4_elig, $job4_start, $job4_end, '0', '$test_job4', '3', '0', '$job4_cpus', '$job4_cpus', '1=$job4_cpus', '$job4_nodes', '0', '0', '0')"
puts $file "on duplicate key update id=LAST_INSERT_ID(id), eligible=VALUES(eligible), submit=VALUES(submit), start=VALUES(start), end=VALUES(end), associd=VALUES(associd), tres_alloc=VALUES(tres_alloc), wckey=VALUES(wckey), wckeyid=VALUES(wckeyid);";
close $file
exec $bin_rm -f $sql_rem
set file [open $sql_rem "w"]
puts $file [format "%s%s%s" "truncate table \"" $cluster "_event_table\";"]
puts $file [format "%s%s%s" "truncate table \"" $cluster "_job_table\";"]
puts $file [format "%s%s%s" "truncate table \"" $cluster "_step_table\";"]
puts $file [format "%s%s%s" "truncate table \"" $cluster "_usage_day_table\";"]
puts $file [format "%s%s%s" "truncate table \"" $cluster "_usage_hour_table\";"]
puts $file [format "%s%s%s" "truncate table \"" $cluster "_usage_month_table\";"]
puts $file [format "%s%s%s" "truncate table \"" $cluster "_assoc_usage_day_table\";"]
puts $file [format "%s%s%s" "truncate table \"" $cluster "_assoc_usage_hour_table\";"]
puts $file [format "%s%s%s" "truncate table \"" $cluster "_assoc_usage_month_table\";"]
puts $file [format "%s%s%s" "truncate table \"" $cluster "_wckey_usage_day_table\";"]
puts $file [format "%s%s%s" "truncate table \"" $cluster "_wckey_usage_hour_table\";"]
puts $file [format "%s%s%s" "truncate table \"" $cluster "_wckey_usage_month_table\";"]
close $file
#
# Use sacctmgr to load info
#
if [archive_load $sql_in] {
fail "Unable to load archive"
}
#
# Use sacct to see if the job loaded
#
set matches 0
eval spawn $sacct -p -M $cluster --format=cluster,account,associd,wckey,wckeyid,start,end,elapsed --noheader --start=$start_str --end=$end_str
expect {
-re "There was a problem" {
fail "There was a problem with the sacct command"
}
-re "$cluster.$account1.$user1acct1.$wckey1.$user1wckey1.$job1_start_str.$job1_end_str.$job1_diff_str." {
log_debug "Got 1"
incr matches
exp_continue
}
-re "$cluster.$account3.$user2acct3.$wckey1.$user2wckey1.$job2_start_str.$job2_end_str.$job2_diff_str." {
log_debug "Got 2"
incr matches
exp_continue
}
-re "$cluster.$account2.$user1acct2.$wckey1.$user1wckey1.$job3_start_str.$job3_end_str.$job3_diff_str." {
log_debug "Got 3"
incr matches
exp_continue
}
timeout {
fail "sacctmgr archive load not responding"
}
eof {
wait
}
}
if {$matches != 3} {
fail "Job wasn't loaded correctly ($matches != 3)"
}
#
# Use sacctmgr to see if node event loaded
#
log_debug "$cluster..$start_str.$end_str.$cluster_cpus.$node_list"
set matches 0
eval spawn $sacctmgr -p list events cluster=$cluster format=cluster,noden,start,end,cpu --noheader start=$start_str end=$end_str
expect {
-re "There was a problem" {
fail "There was a problem with the sacctmgr command"
}
-re "($cluster..$start_str.$end_str.$cluster_cpus.)" {
log_debug "Got 1"
incr matches
exp_continue
}
-re "($cluster.$node0.$node0_start_str.$node0_end_str.$node0_cpus.)" {
log_debug "Got 2"
incr matches
exp_continue
}
timeout {
fail "sacctmgr archive load not responding"
}
eof {
wait
}
}
if {$matches != 2} {
fail "Cluster env wasn't loaded correctly ($matches != 2)"
}
#
# Use sacctmgr to roll up that time period
#
set matches 0
eval spawn $sacctmgr -i roll $start_str $end_str
expect {
-re "There was a problem" {
fail "There was a problem with the sacctmgr command"
}
-re "$cluster" {
incr matches
exp_continue
}
-re "SUCCESS" {
incr matches
exp_continue
}
timeout {
fail "sacctmgr archive load not responding"
}
eof {
wait
}
}
if {$matches != 1} {
fail "sacctmgr wasn't able to roll data"
}
}
create_sql $cluster1 $sql_in_clus1 $sql_rem_clus1
create_sql $cluster2 $sql_in_clus2 $sql_rem_clus2
#
# Execute sub-test
#
# Fed Testing
test_fed
inc22_1_1
inc22_1_2
inc22_1_3
inc22_1_4
inc22_1_5