blob: 98db0d877903b05e6dc1d0a8e51f8802ef4e2951 [file] [log] [blame]
#!/usr/bin/env expect
############################################################################
# Purpose: Test of Slurm functionality
# to be called from test3.11
# Several cases for core based reservations using nodelists
# Plugin select/cons_tres needed
#
############################################################################
# Copyright (C) 2013 Barcelona Supercomputing Center
#
# This file is part of Slurm, a resource management program.
# For details, see <https://slurm.schedmd.com/>.
# Please also read the included file: DISCLAIMER.
#
# Slurm is free software; you can redistribute it and/or modify it under
# the terms of the GNU General Public License as published by the Free
# Software Foundation; either version 2 of the License, or (at your option)
# any later version.
#
# Slurm is distributed in the hope that it will be useful, but WITHOUT ANY
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
# details.
#
# You should have received a copy of the GNU General Public License along
# with Slurm; if not, write to the Free Software Foundation, Inc.,
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
############################################################################
proc inc3_11_9 {} {
global user_name node_count
global file_in bin_sleep sbatch number scontrol
global re_word_str scancel
global cluster_cpus cores_per_node def_partition
global def_node_name def_node_inx_min def_node_inx_max
global def_node_inx_min_int def_node_inx_max_int
set res_name "resv3.11.9"
set res_name_test "resv3.11.9.0"
log_info "+++++ STARTING TEST 9 +++++"
# Assumes nodes have sequential suffix numbers
set min_node_inx_int $def_node_inx_min_int
set max_node_inx_int [expr min($def_node_inx_min_int + 4,$def_node_inx_max_int)]
set min_node_inx $def_node_inx_min
set max_node_inx $max_node_inx_int
set min_len [string length $min_node_inx]
set max_len [string length $max_node_inx]
if {$min_len > $max_len} {
# Preserve leading zeros
# (e.g. "nid[00001-00005]" rather than "nid[00001-5]")
set max_node_inx [format %${min_len}.${min_len}d $max_node_inx_int]
}
# Make the job script
make_bash_script $file_in "$bin_sleep 100"
# Make a reservation, just to get node size information
set ret_code [create_res $res_name "StartTime=now Duration=1 NodeCnt=1 User=$user_name"]
if {$ret_code != 0} {
fail "Unable to create a valid reservation"
}
# Get reservation info
set res_info [get_reservations $res_name]
if { ![dict exists $res_info $res_name] } {
delete_res $res_name
fail "Unable to get info about reservation ($res_name)"
}
lassign [get_node_cpus [dict get $res_info $res_name "Nodes"]] cputot threadcnt
# Delete the reservation
set ret_code [delete_res $res_name]
if {$ret_code != 0} {
fail "Unable to delete reservation ($res_name)"
}
set nodes [get_nodes_by_state]
set num_nodes [llength $nodes]
set core_res_num [ expr $cores_per_node / 2 ]
set thread_res_num [ expr $core_res_num * $threadcnt ]
set job_id 0
# Submit a batch job using half the threads on the nodes
spawn $sbatch -w[node_list_to_range $nodes] --time=10:00 --ntasks-per-node=$thread_res_num --output=/dev/null $file_in
expect {
-re "Submitted batch job ($number)" {
set job_id $expect_out(1,string)
exp_continue
}
timeout {
cancel_job $job_id
fail "sbatch not responding"
}
eof {
wait
}
}
if {$job_id == 0} {
fail "Batch submit failure"
}
subtest {[wait_for_job $job_id "RUNNING"] == 0} "A batch job using half the threads on the nodes should run"
spawn $scontrol show job $job_id
expect {
timeout {
cancel_job $job_id
fail "scontrol not responding"
}
eof {
wait
}
}
# Make a reservation using 1 core per node in first 5 nodes
if {[expr $max_node_inx_int - $min_node_inx_int] != 4} {
subskip "Insufficient node count for remaining subtests (needs at least 5)"
cancel_job $job_id
return
}
set ret_code [create_res $res_name "StartTime=now Duration=60 Nodes=$def_node_name\[$min_node_inx\-$max_node_inx\] CoreCnt=5 User=$user_name"]
if {$ret_code != 0} {
cancel_job $job_id
fail "Unable to create a valid reservation"
}
# Get reservation info
set res_info [get_reservations $res_name]
if { ![dict exists $res_info $res_name] } {
delete_res $res_name
cancel_job $job_id
fail "Unable to get info about reservation ($res_name)"
}
lassign [get_node_cpus [dict get $res_info $res_name "Nodes"]] cputot threadcnt
set res_nodecnt [dict get $res_info $res_name "NodeCnt"]
subtest {$res_nodecnt == 5} "Reservation should be created with 5 nodes" "Reservation was created with $res_nodecnt nodes"
set res_corecnt [dict get $res_info $res_name "CoreCnt"]
subtest {$res_corecnt == 5} "Reservation should be created with 5 cores" "Reservation was created with $res_corecnt cores"
# Delete the reservation
set ret_code [delete_res $res_name]
if {$ret_code != 0} {
cancel_job $job_id
fail "Unable to delete reservation ($res_name)"
}
# Get the core_cnt left + 1 from the job above.
set core_cnt [expr (($cores_per_node / 2) * 5) + 1]
# Make the reservation using more cores then free in a node
set ret_code [create_res $res_name "StartTime=now Duration=60 Nodes=$def_node_name\[$min_node_inx\-$max_node_inx\] CoreCnt=$core_cnt User=$user_name"]
if {! [
subtest {$ret_code != 0} "A reservation using more cores than free in a node should not be created"
]} {
delete_res $res_name
}
# Make the reservation using more cores than free in a node (now)
# but those cores being free at reservation start time
set ret_code [create_res $res_name "StartTime=now+3600 Duration=60 Nodes=$def_node_name\[$min_node_inx\-$max_node_inx\] CoreCnt=$core_cnt User=$user_name"]
subtest {$ret_code == 0} "A reservation using more cores than currently free but available at reservation start time should succeed"
# Delete the reservation
set ret_code [delete_res $res_name]
if {$ret_code != 0} {
cancel_job $job_id
fail "Unable to delete reservation ($res_name)"
}
# Make the reservation using more cores than free at reservation start time
set ret_code [create_res $res_name "StartTime=now+300 Duration=60 Nodes=$def_node_name\[$min_node_inx\-$max_node_inx\] CoreCnt=$core_cnt User=$user_name"]
if {! [
subtest {$ret_code != 0} "A reservation using more cores than free at reservation start time should not be created"
]} {
delete_res $res_name
}
cancel_job $job_id
log_info "Let's check overlapping reservations"
set core_res_num [ expr $cores_per_node / 2 ]
set total_core_res $core_res_num
# Make a reservation for all nodes using just half the processor in each node
set ret_code [create_res $res_name "StartTime=now Duration=60 Nodes=$def_node_name\[$min_node_inx\] CoreCnt=$total_core_res User=$user_name"]
if {$ret_code != 0} {
fail "Unable to create a valid reservation"
}
log_info "Reservation was created as expected"
if {$core_res_num < 2} {
log_warn "Not enough cores for remaining tests"
if [delete_res $res_name] {
fail "Unable to delete reservation ($res_name)"
}
return
}
set core_cnt [expr $total_core_res + 1]
# Now creating a reservation using first node and more cores per node than available
set ret_code [create_res $res_name_test "StartTime=now Duration=60 Nodes=$def_node_name\[$min_node_inx\] CoreCnt=$core_cnt User=$user_name"]
if {! [
subtest {$ret_code != 0} "A reservation using first node and more cores per node than available should not be created"
]} {
delete_res $res_name_test
}
# Now creating a reservation using first node and just 1 core per node
set ret_code [create_res $res_name_test "StartTime=now Duration=60 Nodes=$def_node_name\[$min_node_inx\] CoreCnt=1 User=$user_name"]
if {$ret_code != 0} {
delete_res $res_name
fail "Unable to create a reservation using first 5 nodes and just 1 core per node"
}
log_info "Reservation was created as expected"
# Get reservation info (first reservation)
set res_info [get_reservations $res_name]
if { ![dict exists $res_info $res_name] } {
delete_res $res_name
delete_res $res_name_test
fail "Unable to get info about reservation ($res_name)"
}
lassign [get_node_cpus [dict get $res_info $res_name "Nodes"]] cputot threadcnt
# Submit a batch job: a job using cores available in first node
set core_res_num [ expr $cores_per_node / 2 ]
set core_res_num [ expr $core_res_num - 1 ]
set thread_res_num [ expr $core_res_num * $threadcnt ]
spawn $sbatch --ntasks-per-node=$thread_res_num --nodelist=$def_node_name\[$min_node_inx\] --output=/dev/null $file_in
expect {
-re "Submitted batch job ($number)" {
set job_id $expect_out(1,string)
exp_continue
}
timeout {
delete_res $res_name
delete_res $res_name_test
fail "sbatch not responding"
}
eof {
wait
}
}
if {$job_id == 0} {
delete_res $res_name
delete_res $res_name_test
fail "Batch submit failure"
}
sleep 10
# Show the job, make sure reservation tag is right
spawn $scontrol show job $job_id
expect {
-re "Invalid job id specified" {
delete_res $res_name
delete_res $res_name_test
cancel_job $job_id
fail "Job ($job_id) not found"
}
-re "JobState=PENDING" {
delete_res $res_name
delete_res $res_name_test
cancel_job $job_id
fail "Job ($job_id) is PENDING"
}
timeout {
delete_res $res_name
delete_res $res_name_test
cancel_job $job_id
fail "scontrol not responding"
}
eof {
wait
}
}
cancel_job $job_id
log_debug "JOB is running as expected"
# Submit a batch job: a job using more cores than available in first node
set core_res_num [ expr $cores_per_node / 2 ]
set thread_res_num [ expr $core_res_num * $threadcnt ]
spawn $sbatch --ntasks-per-node=$thread_res_num --nodelist=$def_node_name\[$min_node_inx\] --output=/dev/null $file_in
expect {
-re "Submitted batch job ($number)" {
set job_id $expect_out(1,string)
exp_continue
}
timeout {
delete_res $res_name
delete_res $res_name_test
fail "sbatch not responding"
}
eof {
wait
}
}
if {$job_id == 0} {
delete_res $res_name
delete_res $res_name_test
fail "Batch submit failure"
}
sleep 10
# Show the job, make sure reservation tag is right
spawn $scontrol show job $job_id
expect {
-re "Invalid job id specified" {
delete_res $res_name
delete_res $res_name_test
cancel_job $job_id
fail "Job ($job_id) not found"
}
-re "JobState=PENDING" {
log_debug "Job is PENDING as expected"
exp_continue
}
-re "JobState=RUNNING" {
delete_res $res_name
delete_res $res_name_test
cancel_job $job_id
fail "Job ($job_id) is RUNNING"
}
timeout {
delete_res $res_name
delete_res $res_name_test
cancel_job $job_id
fail "scontrol not responding"
}
eof {
wait
}
}
cancel_job $job_id
# Submit a batch job: a job using cores reserved in first reservation
set core_res_num [ expr $cores_per_node / 2 ]
set thread_res_num [ expr $core_res_num * $threadcnt ]
spawn $sbatch --ntasks-per-node=$thread_res_num --nodelist=$def_node_name\[$min_node_inx\] --reservation=$res_name --output=/dev/null $file_in
expect {
-re "Submitted batch job ($number)" {
set job_id $expect_out(1,string)
exp_continue
}
timeout {
delete_res $res_name
delete_res $res_name_test
fail "sbatch not responding"
}
eof {
wait
}
}
if {$job_id == 0} {
delete_res $res_name
delete_res $res_name_test
fail "Batch submit failure"
}
sleep 10
# Show the job, make sure reservation tag is right
spawn $scontrol show job $job_id
expect {
-re "Invalid job id specified" {
delete_res $res_name
delete_res $res_name_test
cancel_job $job_id
fail "Job ($job_id) not found"
}
-re "JobState=RUNNING" {
log_debug "Job is RUNNING as expected"
exp_continue
}
-re "JobState=PENDING" {
delete_res $res_name
delete_res $res_name_test
cancel_job $job_id
fail "Job ($job_id) is PENDING"
}
timeout {
delete_res $res_name
delete_res $res_name_test
cancel_job $job_id
fail "scontrol not responding"
}
eof {
wait
}
}
cancel_job $job_id
# Submit a batch job: a job using more cores than reserved in first reservation
set core_res_num [ expr $cores_per_node / 2 ]
set core_res_num [ expr $core_res_num + 1 ]
set thread_res_num [ expr $core_res_num * $threadcnt ]
spawn $sbatch --ntasks-per-node=$thread_res_num --nodelist=$def_node_name\[$min_node_inx\] --reservation=$res_name --output=/dev/null $file_in
expect {
-re "Submitted batch job ($number)" {
set job_id $expect_out(1,string)
exp_continue
}
timeout {
delete_res $res_name
delete_res $res_name_test
fail "sbatch not responding"
}
eof {
wait
}
}
if {$job_id == 0} {
delete_res $res_name
delete_res $res_name_test
fail "Batch submit failure"
}
sleep 10
# Show the job, make sure reservation tag is right
spawn $scontrol show job $job_id
expect {
-re "Invalid job id specified" {
delete_res $res_name
delete_res $res_name_test
cancel_job $job_id
fail "Job ($job_id) not found"
}
-re "JobState=PENDING" {
log_debug "Job is PENDING as expected"
exp_continue
}
-re "JobState=RUNNING" {
delete_res $res_name
delete_res $res_name_test
cancel_job $job_id
fail "Job ($job_id) is RUNNING"
}
timeout {
delete_res $res_name
delete_res $res_name_test
cancel_job $job_id
fail "scontrol not responding"
}
eof {
wait
}
}
cancel_job $job_id
# Submit a batch job: a job using cores reserved in second reservation
set thread_res_num [ expr 1 * $threadcnt ]
spawn $sbatch --ntasks-per-node=$thread_res_num --nodelist=$def_node_name\[$min_node_inx\] --reservation=$res_name_test --output=/dev/null $file_in
expect {
-re "Submitted batch job ($number)" {
set job_id $expect_out(1,string)
exp_continue
}
timeout {
delete_res $res_name
delete_res $res_name_test
fail "sbatch not responding"
}
eof {
wait
}
}
if {$job_id == 0} {
delete_res $res_name
delete_res $res_name_test
fail "Batch submit failure"
}
sleep 10
# Show the job, make sure reservation tag is right
spawn $scontrol show job $job_id
expect {
-re "Invalid job id specified" {
delete_res $res_name
delete_res $res_name_test
cancel_job $job_id
fail "Job ($job_id) not found"
}
-re "JobState=RUNNING" {
log_debug "Job is RUNNING as expected"
exp_continue
}
-re "JobState=PENDING" {
delete_res $res_name
delete_res $res_name_test
cancel_job $job_id
fail "Job ($job_id) is PENDING"
}
timeout {
delete_res $res_name
delete_res $res_name_test
cancel_job $job_id
fail "scontrol not responding"
}
eof {
wait
}
}
cancel_job $job_id
# Submit a batch job: a job using more cores than reserved in second reservation
set thread_res_num [ expr 2 * $threadcnt ]
spawn $sbatch --ntasks-per-node=$thread_res_num --nodelist=$def_node_name\[$min_node_inx\] --reservation=$res_name_test --output=/dev/null $file_in
expect {
-re "Submitted batch job ($number)" {
set job_id $expect_out(1,string)
exp_continue
}
timeout {
delete_res $res_name
delete_res $res_name_test
fail "sbatch not responding"
}
eof {
wait
}
}
if {$job_id == 0} {
delete_res $res_name
delete_res $res_name_test
fail "Batch submit failure"
}
sleep 10
# Show the job, make sure reservation tag is right
spawn $scontrol show job $job_id
expect {
-re "Invalid job id specified" {
delete_res $res_name
delete_res $res_name_test
cancel_job $job_id
fail "Job ($job_id) not found"
}
-re "JobState=PENDING" {
log_debug "Job is PENDING as expected"
exp_continue
}
-re "JobState=RUNNING" {
delete_res $res_name
delete_res $res_name_test
cancel_job $job_id
fail "Job ($job_id) is RUNNING"
}
timeout {
delete_res $res_name
delete_res $res_name_test
cancel_job $job_id
fail "scontrol not responding"
}
eof {
wait
}
}
cancel_job $job_id
set ret_code1 [delete_res $res_name]
if { $ret_code1 != 0 } {
delete_res $res_name_test
fail "Failed to delete reservation ($res_name)"
}
set ret_code [delete_res $res_name_test]
if { $ret_code != 0 } {
fail "Failed to delete reservation ($res_name_test)"
}
# Create a job that uses all cores in the node range and is not
# part of a reservation.
spawn $sbatch --ntasks-per-node=$cores_per_node --nodelist=$def_node_name\[$min_node_inx\] --output=/dev/null $file_in
expect {
-re "Submitted batch job ($number)" {
set job_id $expect_out(1,string)
exp_continue
}
timeout {
fail "sbatch not responding"
}
eof {
wait
}
}
if {$job_id == 0} {
fail "Batch submit failure"
}
sleep 10
# Verify the job is running.
set job_is_running false
spawn $scontrol show job $job_id
expect {
-re "Invalid job id specified" {
cancel_job $job_id
fail "Job ($job_id) not found"
}
-re "JobState=RUNNING" {
set job_is_running true
log_debug "Job is RUNNING as expected"
exp_continue
}
timeout {
cancel_job $job_id
fail "scontrol not responding"
}
eof {
wait
}
}
if { ! $job_is_running } {
cancel_job $job_id
fail "Unable to verify the job is running"
}
# Make the reservation using more cores than free, but use the
# IGNORE_JOBS flags. Verify that it is created with the
# correct nodes, CoreCnt, and CoreIDs.
set core_cnt [expr 8 + $cores_per_node]
set ret_code [create_res $res_name "StartTime=now Duration=60 Nodes=$def_node_name\[$min_node_inx\-$max_node_inx\] CoreCnt=$core_cnt User=$user_name Flags=IGNORE_JOBS"]
if {$ret_code == 1} {
cancel_job $job_id
fail "Reservation was not created when it should have been"
}
log_info "Reservation was created as expected"
set exp_node_cnt [expr {$max_node_inx_int-$min_node_inx_int+1}]
set exp_core_cnt [expr {8+$cores_per_node}]
set hosts_correct 0
set node_cnt_correct 0
set core_cnt_correct 0
set core_ids_correct 0
set node_inxs [node_range_to_list "$def_node_name\[$min_node_inx-$max_node_inx_int\]"]
sleep 5
spawn $scontrol show res
expect {
-re "Nodes=$def_node_name\\\[$min_node_inx\-$max_node_inx" {
set hosts_correct 1
exp_continue
}
-re "NodeCnt=$exp_node_cnt" {
set node_cnt_correct 1
exp_continue
}
-re "CoreCnt=$exp_core_cnt" {
set core_cnt_correct 1
exp_continue
}
-re "NodeName=$re_word_str CoreIDs=\($re_word_str\)" {
set core_inxs [range_to_list $expect_out(1,string)]
set core_ids_correct [expr $core_ids_correct + [llength $core_inxs]]
exp_continue
}
timeout {
cancel_job $job_id
delete_res $res_name
fail "scontrol not responding"
}
eof {
wait
}
}
cancel_job $job_id
set ret_code [delete_res $res_name]
if { $ret_code != 0 } {
fail "Error $ret_code deleting reservation"
}
subtest {$hosts_correct == 1} "Reservation should have correct node list"
subtest {$node_cnt_correct == 1} "Reservation should have correct node count"
subtest {$core_cnt_correct == 1} "Reservation should have correct core count"
subtest {$core_ids_correct == $exp_core_cnt} "Reservation should have correct core ids $core_ids_correct ? $exp_core_cnt"
# Create a job that runs on a single core on all nodes in the reservation.
spawn $sbatch --ntasks-per-node=1 --nodelist=$def_node_name\[$min_node_inx\-$max_node_inx\] --output=/dev/null $file_in
expect {
-re "Submitted batch job ($number)" {
set job_id $expect_out(1,string)
exp_continue
}
timeout {
fail "sbatch not responding"
}
eof {
wait
}
}
if {$job_id == 0} {
fail "Batch submit failure"
}
sleep 10
spawn $scontrol show job $job_id
expect {
-re "Invalid job id specified" {
cancel_job $job_id
fail "Job ($job_id) not found"
}
-re "JobState=RUNNING" {
log_debug "Job is RUNNING as expected"
exp_continue
}
timeout {
cancel_job $job_id
fail "scontrol not responding"
}
eof {
wait
}
}
cancel_job $job_id
}