| #!/usr/bin/env expect |
| ############################################################################# |
| # Purpose: Stress test of per-task output and input files |
| # |
| # Note: To avoid one minute NFS propagation delays in the output files, |
| # we create them ahead of time. Without explicitly file creation, |
| # this test requires about one minute per cycle to execute. |
| # |
| # Note: This script generates and then deletes files in the working |
| # directory named test9.4.input, test9.4.[0-9]+.input, and |
| # test9.4.[0-9]+.output |
| ############################################################################ |
| # Copyright (C) 2002-2007 The Regents of the University of California. |
| # Copyright (C) 2008-2010 Lawrence Livermore National Security. |
| # Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). |
| # Written by Morris Jette <jette1@llnl.gov> |
| # CODE-OCEC-09-009. All rights reserved. |
| # |
| # This file is part of Slurm, a resource management program. |
| # For details, see <https://slurm.schedmd.com/>. |
| # Please also read the included file: DISCLAIMER. |
| # |
| # Slurm is free software; you can redistribute it and/or modify it under |
| # the terms of the GNU General Public License as published by the Free |
| # Software Foundation; either version 2 of the License, or (at your option) |
| # any later version. |
| # |
| # Slurm is distributed in the hope that it will be useful, but WITHOUT ANY |
| # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS |
| # FOR A PARTICULAR PURPOSE. See the GNU General Public License for more |
| # details. |
| # |
| # You should have received a copy of the GNU General Public License along |
| # with Slurm; if not, write to the Free Software Foundation, Inc., |
| # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. |
| ############################################################################ |
| source ./globals |
| |
| set file_in "$test_dir/input" |
| set file_in_task "$test_name.%t.input" |
| set file_out_task "$test_name.%t.output" |
| set job_name $test_name |
| set cycle_count [get_cycle_count] |
| set task_cnt $max_stress_tasks |
| |
| # TODO: Temporary debug for bug 15302 (remove once fixed) |
| set config_dir [get_conf_path] |
| set config_file $config_dir/slurm.conf |
| |
| |
| proc cleanup {} { |
| global test_name task_cnt |
| |
| # Destroy all input/output files |
| for {set tsk 0} {$tsk < $task_cnt} {incr tsk} { |
| set file_in_glob "$test_name.$tsk.input" |
| set file_out_glob "$test_name.$tsk.output" |
| file delete $file_in_glob $file_out_glob |
| } |
| |
| # TODO: Temporary debug for bug 15302 (remove once fixed) |
| global config_file |
| restore_conf $config_file |
| reconfigure |
| } |
| |
| |
| set node_cnt 1-4 |
| set other_opts "-O" |
| |
| |
| # Execute an srun job to print hostname to output_file with task_cnt tasks |
| # per node, wait for completion |
| # Returns 0 on successful completion, returns 1 otherwise |
| proc run_cat_job { input_file output_file } { |
| global bin_cat bin_rm job_name number srun node_cnt other_opts task_cnt timeout |
| |
| # TODO: Temporarily add verbosity to troubleshoot bug 15302 (restore once fixed) |
| spawn $srun -vvvvv --job-name=$job_name -e - -i $input_file -o $output_file -N$node_cnt -n$task_cnt $other_opts -t1 $bin_cat - |
| #spawn $srun --job-name=$job_name -e - -i $input_file -o $output_file -N$node_cnt -n$task_cnt $other_opts -t1 $bin_cat - |
| expect { |
| -re "Unable to contact" { |
| fail "Slurm appears to be down" |
| } |
| timeout { |
| fail "srun not responding" |
| } |
| eof { |
| wait |
| } |
| } |
| } |
| |
| |
| # TODO: Temporary debug for bug 15302 (remove once fixed) |
| save_conf $config_file |
| run_command -none "$bin_echo SlurmdDebug=debug4 >> $config_file" |
| reconfigure -fail |
| |
| # |
| # Create a sizable text file |
| # |
| exec $bin_cat /etc/hosts >$file_in |
| exec $bin_cat /etc/passwd >>$file_in |
| set stdin_lines [get_line_cnt $file_in] |
| |
| # Make a text file for each task |
| set timeout $max_job_delay |
| spawn $srun -e /dev/null -i $file_in -o $file_in_task -N$node_cnt -n$task_cnt $other_opts -t1 $bin_cat |
| expect { |
| -re "Unable to contact" { |
| fail "Slurm appears to be down" |
| } |
| timeout { |
| fail "srun not responding" |
| } |
| eof { |
| wait |
| } |
| } |
| |
| # |
| # Run cycle_count jobs to copy job input to job output and compare sizes |
| # |
| set success_cnt 0 |
| set timeout $max_job_delay |
| for {set inx 0} {$inx < $cycle_count} {incr inx} { |
| for {set tsk 0} {$tsk < $task_cnt} {incr tsk} { |
| set file_out_glob "$test_name.$tsk.output" |
| exec $bin_rm -f $file_out_glob |
| } |
| run_cat_job $file_in_task $file_out_task |
| # Test output file sizes |
| for {set tsk 0} {$tsk < $task_cnt} {incr tsk} { |
| set file_out_glob "$test_name.$tsk.output" |
| wait_for_file $file_out_glob |
| set stdout_lines [get_line_cnt $file_out_glob] |
| if {$stdout_lines != $stdin_lines} { |
| exec $bin_sleep 1 |
| set stdout_lines [get_line_cnt $file_out_glob] |
| } |
| if {$stdout_lines != $stdin_lines} { |
| # TODO: Temporary debug for bug 15302 (remove once fixed) |
| set hostname [string trimright [run_command_output "hostname -s"]] |
| run_command "sudo dmesg -T | grep -i 'killed process'" |
| run_command "free -h" |
| run_command "top -o %MEM -n 1" |
| |
| if {$stdout_lines == 0} { |
| fail "stdout is empty. Is current working directory writable from compute nodes?" |
| } else { |
| fail "stdout is incomplete" |
| } |
| } else { |
| incr success_cnt |
| } |
| } |
| } |