| ############################################################################ |
| # Copyright (C) SchedMD LLC. |
| ############################################################################ |
| import atf |
| import pytest |
| |
| |
| node_count = 2 |
| slurm_user = atf.properties["slurm-user"] |
| |
| |
| # Setup |
| @pytest.fixture(scope="module", autouse=True) |
| def setup(): |
| atf.require_nodes(node_count) |
| atf.require_config_parameter("SlurmdTimeout", 5) |
| atf.require_slurm_running() |
| |
| |
| @pytest.fixture(scope="function") |
| def no_kill_job(request): |
| """Submit a job that should not be killed on node failure, and resume node""" |
| |
| no_kill = request.param |
| if no_kill: |
| no_kill_arg = "--no-kill " |
| else: |
| no_kill_arg = "" |
| job_id = atf.submit_job_sbatch( |
| f"{no_kill_arg} -N{node_count} --wrap 'sleep 300'", fatal=True |
| ) |
| atf.wait_for_job_state(job_id, "RUNNING", fatal=True) |
| |
| # Get an allocated node that it's not the BatchHost before it is removed |
| # from the job so we can clean it up |
| batch_node = atf.get_job_parameter(job_id, "BatchHost") |
| node_list = atf.node_range_to_list(atf.get_job_parameter(job_id, "NodeList")) |
| node = next((n for n in node_list if n != batch_node), None) |
| if not node: |
| pytest.fail( |
| "Unable to find a node different than BatchHost, this shouldn't happen" |
| ) |
| |
| yield job_id, node, no_kill |
| |
| # Return the node to the idle state |
| atf.cancel_jobs([job_id]) |
| atf.run_command(f"scontrol update nodename={node} state=resume", user=slurm_user) |
| |
| |
| @pytest.mark.parametrize("no_kill_job", [True, False], indirect=True) |
| def test_no_kill(no_kill_job): |
| """Verify job with --no-kill option is not killed on node failure""" |
| |
| job_id, node, no_kill = no_kill_job |
| |
| job_nodes = atf.node_range_to_list(atf.get_job_parameter(job_id, "NodeList")) |
| atf.run_command( |
| f"scontrol update nodename={node} state=down reason=test_nokill", |
| user=slurm_user, |
| ) |
| atf.wait_for_node_state(node, "DOWN", fatal=True) |
| job_nodes_post_node_down = atf.node_range_to_list( |
| atf.get_job_parameter(job_id, "NodeList") |
| ) |
| |
| if no_kill: |
| # With --no-kill: job should keep running without the down node |
| assert ( |
| len(job_nodes_post_node_down) == len(job_nodes) - 1 |
| ), "The job with --no-kill should lose only one node after it was brought down" |
| assert ( |
| atf.get_job_parameter(job_id, "JobState") == "RUNNING" |
| ), "The job should keep running with --no-kill set" |
| assert ( |
| atf.get_job_parameter(job_id, "Restarts") == 0 |
| ), "The job should not be restarted/requeued" |
| else: |
| # Without --no-kill: job should be requeued |
| assert ( |
| atf.get_job_parameter(job_id, "JobState") != "RUNNING" |
| ), "The job should not keep running without --no-kill set" |
| |
| # Wait until the initial job is completed and restarted/requeued |
| requeued = atf.repeat_until( |
| lambda: atf.get_jobs()[job_id], |
| lambda job: job["JobState"] == "PENDING" |
| and job["Restarts"] == 1 |
| and job["Reason"] == "BeginTime", |
| ) |
| assert ( |
| requeued |
| ), f"Job should be requeued when run without --no--kill, but {atf.get_jobs[job_id]}" |