blob: d1973429d8924080c977be334d307002c2c58dd1 [file] [log] [blame] [edit]
############################################################################
# Copyright (C) SchedMD LLC.
############################################################################
import pytest
import atf
import time
def _all_jobs_of_state(job_ids=[], desired="RUNNING", timeout=60):
while True:
jobs = atf.get_jobs()
states = [
jobs[job_id]["JobState"]
for job_id in jobs
if not job_ids or job_id in job_ids
]
if all(st == desired for st in states):
return True
timeout -= 1
if timeout <= 0:
break
time.sleep(1)
return False
@pytest.fixture(scope="module", autouse=True)
def setup():
atf.require_auto_config("Needs to create/reconfigure nodes and partitions")
atf.require_nodes(1, [("CPUs", 2)])
atf.require_config_parameter("SelectType", "select/cons_tres")
atf.require_config_parameter("SelectTypeParameters", "CR_CPU")
atf.require_slurm_running()
@pytest.fixture(scope="function")
def partition_nodes():
"""Create 2 partitions sharing the same node"""
nodes = list(atf.get_nodes().keys())
shared_node = nodes[0]
atf.run_command(
f"scontrol create PartitionName=p1 Nodes={shared_node} OverSubscribe=YES",
fatal=True,
user=atf.properties["slurm-user"],
)
atf.run_command(
f"scontrol create PartitionName=p2 Nodes={shared_node} OverSubscribe=NO",
fatal=True,
user=atf.properties["slurm-user"],
)
yield nodes
atf.cancel_all_jobs()
atf.run_command(
"scontrol delete PartitionName=p1",
user=atf.properties["slurm-user"],
)
atf.run_command(
"scontrol delete PartitionName=p2",
user=atf.properties["slurm-user"],
)
@pytest.mark.xfail(
atf.get_version() < (25, 11),
reason="Ticket 22228: Fix the 'is-busy' check for a node shared between partitions",
)
def test_overlapping_oversubscribe_differ_sharing_restriction_1(partition_nodes):
"""
With the select plugin targeting CPU, two partitions sharing a node
should be able to oversubscribe as long as the other partition does not
have jobs on the shared node.
"""
num_oversub_jobs = 2
# Submit first job to p2 *with* --oversubscribe (despite OverSubscribe=NO)
job_ids = [
atf.submit_job_sbatch("-p p2 --oversubscribe --wrap='sleep 60'", fatal=True)
for _ in range(num_oversub_jobs)
]
# Jobs should run simultaneously
assert _all_jobs_of_state(job_ids, timeout=60)
atf.cancel_jobs(job_ids, fatal=True)
# Submit a job to p1 *without* --oversubscribe
job_ids = [atf.submit_job_sbatch("-p p1 --wrap='sleep 0'", fatal=True)]
# Job should complete quickly
atf.wait_for_job_state(job_ids[0], "COMPLETED", fatal=True, timeout=5)
# Once again, submit job with oversubscribe to p2 now that p1's job is complete
job_ids = [
atf.submit_job_sbatch("-p p2 --oversubscribe --wrap='sleep 60'", fatal=True)
for _ in range(num_oversub_jobs)
]
# Verify last 'num_oversub_jobs' jobs are running
assert _all_jobs_of_state(job_ids)