| ############################################################################ |
| # Copyright (C) SchedMD LLC. |
| ############################################################################ |
| """ |
| Ticket 24603: Het job with different QoS per component. |
| |
| Pre-fix (Slurm < 26.05): slurmctld aggregates TRES across all components |
| that share the same association and validates once using the first component's |
| partition and QoS. With DenyOnLimit: valid het jobs are rejected at submit. |
| Without DenyOnLimit: aggregate limit is not enforced at submit or at schedule, |
| so the job is accepted; on an idle cluster these het jobs typically run to |
| COMPLETED. |
| |
| Post-fix (Slurm >= 26.05): TRES are aggregated only per (assoc, job QOS, |
| partition) group. With or without DenyOnLimit, the same het jobs are accepted |
| and run. |
| |
| Coverage (grouped by scenario; several tests skip if the case only applies before |
| or from Slurm 26.05—see skipif reason strings on those tests). |
| |
| - Job-level --qos=: without DenyOnLimit → accept and complete on idle cluster (pre- and post-fix); |
| post-fix DenyOnLimit |
| → accept and run (skipif on Slurm < 26.05); without Deny → run to completion on all versions; |
| GPU partition fully busy → het stays PENDING (scheduling). |
| - Partition DefaultQOS (no job --qos=): no-Deny pending; post-fix Deny → accept and run |
| (skipif on older); no-Deny → run to completion on all versions. |
| - Min/Max gres/r1 per component (CPU max 0, GPU min 4 r1): no-Deny pending; post-fix Deny and |
| no-Deny → run to completion (skipif on older where marked). |
| - Association GrpTRES: one association—2+2 > cap rejected at submit; 1+1 or same-partition |
| shapes accepted; two associations—2+2 with split -A accepted; running 1-node or |
| het blocker → follow-up het PENDING until capacity frees (GrpTRES=2 and GrpTRES=3 |
| cases; Slurm >= 26.05 where skipif); partition DefaultQOS GrpTRES variants. |
| - Duplicate (assoc, job QOS, partition) on both het lines: stack within MaxTresPerUser |
| → accept or reject at submit. |
| - Combined-limit (stacked TRES in one bucket): QoS limit_factor × GrpTRES; shared |
| (assoc, job QOS); shared (assoc, partition). |
| |
| Requires: AccountingStorageEnforce=limits, accounting, |
| AccountingStorageTRES including gres/r1, SelectType=select/cons_tres, |
| SelectTypeParameters=CR_CPU, GresTypes=r1, two partitions, 4 nodes with |
| Gres=r1:2 each (2 per partition). |
| """ |
| import os |
| import time |
| |
| import atf |
| import pytest |
| |
| test_name = os.path.splitext(os.path.basename(__file__))[0] |
| # Partitions: CPU (2 nodes), GPU (2 nodes) |
| p_cpu = f"{test_name}_p_cpu" |
| p_gpu = f"{test_name}_p_gpu" |
| # QoS with DenyOnLimit: CPU limit 2 nodes, GPU limit 10 (so 2 is within limit) |
| qos_cpu_deny = f"{test_name}_qos_cpu_deny" |
| qos_gpu_deny = f"{test_name}_qos_gpu_deny" |
| # QoS without DenyOnLimit (same limits) |
| qos_cpu_nodeny = f"{test_name}_qos_cpu_nodeny" |
| qos_gpu_nodeny = f"{test_name}_qos_gpu_nodeny" |
| # Partition-level QoS (used as partition DefaultQOS; no --qos= in script) |
| qos_p_cpu_def_deny = f"{test_name}_p_cpu_def_deny" |
| qos_p_gpu_def_deny = f"{test_name}_p_gpu_def_deny" |
| qos_p_cpu_def_nodeny = f"{test_name}_p_cpu_def_nodeny" |
| qos_p_gpu_def_nodeny = f"{test_name}_p_gpu_def_nodeny" |
| acct = f"{test_name}_acct" |
| # Min/Max gres/r1 het scenario (job-level QoS per component) |
| acct_tres_gres = f"{test_name}_acct_tres_gres" |
| qos_tres_cpu_deny = f"{test_name}_qos_tres_cpu_deny" |
| qos_tres_gpu_deny = f"{test_name}_qos_tres_gpu_deny" |
| qos_tres_cpu_nodeny = f"{test_name}_qos_tres_cpu_nodeny" |
| qos_tres_gpu_nodeny = f"{test_name}_qos_tres_gpu_nodeny" |
| # Second account for association GrpTRES tests (9–9c) |
| acct_assoc = f"{test_name}_acct_assoc" |
| qos_assoc_cpu = f"{test_name}_qos_assoc_cpu" |
| qos_assoc_gpu = f"{test_name}_qos_assoc_gpu" |
| # Two accounts: GrpTRES on each (pair to test 9 single-assoc rejection) |
| acct_assoc2_a = f"{test_name}_acct_assoc2_a" |
| acct_assoc2_b = f"{test_name}_acct_assoc2_b" |
| qos_assoc2_cpu = f"{test_name}_qos_assoc2_cpu" |
| qos_assoc2_gpu = f"{test_name}_qos_assoc2_gpu" |
| # Combined-limit tests 10–12: separate accounts and QoS |
| acct_lf = f"{test_name}_acct_lf" |
| qos_lf = f"{test_name}_qos_lf" |
| acct_qos_combined_limit = f"{test_name}_acct_qos_combined_limit" |
| qos_combined_limit = f"{test_name}_qos_combined_limit" |
| acct_part_combined_limit = f"{test_name}_acct_part_combined_limit" |
| qos_part_lim = f"{test_name}_qos_part_lim" |
| qos_job_a = f"{test_name}_qos_ja" |
| qos_job_b = f"{test_name}_qos_jb" |
| # Duplicate same (assoc, job QoS, partition) on two het lines (tests 9d–9e) |
| acct_dup_accept = f"{test_name}_acct_dup_accept" |
| qos_dup_accept = f"{test_name}_qos_dup_accept" |
| acct_dup_reject = f"{test_name}_acct_dup_reject" |
| qos_dup_reject = f"{test_name}_qos_dup_reject" |
| # GrpTRES=node=2 vs running job + het (same assoc; queues until limit frees) |
| acct_grp_tres_run = f"{test_name}_acct_grp_tres_run" |
| qos_grp_tres_run = f"{test_name}_qos_grp_tres_run" |
| |
| # Last Slurm (major, minor) treated as pre-fix: version checks use |
| # get_version() > SLURM_VERSION_HET_JOB_FIX for post-fix behavior (fix in 26.05+). |
| SLURM_VERSION_HET_JOB_FIX = (26, 4) |
| |
| |
| @pytest.fixture(scope="module", autouse=True) |
| def setup(): |
| atf.require_auto_config("wants to create partitions, gres, and accounting") |
| atf.require_accounting(modify=True) |
| atf.require_config_parameter_includes("AccountingStorageEnforce", "limits") |
| atf.require_config_parameter_includes("AccountingStorageTRES", "gres/r1") |
| atf.require_config_parameter("SelectType", "select/cons_tres") |
| atf.require_config_parameter("SelectTypeParameters", "CR_CPU") |
| atf.require_config_parameter_includes("GresTypes", "r1") |
| atf.require_nodes(4, [("Gres", "r1:2")]) |
| atf.require_config_parameter( |
| "PartitionName", |
| { |
| p_cpu: { |
| "Nodes": "node1,node2", |
| "Default": "NO", |
| "State": "UP", |
| }, |
| p_gpu: { |
| "Nodes": "node3,node4", |
| "Default": "NO", |
| "State": "UP", |
| }, |
| }, |
| ) |
| atf.require_config_parameter_includes("SchedulerParameters", "bf_interval=1") |
| atf.require_config_parameter_includes("SchedulerParameters", "sched_interval=1") |
| atf.require_slurm_running() |
| |
| |
| @pytest.fixture(scope="module") |
| def setup_account_and_qos(setup): |
| """Create one account and eight QoS (job-level and partition-level).""" |
| atf.run_command( |
| f"sacctmgr -i add qos {qos_cpu_deny} " |
| f"flags=DenyOnLimit MaxtresPerUser=node=2", |
| user=atf.properties["slurm-user"], |
| fatal=True, |
| ) |
| atf.run_command( |
| f"sacctmgr -i add qos {qos_gpu_deny} " |
| f"flags=DenyOnLimit MaxtresPerUser=node=10", |
| user=atf.properties["slurm-user"], |
| fatal=True, |
| ) |
| atf.run_command( |
| f"sacctmgr -i add qos {qos_cpu_nodeny} " f"MaxtresPerUser=node=2", |
| user=atf.properties["slurm-user"], |
| fatal=True, |
| ) |
| atf.run_command( |
| f"sacctmgr -i add qos {qos_gpu_nodeny} " f"MaxtresPerUser=node=10", |
| user=atf.properties["slurm-user"], |
| fatal=True, |
| ) |
| # Partition-level QoS (same limits; used as partition DefaultQOS) |
| atf.run_command( |
| f"sacctmgr -i add qos {qos_p_cpu_def_deny} " |
| f"flags=DenyOnLimit MaxtresPerUser=node=2", |
| user=atf.properties["slurm-user"], |
| fatal=True, |
| ) |
| atf.run_command( |
| f"sacctmgr -i add qos {qos_p_gpu_def_deny} " |
| f"flags=DenyOnLimit MaxtresPerUser=node=10", |
| user=atf.properties["slurm-user"], |
| fatal=True, |
| ) |
| atf.run_command( |
| f"sacctmgr -i add qos {qos_p_cpu_def_nodeny} " f"MaxtresPerUser=node=2", |
| user=atf.properties["slurm-user"], |
| fatal=True, |
| ) |
| atf.run_command( |
| f"sacctmgr -i add qos {qos_p_gpu_def_nodeny} " f"MaxtresPerUser=node=10", |
| user=atf.properties["slurm-user"], |
| fatal=True, |
| ) |
| atf.run_command( |
| f"sacctmgr -i add account {acct}", |
| user=atf.properties["slurm-user"], |
| fatal=True, |
| ) |
| user = atf.get_user_name() |
| atf.run_command( |
| f"sacctmgr -i add user {user} account={acct} " |
| f"qos={qos_cpu_deny},{qos_gpu_deny},{qos_cpu_nodeny},{qos_gpu_nodeny}," |
| f"{qos_p_cpu_def_deny},{qos_p_gpu_def_deny}," |
| f"{qos_p_cpu_def_nodeny},{qos_p_gpu_def_nodeny}", |
| user=atf.properties["slurm-user"], |
| fatal=True, |
| ) |
| yield |
| atf.run_command( |
| f"sacctmgr -i del qos {qos_cpu_deny},{qos_gpu_deny}," |
| f"{qos_cpu_nodeny},{qos_gpu_nodeny}," |
| f"{qos_p_cpu_def_deny},{qos_p_gpu_def_deny}," |
| f"{qos_p_cpu_def_nodeny},{qos_p_gpu_def_nodeny}", |
| user=atf.properties["slurm-user"], |
| quiet=True, |
| ) |
| |
| |
| @pytest.fixture(scope="module") |
| def setup_account_and_qos_tres(setup): |
| """Account + QoS for Min/Max gres/r1 het: CPU max gres/r1=0, GPU min gres/r1=4.""" |
| atf.run_command( |
| f"sacctmgr -i add qos {qos_tres_cpu_deny} " |
| "flags=DenyOnLimit MaxtresPerJob=gres/r1=0", |
| user=atf.properties["slurm-user"], |
| fatal=True, |
| ) |
| atf.run_command( |
| f"sacctmgr -i add qos {qos_tres_gpu_deny} " |
| "flags=DenyOnLimit MintresPerJob=gres/r1=4", |
| user=atf.properties["slurm-user"], |
| fatal=True, |
| ) |
| atf.run_command( |
| f"sacctmgr -i add qos {qos_tres_cpu_nodeny} MaxtresPerJob=gres/r1=0", |
| user=atf.properties["slurm-user"], |
| fatal=True, |
| ) |
| atf.run_command( |
| f"sacctmgr -i add qos {qos_tres_gpu_nodeny} MintresPerJob=gres/r1=4", |
| user=atf.properties["slurm-user"], |
| fatal=True, |
| ) |
| atf.run_command( |
| f"sacctmgr -i add account {acct_tres_gres}", |
| user=atf.properties["slurm-user"], |
| fatal=True, |
| ) |
| user = atf.get_user_name() |
| atf.run_command( |
| f"sacctmgr -i add user {user} account={acct_tres_gres} " |
| f"qos={qos_tres_cpu_deny},{qos_tres_gpu_deny}," |
| f"{qos_tres_cpu_nodeny},{qos_tres_gpu_nodeny}", |
| user=atf.properties["slurm-user"], |
| fatal=True, |
| ) |
| yield |
| atf.run_command( |
| f"sacctmgr -i del qos {qos_tres_cpu_deny},{qos_tres_gpu_deny}," |
| f"{qos_tres_cpu_nodeny},{qos_tres_gpu_nodeny}", |
| user=atf.properties["slurm-user"], |
| quiet=True, |
| ) |
| |
| |
| @pytest.fixture(scope="module") |
| def setup_assoc_limit_het(setup): |
| """ |
| One account with association GrpTRES=node=3; two QoS with high node limit. |
| |
| Used by tests 9–9c: both het components use the same association; a 2+2 node |
| het stacks 4 nodes against GrpTRES=3 and is rejected at submit (tests 9–9b). |
| """ |
| atf.run_command( |
| f"sacctmgr -i add qos {qos_assoc_cpu} MaxtresPerUser=node=10", |
| user=atf.properties["slurm-user"], |
| fatal=True, |
| ) |
| atf.run_command( |
| f"sacctmgr -i add qos {qos_assoc_gpu} MaxtresPerUser=node=10", |
| user=atf.properties["slurm-user"], |
| fatal=True, |
| ) |
| atf.run_command( |
| f"sacctmgr -i add account {acct_assoc}", |
| user=atf.properties["slurm-user"], |
| fatal=True, |
| ) |
| user = atf.get_user_name() |
| atf.run_command( |
| f"sacctmgr -i add user {user} account={acct_assoc} " |
| f"GrpTRES=node=3 qos={qos_assoc_cpu},{qos_assoc_gpu}", |
| user=atf.properties["slurm-user"], |
| fatal=True, |
| ) |
| yield |
| atf.run_command( |
| f"sacctmgr -i del qos {qos_assoc_cpu},{qos_assoc_gpu}", |
| user=atf.properties["slurm-user"], |
| quiet=True, |
| ) |
| |
| |
| @pytest.fixture(scope="module") |
| def setup_assoc_limit_het_two_accounts(setup): |
| """ |
| Two accounts, each with association GrpTRES=node=3; two QoS with high node limit. |
| |
| Used by test_het_job_two_assoc_grp_tres_2plus2_accepted: each het component |
| uses a different -A (2 nodes per account); each association only accrues its |
| own 2 nodes against GrpTRES=3, so submit must succeed (unlike test 9 where one |
| association sees 4 nodes). |
| """ |
| atf.run_command( |
| f"sacctmgr -i add qos {qos_assoc2_cpu} MaxtresPerUser=node=10", |
| user=atf.properties["slurm-user"], |
| fatal=True, |
| ) |
| atf.run_command( |
| f"sacctmgr -i add qos {qos_assoc2_gpu} MaxtresPerUser=node=10", |
| user=atf.properties["slurm-user"], |
| fatal=True, |
| ) |
| atf.run_command( |
| f"sacctmgr -i add account {acct_assoc2_a},{acct_assoc2_b}", |
| user=atf.properties["slurm-user"], |
| fatal=True, |
| ) |
| user = atf.get_user_name() |
| atf.run_command( |
| f"sacctmgr -i add user {user} account={acct_assoc2_a},{acct_assoc2_b} " |
| f"GrpTRES=node=3 qos={qos_assoc2_cpu},{qos_assoc2_gpu}", |
| user=atf.properties["slurm-user"], |
| fatal=True, |
| ) |
| yield |
| atf.run_command( |
| f"sacctmgr -i del qos {qos_assoc2_cpu},{qos_assoc2_gpu}", |
| user=atf.properties["slurm-user"], |
| quiet=True, |
| ) |
| |
| |
| @pytest.fixture(scope="module") |
| def setup_assoc_grp_tres_running_contention(setup): |
| """ |
| One account with association GrpTRES=node=2; QoS with high per-user limit. |
| |
| Used to verify that a running job charges GrpTRES so a subsequent het job |
| whose stacked components exceed the remaining association capacity stays |
| PENDING (mirrors salloc --account=X -N1 with salloc het : -N1 : -N1). |
| """ |
| atf.run_command( |
| f"sacctmgr -i add qos {qos_grp_tres_run} MaxtresPerUser=node=10", |
| user=atf.properties["slurm-user"], |
| fatal=True, |
| ) |
| atf.run_command( |
| f"sacctmgr -i add account {acct_grp_tres_run}", |
| user=atf.properties["slurm-user"], |
| fatal=True, |
| ) |
| user = atf.get_user_name() |
| atf.run_command( |
| f"sacctmgr -i add user {user} account={acct_grp_tres_run} " |
| f"GrpTRES=node=2 qos={qos_grp_tres_run}", |
| user=atf.properties["slurm-user"], |
| fatal=True, |
| ) |
| yield |
| atf.run_command( |
| f"sacctmgr -i del qos {qos_grp_tres_run}", |
| user=atf.properties["slurm-user"], |
| quiet=True, |
| ) |
| |
| |
| @pytest.fixture(scope="module") |
| def setup_limit_factor_het(setup): |
| """ |
| Combined-limit test 10: association GrpTRES=node=10, one QoS with limit_factor 0.3 (effective 3). |
| 2+2 node het same assoc same QoS → stacked usage exceeds combined GrpTRES × limit_factor → rejected at submit (4 > 3). |
| """ |
| clear_partition_default_qos() |
| atf.run_command( |
| f"sacctmgr -i add qos {qos_lf} flags=DenyOnLimit " |
| f"LimitFactor=0.3 MaxtresPerUser=node=10", |
| user=atf.properties["slurm-user"], |
| fatal=True, |
| ) |
| atf.run_command( |
| f"sacctmgr -i add account {acct_lf}", |
| user=atf.properties["slurm-user"], |
| fatal=True, |
| ) |
| user = atf.get_user_name() |
| atf.run_command( |
| f"sacctmgr -i add user {user} account={acct_lf} " |
| f"GrpTRES=node=10 qos={qos_lf}", |
| user=atf.properties["slurm-user"], |
| fatal=True, |
| ) |
| yield |
| atf.run_command( |
| f"sacctmgr -i del qos {qos_lf}", |
| user=atf.properties["slurm-user"], |
| quiet=True, |
| ) |
| |
| |
| @pytest.fixture(scope="module") |
| def setup_qos_combined_limit_het(setup): |
| """ |
| Combined-limit test 11: one QoS MaxTRESPerUser=node=3. 2+2 het same assoc same QoS, |
| different partitions → stacked nodes in one (assoc, job QOS) group exceeds cap → rejected at submit (4 > 3). |
| """ |
| clear_partition_default_qos() |
| atf.run_command( |
| f"sacctmgr -i add qos {qos_combined_limit} flags=DenyOnLimit " |
| f"MaxtresPerUser=node=3", |
| user=atf.properties["slurm-user"], |
| fatal=True, |
| ) |
| atf.run_command( |
| f"sacctmgr -i add account {acct_qos_combined_limit}", |
| user=atf.properties["slurm-user"], |
| fatal=True, |
| ) |
| user = atf.get_user_name() |
| atf.run_command( |
| f"sacctmgr -i add user {user} account={acct_qos_combined_limit} " |
| f"qos={qos_combined_limit}", |
| user=atf.properties["slurm-user"], |
| fatal=True, |
| ) |
| yield |
| atf.run_command( |
| f"sacctmgr -i del qos {qos_combined_limit}", |
| user=atf.properties["slurm-user"], |
| quiet=True, |
| ) |
| |
| |
| @pytest.fixture(scope="module") |
| def setup_partition_combined_limit_het(setup): |
| """ |
| Combined-limit test 12: partition p_cpu QoS limit 1 node. 1+1 het same partition, |
| different job QoS → stacked nodes in one (assoc, partition) group exceeds cap → rejected at submit (2 > 1). |
| """ |
| atf.run_command( |
| f"sacctmgr -i add qos {qos_part_lim} flags=DenyOnLimit " |
| f"MaxtresPerUser=node=1", |
| user=atf.properties["slurm-user"], |
| fatal=True, |
| ) |
| atf.run_command( |
| f"sacctmgr -i add qos {qos_job_a} MaxtresPerUser=node=10", |
| user=atf.properties["slurm-user"], |
| fatal=True, |
| ) |
| atf.run_command( |
| f"sacctmgr -i add qos {qos_job_b} MaxtresPerUser=node=10", |
| user=atf.properties["slurm-user"], |
| fatal=True, |
| ) |
| atf.run_command( |
| f"sacctmgr -i add account {acct_part_combined_limit}", |
| user=atf.properties["slurm-user"], |
| fatal=True, |
| ) |
| user = atf.get_user_name() |
| atf.run_command( |
| f"sacctmgr -i add user {user} account={acct_part_combined_limit} " |
| f"qos={qos_job_a},{qos_job_b}", |
| user=atf.properties["slurm-user"], |
| fatal=True, |
| ) |
| atf.run_command( |
| f"scontrol update PartitionName={p_cpu} QoS={qos_part_lim}", |
| user=atf.properties["slurm-user"], |
| fatal=True, |
| ) |
| yield |
| atf.run_command( |
| f"sacctmgr -i del qos {qos_part_lim},{qos_job_a},{qos_job_b}", |
| user=atf.properties["slurm-user"], |
| quiet=True, |
| ) |
| |
| |
| @pytest.fixture(scope="module") |
| def setup_het_dup_group_accept(setup): |
| """ |
| One account; one QoS with DenyOnLimit MaxTresPerUser=node=2. |
| |
| Used by test_het_job_duplicate_same_qos_partition_accept: two het components |
| on p_cpu with the same --qos= (1+1 nodes in one assoc/job QoS/partition group). |
| """ |
| clear_partition_default_qos() |
| atf.run_command( |
| f"sacctmgr -i add qos {qos_dup_accept} flags=DenyOnLimit " |
| f"MaxtresPerUser=node=2", |
| user=atf.properties["slurm-user"], |
| fatal=True, |
| ) |
| atf.run_command( |
| f"sacctmgr -i add account {acct_dup_accept}", |
| user=atf.properties["slurm-user"], |
| fatal=True, |
| ) |
| user = atf.get_user_name() |
| atf.run_command( |
| f"sacctmgr -i add user {user} account={acct_dup_accept} " |
| f"qos={qos_dup_accept}", |
| user=atf.properties["slurm-user"], |
| fatal=True, |
| ) |
| yield |
| atf.run_command( |
| f"sacctmgr -i del qos {qos_dup_accept}", |
| user=atf.properties["slurm-user"], |
| quiet=True, |
| ) |
| |
| |
| @pytest.fixture(scope="module") |
| def setup_het_dup_group_reject(setup): |
| """ |
| One account; one QoS with DenyOnLimit MaxTresPerUser=node=1. |
| |
| Used by test_het_job_duplicate_same_qos_partition_rejected: same het shape as |
| setup_het_dup_group_accept; stacked 1+1 in one group must be rejected at submit. |
| """ |
| clear_partition_default_qos() |
| atf.run_command( |
| f"sacctmgr -i add qos {qos_dup_reject} flags=DenyOnLimit " |
| f"MaxtresPerUser=node=1", |
| user=atf.properties["slurm-user"], |
| fatal=True, |
| ) |
| atf.run_command( |
| f"sacctmgr -i add account {acct_dup_reject}", |
| user=atf.properties["slurm-user"], |
| fatal=True, |
| ) |
| user = atf.get_user_name() |
| atf.run_command( |
| f"sacctmgr -i add user {user} account={acct_dup_reject} " |
| f"qos={qos_dup_reject}", |
| user=atf.properties["slurm-user"], |
| fatal=True, |
| ) |
| yield |
| atf.run_command( |
| f"sacctmgr -i del qos {qos_dup_reject}", |
| user=atf.properties["slurm-user"], |
| quiet=True, |
| ) |
| |
| |
| def _expect_submit_accept_het_done(job_id): |
| """After submit acceptance, expect the het leader to finish in COMPLETED on an idle cluster.""" |
| atf.wait_for_job_state(job_id, "DONE", timeout=120, fatal=True) |
| assert ( |
| atf.get_job_parameter(job_id, "JobState", quiet=True) == "COMPLETED" |
| ), "Het leader should finish in COMPLETED on idle cluster" |
| |
| |
| def set_partition_default_qos(qos_cpu, qos_gpu): |
| """Set default QoS (QoS=) for p_cpu and p_gpu (used by partition-level tests).""" |
| atf.run_command( |
| f"scontrol update PartitionName={p_cpu} QoS={qos_cpu}", |
| user=atf.properties["slurm-user"], |
| fatal=True, |
| ) |
| atf.run_command( |
| f"scontrol update PartitionName={p_gpu} QoS={qos_gpu}", |
| user=atf.properties["slurm-user"], |
| fatal=True, |
| ) |
| |
| |
| def scancel_started_job_leaders(*job_ids): |
| """Cancel jobs by sbatch leader ids before cancel_all_jobs teardown. |
| |
| cancel_all_jobs passes every job id from scontrol JSON to bulk scancel; cancelling a het |
| component id fails with rc=60. Cancelling leaders first removes the whole het job. |
| """ |
| ids = [j for j in job_ids if j] |
| if not ids: |
| return |
| atf.cancel_jobs( |
| ids, |
| fatal=False, |
| quiet=True, |
| user=atf.properties["slurm-user"], |
| ) |
| |
| |
| def clear_partition_default_qos(): |
| """Clear partition QoS= on p_cpu and p_gpu (called from cancel_jobs teardown and combined-limit fixtures).""" |
| atf.run_command( |
| f"scontrol update PartitionName={p_cpu} QoS=", |
| user=atf.properties["slurm-user"], |
| fatal=False, |
| ) |
| atf.run_command( |
| f"scontrol update PartitionName={p_gpu} QoS=", |
| user=atf.properties["slurm-user"], |
| fatal=False, |
| ) |
| |
| |
| @pytest.fixture(scope="function") |
| def cancel_jobs(): |
| """Cancel all jobs after each test; clear partition QoS= (no leaking DefaultQOS).""" |
| yield |
| atf.cancel_all_jobs(fatal=True) |
| clear_partition_default_qos() |
| |
| |
| def test_het_job_without_deny_on_limit_accepted(setup_account_and_qos, cancel_jobs): |
| """ |
| No DenyOnLimit; same het shape (1+2 nodes). Submit succeeds; on idle cluster the het completes. |
| """ |
| atf.make_bash_script( |
| "het_nodeny.in", |
| f""" |
| #SBATCH -p {p_cpu} --qos={qos_cpu_nodeny} -N1 -t1 |
| #SBATCH hetjob |
| #SBATCH -p {p_gpu} --qos={qos_gpu_nodeny} -N2 -t1 |
| true |
| """, |
| ) |
| job_id = atf.submit_job_sbatch("het_nodeny.in", fatal=False) |
| assert job_id != 0, "Without DenyOnLimit, het job should be accepted at submit" |
| _expect_submit_accept_het_done(job_id) |
| |
| |
| def test_het_job_pending_when_gpu_partition_busy(setup_account_and_qos, cancel_jobs): |
| """ |
| Plain job holds both p_gpu nodes; het needs 1 on p_cpu and 2 on p_gpu (same shape as het_nodeny). |
| Submit validation passes but all-or-nothing scheduling cannot place the GPU component → het stays |
| PENDING until resources free. Scheduler contention, not assoc/QoS submit validation. |
| """ |
| block_jid = atf.submit_job_sbatch( |
| f"-J {test_name}_gpu_all -p {p_gpu} --qos={qos_gpu_nodeny} " |
| '-N2 -t10 --wrap "sleep 600"', |
| fatal=False, |
| ) |
| assert block_jid != 0, "Blocker job should submit" |
| atf.wait_for_job_state(block_jid, "RUNNING", timeout=120, fatal=True) |
| |
| atf.make_bash_script( |
| "het_gpu_busy.in", |
| f""" |
| #SBATCH -p {p_cpu} --qos={qos_cpu_nodeny} -N1 -t1 |
| #SBATCH hetjob |
| #SBATCH -p {p_gpu} --qos={qos_gpu_nodeny} -N2 -t1 |
| true |
| """, |
| ) |
| het_jid = atf.submit_job_sbatch("het_gpu_busy.in", fatal=False) |
| assert het_jid != 0, "Het job should be accepted at submit" |
| |
| atf.repeat_until( |
| lambda: atf.get_job_parameter(het_jid, "JobState", quiet=True), |
| lambda s: s == "PENDING", |
| timeout=60, |
| poll_interval=0.5, |
| fatal=True, |
| ) |
| assert ( |
| atf.get_job_parameter(block_jid, "JobState", quiet=True) == "RUNNING" |
| ), "Blocker should still be running while het remains pending" |
| |
| time.sleep(2) |
| assert ( |
| atf.get_job_parameter(het_jid, "JobState", quiet=True) == "PENDING" |
| ), "Het leader should stay PENDING while both GPU nodes are consumed" |
| |
| scancel_started_job_leaders(block_jid, het_jid) |
| |
| |
| # --- Partition-level QoS (DefaultQOS; no --qos= in script) --- |
| |
| |
| def test_partition_level_nodeny_pending(setup_account_and_qos, cancel_jobs): |
| """ |
| Partition DefaultQOS, no DenyOnLimit; same het shape as het_part_nodeny. Submit succeeds; |
| on idle cluster the het completes. |
| """ |
| set_partition_default_qos(qos_p_cpu_def_nodeny, qos_p_gpu_def_nodeny) |
| atf.make_bash_script( |
| "het_part_nodeny.in", |
| f""" |
| #SBATCH -p {p_cpu} -N1 -t1 |
| #SBATCH hetjob |
| #SBATCH -p {p_gpu} -N2 -t1 |
| true |
| """, |
| ) |
| job_id = atf.submit_job_sbatch("het_part_nodeny.in", fatal=False) |
| assert ( |
| job_id != 0 |
| ), "Partition-level without DenyOnLimit, het job should be accepted" |
| _expect_submit_accept_het_done(job_id) |
| |
| |
| # --- Post-fix behavior: skipif on Slurm < 26.05 --- |
| |
| |
| @pytest.mark.skipif( |
| atf.get_version() <= SLURM_VERSION_HET_JOB_FIX, |
| reason="Test case invalid for the current Slurm version (requires Slurm >= 26.05).", |
| ) |
| def test_het_job_with_deny_on_limit_accepted_and_runs( |
| setup_account_and_qos, cancel_jobs |
| ): |
| """ |
| DenyOnLimit; job-level --qos=. Slurm >= 26.05: het (1+2 nodes) accepted at submit and completes. |
| """ |
| atf.make_bash_script( |
| "het_deny.in", |
| f""" |
| #SBATCH -p {p_cpu} --qos={qos_cpu_deny} -N1 -t1 |
| #SBATCH hetjob |
| #SBATCH -p {p_gpu} --qos={qos_gpu_deny} -N2 -t1 |
| true |
| """, |
| ) |
| job_id = atf.submit_job_sbatch("het_deny.in", fatal=False) |
| assert job_id != 0, ( |
| "With DenyOnLimit, het job (1 node CPU + 2 nodes GPU) should be " |
| "accepted at submit after 24603 fix (TRES per assoc/qos/partition)" |
| ) |
| atf.wait_for_job_state(job_id, "DONE", fatal=True) |
| |
| |
| def test_het_job_without_deny_on_limit_accepted_and_runs( |
| setup_account_and_qos, cancel_jobs |
| ): |
| """ |
| No DenyOnLimit; same 1+2 het. Submit succeeds; job runs to completion (no-deny path on all versions). |
| """ |
| atf.make_bash_script( |
| "het_nodeny.in", |
| f""" |
| #SBATCH -p {p_cpu} --qos={qos_cpu_nodeny} -N1 -t1 |
| #SBATCH hetjob |
| #SBATCH -p {p_gpu} --qos={qos_gpu_nodeny} -N2 -t1 |
| true |
| """, |
| ) |
| job_id = atf.submit_job_sbatch("het_nodeny.in", fatal=False) |
| assert job_id != 0, ( |
| "Without DenyOnLimit, het job (1 node CPU + 2 nodes GPU) should be " |
| "accepted at submit" |
| ) |
| atf.wait_for_job_state(job_id, "DONE", fatal=True) |
| |
| |
| @pytest.mark.skipif( |
| atf.get_version() <= SLURM_VERSION_HET_JOB_FIX, |
| reason="Test case invalid for the current Slurm version (requires Slurm >= 26.05).", |
| ) |
| def test_partition_level_deny_accepted_and_runs(setup_account_and_qos, cancel_jobs): |
| """ |
| Partition DefaultQOS + DenyOnLimit. Slurm >= 26.05: het accepted and completes (per-group limits). |
| """ |
| set_partition_default_qos(qos_p_cpu_def_deny, qos_p_gpu_def_deny) |
| atf.make_bash_script( |
| "het_part_deny.in", |
| f""" |
| #SBATCH -p {p_cpu} -N1 -t1 |
| #SBATCH hetjob |
| #SBATCH -p {p_gpu} -N2 -t1 |
| true |
| """, |
| ) |
| job_id = atf.submit_job_sbatch("het_part_deny.in", fatal=False) |
| assert ( |
| job_id != 0 |
| ), "Partition-level DenyOnLimit het job should be accepted after 24603 fix" |
| atf.wait_for_job_state(job_id, "DONE", fatal=True) |
| |
| |
| def test_partition_level_nodeny_accepted_and_runs(setup_account_and_qos, cancel_jobs): |
| """ |
| Partition DefaultQOS, no DenyOnLimit. Submit succeeds; job runs to completion (no-deny path on all versions). |
| """ |
| set_partition_default_qos(qos_p_cpu_def_nodeny, qos_p_gpu_def_nodeny) |
| atf.make_bash_script( |
| "het_part_nodeny.in", |
| f""" |
| #SBATCH -p {p_cpu} -N1 -t1 |
| #SBATCH hetjob |
| #SBATCH -p {p_gpu} -N2 -t1 |
| true |
| """, |
| ) |
| job_id = atf.submit_job_sbatch("het_part_nodeny.in", fatal=False) |
| assert job_id != 0, "Partition-level without DenyOnLimit het job should be accepted" |
| atf.wait_for_job_state(job_id, "DONE", fatal=True) |
| |
| |
| # --- Job-level Min/Max TRES (gres/r1) per component --- |
| |
| |
| def test_tres_nodeny_pending(setup_account_and_qos_tres, cancel_jobs): |
| """ |
| No DenyOnLimit; same Min/Max gres/r1 het. Submit succeeds; on idle cluster the job completes. |
| """ |
| atf.make_bash_script( |
| "het_tres_nodeny.in", |
| f""" |
| #SBATCH -A {acct_tres_gres} |
| #SBATCH -p {p_cpu} --qos={qos_tres_cpu_nodeny} -N1 -t1 |
| #SBATCH hetjob |
| #SBATCH -p {p_gpu} --qos={qos_tres_gpu_nodeny} -N2 -t1 --gres=r1:2 |
| true |
| """, |
| ) |
| job_id = atf.submit_job_sbatch("het_tres_nodeny.in", fatal=False) |
| assert ( |
| job_id != 0 |
| ), "Min/Max TRES het job without DenyOnLimit should be accepted at submit" |
| _expect_submit_accept_het_done(job_id) |
| |
| |
| @pytest.mark.skipif( |
| atf.get_version() <= SLURM_VERSION_HET_JOB_FIX, |
| reason="Test case invalid for the current Slurm version (requires Slurm >= 26.05).", |
| ) |
| def test_tres_deny_accepted_and_runs(setup_account_and_qos_tres, cancel_jobs): |
| """ |
| DenyOnLimit; per-component gres/r1 (0 + 4 r1). Slurm >= 26.05: accepted and completes (each component |
| satisfies its QoS). |
| """ |
| atf.make_bash_script( |
| "het_tres_deny.in", |
| f""" |
| #SBATCH -A {acct_tres_gres} |
| #SBATCH -p {p_cpu} --qos={qos_tres_cpu_deny} -N1 -t1 |
| #SBATCH hetjob |
| #SBATCH -p {p_gpu} --qos={qos_tres_gpu_deny} -N2 -t1 --gres=r1:2 |
| true |
| """, |
| ) |
| job_id = atf.submit_job_sbatch("het_tres_deny.in", fatal=False) |
| assert ( |
| job_id != 0 |
| ), "Post-fix: Min/Max TRES het job with DenyOnLimit should be accepted" |
| atf.wait_for_job_state(job_id, "DONE", fatal=True) |
| |
| |
| def test_tres_nodeny_accepted_and_runs(setup_account_and_qos_tres, cancel_jobs): |
| """ |
| No DenyOnLimit; same Min/Max gres/r1 het. Submit succeeds; job runs to completion (no-deny path on all versions). |
| """ |
| atf.make_bash_script( |
| "het_tres_nodeny.in", |
| f""" |
| #SBATCH -A {acct_tres_gres} |
| #SBATCH -p {p_cpu} --qos={qos_tres_cpu_nodeny} -N1 -t1 |
| #SBATCH hetjob |
| #SBATCH -p {p_gpu} --qos={qos_tres_gpu_nodeny} -N2 -t1 --gres=r1:2 |
| true |
| """, |
| ) |
| job_id = atf.submit_job_sbatch("het_tres_nodeny.in", fatal=False) |
| assert job_id != 0, "Min/Max TRES het job without DenyOnLimit should be accepted" |
| atf.wait_for_job_state(job_id, "DONE", fatal=True) |
| |
| |
| @pytest.mark.skipif( |
| atf.get_version() <= SLURM_VERSION_HET_JOB_FIX, |
| reason="Test case invalid for the current Slurm version (requires Slurm >= 26.05).", |
| ) |
| def test_het_job_assoc_total_over_limit_rejected(setup_assoc_limit_het, cancel_jobs): |
| """ |
| GrpTRES=node=3 on one association; job-level --qos=. Slurm >= 26.05: single het 2+2 stacks 4 > 3 → rejected at submit. |
| """ |
| atf.make_bash_script( |
| "het_assoc_total.in", |
| f""" |
| #SBATCH -A {acct_assoc} |
| #SBATCH -p {p_cpu} --qos={qos_assoc_cpu} -N2 -t1 |
| #SBATCH hetjob |
| #SBATCH -p {p_gpu} --qos={qos_assoc_gpu} -N2 -t1 |
| true |
| """, |
| ) |
| assert atf.submit_job_sbatch("het_assoc_total.in", fatal=False) == 0, ( |
| "Het job (2+2 nodes) with GrpTRES=node=3 on the shared association should " |
| "be rejected at submit (4 > 3)." |
| ) |
| |
| |
| def test_het_job_two_assoc_grp_tres_2plus2_accepted( |
| setup_assoc_limit_het_two_accounts, cancel_jobs |
| ): |
| """ |
| GrpTRES=node=3 per association; different -A per het component (2+2 nodes total). Each association |
| sees 2 ≤ 3 at submit—not four nodes on one association. Submit succeeds; on idle cluster the job completes. |
| """ |
| atf.make_bash_script( |
| "het_assoc_two_acct.in", |
| f""" |
| #SBATCH -A {acct_assoc2_a} |
| #SBATCH -p {p_cpu} --qos={qos_assoc2_cpu} -N2 -t1 |
| #SBATCH hetjob |
| #SBATCH -A {acct_assoc2_b} |
| #SBATCH -p {p_gpu} --qos={qos_assoc2_gpu} -N2 -t1 |
| true |
| """, |
| ) |
| job_id = atf.submit_job_sbatch("het_assoc_two_acct.in", fatal=False) |
| assert job_id != 0, ( |
| "Het job (2+2 nodes) with GrpTRES=node=3 on two different associations " |
| "should be accepted at submit (2 per association, not 4 on one)." |
| ) |
| _expect_submit_accept_het_done(job_id) |
| |
| |
| @pytest.mark.skipif( |
| atf.get_version() <= SLURM_VERSION_HET_JOB_FIX, |
| reason="Test case invalid for the current Slurm version (requires Slurm >= 26.05).", |
| ) |
| def test_het_job_grp_tres_pend_with_running_alloc_same_account( |
| setup_assoc_grp_tres_running_contention, cancel_jobs |
| ): |
| """ |
| GrpTRES=node=2; QoS MaxTresPerUser=node=10. One-node blocker runs on p_cpu (--exclusive); a 1+1 het on |
| the same account/QoS/partition cannot start until that allocation frees capacity under the group cap. |
| Het is accepted at submit and stays PENDING—not submit-time rejection. |
| """ |
| block_jid = atf.submit_job_sbatch( |
| f"-J {test_name}_grp_tres_block -A {acct_grp_tres_run} " |
| f"-p {p_cpu} --qos={qos_grp_tres_run} -N1 --exclusive -t10 " |
| '--wrap "sleep 600"', |
| fatal=False, |
| ) |
| assert block_jid != 0, "Blocker job should submit" |
| atf.wait_for_job_state(block_jid, "RUNNING", timeout=120, fatal=True) |
| |
| atf.make_bash_script( |
| "het_grp_tres_contend.in", |
| f""" |
| #SBATCH -A {acct_grp_tres_run} |
| #SBATCH -J {test_name}_het_grptres |
| #SBATCH -p {p_cpu} --qos={qos_grp_tres_run} -N1 -t1 --exclusive |
| #SBATCH hetjob |
| #SBATCH -p {p_cpu} --qos={qos_grp_tres_run} -N1 -t1 --exclusive |
| true |
| """, |
| ) |
| het_jid = atf.submit_job_sbatch("het_grp_tres_contend.in", fatal=False) |
| assert het_jid != 0, "Het job should be accepted at submit (under GrpTRES cap)" |
| |
| atf.repeat_until( |
| lambda: atf.get_job_parameter(het_jid, "JobState", quiet=True), |
| lambda s: s == "PENDING", |
| timeout=60, |
| poll_interval=0.5, |
| fatal=True, |
| ) |
| assert ( |
| atf.get_job_parameter(block_jid, "JobState", quiet=True) == "RUNNING" |
| ), "Blocker should still be running while het remains pending" |
| |
| time.sleep(2) |
| assert atf.get_job_parameter(het_jid, "JobState", quiet=True) == "PENDING", ( |
| "Het job should stay PENDING: association GrpTRES=node=2 with one node " |
| "in use by the running alloc and two nodes needed for het components." |
| ) |
| |
| scancel_started_job_leaders(block_jid, het_jid) |
| |
| |
| @pytest.mark.skipif( |
| atf.get_version() <= SLURM_VERSION_HET_JOB_FIX, |
| reason="Test case invalid for the current Slurm version (requires Slurm >= 26.05).", |
| ) |
| def test_het_job_grp_tres_runs_after_running_alloc_completes_same_account( |
| setup_assoc_grp_tres_running_contention, cancel_jobs |
| ): |
| """ |
| Same limits as test_het_job_grp_tres_pend_with_running_alloc_same_account (GrpTRES=node=2); blocker uses a |
| short sleep. Het queues while the blocker runs; after blocker DONE, het runs to completion. |
| """ |
| block_jid = atf.submit_job_sbatch( |
| f"-J {test_name}_grp_tres_block_short -A {acct_grp_tres_run} " |
| f"-p {p_cpu} --qos={qos_grp_tres_run} -N1 --exclusive -t2 " |
| '--wrap "sleep 10"', |
| fatal=False, |
| ) |
| assert block_jid != 0, "Blocker job should submit" |
| atf.wait_for_job_state(block_jid, "RUNNING", timeout=120, fatal=True) |
| |
| atf.make_bash_script( |
| "het_grp_tres_after_block.in", |
| f""" |
| #SBATCH -A {acct_grp_tres_run} |
| #SBATCH -J {test_name}_het_grptres_after |
| #SBATCH -p {p_cpu} --qos={qos_grp_tres_run} -N1 -t1 --exclusive |
| #SBATCH hetjob |
| #SBATCH -p {p_cpu} --qos={qos_grp_tres_run} -N1 -t1 --exclusive |
| true |
| """, |
| ) |
| het_jid = atf.submit_job_sbatch("het_grp_tres_after_block.in", fatal=False) |
| assert het_jid != 0, "Het job should be accepted at submit (under GrpTRES cap)" |
| |
| atf.repeat_until( |
| lambda: atf.get_job_parameter(het_jid, "JobState", quiet=True), |
| lambda s: s == "PENDING", |
| timeout=60, |
| poll_interval=0.5, |
| fatal=True, |
| ) |
| assert ( |
| atf.get_job_parameter(block_jid, "JobState", quiet=True) == "RUNNING" |
| ), "Blocker should still be running while het is pending" |
| |
| atf.wait_for_job_state(block_jid, "DONE", timeout=120, fatal=True) |
| atf.wait_for_job_state(het_jid, "DONE", timeout=120, fatal=True) |
| |
| |
| @pytest.mark.skipif( |
| atf.get_version() <= SLURM_VERSION_HET_JOB_FIX, |
| reason="Test case invalid for the current Slurm version (requires Slurm >= 26.05).", |
| ) |
| def test_het_job_grp_tres_three_running_second_smaller_pends_same_account( |
| setup_assoc_limit_het, cancel_jobs |
| ): |
| """ |
| GrpTRES=node=3: first het uses three nodes (1 exclusive on p_cpu + 2 exclusive on p_gpu), saturating |
| the association group. A second het (1+1) is under the per-submit GrpTRES ceiling (2 ≤ 3) but cannot |
| start while the first het holds group node usage; it should be accepted at submit and stay PENDING—not |
| submit-time rejection. |
| """ |
| atf.make_bash_script( |
| "het_grp3_sat_first.in", |
| f""" |
| #SBATCH -A {acct_assoc} |
| #SBATCH -J {test_name}_grp3_sat_first |
| #SBATCH -p {p_cpu} --qos={qos_assoc_cpu} -N1 -t10 --exclusive |
| #SBATCH hetjob |
| #SBATCH -p {p_gpu} --qos={qos_assoc_gpu} -N2 -t10 --exclusive |
| sleep 600 |
| """, |
| ) |
| first_jid = atf.submit_job_sbatch("het_grp3_sat_first.in", fatal=False) |
| assert first_jid != 0, "First het job (1+2 nodes) should be accepted at submit" |
| atf.wait_for_job_state(first_jid, "RUNNING", timeout=120, fatal=True) |
| |
| atf.make_bash_script( |
| "het_grp3_sat_second.in", |
| f""" |
| #SBATCH -A {acct_assoc} |
| #SBATCH -J {test_name}_grp3_sat_second |
| #SBATCH -p {p_cpu} --qos={qos_assoc_cpu} -N1 -t1 --exclusive |
| #SBATCH hetjob |
| #SBATCH -p {p_gpu} --qos={qos_assoc_gpu} -N1 -t1 --exclusive |
| true |
| """, |
| ) |
| second_jid = atf.submit_job_sbatch("het_grp3_sat_second.in", fatal=False) |
| assert ( |
| second_jid != 0 |
| ), "Second het (1+1 nodes) should be accepted at submit while first het is running" |
| |
| atf.repeat_until( |
| lambda: atf.get_job_parameter(second_jid, "JobState", quiet=True), |
| lambda s: s == "PENDING", |
| timeout=60, |
| poll_interval=0.5, |
| fatal=True, |
| ) |
| assert ( |
| atf.get_job_parameter(first_jid, "JobState", quiet=True) == "RUNNING" |
| ), "First het should still be running while second het is pending" |
| |
| time.sleep(2) |
| assert atf.get_job_parameter(second_jid, "JobState", quiet=True) == "PENDING", ( |
| "Second het should stay PENDING: association GrpTRES=node=3 is saturated " |
| "by the running first het (1+2 nodes); the follow-up het cannot start yet." |
| ) |
| |
| scancel_started_job_leaders(first_jid, second_jid) |
| |
| |
| @pytest.mark.skipif( |
| atf.get_version() <= SLURM_VERSION_HET_JOB_FIX, |
| reason="Test case invalid for the current Slurm version (requires Slurm >= 26.05).", |
| ) |
| def test_het_job_assoc_total_over_limit_rejected_partition_default_qos( |
| setup_assoc_limit_het, cancel_jobs |
| ): |
| """ |
| GrpTRES=node=3; partition DefaultQOS (no job --qos=). Slurm >= 26.05: single het 2+2 stacks 4 > 3 → rejected at submit. |
| """ |
| set_partition_default_qos(qos_assoc_cpu, qos_assoc_gpu) |
| atf.make_bash_script( |
| "het_assoc_total_partqos.in", |
| f""" |
| #SBATCH -A {acct_assoc} |
| #SBATCH -p {p_cpu} -N2 -t1 |
| #SBATCH hetjob |
| #SBATCH -p {p_gpu} -N2 -t1 |
| true |
| """, |
| ) |
| assert atf.submit_job_sbatch("het_assoc_total_partqos.in", fatal=False) == 0, ( |
| "Het job (2+2 nodes, partition DefaultQOS) with GrpTRES=node=3 on the " |
| "shared association should be rejected at submit (4 > 3)." |
| ) |
| |
| |
| def test_het_job_assoc_total_same_partition_default_qos( |
| setup_assoc_limit_het, cancel_jobs |
| ): |
| """ |
| GrpTRES=node=3; partition DefaultQOS; both het components on p_cpu (1+1 nodes, 2 ≤ 3). Submit succeeds; |
| on idle cluster the job completes. |
| """ |
| set_partition_default_qos(qos_assoc_cpu, qos_assoc_gpu) |
| atf.make_bash_script( |
| "het_assoc_total_samepart.in", |
| f""" |
| #SBATCH -A {acct_assoc} |
| #SBATCH -p {p_cpu} -N1 -t1 |
| #SBATCH hetjob |
| #SBATCH -p {p_cpu} -N1 -t1 |
| true |
| """, |
| ) |
| job_id = atf.submit_job_sbatch("het_assoc_total_samepart.in", fatal=False) |
| assert job_id != 0, ( |
| "Same partition + partition DefaultQOS: 1+1 het on p_cpu with " |
| "GrpTRES=node=3 should be accepted (2 nodes in one assoc/qos/part group)." |
| ) |
| _expect_submit_accept_het_done(job_id) |
| |
| |
| def test_het_job_duplicate_same_qos_partition_accept( |
| setup_het_dup_group_accept, cancel_jobs |
| ): |
| """ |
| DenyOnLimit; duplicate (assoc, job QoS, partition) on p_cpu; MaxTresPerUser=node=2; 1+1 het stacks |
| 2 ≤ 2 in one group → accepted at submit; on idle cluster the job completes. |
| """ |
| atf.make_bash_script( |
| "het_dup_qos_accept.in", |
| f""" |
| #SBATCH -A {acct_dup_accept} |
| #SBATCH -p {p_cpu} --qos={qos_dup_accept} -N1 -t1 |
| #SBATCH hetjob |
| #SBATCH -p {p_cpu} --qos={qos_dup_accept} -N1 -t1 |
| true |
| """, |
| ) |
| job_id = atf.submit_job_sbatch("het_dup_qos_accept.in", fatal=False) |
| assert job_id != 0, ( |
| "Duplicate (assoc, qos, partition): 1+1 nodes with MaxTresPerUser=2 " |
| "should be accepted at submit." |
| ) |
| _expect_submit_accept_het_done(job_id) |
| |
| |
| def test_het_job_duplicate_same_qos_partition_rejected( |
| setup_het_dup_group_reject, cancel_jobs |
| ): |
| """ |
| DenyOnLimit; duplicate same group on p_cpu; MaxTresPerUser=node=1; 1+1 stacks 2 > 1 → rejected at submit. |
| """ |
| atf.make_bash_script( |
| "het_dup_qos_reject.in", |
| f""" |
| #SBATCH -A {acct_dup_reject} |
| #SBATCH -p {p_cpu} --qos={qos_dup_reject} -N1 -t1 |
| #SBATCH hetjob |
| #SBATCH -p {p_cpu} --qos={qos_dup_reject} -N1 -t1 |
| true |
| """, |
| ) |
| assert atf.submit_job_sbatch("het_dup_qos_reject.in", fatal=False) == 0, ( |
| "Duplicate (assoc, qos, partition): 1+1 nodes with MaxTresPerUser=1 " |
| "should be rejected at submit (stacked 2 > 1)." |
| ) |
| |
| |
| # --- Combined-limit: limit_factor, (assoc,qos), (assoc,partition) rejection (tests 10–12) --- |
| |
| |
| def test_het_job_limit_factor_combined_limit_rejected( |
| setup_limit_factor_het, cancel_jobs |
| ): |
| """ |
| Association GrpTRES=node=10; QoS limit_factor=0.3 (effective node cap 3). Het 2+2 same assoc and job QoS |
| stacks 4 > 3 → rejected at submit. |
| """ |
| atf.make_bash_script( |
| "het_lf.in", |
| f""" |
| #SBATCH -A {acct_lf} |
| #SBATCH -p {p_cpu} --qos={qos_lf} -N2 -t1 |
| #SBATCH hetjob |
| #SBATCH -p {p_gpu} --qos={qos_lf} -N2 -t1 |
| true |
| """, |
| ) |
| assert atf.submit_job_sbatch("het_lf.in", fatal=False) == 0, ( |
| "Combined-limit: het job (2+2 nodes) with assoc GrpTRES=node=10 and QoS " |
| "limit_factor=0.3 (effective 3) should be rejected at submit." |
| ) |
| |
| |
| def test_het_job_qos_combined_limit_rejected(setup_qos_combined_limit_het, cancel_jobs): |
| """ |
| QoS MaxTresPerUser=node=3; het 2+2 across partitions with same assoc and job QoS stacks 4 > 3 in one |
| bucket → rejected at submit. |
| """ |
| atf.make_bash_script( |
| "het_qos_combined_limit.in", |
| f""" |
| #SBATCH -A {acct_qos_combined_limit} |
| #SBATCH -p {p_cpu} --qos={qos_combined_limit} -N2 -t1 |
| #SBATCH hetjob |
| #SBATCH -p {p_gpu} --qos={qos_combined_limit} -N2 -t1 |
| true |
| """, |
| ) |
| assert atf.submit_job_sbatch("het_qos_combined_limit.in", fatal=False) == 0, ( |
| "Combined-limit: het job (2+2 nodes) with QoS MaxTresPerUser=node=3 should be " |
| "rejected at submit when stacked TRES for (assoc, job QOS) exceeds the cap." |
| ) |
| |
| |
| def test_het_job_partition_combined_limit_rejected( |
| setup_partition_combined_limit_het, cancel_jobs |
| ): |
| """ |
| Partition p_cpu QoS max node=1; 1+1 het in the same partition with different job QoS stacks 2 > 1 in |
| the assoc+partition bucket → rejected at submit. |
| """ |
| atf.make_bash_script( |
| "het_part_combined_limit.in", |
| f""" |
| #SBATCH -A {acct_part_combined_limit} |
| #SBATCH -p {p_cpu} --qos={qos_job_a} -N1 -t1 |
| #SBATCH hetjob |
| #SBATCH -p {p_cpu} --qos={qos_job_b} -N1 -t1 |
| true |
| """, |
| ) |
| assert atf.submit_job_sbatch("het_part_combined_limit.in", fatal=False) == 0, ( |
| "Combined-limit: het job (1+1 nodes in same partition) with partition QoS " |
| "limit 1 should be rejected at submit when stacked TRES for (assoc, partition) exceeds the cap." |
| ) |