Merge branch 'nate-i50644-update_job_str' into 'master'
See merge request SchedMD/dev/slurm!2002
diff --git a/slurm/slurm_errno.h b/slurm/slurm_errno.h
index 595a429..c51f826 100644
--- a/slurm/slurm_errno.h
+++ b/slurm/slurm_errno.h
@@ -354,6 +354,17 @@
ESLURM_FED_JOB_LOCK,
ESLURM_FED_NO_VALID_CLUSTERS,
+ /* Lua wrapper errors */
+ ESLURM_LUA_INVALID_STATE = 7400,
+ ESLURM_LUA_INVALID_SYNTAX,
+ ESLURM_LUA_FUNC_NOT_FOUND,
+ ESLURM_LUA_FUNC_INVALID_RC,
+ ESLURM_LUA_FUNC_FAILED,
+ ESLURM_LUA_FUNC_FAILED_RUNTIME_ERROR,
+ ESLURM_LUA_FUNC_FAILED_ENOMEM,
+ ESLURM_LUA_FUNC_FAILED_GARBAGE_COLLECTOR,
+ ESLURM_LUA_INVALID_CONVERSION_TYPE,
+
/* plugin and custom errors */
ESLURM_MISSING_TIME_LIMIT = 8000,
ESLURM_INVALID_KNL,
diff --git a/src/common/forward.c b/src/common/forward.c
index 573461c..065b712 100644
--- a/src/common/forward.c
+++ b/src/common/forward.c
@@ -688,10 +688,22 @@
if (header->forward.tree_depth)
header->forward.timeout = (header->forward.timeout * depth) /
header->forward.tree_depth;
- else
- header->forward.timeout *= 2 * depth;
+ else {
+ /*
+ * tree_depth not packed - likely using 24.05- protocol version.
+ * Calculate the timeout based on MessageTimeout instead.
+ */
+ header->forward.timeout =
+ (2 * depth * slurm_conf.msg_timeout * MSEC_IN_SEC);
+ debug3("%s: original tree_depth was 0 so setting new timeout to %d",
+ __func__, header->forward.timeout);
+ }
header->forward.tree_depth = depth;
forward_struct->timeout = header->forward.timeout;
+
+ log_flag(NET, "%s: forwarding messages to %u nodes with timeout of %d",
+ __func__, forward_struct->fwd_cnt, forward_struct->timeout);
+
_forward_msg_internal(NULL, sp_hl, forward_struct, header,
forward_struct->timeout, hl_count);
diff --git a/src/common/log.c b/src/common/log.c
index d26fd97..3042ba9 100644
--- a/src/common/log.c
+++ b/src/common/log.c
@@ -308,6 +308,11 @@
atfork_install_handlers();
}
+ if (syslog_open) {
+ closelog();
+ syslog_open = false;
+ }
+
if (prog) {
if (log->argv0)
xfree(log->argv0);
@@ -340,11 +345,6 @@
log->fbuf = cbuf_create(128, 8192);
}
- if (syslog_open) {
- closelog();
- syslog_open = false;
- }
-
if (log->opt.syslog_level > LOG_LEVEL_QUIET) {
log->facility = fac;
openlog(log->argv0, LOG_PID, log->facility);
@@ -522,6 +522,10 @@
slurm_mutex_lock(&log_lock);
_log_flush(log);
+ if (syslog_open) {
+ closelog();
+ syslog_open = false;
+ }
xfree(log->argv0);
xfree(log->prefix);
if (log->buf)
@@ -530,10 +534,6 @@
cbuf_destroy(log->fbuf);
if (log->logfp)
fclose(log->logfp);
- if (syslog_open) {
- closelog();
- syslog_open = false;
- }
xfree(log);
slurm_mutex_unlock(&log_lock);
}
@@ -578,6 +578,10 @@
void log_set_argv0(char *argv0)
{
slurm_mutex_lock(&log_lock);
+ if (syslog_open) {
+ closelog();
+ syslog_open = false;
+ }
if (log->argv0)
xfree(log->argv0);
if (!argv0)
diff --git a/src/common/slurm_errno.c b/src/common/slurm_errno.c
index 4b2ff40..6e34c2e 100644
--- a/src/common/slurm_errno.c
+++ b/src/common/slurm_errno.c
@@ -632,6 +632,26 @@
{ ERRTAB_ENTRY(ESLURM_FED_NO_VALID_CLUSTERS),
"No eligible clusters for federated job" },
+ /* Lua wrapper errors */
+ { ERRTAB_ENTRY(ESLURM_LUA_INVALID_STATE),
+ "Invalid or NULL Lua state pointer" },
+ { ERRTAB_ENTRY(ESLURM_LUA_INVALID_SYNTAX),
+ "Lua script loading failed due to invalid syntax" },
+ { ERRTAB_ENTRY(ESLURM_LUA_FUNC_NOT_FOUND),
+ "Requested Lua function not found" },
+ { ERRTAB_ENTRY(ESLURM_LUA_FUNC_INVALID_RC),
+ "Lua function returned nil or table instead of numeric result" },
+ { ERRTAB_ENTRY(ESLURM_LUA_FUNC_FAILED),
+ "Lua function execution failed for unknown reasons" },
+ { ERRTAB_ENTRY(ESLURM_LUA_FUNC_FAILED_RUNTIME_ERROR),
+ "Lua function execution failed due to runtime error" },
+ { ERRTAB_ENTRY(ESLURM_LUA_FUNC_FAILED_ENOMEM),
+ "Lua function execution failed due to memory allocation failure. Either not enough memory for Lua script's variables or memory allocation request for a variable was way too large." },
+ { ERRTAB_ENTRY(ESLURM_LUA_FUNC_FAILED_GARBAGE_COLLECTOR),
+ "Lua function execution failed due to unknown failure in Lua's garbage collector" },
+ { ERRTAB_ENTRY(ESLURM_LUA_INVALID_CONVERSION_TYPE),
+ "Unknown or unparsable Lua type found at top of Lua stack" },
+
/* plugin and custom errors */
{ ERRTAB_ENTRY(ESLURM_MISSING_TIME_LIMIT),
"Time limit specification required, but not provided" },
diff --git a/src/common/slurm_protocol_api.c b/src/common/slurm_protocol_api.c
index dbaad8f..9b897f2 100644
--- a/src/common/slurm_protocol_api.c
+++ b/src/common/slurm_protocol_api.c
@@ -102,7 +102,6 @@
/* #DEFINES */
/* STATIC VARIABLES */
-static int message_timeout = -1;
/* STATIC FUNCTIONS */
static char *_global_auth_key(void);
@@ -1531,16 +1530,8 @@
msg->forward_struct->buf_len);
msg->forward_struct->ret_list = msg->ret_list;
- /* take out the amount of timeout from this hop */
- msg->forward_struct->timeout = header.forward.timeout;
- if (!msg->forward_struct->timeout)
- msg->forward_struct->timeout = message_timeout;
msg->forward_struct->fwd_cnt = header.forward.cnt;
- log_flag(NET, "%s: [%s] forwarding messages to %u nodes with timeout of %d",
- __func__, peer, msg->forward_struct->fwd_cnt,
- msg->forward_struct->timeout);
-
if (forward_msg(msg->forward_struct, &header) == SLURM_ERROR) {
/* peer may have not been resolved already */
if (!peer)
diff --git a/src/common/slurm_protocol_defs.c b/src/common/slurm_protocol_defs.c
index 38fa028..931bbc4 100644
--- a/src/common/slurm_protocol_defs.c
+++ b/src/common/slurm_protocol_defs.c
@@ -4792,12 +4792,21 @@
xfree(msg);
}
-extern void slurm_free_tls_cert_response_msg(tls_cert_response_msg_t *msg)
+extern void slurm_free_tls_cert_response_msg_members(tls_cert_response_msg_t
+ *msg)
{
if (!msg)
return;
xfree(msg->signed_cert);
+}
+
+extern void slurm_free_tls_cert_response_msg(tls_cert_response_msg_t *msg)
+{
+ if (!msg)
+ return;
+
+ slurm_free_tls_cert_response_msg_members(msg);
xfree(msg);
}
diff --git a/src/common/slurm_protocol_defs.h b/src/common/slurm_protocol_defs.h
index 3b455de..1556bb6 100644
--- a/src/common/slurm_protocol_defs.h
+++ b/src/common/slurm_protocol_defs.h
@@ -1642,6 +1642,8 @@
crontab_update_response_msg_t *msg);
extern void slurm_free_tls_cert_request_msg(tls_cert_request_msg_t *msg);
extern void slurm_free_tls_cert_response_msg(tls_cert_response_msg_t *msg);
+extern void slurm_free_tls_cert_response_msg_members(tls_cert_response_msg_t
+ *msg);
extern void slurm_free_suspend_exc_update_msg(suspend_exc_update_msg_t *msg);
extern void slurm_free_sbcast_cred_req_msg(sbcast_cred_req_msg_t *msg);
diff --git a/src/lua/slurm_lua.c b/src/lua/slurm_lua.c
index d83c092..116b453 100644
--- a/src/lua/slurm_lua.c
+++ b/src/lua/slurm_lua.c
@@ -67,6 +67,64 @@
void *acct_db_conn = NULL;
#endif
+#define T(status_code, string, err) { status_code, #status_code, string, err }
+
+static const struct {
+ lua_status_code_t status_code;
+ const char *status_code_string;
+ const char *string;
+ slurm_err_t err;
+} lua_status_codes[] = {
+ /*
+ * Status codes macros from lua.h and messages derived from:
+ * https://www.lua.org/manual/5.3/manual.html
+ */
+ T(LUA_OK, "SUCCESS", SLURM_SUCCESS),
+ T(LUA_YIELD, "Thread yielded", ESLURM_LUA_FUNC_FAILED),
+ T(LUA_ERRRUN, "Runtime error", ESLURM_LUA_FUNC_FAILED_RUNTIME_ERROR),
+ T(LUA_ERRSYNTAX, "Syntax error during precompilation",
+ ESLURM_LUA_INVALID_SYNTAX),
+ T(LUA_ERRMEM, "Memory allocation error", ESLURM_LUA_FUNC_FAILED_ENOMEM),
+#ifdef LUA_ERRGCMM
+ T(LUA_ERRGCMM, "Error while running a __gc metamethod",
+ ESLURM_LUA_FUNC_FAILED_GARBAGE_COLLECTOR),
+#endif
+ T(LUA_ERRERR, "Error while running the message handler",
+ ESLURM_LUA_FUNC_FAILED_RUNTIME_ERROR),
+};
+#undef T
+
+extern const char *slurm_lua_status_code_string(lua_status_code_t sc)
+{
+ for (int i = 0; i < ARRAY_SIZE(lua_status_codes); i++)
+ if (lua_status_codes[i].status_code == sc)
+ return lua_status_codes[i].string;
+
+ /*
+ * Should never happen but only Lua controls returns these values so it
+ * is out of Slurm's control.
+ */
+ return "Unknown Lua status code";
+}
+
+extern const char *slurm_lua_status_code_stringify(lua_status_code_t sc)
+{
+ for (int i = 0; i < ARRAY_SIZE(lua_status_codes); i++)
+ if (lua_status_codes[i].status_code == sc)
+ return lua_status_codes[i].status_code_string;
+
+ return "INVALID";
+}
+
+extern slurm_err_t slurm_lua_status_error(lua_status_code_t sc)
+{
+ for (int i = 0; i < ARRAY_SIZE(lua_status_codes); i++)
+ if (lua_status_codes[i].status_code == sc)
+ return lua_status_codes[i].err;
+
+ return ESLURM_LUA_FUNC_FAILED;
+}
+
static int _setup_stringarray(lua_State *L, int limit, char **data)
{
/*
@@ -83,17 +141,53 @@
return 1;
}
-/*
- * check that global symbol [name] in lua script is a function
- */
-static int _check_lua_script_function(lua_State *L, const char *name)
+extern int slurm_lua_pcall(lua_State *L, int nargs, int nresults, int msgh,
+ char **err_ptr, const char *caller)
{
- int rc = 0;
+ lua_status_code_t sc;
+ int rc;
+
+ sc = lua_pcall(L, nargs, nresults, msgh);
+ rc = slurm_lua_status_error(sc);
+
+ if (rc) {
+ /*
+ * When a lua_pcall() fails, Lua "pushes a single value on the
+ * stack (the error object)" per lua_pcall() description in the
+ * reference manual:
+ * https://www.lua.org/manual/5.3/manual.html
+ * When msgh == 0, this is the same as the return value of
+ * lua_pcall().
+ *
+ * This function will lua_pop() that value to remove it from
+ * the stack.
+ */
+ lua_pop(L, 1);
+ *err_ptr = xstrdup(slurm_strerror(rc));
+
+ error("%s: lua_pcall(0x%"PRIxPTR", %d, %d, %d)=%s(%s)=%s",
+ caller, (uintptr_t) L, nargs, nresults, msgh,
+ slurm_lua_status_code_stringify(sc),
+ slurm_lua_status_code_string(sc), *err_ptr);
+ } else {
+ log_flag(SCRIPT, "%s: lua_pcall(0x%"PRIxPTR", %d, %d, %d)=%s(%s)=%s",
+ caller, (uintptr_t) L, nargs, nresults, msgh,
+ slurm_lua_status_code_stringify(sc),
+ slurm_lua_status_code_string(sc), slurm_strerror(rc));
+ }
+
+ return rc;
+}
+
+extern bool slurm_lua_is_function_defined(lua_State *L, const char *name)
+{
+ bool rc = false;
+
lua_getglobal(L, name);
- if (!lua_isfunction(L, -1))
- rc = -1;
+ rc = lua_isfunction(L, -1);
lua_pop(L, -1);
- return (rc);
+
+ return rc;
}
/*
@@ -106,7 +200,7 @@
int rc = 0;
const char **ptr = NULL;
for (ptr = req_fxns; ptr && *ptr; ptr++) {
- if (_check_lua_script_function(L, *ptr) < 0) {
+ if (!slurm_lua_is_function_defined(L, *ptr)) {
error("%s: %s: missing required function %s",
plugin, script_path, *ptr);
rc = -1;
@@ -712,7 +806,7 @@
lua_State *curr = *L;
struct stat st;
int rc = 0;
- char *err_str = NULL;
+ char *err_str = NULL, *ret_err_str = NULL;
if (stat(script_path, &st) != 0) {
err_str = xstrdup_printf("Unable to stat %s: %s",
@@ -755,9 +849,9 @@
/*
* Run the user script:
*/
- if (lua_pcall(new, 0, 1, 0)) {
- err_str = xstrdup_printf("%s: %s",
- script_path, lua_tostring(new, -1));
+ if ((rc = slurm_lua_pcall(new, 0, 1, 0, &ret_err_str, __func__))) {
+ err_str = xstrdup_printf("%s: %s", script_path, ret_err_str);
+ xfree(ret_err_str);
lua_close(new);
goto fini_error;
}
diff --git a/src/lua/slurm_lua.h b/src/lua/slurm_lua.h
index c283eb4..57b2785 100644
--- a/src/lua/slurm_lua.h
+++ b/src/lua/slurm_lua.h
@@ -44,6 +44,21 @@
#include "src/slurmctld/slurmctld.h"
#include "slurm/slurm_errno.h"
+#ifndef LUA_OK
+/* Define LUA_OK if Lua is <5.2 */
+#define LUA_OK 0
+#endif
+
+/* Using typedef as Lua status codes are distinct from POSIX return codes */
+typedef int lua_status_code_t;
+
+/* Get string description of Lua status code */
+extern const char *slurm_lua_status_code_string(lua_status_code_t sc);
+/* Get stringified form of status codes macro from lua.h */
+extern const char *slurm_lua_status_code_stringify(lua_status_code_t sc);
+/* Get slurm_err_t of status codes macro from lua.h */
+extern slurm_err_t slurm_lua_status_error(lua_status_code_t sc);
+
/* Generic stack dump function for debugging purposes */
extern void slurm_lua_stack_dump(const char *plugin,
char *header, lua_State *L);
@@ -88,6 +103,27 @@
extern int slurm_lua_job_record_field(lua_State *L, const job_record_t *job_ptr,
const char *name);
+/*
+ * Check if a function is present in script
+ * IN L - lua state table pointer
+ * IN func_name - name of function to check
+ * RET true if function is present or false is function not found
+ */
+extern bool slurm_lua_is_function_defined(lua_State *L, const char *func_name);
+
+/*
+ * Call lua_pcall() and catch error
+ * IN L - lua state table pointer
+ * IN nargs - number of arguments to function already pushed
+ * IN nresults - number of returns expected from function
+ * IN msgh - message handler
+ * IN/OUT err_ptr - Populate error string on failure. Must xfree(*err_ptr)
+ * IN caller - __func__ from caller
+ * RET SLURM_SUCCESS or error
+ */
+extern int slurm_lua_pcall(lua_State *L, int nargs, int nresults, int msgh,
+ char **err_ptr, const char *caller);
+
#else
# define LUA_VERSION_NUM 0
#endif
diff --git a/src/plugins/sched/backfill/backfill.c b/src/plugins/sched/backfill/backfill.c
index 7a3f816..73d1a66 100644
--- a/src/plugins/sched/backfill/backfill.c
+++ b/src/plugins/sched/backfill/backfill.c
@@ -2651,8 +2651,8 @@
comp_time_limit = MIN(time_limit, deadline_time_limit);
else if (job_ptr->time_min &&
(job_ptr->time_min < time_limit)) {
- time_limit = job_ptr->time_limit = job_ptr->time_min;
comp_time_limit = time_limit;
+ time_limit = job_ptr->time_limit = job_ptr->time_min;
} else
comp_time_limit = time_limit;
if ((qos_flags & QOS_FLAG_NO_RESERVE) &&
diff --git a/src/slurmctld/job_mgr.c b/src/slurmctld/job_mgr.c
index fe1bdab..8e48d3a 100644
--- a/src/slurmctld/job_mgr.c
+++ b/src/slurmctld/job_mgr.c
@@ -7752,6 +7752,16 @@
uint16_t ntasks_per_node = job_desc->ntasks_per_node;
uint16_t ntasks_per_tres = job_desc->ntasks_per_tres;
+ /*
+ * Don't figure out num tasks / bitflags if updating the job and none
+ * of the relevant influencing fields in job_desc are set.
+ */
+ if (job_ptr &&
+ (job_desc->num_tasks == NO_VAL && job_desc->min_nodes == NO_VAL &&
+ job_desc->ntasks_per_node == NO_VAL16 &&
+ job_desc->ntasks_per_tres == NO_VAL16))
+ return;
+
if (num_tasks != NO_VAL) {
job_desc->bitflags |= JOB_NTASKS_SET;
}
diff --git a/src/slurmctld/proc_req.c b/src/slurmctld/proc_req.c
index ed70940..bc0ade2 100644
--- a/src/slurmctld/proc_req.c
+++ b/src/slurmctld/proc_req.c
@@ -5963,7 +5963,7 @@
static void _slurm_rpc_tls_cert(slurm_msg_t *msg)
{
tls_cert_request_msg_t *req = msg->data;
- tls_cert_response_msg_t *resp = xmalloc(sizeof(*resp));
+ tls_cert_response_msg_t resp = { 0 };
node_record_t *node = NULL;
bool is_client_auth = false;
@@ -5981,7 +5981,7 @@
is_client_auth = conn_g_is_client_authenticated(msg->tls_conn);
- if (!(resp->signed_cert =
+ if (!(resp.signed_cert =
certmgr_g_sign_csr(req->csr, is_client_auth, req->token,
req->node_name))) {
error("%s: Unable to sign certificate signing request.",
@@ -5991,13 +5991,13 @@
node->cert_last_renewal = time(NULL);
}
- if (resp->signed_cert) {
+ if (resp.signed_cert) {
log_flag(AUDIT_TLS, "Sending signed certificate back to node \'%s\'",
req->node_name);
}
- (void) send_msg_response(msg, RESPONSE_TLS_CERT, resp);
- slurm_free_msg_data(RESPONSE_TLS_CERT, resp);
+ (void) send_msg_response(msg, RESPONSE_TLS_CERT, &resp);
+ slurm_free_tls_cert_response_msg_members(&resp);
}
static void _slurm_rpc_sib_job_lock(slurm_msg_t *msg)
diff --git a/src/slurmrestd/plugins/openapi/slurmdbd/wckeys.c b/src/slurmrestd/plugins/openapi/slurmdbd/wckeys.c
index 4854bee..fe68f06 100644
--- a/src/slurmrestd/plugins/openapi/slurmdbd/wckeys.c
+++ b/src/slurmrestd/plugins/openapi/slurmdbd/wckeys.c
@@ -150,6 +150,6 @@
}
cleanup:
- slurmdb_destroy_wckey_rec(wckey_cond);
+ slurmdb_destroy_wckey_cond(wckey_cond);
return SLURM_SUCCESS;
}
diff --git a/src/srun/srun_job.c b/src/srun/srun_job.c
index 9e7fe4e..a566af8 100644
--- a/src/srun/srun_job.c
+++ b/src/srun/srun_job.c
@@ -1994,6 +1994,7 @@
(setenvf(NULL, key, "%u", resp->segment_size) < 0)) {
error("unable to set %s in environment", key);
}
+ xfree(key);
}
return;
diff --git a/testsuite/expect/globals_accounting b/testsuite/expect/globals_accounting
index 10bd205..be40646 100644
--- a/testsuite/expect/globals_accounting
+++ b/testsuite/expect/globals_accounting
@@ -1190,7 +1190,7 @@
# Use sacctmgr to load info
#
set rc $::RETURN_SUCCESS
- set result [run_command -timeout 180 "$sacctmgr -i -n archive load $file"]
+ set result [run_command -timeout 360 "$sacctmgr -i -n archive load $file"]
if { [dict get $result exit_code] != 0 || [regexp "There was a problem" [dict get $result output]]} {
log_error "sacctmgr didn't load archive correctly"
set rc $::RETURN_ERROR
diff --git a/testsuite/expect/test38.3 b/testsuite/expect/test38.3
index 337383d..ffcb130 100755
--- a/testsuite/expect/test38.3
+++ b/testsuite/expect/test38.3
@@ -107,14 +107,20 @@
set id_set $expect_out(1,string)
exp_continue
}
- -re "CPUs/Task=1" {
+ -re "CPUs/Task=4" {
incr matches
exp_continue
}
- -re "MinMemoryCPU=6M" {
+ -re "MinMemoryCPU=10M" {
incr matches
exp_continue
}
+ -re "JobId=$re_word_str\\s+HetJobId=$job_id" {
+ # this match ensures that the previous regexp are matched before
+ # the 2nd and 3rd components are printed
+ incr matches
+ exp_continue
+ }
timeout {
fail "scontrol not responding"
}
@@ -122,8 +128,8 @@
wait
}
}
-if {$matches != 3} {
- fail "Problem with scontrol ($matches != 3)"
+if {$matches != 5} {
+ fail "Problem with scontrol ($matches != 5)"
}
set js [parse_id_set $id_set $job_id]
diff --git a/testsuite/python/conftest.py b/testsuite/python/conftest.py
index 8928a40..e6c929a 100644
--- a/testsuite/python/conftest.py
+++ b/testsuite/python/conftest.py
@@ -158,6 +158,7 @@
atf.properties["configurations-modified"] = set()
atf.properties["orig-environment"] = dict(os.environ)
atf.properties["orig-pypath"] = list(sys.path)
+ atf.properties["forced_upgrade_setup"] = False
if "old-slurm-prefix" in atf.properties.keys():
del atf.properties["old-slurm-prefix"]
if "new-slurm-prefix" in atf.properties.keys():
@@ -265,7 +266,7 @@
atf.run_command(f"sudo rm -rf {tmpfs_dir}", quiet=True)
# Restore upgrade setup
- if "old-slurm-prefix" in atf.properties.keys():
+ if atf.properties.get("forced_upgrade_setup"):
logging.debug("Restoring upgrade setup...")
if not os.path.exists(f"{atf.module_tmp_path}/upgrade-sbin"):
pytest.fail(
@@ -379,13 +380,100 @@
"""
# The plugin uses ESPANK_NODE_FAILURE, so it needs to compile against 25.05+
- atf.require_version((25, 5), "config.h")
+ # It also needs to be built against the same version of slurmd and submit
+ # clients like sbatch
+ new_prefixes = False
+ if not atf.is_upgrade_setup():
+ atf.require_version((25, 5), "config.h")
+ else:
+ slurmd_version = atf.get_version("sbin/slurmd")
+ sbatch_version = atf.get_version("bin/sbatch")
+
+ if slurmd_version != sbatch_version:
+ pytest.skip(
+ f"We need to build SPANK against Slurm version of submit clients as sbatch {sbatch_version} and slurmd {slurmd_version}, but they diffear."
+ )
+
+ if slurmd_version < (25, 5):
+ pytest.skip(
+ f"This SPANK plugin needs a Slurm 25.05+, but slurmd version is {slurmd_version}"
+ )
+
+ if (
+ atf.get_version("config.h", slurm_prefix=atf.properties["new-build-prefix"])
+ == slurmd_version
+ ):
+ new_prefixes = True
+ elif (
+ not atf.get_version(
+ "config.h", slurm_prefix=atf.properties["old-build-prefix"]
+ )
+ == slurmd_version
+ ):
+ # This should never happen, slurmd should be one of those versions
+ pytest.fail(
+ "Unable to find build dir to match slurmd version {slurmd_version}"
+ )
src_path = atf.properties["testsuite_scripts_dir"] + "/spank_fail_test.c"
bin_path = os.getcwd() + "/spank_fail_test.so"
- atf.compile_against_libslurm(src_path, bin_path, full=True, shared=True)
+ atf.compile_against_libslurm(
+ src_path, bin_path, full=True, shared=True, new_prefixes=new_prefixes
+ )
yield bin_path
atf.run_command(f"rm -f {bin_path}", fatal=True)
+
+
+@pytest.fixture(scope="module")
+def spank_tmp_lib(module_setup):
+ """
+ Compiles a SPANK plugin that will write files in a /tmp directory.
+ Returns the tmp_spank dir and the bin path of the spank .so that will write
+ files in the tmp_spank dir if configured.
+ """
+
+ # The plugin uses ESPANK_NODE_FAILURE, so it needs to compile against 25.05+
+ # It also needs to be built against the same version of slurmd and submit
+ # clients like sbatch
+ new_prefixes = False
+ if atf.is_upgrade_setup():
+ slurmd_version = atf.get_version("sbin/slurmd")
+ sbatch_version = atf.get_version("bin/sbatch")
+
+ if slurmd_version != sbatch_version:
+ pytest.skip(
+ f"We need to build SPANK against Slurm version of submit clients as sbatch {sbatch_version} and slurmd {slurmd_version}, but they diffear."
+ )
+ if (
+ atf.get_version("config.h", slurm_prefix=atf.properties["new-build-prefix"])
+ == slurmd_version
+ ):
+ new_prefixes = True
+ elif (
+ not atf.get_version(
+ "config.h", slurm_prefix=atf.properties["old-build-prefix"]
+ )
+ == slurmd_version
+ ):
+ # This should never happen, slurmd should be one of those versions
+ pytest.fail(
+ "Unable to find build dir to match slurmd version {slurmd_version}"
+ )
+
+ src_path = atf.properties["testsuite_scripts_dir"] + "/spank_tmp_plugin.c"
+ bin_path = os.getcwd() + "/spank_tmp_plugin.so"
+
+ atf.compile_against_libslurm(
+ src_path, bin_path, full=True, shared=True, new_prefixes=new_prefixes
+ )
+
+ tmp_spank = "/tmp/spank"
+ atf.run_command(f"mkdir -p {tmp_spank}", fatal=True)
+
+ yield tmp_spank, bin_path
+
+ atf.run_command(f"rm -f {bin_path}", fatal=True)
+ atf.run_command(f"rm -rf {tmp_spank}", fatal=True)
diff --git a/testsuite/python/lib/atf.py b/testsuite/python/lib/atf.py
index dc2c410..13915e9 100644
--- a/testsuite/python/lib/atf.py
+++ b/testsuite/python/lib/atf.py
@@ -1061,7 +1061,13 @@
def is_upgrade_setup(
- old_slurm_prefix="/opt/slurm-old", new_slurm_prefix="/opt/slurm-new"
+ old_slurm_prefix="/opt/slurm-old",
+ new_slurm_prefix="/opt/slurm-new",
+ old_build_prefix="",
+ new_build_prefix="",
+ old_source_prefix="",
+ new_source_prefix="",
+ force_old=False,
):
"""
Return True if we have two Slurms configured in the system.
@@ -1075,11 +1081,27 @@
logging.debug(f"New prefix {new_slurm_prefix} not exists.")
return False
+ # Add the right properties
+ setup_upgrades(
+ old_slurm_prefix,
+ new_slurm_prefix,
+ old_build_prefix,
+ new_build_prefix,
+ old_source_prefix,
+ new_source_prefix,
+ force_old,
+ )
return True
def require_upgrades(
- old_slurm_prefix="/opt/slurm-old", new_slurm_prefix="/opt/slurm-new"
+ old_slurm_prefix="/opt/slurm-old",
+ new_slurm_prefix="/opt/slurm-new",
+ old_build_prefix="",
+ new_build_prefix="",
+ old_source_prefix="",
+ new_source_prefix="",
+ force_old=True,
):
"""Checks if has two different versions installed.
@@ -1088,7 +1110,15 @@
if not properties["auto-config"]:
require_auto_config("to change/upgrade Slurm setup")
- if not is_upgrade_setup():
+ if not is_upgrade_setup(
+ old_slurm_prefix,
+ new_slurm_prefix,
+ old_build_prefix,
+ new_build_prefix,
+ old_source_prefix,
+ new_source_prefix,
+ force_old,
+ ):
pytest.skip("This test needs an upgrade setup")
# Double-check that old_version <= new_version
@@ -1100,37 +1130,71 @@
)
logging.info(f"Required upgrade setup found: {old_version} and {new_version}")
+
+def setup_upgrades(
+ old_slurm_prefix="/opt/slurm-old",
+ new_slurm_prefix="/opt/slurm-new",
+ old_build_prefix="",
+ new_build_prefix="",
+ old_source_prefix="",
+ new_source_prefix="",
+ force_old=False,
+):
+ """
+ Adds the necessary atf.properties[] with the old/new paths.
+ If force_old is specified itt also update the links pointing to the old
+ paths, and they will be restored in the global teardown.
+ """
+ # TODO: We should use slurm-new(-build) instead of slurm-git(-build)
+ if old_build_prefix == "":
+ old_build_prefix = properties["slurm-build-dir"]
+ if new_build_prefix == "":
+ new_build_prefix = f"{properties['slurm-build-dir']}/../slurm-git-build"
+ if old_source_prefix == "":
+ old_source_prefix = properties["slurm-source-dir"]
+ if new_source_prefix == "":
+ new_source_prefix = f"{properties['slurm-source-dir']}/../slurm-git"
+
properties["old-slurm-prefix"] = old_slurm_prefix
properties["new-slurm-prefix"] = new_slurm_prefix
- logging.debug(
- "Setting bin/ and sbin/ pointing to old version and saving a backup..."
- )
- run_command(
- f"sudo mv {properties['slurm-sbin-dir']} {module_tmp_path}/upgrade-sbin",
- quiet=True,
- fatal=True,
- )
- run_command(
- f"sudo mv {properties['slurm-bin-dir']} {module_tmp_path}/upgrade-bin",
- quiet=True,
- fatal=True,
- )
- run_command(
- f"sudo mkdir {properties['slurm-sbin-dir']} {properties['slurm-bin-dir']}",
- quiet=True,
- fatal=True,
- )
- run_command(
- f"sudo ln -s {properties['old-slurm-prefix']}/sbin/* {properties['slurm-sbin-dir']}/",
- quiet=True,
- fatal=True,
- )
- run_command(
- f"sudo ln -s {properties['old-slurm-prefix']}/bin/* {properties['slurm-bin-dir']}/",
- quiet=True,
- fatal=True,
- )
+ properties["old-build-prefix"] = old_build_prefix
+ properties["new-build-prefix"] = new_build_prefix
+
+ properties["old-source-prefix"] = old_source_prefix
+ properties["new-source-prefix"] = new_source_prefix
+
+ properties["forced_upgrade_setup"] = force_old
+
+ if force_old:
+ logging.debug(
+ "Setting bin/ and sbin/ pointing to old version and saving a backup..."
+ )
+ run_command(
+ f"sudo mv {properties['slurm-sbin-dir']} {module_tmp_path}/upgrade-sbin",
+ quiet=True,
+ fatal=True,
+ )
+ run_command(
+ f"sudo mv {properties['slurm-bin-dir']} {module_tmp_path}/upgrade-bin",
+ quiet=True,
+ fatal=True,
+ )
+ run_command(
+ f"sudo mkdir {properties['slurm-sbin-dir']} {properties['slurm-bin-dir']}",
+ quiet=True,
+ fatal=True,
+ )
+ run_command(
+ f"sudo ln -s {properties['old-slurm-prefix']}/sbin/* {properties['slurm-sbin-dir']}/",
+ quiet=True,
+ fatal=True,
+ )
+ run_command(
+ f"sudo ln -s {properties['old-slurm-prefix']}/bin/* {properties['slurm-bin-dir']}/",
+ quiet=True,
+ fatal=True,
+ )
def upgrade_component(component, new_version=True):
@@ -1193,13 +1257,15 @@
component (string): The bin/ or sbin/ component of Slurm to check.
It also supports "config.h" to obtain the VERSION in the header.
slurm_prefix (string): The path where the component is. By default the defined in testsuite.conf.
- Ignored when component is "config.h".
+ If component is "config.h", then it's the build dir.
Returns:
A tuple representing the version. E.g. (25.05.0).
"""
if component == "config.h":
- header = pathlib.Path(f"{properties['slurm-build-dir']}/config.h")
+ if slurm_prefix == "":
+ slurm_prefix = properties["slurm-build-dir"]
+ header = pathlib.Path(f"{slurm_prefix}/config.h")
if not header.exists():
pytest.fail("Unable to access to config.h to get Slurm version")
@@ -4397,6 +4463,7 @@
build_args="",
full=False,
shared=False,
+ new_prefixes=False,
**run_command_kwargs,
):
"""Compiles a test program against either libslurm.so or libslurmfull.so.
@@ -4421,26 +4488,32 @@
>>> compile_against_libslurm("my_test.c", "my_test", build_args="-Wall -Werror")
"""
+ slurm_prefix = properties["slurm-prefix"]
+ slurm_source = properties["slurm-source-dir"]
+ slurm_build = properties["slurm-build-dir"]
+ if new_prefixes:
+ slurm_prefix = properties["new-slurm-prefix"]
+ slurm_source = properties["new-source-prefix"]
+ slurm_build = properties["new-build-prefix"]
+
if full:
slurm_library = "slurmfull"
else:
slurm_library = "slurm"
- if os.path.isfile(
- f"{properties['slurm-prefix']}/lib64/slurm/lib{slurm_library}.so"
- ):
+ if os.path.isfile(f"{slurm_prefix}/lib64/slurm/lib{slurm_library}.so"):
lib_dir = "lib64"
else:
lib_dir = "lib"
if full:
- lib_path = f"{properties['slurm-prefix']}/{lib_dir}/slurm"
+ lib_path = f"{slurm_prefix}/{lib_dir}/slurm"
else:
- lib_path = f"{properties['slurm-prefix']}/{lib_dir}"
+ lib_path = f"{slurm_prefix}/{lib_dir}"
command = f"gcc {source_file} -g -pthread"
if shared:
command += " -fPIC -shared"
command += f" -o {dest_file}"
- command += f" -I{properties['slurm-source-dir']} -I{properties['slurm-build-dir']} -I{properties['slurm-prefix']}/include -Wl,-rpath={lib_path} -L{lib_path} -l{slurm_library} -lresolv"
+ command += f" -I{slurm_source} -I{slurm_build} -I{slurm_prefix}/include -Wl,-rpath={lib_path} -L{lib_path} -l{slurm_library} -lresolv"
if build_args != "":
command += f" {build_args}"
run_command(command, **run_command_kwargs)
diff --git a/testsuite/python/scripts/spank_tmp_plugin.c b/testsuite/python/scripts/spank_tmp_plugin.c
new file mode 100644
index 0000000..635bf5c
--- /dev/null
+++ b/testsuite/python/scripts/spank_tmp_plugin.c
@@ -0,0 +1,46 @@
+/*****************************************************************************\
+ * Copyright (C) SchedMD LLC.
+ \*****************************************************************************/
+#include <fcntl.h>
+#include <slurm/spank.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+SPANK_PLUGIN(spank_tmp_plugin, 1);
+
+int slurm_spank_user_init(spank_t sp, int ac, char **av)
+{
+ FILE *file = fopen("/tmp/spank/slurm_spank_user_init_log", "w");
+ if (file == NULL) {
+ slurm_error("Failed to open slurm_spank_user_init_log file\n");
+ return ESPANK_ERROR;
+ }
+ fprintf(file, "slurm_spank_user_init_executed\n");
+ fclose(file);
+ return ESPANK_SUCCESS;
+}
+
+int slurm_spank_task_post_fork(spank_t sp, int ac, char **av)
+{
+ FILE *file = fopen("/tmp/spank/slurm_spank_task_post_fork_log", "w");
+ if (file == NULL) {
+ slurm_error(
+ "Failed to open slurm_spank_task_post_fork_log file\n");
+ return ESPANK_ERROR;
+ }
+ fprintf(file, "slurm_spank_task_post_fork_executed\n");
+ fclose(file);
+ return ESPANK_SUCCESS;
+}
+
+int slurm_spank_task_exit(spank_t sp, int ac, char **av)
+{
+ FILE *file = fopen("/tmp/spank/slurm_spank_task_exit_log", "w");
+ if (file == NULL) {
+ slurm_error("Failed to open slurm_spank_task_exit_log file\n");
+ return ESPANK_ERROR;
+ }
+ fprintf(file, "slurm_spank_task_exit_executed\n");
+ fclose(file);
+ return ESPANK_SUCCESS;
+}
diff --git a/testsuite/python/scripts/test_147_1_spank_plugin.c b/testsuite/python/scripts/test_147_1_spank_plugin.c
deleted file mode 100644
index c2a8737..0000000
--- a/testsuite/python/scripts/test_147_1_spank_plugin.c
+++ /dev/null
@@ -1,39 +0,0 @@
-#include <stdio.h>
-#include <stdlib.h>
-#include <fcntl.h>
-#include <slurm/spank.h>
-
-SPANK_PLUGIN(test_spank_plugin, 1);
-
-int slurm_spank_user_init(spank_t sp, int ac, char **av) {
- FILE *file = fopen("/tmp/test_147_1_private/slurm_spank_user_init_log", "w");
- if (file == NULL) {
- slurm_error("Failed to open slurm_spank_user_init_log file\n");
- return ESPANK_ERROR;
- }
- fprintf(file, "slurm_spank_user_init_executed\n");
- fclose(file);
- return ESPANK_SUCCESS;
-}
-
-int slurm_spank_task_post_fork(spank_t sp, int ac, char **av) {
- FILE *file = fopen("/tmp/test_147_1_private/slurm_spank_task_post_fork_log", "w");
- if (file == NULL) {
- slurm_error("Failed to open slurm_spank_task_post_fork_log file\n");
- return ESPANK_ERROR;
- }
- fprintf(file, "slurm_spank_task_post_fork_executed\n");
- fclose(file);
- return ESPANK_SUCCESS;
-}
-
-int slurm_spank_task_exit(spank_t sp, int ac, char **av) {
- FILE *file = fopen("/tmp/test_147_1_private/slurm_spank_task_exit_log", "w");
- if (file == NULL) {
- slurm_error("Failed to open slurm_spank_task_exit_log file\n");
- return ESPANK_ERROR;
- }
- fprintf(file, "slurm_spank_task_exit_executed\n");
- fclose(file);
- return ESPANK_SUCCESS;
-}
diff --git a/testsuite/python/tests/test_103_2.py b/testsuite/python/tests/test_103_2.py
index 6af2245..d9e2c44 100644
--- a/testsuite/python/tests/test_103_2.py
+++ b/testsuite/python/tests/test_103_2.py
@@ -14,6 +14,12 @@
atf.require_slurm_running()
+@pytest.mark.xfail(
+ atf.get_version() < (24, 11, 1)
+ and "use_interactive_step"
+ in atf.get_config_parameter("LaunchParameters", "", live=False),
+ reason="The 'ioctl(TIOCGWINSZ): Inappropriate ioctl for device' error when using LaunchParameters=use_interactive_step was fixed in 24.11.1",
+)
def test_salloc_normal():
"""Test salloc allocations without commands. We test the stderr and
stdout because the exit_codes seem to be 0 even when it has error messages.
diff --git a/testsuite/python/tests/test_116_48.py b/testsuite/python/tests/test_116_48.py
index 0be0563..d606a04 100644
--- a/testsuite/python/tests/test_116_48.py
+++ b/testsuite/python/tests/test_116_48.py
@@ -5,14 +5,10 @@
import pytest
import re
-# TODO: Bug 17619
-# From the docs:
-# "A single srun opens 3 listening ports plus 2 more for every 48 hosts."
-# It seems that docs are not right, but (4 + 2 * ((hosts-1)//48))
-port_range = 8
+port_range = 9
srun_port_lower = 60000
-srun_port_upper = srun_port_lower + port_range
+srun_port_upper = srun_port_lower + port_range - 1 # 60008 inclusive
# Setup
@@ -35,7 +31,7 @@
| grep SrunHost | awk -F: '{print \\$3}')
lsof -P -p \\$task_id 2>/dev/null | grep LISTEN | awk '{print \\$9}' \
| awk -F: '{print \\$2}'"'''
- output = atf.run_command_output(f"srun -N{nodes} {command}").split("\n")
+ output = atf.run_job_output(f"-N{nodes} {command}", timeout=120).split("\n")
count = 0
for port_string in output:
# Ignore blank lines
@@ -47,6 +43,9 @@
port_int >= srun_port_lower and port_int <= srun_port_upper
), f"Port {port_int} is not in range {srun_port_lower}-{srun_port_upper}"
+ # From the docs:
+ # "A single srun opens 4 listening ports plus 2 more for every 48 hosts
+ # beyond the first 48."
ports = nodes * (4 + 2 * ((nodes - 1) // 48))
assert count == ports, f"srun with -N{nodes} should use {ports} ports, not {count}"
@@ -55,14 +54,11 @@
def test_srun_ports_out_of_range(nodes):
"""Test sruns with too many nodes, so with not enough SrunPortRange"""
- result = atf.run_command(f"srun -t1 -N{nodes} sleep 1", xfail=True)
- assert (
- result["exit_code"] != 0
- ), f"srun with -N{nodes} should fail because it needs more than {port_range} ports"
+ result = atf.run_job_error(f"-t1 -N{nodes} sleep 1", fatal=True, xfail=True)
regex = rf"all ports in range .{srun_port_lower}, {srun_port_upper}. exhausted"
assert (
- re.search(regex, result["stderr"]) is not None
+ re.search(regex, result) is not None
), "srun's stderr should contain the 'all ports in range exhausted' message"
@@ -75,12 +71,9 @@
atf.wait_for_step(job_id1, 0, fatal=True)
atf.wait_for_step(job_id2, 0, fatal=True)
- result = atf.run_command("srun -t1 -N1 sleep 1", xfail=True)
- assert (
- result["exit_code"] != 0
- ), "srun should fail because running job shoulb be using the whole SrunPortRange"
+ result = atf.run_job_error("-t1 -N1 sleep 1", fatal=True, xfail=True)
regex = rf"all ports in range .{srun_port_lower}, {srun_port_upper}. exhausted"
assert (
- re.search(regex, result["stderr"]) is not None
+ re.search(regex, result) is not None
), "srun's stderr should contain the 'all ports in range exhausted' message"
diff --git a/testsuite/python/tests/test_147_1.py b/testsuite/python/tests/test_147_1.py
index 5a565d0..7785851 100644
--- a/testsuite/python/tests/test_147_1.py
+++ b/testsuite/python/tests/test_147_1.py
@@ -2,29 +2,22 @@
import pytest
import os
-spank_plugin_script = "test_147_1_spank_plugin.c"
-spank_compiled_plugin = "test_147_1_spank_plugin.so"
+spank_tmp = ""
# Setup
@pytest.fixture(scope="module", autouse=True)
-def setup():
+def setup(spank_tmp_lib):
+ global spank_tmp
+ spank_tmp, spank_lib = spank_tmp_lib
atf.require_config_parameter("JobContainerType", "job_container/tmpfs")
atf.require_config_parameter_includes("SlurmdParameters", "contain_spank")
atf.require_config_parameter_includes("PrologFlags", "Contain")
- # Compile SPANK plugin
- spank_plugin_script_path = (
- f"{atf.properties['testsuite_scripts_dir']}/{spank_plugin_script}"
- )
- atf.compile_against_libslurm(
- spank_plugin_script_path, spank_compiled_plugin, full=True, shared=True
- )
-
# Ensure the SPANK plugin is included in plugstack.conf
atf.require_config_parameter(
"required",
- f"{atf.module_tmp_path}/{spank_compiled_plugin}",
+ f"{spank_lib}",
delimiter=" ",
source="plugstack",
)
@@ -35,11 +28,8 @@
"BasePath", "/tmp/%h_%n_base_path", source="job_container"
)
- # Mount /tmp/test_147_1_private in the job container as a private mount
- atf.require_config_parameter(
- "Dirs", "/tmp/test_147_1_private", source="job_container"
- )
- atf.run_command("mkdir -p /tmp/test_147_1_private", fatal=True)
+ # Mount spank_tmp in the job container as a private mount
+ atf.require_config_parameter("Dirs", spank_tmp, source="job_container")
atf.require_slurm_running()
@@ -49,14 +39,14 @@
Test that SPANK plugin hooks execute correctly in the tmpfs job container.
"""
# Clear out the private mount and create a file outside the container
- atf.run_command("rm -rf /tmp/test_147_1_private/*", fatal=True)
- atf.run_command("touch /tmp/test_147_1_private/file_on_host", fatal=True)
+ atf.run_command(f"rm -rf {spank_tmp}/*", fatal=True)
+ atf.run_command(f"touch {spank_tmp}/file_on_host", fatal=True)
atf.make_bash_script(
"job.sh",
- """
+ f"""
# Check to make sure job_container/tmpfs made a private mount
- if [[ -f /tmp/test_147_1_private/file_on_host ]]; then
+ if [[ -f {spank_tmp}/file_on_host ]]; then
echo "job_container/tmpfs failed to create private mount"
else
echo "job_container/tmpfs created private mount"
@@ -66,21 +56,21 @@
srun hostname
# Check if slurm_spank_user_init executed its functions and left behind a file
- if [[ -f /tmp/test_147_1_private/slurm_spank_user_init_log ]]; then
+ if [[ -f {spank_tmp}/slurm_spank_user_init_log ]]; then
echo "Found log for hook slurm_spank_user_init"
else
echo "Couldn't find log for hook slurm_spank_user_init"
fi
# Check if slurm_spank_task_post_fork executed its functions and left behind a file
- if [[ -f /tmp/test_147_1_private/slurm_spank_task_post_fork_log ]]; then
+ if [[ -f {spank_tmp}/slurm_spank_task_post_fork_log ]]; then
echo "Found log for hook slurm_spank_task_post_fork"
else
echo "Couldn't find log for hook slurm_spank_task_post_fork"
fi
# Check if slurm_spank_task_exit executed its functions and left behind a file
- if [[ -f /tmp/test_147_1_private/slurm_spank_task_exit_log ]]; then
+ if [[ -f {spank_tmp}/slurm_spank_task_exit_log ]]; then
echo "Found log for hook slurm_spank_task_exit"
else
echo "Couldn't find log for hook slurm_spank_task_exit"
@@ -115,7 +105,7 @@
"job_container/tmpfs created private mount" in content
), "job_container/tmpfs failed to create a private mount because we found a pre-existing file on the host when running a job"
assert not (
- os.path.isfile("/tmp/test_147_1_private/slurm_spank_user_init_log")
- or os.path.isfile("/tmp/test_147_1_private/slurm_spank_task_post_fork_log")
- or os.path.isfile("/tmp/test_147_1_private/slurm_spank_task_exit_log")
+ os.path.isfile(f"{spank_tmp}/slurm_spank_user_init_log")
+ or os.path.isfile(f"{spank_tmp}/slurm_spank_task_post_fork_log")
+ or os.path.isfile(f"{spank_tmp}/slurm_spank_task_exit_log")
), "job_container/tmpfs failed to isolate private mount; files created in container appear on host"