blob: 5a52a63e66daf0877fdd63b029af271061fde073 [file] [log] [blame] [edit]
/*
* Interface between lower-level ALPS XML-RPC functions and SLURM.
*
* Copyright (c) 2010-11 Centro Svizzero di Calcolo Scientifico (CSCS)
* Licensed under GPLv2.
*/
#include "basil_interface.h"
#include "basil_alps.h"
#include "src/common/gres.h"
#include "src/common/slurm_accounting_storage.h"
#define _DEBUG 0
int dim_size[3] = {0, 0, 0};
typedef struct args_sig_basil {
uint32_t resv_id;
int signal;
uint16_t delay;
} args_sig_basil_t;
/*
* Following routines are from src/plugins/select/bluegene/plugin/jobinfo.c
*/
static int _set_select_jobinfo(select_jobinfo_t *jobinfo,
enum select_jobdata_type data_type, void *data)
{
uint32_t *uint32 = (uint32_t *) data;
if (jobinfo == NULL) {
error("cray/set_select_jobinfo: jobinfo not set");
return SLURM_ERROR;
}
if (jobinfo->magic != JOBINFO_MAGIC) {
error("cray/set_select_jobinfo: jobinfo magic bad");
return SLURM_ERROR;
}
switch (data_type) {
case SELECT_JOBDATA_RESV_ID:
jobinfo->reservation_id = *uint32;
break;
default:
error("cray/set_select_jobinfo: data_type %d invalid",
data_type);
}
return SLURM_SUCCESS;
}
static int _get_select_jobinfo(select_jobinfo_t *jobinfo,
enum select_jobdata_type data_type, void *data)
{
uint64_t *uint64 = (uint64_t *) data;
uint32_t *uint32 = (uint32_t *) data;
if (jobinfo == NULL) {
error("cray/get_select_jobinfo: jobinfo not set");
return SLURM_ERROR;
}
if (jobinfo->magic != JOBINFO_MAGIC) {
error("cray/get_select_jobinfo: jobinfo magic bad");
return SLURM_ERROR;
}
switch (data_type) {
case SELECT_JOBDATA_RESV_ID:
*uint32 = jobinfo->reservation_id;
break;
case SELECT_JOBDATA_PAGG_ID:
*uint64 = jobinfo->confirm_cookie;
break;
default:
error("cray/get_select_jobinfo: data_type %d invalid",
data_type);
}
return SLURM_SUCCESS;
}
/** Convert between Cray NID and slurm nodename format */
static struct node_record *_find_node_by_basil_id(uint32_t node_id)
{
char nid[9]; /* nid%05d\0 */
snprintf(nid, sizeof(nid), "nid%05u", node_id);
return find_node_record(nid);
}
extern int basil_node_ranking(struct node_record *node_array, int node_cnt)
{
enum basil_version version = get_basil_version();
struct basil_inventory *inv;
struct basil_node *node;
int rank_count = 0, i;
hostlist_t hl = hostlist_create(NULL);
bool bad_node = 0;
/*
* When obtaining the initial configuration, we can not allow ALPS to
* fail. If there is a problem at this stage it is better to restart
* SLURM completely, after investigating (and/or fixing) the cause.
*/
inv = get_full_inventory(version);
if (inv == NULL)
fatal("failed to get BASIL %s ranking", bv_names_long[version]);
else if (!inv->batch_total)
fatal("system has no usable batch compute nodes");
else if (inv->batch_total < node_cnt)
info("Warning: ALPS sees only %d/%d slurm.conf nodes, "
"check DownNodes", inv->batch_total, node_cnt);
debug("BASIL %s RANKING INVENTORY: %d/%d batch nodes",
bv_names_long[version], inv->batch_avail, inv->batch_total);
/*
* Node ranking is based on a subset of the inventory: only nodes in
* batch allocation mode which are up and not allocated. Assign a
* 'NO_VAL' rank to all other nodes, which will translate as a very
* high value, (unsigned)-2, to put those nodes last in the ranking.
* The rest of the code must ensure that those nodes are never chosen.
*/
for (i = 0; i < node_cnt; i++)
node_array[i].node_rank = NO_VAL;
for (node = inv->f->node_head; node; node = node->next) {
struct node_record *node_ptr;
char tmp[50];
/* This will ignore interactive nodes when iterating through
* the apbasil inventory. If we don't do this, SLURM is
* unable to resolve the ID to a nidXXX name since it's not in
* the slurm.conf file. (Chris North)
*/
if (node->role == BNR_INTER)
continue;
node_ptr = _find_node_by_basil_id(node->node_id);
if (node_ptr == NULL) {
error("nid%05u (%s node in state %s) not in slurm.conf",
node->node_id, nam_noderole[node->role],
nam_nodestate[node->state]);
bad_node = 1;
} else
node_ptr->node_rank = inv->nodes_total - rank_count++;
sprintf(tmp, "nid%05u", node->node_id);
hostlist_push(hl, tmp);
}
free_inv(inv);
if (bad_node) {
hostlist_sort(hl);
char *name = hostlist_ranged_string_xmalloc(hl);
info("It appears your slurm.conf nodelist doesn't "
"match the alps system. Here are the nodes alps knows "
"about\n%s", name);
}
hostlist_destroy(hl);
return SLURM_SUCCESS;
}
/**
* basil_inventory - Periodic node-state query via ALPS XML-RPC.
* This should be run immediately before each scheduling cycle.
* Returns non-SLURM_SUCCESS if
* - INVENTORY method failed (error)
* - no nodes are available (no point in scheduling)
* - orphaned ALPS reservation exists (wait until ALPS resynchronizes)
*/
extern int basil_inventory(void)
{
enum basil_version version = get_basil_version();
struct basil_inventory *inv;
struct basil_node *node;
struct basil_rsvn *rsvn;
int slurm_alps_mismatch = 0;
int rc = SLURM_SUCCESS;
time_t now = time(NULL);
static time_t slurm_alps_mismatch_time = (time_t) 0;
static bool logged_sync_timeout = false;
inv = get_full_inventory(version);
if (inv == NULL) {
error("BASIL %s INVENTORY failed", bv_names_long[version]);
return SLURM_ERROR;
}
debug("BASIL %s INVENTORY: %d/%d batch nodes available",
bv_names_long[version], inv->batch_avail, inv->batch_total);
/* Avoid checking for inv->batch_avail here since if we are
gang scheduling returning an error for a full system is
probably the wrong thing to do. (the schedule() function
in the slurmctld will never run ;)).
*/
if (!inv->f->node_head || !inv->batch_total)
rc = ESLURM_REQUESTED_NODE_CONFIG_UNAVAILABLE;
for (node = inv->f->node_head; node; node = node->next) {
int node_inx;
struct node_record *node_ptr;
char *reason = NULL;
/* This will ignore interactive nodes when iterating through
* the apbasil inventory. If we don't do this, SLURM is
* unable to resolve the ID to a nidXXX name since it's not in
* the slurm.conf file. (Chris North)
*/
if (node->role == BNR_INTER)
continue;
node_ptr = _find_node_by_basil_id(node->node_id);
if (node_ptr == NULL) {
error("nid%05u (%s node in state %s) not in slurm.conf",
node->node_id, nam_noderole[node->role],
nam_nodestate[node->state]);
continue;
}
node_inx = node_ptr - node_record_table_ptr;
if (node_is_allocated(node) && !IS_NODE_ALLOCATED(node_ptr)) {
/*
* ALPS still hangs on to the node while SLURM considers
* it already unallocated. Possible causes are partition
* cleanup taking too long (can be 10sec ... minutes),
* and orphaned ALPS reservations (caught below).
*
* The converse case (SLURM hanging on to the node while
* ALPS has already freed it) happens frequently during
* job completion: select_g_job_fini() is called before
* make_node_comp(). Rely on SLURM logic for this case.
*/
slurm_alps_mismatch++;
}
if (node->state == BNS_DOWN) {
reason = "ALPS marked it DOWN";
} else if (node->state == BNS_UNAVAIL) {
reason = "node is UNAVAILABLE";
} else if (node->state == BNS_ROUTE) {
reason = "node does ROUTING";
} else if (node->state == BNS_SUSPECT) {
reason = "entered SUSPECT mode";
} else if (node->state == BNS_ADMINDOWN) {
reason = "node is ADMINDOWN";
} else if (node->state != BNS_UP) {
reason = "state not UP";
} else if (node->role != BNR_BATCH) {
reason = "mode not BATCH";
} else if (node->arch != BNA_XT) {
reason = "arch not XT/XE";
}
/* Base state entirely derives from ALPS */
if (reason) {
if (node_ptr->down_time == 0)
node_ptr->down_time = now;
if (IS_NODE_DOWN(node_ptr)) {
/* node still down */
} else if ((slurmctld_conf.slurmd_timeout == 0) ||
((now - node_ptr->down_time) <
slurmctld_conf.slurmd_timeout)) {
node_ptr->node_state |= NODE_STATE_NO_RESPOND;
bit_clear(avail_node_bitmap, node_inx);
} else {
xfree(node_ptr->reason);
info("MARKING %s DOWN (%s)",
node_ptr->name, reason);
/* set_node_down also kills any running jobs */
set_node_down_ptr(node_ptr, reason);
}
} else if (IS_NODE_DOWN(node_ptr)) {
xfree(node_ptr->reason);
node_ptr->down_time = 0;
info("MARKING %s UP", node_ptr->name);
/* Reset state, make_node_idle figures out the rest */
node_ptr->node_state &= NODE_STATE_FLAGS;
node_ptr->node_state &= (~NODE_STATE_NO_RESPOND);
node_ptr->node_state |= NODE_STATE_UNKNOWN;
make_node_idle(node_ptr, NULL);
if (!IS_NODE_DRAIN(node_ptr) &&
!IS_NODE_FAIL(node_ptr)) {
xfree(node_ptr->reason);
node_ptr->reason_time = 0;
node_ptr->reason_uid = NO_VAL;
clusteracct_storage_g_node_up(
acct_db_conn, node_ptr, now);
}
} else if (IS_NODE_NO_RESPOND(node_ptr)) {
node_ptr->node_state &= (~NODE_STATE_NO_RESPOND);
if (!IS_NODE_DRAIN(node_ptr) &&
!IS_NODE_FAIL(node_ptr)) {
bit_set(avail_node_bitmap, node_inx);
}
}
}
if (slurm_alps_mismatch)
debug("ALPS: %d node(s) still held", slurm_alps_mismatch);
/*
* Check that each ALPS reservation corresponds to a SLURM job.
* Purge orphaned reservations, which may result from stale or
* messed up system state, or are indicative of ALPS problems
* (stuck in pending cancel calls).
*/
for (rsvn = inv->f->rsvn_head; rsvn; rsvn = rsvn->next) {
ListIterator job_iter = list_iterator_create(job_list);
struct job_record *job_ptr;
uint32_t resv_id;
if (job_iter == NULL)
fatal("list_iterator_create: malloc failure");
while ((job_ptr = (struct job_record *)list_next(job_iter))) {
if (_get_select_jobinfo(job_ptr->select_jobinfo->data,
SELECT_JOBDATA_RESV_ID,
&resv_id) == SLURM_SUCCESS
&& resv_id == rsvn->rsvn_id)
break;
}
list_iterator_destroy(job_iter);
/*
* Changed to ignore reservations for "UNKNOWN" batch
* ids (e.g. the interactive region) (Chris North)
*/
if ((job_ptr == NULL) && (strcmp(rsvn->batch_id, "UNKNOWN"))) {
error("orphaned ALPS reservation %u, trying to remove",
rsvn->rsvn_id);
basil_safe_release(rsvn->rsvn_id, inv);
slurm_alps_mismatch = true;
}
}
free_inv(inv);
if (slurm_alps_mismatch) {
/* If SLURM and ALPS state are not in synchronization,
* do not schedule any more jobs until waiting at least
* SyncTimeout seconds. */
if (slurm_alps_mismatch_time == 0) {
slurm_alps_mismatch_time = now;
} else if (cray_conf->sync_timeout == 0) {
/* Wait indefinitely */
} else if (difftime(now, slurm_alps_mismatch_time) <
cray_conf->sync_timeout) {
return ESLURM_REQUESTED_NODE_CONFIG_UNAVAILABLE;
} else if (!logged_sync_timeout) {
error("Could not synchronize SLURM with ALPS for %u "
"seconds, proceeding with job scheduling",
cray_conf->sync_timeout);
logged_sync_timeout = true;
}
} else {
slurm_alps_mismatch_time = 0;
logged_sync_timeout = false;
}
return rc;
}
/** Base-36 encoding of @coord */
static char _enc_coord(uint8_t coord)
{
return coord + (coord < 10 ? '0' : 'A' - 10);
}
/**
* basil_geometry - Check node attributes, resolve (X,Y,Z) coordinates.
*
* Checks both SDB database and ALPS inventory for consistency. The inventory
* part is identical to basil_inventory(), with the difference of being called
* before valid bitmaps exist, from select_g_node_init().
* Its dependencies are:
* - it needs reset_job_bitmaps() in order to rebuild node_bitmap fields,
* - it relies on _sync_nodes_to_jobs() to
* o kill active jobs on nodes now marked DOWN,
* o reset node state to ALLOCATED if it has been marked IDLE here (which is
* an error case, since there is no longer an ALPS reservation for the job,
* this is caught by the subsequent basil_inventory()).
*/
extern int basil_geometry(struct node_record *node_ptr_array, int node_cnt)
{
struct node_record *node_ptr, *end = node_ptr_array + node_cnt;
enum basil_version version = get_basil_version();
struct basil_inventory *inv;
/* General mySQL */
MYSQL *handle;
MYSQL_STMT *stmt = NULL;
/* Input parameters */
unsigned int node_id;
/*
* Use a left outer join here since the attributes table may not be
* populated for a given nodeid (e.g. when the node has been disabled
* on the SMW via 'xtcli disable').
* The processor table has more authoritative information, if a nodeid
* is not listed there, it does not exist.
*/
const char query[] = "SELECT x_coord, y_coord, z_coord,"
" cab_position, cab_row, cage, slot, cpu,"
" LOG2(coremask+1), availmem, "
" processor_type "
"FROM processor LEFT JOIN attributes "
"ON processor_id = nodeid "
"WHERE processor_id = ? ";
const int PARAM_COUNT = 1; /* node id */
MYSQL_BIND params[PARAM_COUNT];
int x_coord, y_coord, z_coord;
int cab, row, cage, slot, cpu;
unsigned int node_cpus, node_mem;
char proc_type[BASIL_STRING_SHORT];
MYSQL_BIND bind_cols[COLUMN_COUNT];
my_bool is_null[COLUMN_COUNT];
my_bool is_error[COLUMN_COUNT];
int is_gemini, i;
time_t now = time(NULL);
memset(params, 0, sizeof(params));
params[0].buffer_type = MYSQL_TYPE_LONG;
params[0].is_unsigned = true;
params[0].is_null = (my_bool *)0;
params[0].buffer = (char *)&node_id;
memset(bind_cols, 0, sizeof(bind_cols));
for (i = 0; i < COLUMN_COUNT; i ++) {
bind_cols[i].is_null = &is_null[i];
bind_cols[i].error = &is_error[i];
if (i == COL_TYPE) {
bind_cols[i].buffer_type = MYSQL_TYPE_STRING;
bind_cols[i].buffer_length = sizeof(proc_type);
bind_cols[i].buffer = proc_type;
} else {
bind_cols[i].buffer_type = MYSQL_TYPE_LONG;
bind_cols[i].is_unsigned = (i >= COL_CORES);
}
}
bind_cols[COL_X].buffer = (char *)&x_coord;
bind_cols[COL_Y].buffer = (char *)&y_coord;
bind_cols[COL_Z].buffer = (char *)&z_coord;
bind_cols[COL_CAB].buffer = (char *)&cab;
bind_cols[COL_ROW].buffer = (char *)&row;
bind_cols[COL_CAGE].buffer = (char *)&cage;
bind_cols[COL_SLOT].buffer = (char *)&slot;
bind_cols[COL_CPU].buffer = (char *)&cpu;
bind_cols[COL_CORES].buffer = (char *)&node_cpus;
bind_cols[COL_MEMORY].buffer = (char *)&node_mem;
inv = get_full_inventory(version);
if (inv == NULL)
fatal("failed to get initial BASIL inventory");
info("BASIL %s initial INVENTORY: %d/%d batch nodes available",
bv_names_long[version], inv->batch_avail, inv->batch_total);
handle = cray_connect_sdb();
if (handle == NULL)
fatal("can not connect to XTAdmin database on the SDB");
is_gemini = cray_is_gemini_system(handle);
if (is_gemini < 0)
fatal("can not determine Cray XT/XE system type");
stmt = prepare_stmt(handle, query, params, PARAM_COUNT,
bind_cols, COLUMN_COUNT);
if (stmt == NULL)
fatal("can not prepare statement to resolve Cray coordinates");
for (node_ptr = node_record_table_ptr; node_ptr < end; node_ptr++) {
struct basil_node *node;
char *reason = NULL;
if ((node_ptr->name == NULL) ||
(sscanf(node_ptr->name, "nid%05u", &node_id) != 1)) {
error("can not read basil_node_id from %s",
node_ptr->name);
continue;
}
if (exec_stmt(stmt, query, bind_cols, COLUMN_COUNT) < 0)
fatal("can not resolve %s coordinates", node_ptr->name);
if (fetch_stmt(stmt) == 0) {
#if _DEBUG
info("proc_type:%s cpus:%u memory:%u",
proc_type, node_cpus, node_mem);
info("row:%u cage:%u slot:%u cpu:%u xyz:%u:%u:%u",
row, cage, slot, cpu, x_coord, y_coord, z_coord);
#endif
if (strcmp(proc_type, "compute") != 0) {
/*
* Switching a compute node to be a service node
* can not happen at runtime: requires a reboot.
*/
fatal("Node '%s' is a %s node. "
"Only compute nodes can appear in slurm.conf.",
node_ptr->name, proc_type);
} else if (is_null[COL_CORES] || is_null[COL_MEMORY]) {
/*
* This can happen if a node has been disabled
* on the SMW (using 'xtcli disable <nid>'). The
* node will still be listed in the 'processor'
* table, but have no 'attributes' entry (NULL
* values for CPUs/memory). Also, the node will
* be invisible to ALPS, which is why we need to
* set it down here already.
*/
node_cpus = node_mem = 0;
reason = "node data unknown - disabled on SMW?";
} else if (is_null[COL_X] || is_null[COL_Y]
|| is_null[COL_Z]) {
/*
* Similar case to the one above, observed when
* a blade has been removed. Node will not
* likely show up in ALPS.
*/
x_coord = y_coord = z_coord = 0;
reason = "unknown coordinates - hardware failure?";
} else if (node_cpus < node_ptr->config_ptr->cpus) {
/*
* FIXME: Might reconsider this policy.
*
* FastSchedule is ignored here, it requires the
* slurm.conf to be consistent with hardware.
*
* Assumption is that CPU/Memory do not change
* at runtime (Cray has no hot-swappable parts).
*
* Hence checking it in basil_inventory() would
* mean a lot of runtime overhead.
*/
fatal("slurm.conf: node %s has only Procs=%d",
node_ptr->name, node_cpus);
} else if (node_mem < node_ptr->config_ptr->real_memory) {
fatal("slurm.conf: node %s has RealMemory=%d",
node_ptr->name, node_mem);
}
} else if (is_gemini) {
fatal("Non-existing Gemini node '%s' in slurm.conf",
node_ptr->name);
} else {
fatal("Non-existing SeaStar node '%s' in slurm.conf",
node_ptr->name);
}
if (!is_gemini) {
/*
* SeaStar: each node has unique coordinates
*/
if (node_ptr->arch == NULL)
node_ptr->arch = xstrdup("XT");
} else {
/*
* Gemini: each 2 nodes share the same network
* interface (i.e., nodes 0/1 and 2/3 each have
* the same coordinates).
*/
if (node_ptr->arch == NULL)
node_ptr->arch = xstrdup("XE");
}
xfree(node_ptr->node_hostname);
xfree(node_ptr->comm_name);
/*
* Convention: since we are using SLURM in frontend-mode,
* we use Node{Addr,HostName} as follows.
*
* NodeAddr: <X><Y><Z> coordinates in base-36 encoding
*
* NodeHostName: c#-#c#s#n# using the NID convention
* <cabinet>-<row><chassis><slot><node>
* - each cabinet can accommodate 3 chassis (c1..c3)
* - each chassis has 8 slots (s0..s7)
* - each slot contains 2 or 4 nodes (n0..n3)
* o either 2 service nodes (n0/n3)
* o or 4 compute nodes (n0..n3)
* o or 2 gemini chips (g0/g1 serving n0..n3)
*
* Example: c0-0c1s0n1
* - c0- = cabinet 0
* - 0 = row 0
* - c1 = chassis 1
* - s0 = slot 0
* - n1 = node 1
*/
node_ptr->node_hostname = xstrdup_printf("c%u-%uc%us%un%u", cab,
row, cage, slot, cpu);
node_ptr->comm_name = xstrdup_printf("%c%c%c",
_enc_coord(x_coord),
_enc_coord(y_coord),
_enc_coord(z_coord));
dim_size[0] = MAX(dim_size[0], (x_coord - 1));
dim_size[1] = MAX(dim_size[1], (y_coord - 1));
dim_size[2] = MAX(dim_size[2], (z_coord - 1));
#if _DEBUG
info("%s %s %s cpus=%u, mem=%u reason=%s", node_ptr->name,
node_ptr->node_hostname, node_ptr->comm_name,
node_cpus, node_mem, reason);
#endif
/*
* Check the current state reported by ALPS inventory, unless it
* is already evident that the node has some other problem.
*/
if (reason == NULL) {
for (node = inv->f->node_head; node; node = node->next)
if (node->node_id == node_id)
break;
if (node == NULL) {
reason = "not visible to ALPS - check hardware";
} else if (node->state == BNS_DOWN) {
reason = "ALPS marked it DOWN";
} else if (node->state == BNS_UNAVAIL) {
reason = "node is UNAVAILABLE";
} else if (node->state == BNS_ROUTE) {
reason = "node does ROUTING";
} else if (node->state == BNS_SUSPECT) {
reason = "entered SUSPECT mode";
} else if (node->state == BNS_ADMINDOWN) {
reason = "node is ADMINDOWN";
} else if (node->state != BNS_UP) {
reason = "state not UP";
} else if (node->role != BNR_BATCH) {
reason = "mode not BATCH";
} else if (node->arch != BNA_XT) {
reason = "arch not XT/XE";
}
}
/* Base state entirely derives from ALPS
* NOTE: The node bitmaps are not defined when this code is
* initially executed. */
node_ptr->node_state &= NODE_STATE_FLAGS;
if (reason) {
if (node_ptr->down_time == 0)
node_ptr->down_time = now;
if (IS_NODE_DOWN(node_ptr)) {
/* node still down */
debug("Initial DOWN node %s - %s",
node_ptr->name, node_ptr->reason);
} else if (slurmctld_conf.slurmd_timeout &&
((now - node_ptr->down_time) <
slurmctld_conf.slurmd_timeout)) {
node_ptr->node_state |= NODE_STATE_NO_RESPOND;
} else {
info("Initial DOWN node %s - %s",
node_ptr->name, reason);
node_ptr->reason = xstrdup(reason);
/* Node state flags preserved above */
node_ptr->node_state |= NODE_STATE_DOWN;
clusteracct_storage_g_node_down(acct_db_conn,
node_ptr,
now, NULL,
slurm_get_slurm_user_id());
}
} else {
bool node_up_flag = IS_NODE_DOWN(node_ptr) &&
!IS_NODE_DRAIN(node_ptr) &&
!IS_NODE_FAIL(node_ptr);
node_ptr->down_time = 0;
if (node_is_allocated(node))
node_ptr->node_state |= NODE_STATE_ALLOCATED;
else
node_ptr->node_state |= NODE_STATE_IDLE;
node_ptr->node_state &= (~NODE_STATE_NO_RESPOND);
xfree(node_ptr->reason);
if (node_up_flag) {
info("ALPS returned node %s to service",
node_ptr->name);
clusteracct_storage_g_node_up(acct_db_conn,
node_ptr, now);
}
}
free_stmt_result(stmt);
}
if (stmt_close(stmt))
error("error closing statement: %s", mysql_stmt_error(stmt));
cray_close_sdb(handle);
free_inv(inv);
return SLURM_SUCCESS;
}
struct basil_accel_param* build_accel_param(struct job_record* job_ptr)
{
int gpu_mem_req;
struct basil_accel_param* head,* bap_ptr;
gpu_mem_req = gres_plugin_get_job_value_by_type(job_ptr->gres_list,
"gpu_mem");
if (gpu_mem_req == NO_VAL)
gpu_mem_req = 0;
if (!job_ptr) {
info("The job_ptr is NULL; nothing to do!");
return NULL;
} else if (!job_ptr->details) {
info("The job_ptr->details is NULL; nothing to do!");
return NULL;
}
head = xmalloc(sizeof(struct basil_accel_param));
bap_ptr = head;
bap_ptr->type = BA_GPU; /* Currently BASIL only permits
* generic resources of type GPU. */
bap_ptr->memory_mb = gpu_mem_req;
bap_ptr->next = NULL;
return head;
}
/**
* do_basil_reserve - create a BASIL reservation.
* IN job_ptr - pointer to job which has just been allocated resources
* RET 0 or error code, job will abort or be requeued on failure
*/
extern int do_basil_reserve(struct job_record *job_ptr)
{
struct nodespec *ns_head = NULL;
uint16_t mppwidth = 0, mppdepth, mppnppn;
/* mppmem must be at least 1 for gang scheduling to work so
* if you are wondering why gang scheduling isn't working you
* should check your slurm.conf for DefMemPerNode */
uint32_t mppmem = 0, node_min_mem = 0;
uint32_t resv_id;
int i, first_bit, last_bit;
long rc;
char *user, batch_id[16];
struct basil_accel_param* bap;
if (!job_ptr->job_resrcs || job_ptr->job_resrcs->nhosts == 0)
return SLURM_SUCCESS;
debug3("job #%u: %u nodes = %s, cpus=%u" , job_ptr->job_id,
job_ptr->job_resrcs->nhosts,
job_ptr->job_resrcs->nodes,
job_ptr->job_resrcs->ncpus
);
if (job_ptr->job_resrcs->node_bitmap == NULL) {
error("job %u node_bitmap not set", job_ptr->job_id);
return SLURM_SUCCESS;
}
first_bit = bit_ffs(job_ptr->job_resrcs->node_bitmap);
last_bit = bit_fls(job_ptr->job_resrcs->node_bitmap);
if (first_bit == -1 || last_bit == -1)
return SLURM_SUCCESS; /* no nodes allocated */
mppdepth = MAX(1, job_ptr->details->cpus_per_task);
mppnppn = job_ptr->details->ntasks_per_node;
/* mppmem */
if (job_ptr->details->pn_min_memory & MEM_PER_CPU) {
/* Only honour --mem-per-cpu if --ntasks has been given */
if (job_ptr->details->num_tasks)
mppmem = job_ptr->details->pn_min_memory & ~MEM_PER_CPU;
} else if (job_ptr->details->pn_min_memory) {
node_min_mem = job_ptr->details->pn_min_memory;
}
for (i = first_bit; i <= last_bit; i++) {
struct node_record *node_ptr = node_record_table_ptr + i;
uint32_t basil_node_id;
if (!bit_test(job_ptr->job_resrcs->node_bitmap, i))
continue;
if (!node_ptr->name || node_ptr->name[0] == '\0')
continue; /* bad node */
if (sscanf(node_ptr->name, "nid%05u", &basil_node_id) != 1)
fatal("can not read basil_node_id from %s",
node_ptr->name);
if (ns_add_node(&ns_head, basil_node_id, false) != 0) {
error("can not add node %s (nid%05u)", node_ptr->name,
basil_node_id);
free_nodespec(ns_head);
return SLURM_ERROR;
}
if (node_min_mem) {
uint32_t node_cpus, node_mem;
int32_t tmp_mppmem;
if (slurmctld_conf.fast_schedule) {
node_cpus = node_ptr->config_ptr->cpus;
node_mem = node_ptr->config_ptr->real_memory;
} else {
node_cpus = node_ptr->cpus;
node_mem = node_ptr->real_memory;
}
/*
* ALPS 'Processing Elements per Node' value (aprun -N),
* which in slurm is --ntasks-per-node and 'mppnppn' in
* PBS: if --ntasks is specified, default to the number
* of cores per node (also the default for 'aprun -N').
* On a heterogeneous system the nodes aren't
* always the same so keep track of the lowest
* mppmem and use it as the level for all
* nodes (mppmem is 0 when coming in).
*/
node_mem /= mppnppn ? mppnppn : node_cpus;
tmp_mppmem = node_min_mem = MIN(node_mem, node_min_mem);
/* If less than or equal to 0 make sure you
have 1 at least since 0 means give all the
memory to the job.
*/
if (tmp_mppmem <= 0)
tmp_mppmem = 1;
if (mppmem)
mppmem = MIN(mppmem, tmp_mppmem);
else
mppmem = tmp_mppmem;
}
}
/* mppwidth */
for (i = 0; i < job_ptr->job_resrcs->nhosts; i++) {
uint16_t node_tasks = job_ptr->job_resrcs->cpus[i] / mppdepth;
if (mppnppn && mppnppn < node_tasks)
node_tasks = mppnppn;
mppwidth += node_tasks;
}
snprintf(batch_id, sizeof(batch_id), "%u", job_ptr->job_id);
user = uid_to_string(job_ptr->user_id);
if (job_ptr->gres_list)
bap = build_accel_param(job_ptr);
else
bap = NULL;
rc = basil_reserve(user, batch_id, mppwidth, mppdepth, mppnppn,
mppmem, ns_head, bap);
xfree(user);
if (rc <= 0) {
/* errno value will be resolved by select_g_job_begin() */
errno = is_transient_error(rc) ? EAGAIN : ECONNABORTED;
return SLURM_ERROR;
}
resv_id = rc;
if (_set_select_jobinfo(job_ptr->select_jobinfo->data,
SELECT_JOBDATA_RESV_ID, &resv_id) != SLURM_SUCCESS) {
/*
* This is a fatal error since it means we will not be able to
* confirm the reservation; no step will be able to run in it.
*/
error("job %u: can not set resId %u", job_ptr->job_id, resv_id);
basil_release(resv_id);
return SLURM_ERROR;
}
if (mppmem)
job_ptr->details->pn_min_memory = mppmem | MEM_PER_CPU;
info("ALPS RESERVATION #%u, JobId %u: BASIL -n %d -N %d -d %d -m %d",
resv_id, job_ptr->job_id, mppwidth, mppnppn, mppdepth, mppmem);
return SLURM_SUCCESS;
}
/**
* do_basil_confirm - confirm an existing BASIL reservation.
* This requires the alloc_sid to equal the session ID (getsid()) of the process
* executing the aprun/mpirun commands
* Returns: SLURM_SUCCESS if ok, READY_JOB_ERROR/FATAL on transient/fatal error.
*/
extern int do_basil_confirm(struct job_record *job_ptr)
{
uint32_t resv_id;
uint64_t pagg_id;
if (_get_select_jobinfo(job_ptr->select_jobinfo->data,
SELECT_JOBDATA_RESV_ID, &resv_id) != SLURM_SUCCESS) {
error("can not read resId for JobId=%u", job_ptr->job_id);
} else if (resv_id == 0) {
/* On Cray XT/XE, a reservation ID of 0 is always invalid. */
error("JobId=%u has invalid (ZERO) resId", job_ptr->job_id);
} else if (_get_select_jobinfo(job_ptr->select_jobinfo->data,
SELECT_JOBDATA_PAGG_ID, &pagg_id) != SLURM_SUCCESS) {
error("can not read pagg ID for JobId=%u", job_ptr->job_id);
} else {
int rc;
if (pagg_id == 0) {
#ifdef HAVE_REAL_CRAY
/* This fallback case is for interactive jobs only */
error("JobId %u has no pagg ID, falling back to SID",
job_ptr->job_id);
#endif
pagg_id = job_ptr->alloc_sid;
}
rc = basil_confirm(resv_id, job_ptr->job_id, pagg_id);
if (rc == 0) {
debug2("confirmed ALPS resId %u for JobId %u, pagg "
"%"PRIu64"", resv_id, job_ptr->job_id, pagg_id);
return SLURM_SUCCESS;
} else if (rc == -BE_NO_RESID) {
/*
* If ALPS can not find the reservation ID we are trying
* to confirm, it may be that the job has already been
* canceled, or that the reservation has timed out after
* waiting for the confirmation.
* It is more likely that this error occurs on a per-job
* basis, hence in this case do not drain frontend node.
*/
error("JobId %u has invalid ALPS resId %u - job "
"already canceled?", job_ptr->job_id, resv_id);
return SLURM_SUCCESS;
} else {
error("confirming ALPS resId %u of JobId %u FAILED: %s",
resv_id, job_ptr->job_id, basil_strerror(rc));
if (is_transient_error(rc))
return READY_JOB_ERROR;
}
}
return READY_JOB_FATAL;
}
/**
* do_basil_signal - pass job signal on to any APIDs
* IN job_ptr - job to be signalled
* IN signal - signal(7) number
* Only signal job if an ALPS reservation exists (non-0 reservation ID).
*/
extern int do_basil_signal(struct job_record *job_ptr, int signal)
{
uint32_t resv_id;
if (_get_select_jobinfo(job_ptr->select_jobinfo->data,
SELECT_JOBDATA_RESV_ID, &resv_id) != SLURM_SUCCESS) {
error("can not read resId for JobId=%u", job_ptr->job_id);
} else if (resv_id != 0) {
int rc = basil_signal_apids(resv_id, signal, NULL);
if (rc)
error("could not signal APIDs of resId %u: %s", resv_id,
basil_strerror(rc));
}
return SLURM_SUCCESS;
}
void *_sig_basil(void *args)
{
args_sig_basil_t *args_sig_basil = (args_sig_basil_t *) args;
int rc;
sleep(args_sig_basil->delay);
rc = basil_signal_apids(args_sig_basil->resv_id,
args_sig_basil->signal, NULL);
if (rc) {
error("could not signal APIDs of resId %u: %s",
args_sig_basil->resv_id, basil_strerror(rc));
}
xfree(args);
return NULL;
}
/**
* queue_basil_signal - queue job signal on to any APIDs
* IN job_ptr - job to be signalled
* IN signal - signal(7) number
* IN delay - how long to delay the signal, in seconds
* Only signal job if an ALPS reservation exists (non-0 reservation ID).
*/
extern void queue_basil_signal(struct job_record *job_ptr, int signal,
uint16_t delay)
{
args_sig_basil_t *args_sig_basil;
pthread_attr_t attr_sig_basil;
pthread_t thread_sig_basil;
uint32_t resv_id;
if (_get_select_jobinfo(job_ptr->select_jobinfo->data,
SELECT_JOBDATA_RESV_ID, &resv_id) != SLURM_SUCCESS) {
error("can not read resId for JobId=%u", job_ptr->job_id);
return;
}
if (resv_id == 0)
return;
if ((delay == 0) || (delay == (uint16_t) NO_VAL)) {
/* Send the signal now */
int rc = basil_signal_apids(resv_id, signal, NULL);
if (rc)
error("could not signal APIDs of resId %u: %s", resv_id,
basil_strerror(rc));
return;
}
/* Create a thread to send the signal later */
slurm_attr_init(&attr_sig_basil);
if (pthread_attr_setdetachstate(&attr_sig_basil,
PTHREAD_CREATE_DETACHED)) {
error("pthread_attr_setdetachstate error %m");
return;
}
args_sig_basil = xmalloc(sizeof(args_sig_basil_t));
args_sig_basil->resv_id = resv_id;
args_sig_basil->signal = signal;
args_sig_basil->delay = delay;
if (pthread_create(&thread_sig_basil, &attr_sig_basil,
_sig_basil, (void *) args_sig_basil)) {
error("pthread_create error %m");
return;
}
slurm_attr_destroy(&attr_sig_basil);
}
/**
* do_basil_release - release an (unconfirmed) BASIL reservation
* IN job_ptr - pointer to job which has just been deallocated resources
* RET see below
*/
extern int do_basil_release(struct job_record *job_ptr)
{
uint32_t resv_id;
if (_get_select_jobinfo(job_ptr->select_jobinfo->data,
SELECT_JOBDATA_RESV_ID, &resv_id) != SLURM_SUCCESS) {
error("can not read resId for JobId=%u", job_ptr->job_id);
} else if (resv_id && basil_release(resv_id) == 0) {
/* The resv_id is non-zero only if the job is or was running. */
debug("released ALPS resId %u for JobId %u",
resv_id, job_ptr->job_id);
}
/*
* Error handling: we only print out the errors (basil_release does this
* internally), but do not signal error to select_g_job_fini(). Calling
* contexts of this function (deallocate_nodes, batch_finish) only print
* additional error text: no further action is taken at this stage.
*/
return SLURM_SUCCESS;
}
/**
* do_basil_switch - suspend/resume BASIL reservation
* IN job_ptr - pointer to job which has just been deallocated resources
* IN suspend - to suspend or not to suspend
* RET see below
*/
extern int do_basil_switch(struct job_record *job_ptr, bool suspend)
{
uint32_t resv_id;
if (_get_select_jobinfo(job_ptr->select_jobinfo->data,
SELECT_JOBDATA_RESV_ID, &resv_id) != SLURM_SUCCESS) {
error("can not read resId for JobId=%u", job_ptr->job_id);
} else if (resv_id && basil_switch(resv_id, suspend) == 0) {
/* The resv_id is non-zero only if the job is or was running. */
debug("%s ALPS resId %u for JobId %u",
suspend ? "Suspended" : "Resumed",
resv_id, job_ptr->job_id);
}
return SLURM_SUCCESS;
}