| /* |
| * Interface between lower-level ALPS XML-RPC functions and SLURM. |
| * |
| * Copyright (c) 2010-11 Centro Svizzero di Calcolo Scientifico (CSCS) |
| * Licensed under GPLv2. |
| */ |
| #include "basil_interface.h" |
| #include "basil_alps.h" |
| #include "src/common/gres.h" |
| #include "src/common/slurm_accounting_storage.h" |
| |
| #define _DEBUG 0 |
| |
| int dim_size[3] = {0, 0, 0}; |
| |
| typedef struct args_sig_basil { |
| uint32_t resv_id; |
| int signal; |
| uint16_t delay; |
| } args_sig_basil_t; |
| |
| /* |
| * Following routines are from src/plugins/select/bluegene/plugin/jobinfo.c |
| */ |
| static int _set_select_jobinfo(select_jobinfo_t *jobinfo, |
| enum select_jobdata_type data_type, void *data) |
| { |
| uint32_t *uint32 = (uint32_t *) data; |
| |
| if (jobinfo == NULL) { |
| error("cray/set_select_jobinfo: jobinfo not set"); |
| return SLURM_ERROR; |
| } |
| if (jobinfo->magic != JOBINFO_MAGIC) { |
| error("cray/set_select_jobinfo: jobinfo magic bad"); |
| return SLURM_ERROR; |
| } |
| |
| switch (data_type) { |
| case SELECT_JOBDATA_RESV_ID: |
| jobinfo->reservation_id = *uint32; |
| break; |
| default: |
| error("cray/set_select_jobinfo: data_type %d invalid", |
| data_type); |
| } |
| |
| return SLURM_SUCCESS; |
| } |
| |
| static int _get_select_jobinfo(select_jobinfo_t *jobinfo, |
| enum select_jobdata_type data_type, void *data) |
| { |
| uint64_t *uint64 = (uint64_t *) data; |
| uint32_t *uint32 = (uint32_t *) data; |
| |
| if (jobinfo == NULL) { |
| error("cray/get_select_jobinfo: jobinfo not set"); |
| return SLURM_ERROR; |
| } |
| if (jobinfo->magic != JOBINFO_MAGIC) { |
| error("cray/get_select_jobinfo: jobinfo magic bad"); |
| return SLURM_ERROR; |
| } |
| |
| switch (data_type) { |
| case SELECT_JOBDATA_RESV_ID: |
| *uint32 = jobinfo->reservation_id; |
| break; |
| case SELECT_JOBDATA_PAGG_ID: |
| *uint64 = jobinfo->confirm_cookie; |
| break; |
| default: |
| error("cray/get_select_jobinfo: data_type %d invalid", |
| data_type); |
| } |
| |
| return SLURM_SUCCESS; |
| } |
| |
| /** Convert between Cray NID and slurm nodename format */ |
| static struct node_record *_find_node_by_basil_id(uint32_t node_id) |
| { |
| char nid[9]; /* nid%05d\0 */ |
| |
| snprintf(nid, sizeof(nid), "nid%05u", node_id); |
| |
| return find_node_record(nid); |
| } |
| |
| extern int basil_node_ranking(struct node_record *node_array, int node_cnt) |
| { |
| enum basil_version version = get_basil_version(); |
| struct basil_inventory *inv; |
| struct basil_node *node; |
| int rank_count = 0, i; |
| hostlist_t hl = hostlist_create(NULL); |
| bool bad_node = 0; |
| |
| /* |
| * When obtaining the initial configuration, we can not allow ALPS to |
| * fail. If there is a problem at this stage it is better to restart |
| * SLURM completely, after investigating (and/or fixing) the cause. |
| */ |
| inv = get_full_inventory(version); |
| if (inv == NULL) |
| fatal("failed to get BASIL %s ranking", bv_names_long[version]); |
| else if (!inv->batch_total) |
| fatal("system has no usable batch compute nodes"); |
| else if (inv->batch_total < node_cnt) |
| info("Warning: ALPS sees only %d/%d slurm.conf nodes, " |
| "check DownNodes", inv->batch_total, node_cnt); |
| |
| debug("BASIL %s RANKING INVENTORY: %d/%d batch nodes", |
| bv_names_long[version], inv->batch_avail, inv->batch_total); |
| |
| /* |
| * Node ranking is based on a subset of the inventory: only nodes in |
| * batch allocation mode which are up and not allocated. Assign a |
| * 'NO_VAL' rank to all other nodes, which will translate as a very |
| * high value, (unsigned)-2, to put those nodes last in the ranking. |
| * The rest of the code must ensure that those nodes are never chosen. |
| */ |
| for (i = 0; i < node_cnt; i++) |
| node_array[i].node_rank = NO_VAL; |
| |
| for (node = inv->f->node_head; node; node = node->next) { |
| struct node_record *node_ptr; |
| char tmp[50]; |
| |
| /* This will ignore interactive nodes when iterating through |
| * the apbasil inventory. If we don't do this, SLURM is |
| * unable to resolve the ID to a nidXXX name since it's not in |
| * the slurm.conf file. (Chris North) |
| */ |
| if (node->role == BNR_INTER) |
| continue; |
| |
| node_ptr = _find_node_by_basil_id(node->node_id); |
| if (node_ptr == NULL) { |
| error("nid%05u (%s node in state %s) not in slurm.conf", |
| node->node_id, nam_noderole[node->role], |
| nam_nodestate[node->state]); |
| bad_node = 1; |
| } else |
| node_ptr->node_rank = inv->nodes_total - rank_count++; |
| sprintf(tmp, "nid%05u", node->node_id); |
| hostlist_push(hl, tmp); |
| } |
| free_inv(inv); |
| if (bad_node) { |
| hostlist_sort(hl); |
| char *name = hostlist_ranged_string_xmalloc(hl); |
| info("It appears your slurm.conf nodelist doesn't " |
| "match the alps system. Here are the nodes alps knows " |
| "about\n%s", name); |
| } |
| hostlist_destroy(hl); |
| |
| return SLURM_SUCCESS; |
| } |
| |
| /** |
| * basil_inventory - Periodic node-state query via ALPS XML-RPC. |
| * This should be run immediately before each scheduling cycle. |
| * Returns non-SLURM_SUCCESS if |
| * - INVENTORY method failed (error) |
| * - no nodes are available (no point in scheduling) |
| * - orphaned ALPS reservation exists (wait until ALPS resynchronizes) |
| */ |
| extern int basil_inventory(void) |
| { |
| enum basil_version version = get_basil_version(); |
| struct basil_inventory *inv; |
| struct basil_node *node; |
| struct basil_rsvn *rsvn; |
| int slurm_alps_mismatch = 0; |
| int rc = SLURM_SUCCESS; |
| time_t now = time(NULL); |
| static time_t slurm_alps_mismatch_time = (time_t) 0; |
| static bool logged_sync_timeout = false; |
| |
| inv = get_full_inventory(version); |
| if (inv == NULL) { |
| error("BASIL %s INVENTORY failed", bv_names_long[version]); |
| return SLURM_ERROR; |
| } |
| |
| debug("BASIL %s INVENTORY: %d/%d batch nodes available", |
| bv_names_long[version], inv->batch_avail, inv->batch_total); |
| |
| /* Avoid checking for inv->batch_avail here since if we are |
| gang scheduling returning an error for a full system is |
| probably the wrong thing to do. (the schedule() function |
| in the slurmctld will never run ;)). |
| */ |
| if (!inv->f->node_head || !inv->batch_total) |
| rc = ESLURM_REQUESTED_NODE_CONFIG_UNAVAILABLE; |
| |
| for (node = inv->f->node_head; node; node = node->next) { |
| int node_inx; |
| struct node_record *node_ptr; |
| char *reason = NULL; |
| |
| /* This will ignore interactive nodes when iterating through |
| * the apbasil inventory. If we don't do this, SLURM is |
| * unable to resolve the ID to a nidXXX name since it's not in |
| * the slurm.conf file. (Chris North) |
| */ |
| if (node->role == BNR_INTER) |
| continue; |
| |
| node_ptr = _find_node_by_basil_id(node->node_id); |
| if (node_ptr == NULL) { |
| error("nid%05u (%s node in state %s) not in slurm.conf", |
| node->node_id, nam_noderole[node->role], |
| nam_nodestate[node->state]); |
| continue; |
| } |
| node_inx = node_ptr - node_record_table_ptr; |
| |
| if (node_is_allocated(node) && !IS_NODE_ALLOCATED(node_ptr)) { |
| /* |
| * ALPS still hangs on to the node while SLURM considers |
| * it already unallocated. Possible causes are partition |
| * cleanup taking too long (can be 10sec ... minutes), |
| * and orphaned ALPS reservations (caught below). |
| * |
| * The converse case (SLURM hanging on to the node while |
| * ALPS has already freed it) happens frequently during |
| * job completion: select_g_job_fini() is called before |
| * make_node_comp(). Rely on SLURM logic for this case. |
| */ |
| slurm_alps_mismatch++; |
| } |
| |
| if (node->state == BNS_DOWN) { |
| reason = "ALPS marked it DOWN"; |
| } else if (node->state == BNS_UNAVAIL) { |
| reason = "node is UNAVAILABLE"; |
| } else if (node->state == BNS_ROUTE) { |
| reason = "node does ROUTING"; |
| } else if (node->state == BNS_SUSPECT) { |
| reason = "entered SUSPECT mode"; |
| } else if (node->state == BNS_ADMINDOWN) { |
| reason = "node is ADMINDOWN"; |
| } else if (node->state != BNS_UP) { |
| reason = "state not UP"; |
| } else if (node->role != BNR_BATCH) { |
| reason = "mode not BATCH"; |
| } else if (node->arch != BNA_XT) { |
| reason = "arch not XT/XE"; |
| } |
| |
| /* Base state entirely derives from ALPS */ |
| if (reason) { |
| if (node_ptr->down_time == 0) |
| node_ptr->down_time = now; |
| if (IS_NODE_DOWN(node_ptr)) { |
| /* node still down */ |
| } else if ((slurmctld_conf.slurmd_timeout == 0) || |
| ((now - node_ptr->down_time) < |
| slurmctld_conf.slurmd_timeout)) { |
| node_ptr->node_state |= NODE_STATE_NO_RESPOND; |
| bit_clear(avail_node_bitmap, node_inx); |
| } else { |
| xfree(node_ptr->reason); |
| info("MARKING %s DOWN (%s)", |
| node_ptr->name, reason); |
| /* set_node_down also kills any running jobs */ |
| set_node_down_ptr(node_ptr, reason); |
| } |
| } else if (IS_NODE_DOWN(node_ptr)) { |
| xfree(node_ptr->reason); |
| node_ptr->down_time = 0; |
| info("MARKING %s UP", node_ptr->name); |
| |
| /* Reset state, make_node_idle figures out the rest */ |
| node_ptr->node_state &= NODE_STATE_FLAGS; |
| node_ptr->node_state &= (~NODE_STATE_NO_RESPOND); |
| node_ptr->node_state |= NODE_STATE_UNKNOWN; |
| |
| make_node_idle(node_ptr, NULL); |
| if (!IS_NODE_DRAIN(node_ptr) && |
| !IS_NODE_FAIL(node_ptr)) { |
| xfree(node_ptr->reason); |
| node_ptr->reason_time = 0; |
| node_ptr->reason_uid = NO_VAL; |
| clusteracct_storage_g_node_up( |
| acct_db_conn, node_ptr, now); |
| } |
| } else if (IS_NODE_NO_RESPOND(node_ptr)) { |
| node_ptr->node_state &= (~NODE_STATE_NO_RESPOND); |
| if (!IS_NODE_DRAIN(node_ptr) && |
| !IS_NODE_FAIL(node_ptr)) { |
| bit_set(avail_node_bitmap, node_inx); |
| } |
| } |
| } |
| |
| if (slurm_alps_mismatch) |
| debug("ALPS: %d node(s) still held", slurm_alps_mismatch); |
| |
| /* |
| * Check that each ALPS reservation corresponds to a SLURM job. |
| * Purge orphaned reservations, which may result from stale or |
| * messed up system state, or are indicative of ALPS problems |
| * (stuck in pending cancel calls). |
| */ |
| for (rsvn = inv->f->rsvn_head; rsvn; rsvn = rsvn->next) { |
| ListIterator job_iter = list_iterator_create(job_list); |
| struct job_record *job_ptr; |
| uint32_t resv_id; |
| |
| if (job_iter == NULL) |
| fatal("list_iterator_create: malloc failure"); |
| |
| while ((job_ptr = (struct job_record *)list_next(job_iter))) { |
| |
| if (_get_select_jobinfo(job_ptr->select_jobinfo->data, |
| SELECT_JOBDATA_RESV_ID, |
| &resv_id) == SLURM_SUCCESS |
| && resv_id == rsvn->rsvn_id) |
| break; |
| } |
| list_iterator_destroy(job_iter); |
| |
| /* |
| * Changed to ignore reservations for "UNKNOWN" batch |
| * ids (e.g. the interactive region) (Chris North) |
| */ |
| |
| if ((job_ptr == NULL) && (strcmp(rsvn->batch_id, "UNKNOWN"))) { |
| |
| error("orphaned ALPS reservation %u, trying to remove", |
| rsvn->rsvn_id); |
| basil_safe_release(rsvn->rsvn_id, inv); |
| slurm_alps_mismatch = true; |
| } |
| } |
| free_inv(inv); |
| |
| if (slurm_alps_mismatch) { |
| /* If SLURM and ALPS state are not in synchronization, |
| * do not schedule any more jobs until waiting at least |
| * SyncTimeout seconds. */ |
| if (slurm_alps_mismatch_time == 0) { |
| slurm_alps_mismatch_time = now; |
| } else if (cray_conf->sync_timeout == 0) { |
| /* Wait indefinitely */ |
| } else if (difftime(now, slurm_alps_mismatch_time) < |
| cray_conf->sync_timeout) { |
| return ESLURM_REQUESTED_NODE_CONFIG_UNAVAILABLE; |
| } else if (!logged_sync_timeout) { |
| error("Could not synchronize SLURM with ALPS for %u " |
| "seconds, proceeding with job scheduling", |
| cray_conf->sync_timeout); |
| logged_sync_timeout = true; |
| } |
| } else { |
| slurm_alps_mismatch_time = 0; |
| logged_sync_timeout = false; |
| } |
| return rc; |
| } |
| |
| /** Base-36 encoding of @coord */ |
| static char _enc_coord(uint8_t coord) |
| { |
| return coord + (coord < 10 ? '0' : 'A' - 10); |
| } |
| |
| /** |
| * basil_geometry - Check node attributes, resolve (X,Y,Z) coordinates. |
| * |
| * Checks both SDB database and ALPS inventory for consistency. The inventory |
| * part is identical to basil_inventory(), with the difference of being called |
| * before valid bitmaps exist, from select_g_node_init(). |
| * Its dependencies are: |
| * - it needs reset_job_bitmaps() in order to rebuild node_bitmap fields, |
| * - it relies on _sync_nodes_to_jobs() to |
| * o kill active jobs on nodes now marked DOWN, |
| * o reset node state to ALLOCATED if it has been marked IDLE here (which is |
| * an error case, since there is no longer an ALPS reservation for the job, |
| * this is caught by the subsequent basil_inventory()). |
| */ |
| extern int basil_geometry(struct node_record *node_ptr_array, int node_cnt) |
| { |
| struct node_record *node_ptr, *end = node_ptr_array + node_cnt; |
| enum basil_version version = get_basil_version(); |
| struct basil_inventory *inv; |
| |
| /* General mySQL */ |
| MYSQL *handle; |
| MYSQL_STMT *stmt = NULL; |
| /* Input parameters */ |
| unsigned int node_id; |
| /* |
| * Use a left outer join here since the attributes table may not be |
| * populated for a given nodeid (e.g. when the node has been disabled |
| * on the SMW via 'xtcli disable'). |
| * The processor table has more authoritative information, if a nodeid |
| * is not listed there, it does not exist. |
| */ |
| const char query[] = "SELECT x_coord, y_coord, z_coord," |
| " cab_position, cab_row, cage, slot, cpu," |
| " LOG2(coremask+1), availmem, " |
| " processor_type " |
| "FROM processor LEFT JOIN attributes " |
| "ON processor_id = nodeid " |
| "WHERE processor_id = ? "; |
| const int PARAM_COUNT = 1; /* node id */ |
| MYSQL_BIND params[PARAM_COUNT]; |
| |
| int x_coord, y_coord, z_coord; |
| int cab, row, cage, slot, cpu; |
| unsigned int node_cpus, node_mem; |
| char proc_type[BASIL_STRING_SHORT]; |
| MYSQL_BIND bind_cols[COLUMN_COUNT]; |
| my_bool is_null[COLUMN_COUNT]; |
| my_bool is_error[COLUMN_COUNT]; |
| int is_gemini, i; |
| time_t now = time(NULL); |
| |
| memset(params, 0, sizeof(params)); |
| params[0].buffer_type = MYSQL_TYPE_LONG; |
| params[0].is_unsigned = true; |
| params[0].is_null = (my_bool *)0; |
| params[0].buffer = (char *)&node_id; |
| |
| memset(bind_cols, 0, sizeof(bind_cols)); |
| for (i = 0; i < COLUMN_COUNT; i ++) { |
| bind_cols[i].is_null = &is_null[i]; |
| bind_cols[i].error = &is_error[i]; |
| |
| if (i == COL_TYPE) { |
| bind_cols[i].buffer_type = MYSQL_TYPE_STRING; |
| bind_cols[i].buffer_length = sizeof(proc_type); |
| bind_cols[i].buffer = proc_type; |
| } else { |
| bind_cols[i].buffer_type = MYSQL_TYPE_LONG; |
| bind_cols[i].is_unsigned = (i >= COL_CORES); |
| } |
| } |
| bind_cols[COL_X].buffer = (char *)&x_coord; |
| bind_cols[COL_Y].buffer = (char *)&y_coord; |
| bind_cols[COL_Z].buffer = (char *)&z_coord; |
| bind_cols[COL_CAB].buffer = (char *)&cab; |
| bind_cols[COL_ROW].buffer = (char *)&row; |
| bind_cols[COL_CAGE].buffer = (char *)&cage; |
| bind_cols[COL_SLOT].buffer = (char *)&slot; |
| bind_cols[COL_CPU].buffer = (char *)&cpu; |
| bind_cols[COL_CORES].buffer = (char *)&node_cpus; |
| bind_cols[COL_MEMORY].buffer = (char *)&node_mem; |
| |
| inv = get_full_inventory(version); |
| if (inv == NULL) |
| fatal("failed to get initial BASIL inventory"); |
| |
| info("BASIL %s initial INVENTORY: %d/%d batch nodes available", |
| bv_names_long[version], inv->batch_avail, inv->batch_total); |
| |
| handle = cray_connect_sdb(); |
| if (handle == NULL) |
| fatal("can not connect to XTAdmin database on the SDB"); |
| |
| is_gemini = cray_is_gemini_system(handle); |
| if (is_gemini < 0) |
| fatal("can not determine Cray XT/XE system type"); |
| |
| stmt = prepare_stmt(handle, query, params, PARAM_COUNT, |
| bind_cols, COLUMN_COUNT); |
| if (stmt == NULL) |
| fatal("can not prepare statement to resolve Cray coordinates"); |
| |
| for (node_ptr = node_record_table_ptr; node_ptr < end; node_ptr++) { |
| struct basil_node *node; |
| char *reason = NULL; |
| |
| if ((node_ptr->name == NULL) || |
| (sscanf(node_ptr->name, "nid%05u", &node_id) != 1)) { |
| error("can not read basil_node_id from %s", |
| node_ptr->name); |
| continue; |
| } |
| |
| if (exec_stmt(stmt, query, bind_cols, COLUMN_COUNT) < 0) |
| fatal("can not resolve %s coordinates", node_ptr->name); |
| |
| if (fetch_stmt(stmt) == 0) { |
| #if _DEBUG |
| info("proc_type:%s cpus:%u memory:%u", |
| proc_type, node_cpus, node_mem); |
| info("row:%u cage:%u slot:%u cpu:%u xyz:%u:%u:%u", |
| row, cage, slot, cpu, x_coord, y_coord, z_coord); |
| #endif |
| if (strcmp(proc_type, "compute") != 0) { |
| /* |
| * Switching a compute node to be a service node |
| * can not happen at runtime: requires a reboot. |
| */ |
| fatal("Node '%s' is a %s node. " |
| "Only compute nodes can appear in slurm.conf.", |
| node_ptr->name, proc_type); |
| } else if (is_null[COL_CORES] || is_null[COL_MEMORY]) { |
| /* |
| * This can happen if a node has been disabled |
| * on the SMW (using 'xtcli disable <nid>'). The |
| * node will still be listed in the 'processor' |
| * table, but have no 'attributes' entry (NULL |
| * values for CPUs/memory). Also, the node will |
| * be invisible to ALPS, which is why we need to |
| * set it down here already. |
| */ |
| node_cpus = node_mem = 0; |
| reason = "node data unknown - disabled on SMW?"; |
| } else if (is_null[COL_X] || is_null[COL_Y] |
| || is_null[COL_Z]) { |
| /* |
| * Similar case to the one above, observed when |
| * a blade has been removed. Node will not |
| * likely show up in ALPS. |
| */ |
| x_coord = y_coord = z_coord = 0; |
| reason = "unknown coordinates - hardware failure?"; |
| } else if (node_cpus < node_ptr->config_ptr->cpus) { |
| /* |
| * FIXME: Might reconsider this policy. |
| * |
| * FastSchedule is ignored here, it requires the |
| * slurm.conf to be consistent with hardware. |
| * |
| * Assumption is that CPU/Memory do not change |
| * at runtime (Cray has no hot-swappable parts). |
| * |
| * Hence checking it in basil_inventory() would |
| * mean a lot of runtime overhead. |
| */ |
| fatal("slurm.conf: node %s has only Procs=%d", |
| node_ptr->name, node_cpus); |
| } else if (node_mem < node_ptr->config_ptr->real_memory) { |
| fatal("slurm.conf: node %s has RealMemory=%d", |
| node_ptr->name, node_mem); |
| } |
| |
| } else if (is_gemini) { |
| fatal("Non-existing Gemini node '%s' in slurm.conf", |
| node_ptr->name); |
| } else { |
| fatal("Non-existing SeaStar node '%s' in slurm.conf", |
| node_ptr->name); |
| } |
| |
| if (!is_gemini) { |
| /* |
| * SeaStar: each node has unique coordinates |
| */ |
| if (node_ptr->arch == NULL) |
| node_ptr->arch = xstrdup("XT"); |
| } else { |
| /* |
| * Gemini: each 2 nodes share the same network |
| * interface (i.e., nodes 0/1 and 2/3 each have |
| * the same coordinates). |
| */ |
| if (node_ptr->arch == NULL) |
| node_ptr->arch = xstrdup("XE"); |
| } |
| |
| xfree(node_ptr->node_hostname); |
| xfree(node_ptr->comm_name); |
| /* |
| * Convention: since we are using SLURM in frontend-mode, |
| * we use Node{Addr,HostName} as follows. |
| * |
| * NodeAddr: <X><Y><Z> coordinates in base-36 encoding |
| * |
| * NodeHostName: c#-#c#s#n# using the NID convention |
| * <cabinet>-<row><chassis><slot><node> |
| * - each cabinet can accommodate 3 chassis (c1..c3) |
| * - each chassis has 8 slots (s0..s7) |
| * - each slot contains 2 or 4 nodes (n0..n3) |
| * o either 2 service nodes (n0/n3) |
| * o or 4 compute nodes (n0..n3) |
| * o or 2 gemini chips (g0/g1 serving n0..n3) |
| * |
| * Example: c0-0c1s0n1 |
| * - c0- = cabinet 0 |
| * - 0 = row 0 |
| * - c1 = chassis 1 |
| * - s0 = slot 0 |
| * - n1 = node 1 |
| */ |
| node_ptr->node_hostname = xstrdup_printf("c%u-%uc%us%un%u", cab, |
| row, cage, slot, cpu); |
| node_ptr->comm_name = xstrdup_printf("%c%c%c", |
| _enc_coord(x_coord), |
| _enc_coord(y_coord), |
| _enc_coord(z_coord)); |
| dim_size[0] = MAX(dim_size[0], (x_coord - 1)); |
| dim_size[1] = MAX(dim_size[1], (y_coord - 1)); |
| dim_size[2] = MAX(dim_size[2], (z_coord - 1)); |
| #if _DEBUG |
| info("%s %s %s cpus=%u, mem=%u reason=%s", node_ptr->name, |
| node_ptr->node_hostname, node_ptr->comm_name, |
| node_cpus, node_mem, reason); |
| #endif |
| /* |
| * Check the current state reported by ALPS inventory, unless it |
| * is already evident that the node has some other problem. |
| */ |
| if (reason == NULL) { |
| for (node = inv->f->node_head; node; node = node->next) |
| if (node->node_id == node_id) |
| break; |
| if (node == NULL) { |
| reason = "not visible to ALPS - check hardware"; |
| } else if (node->state == BNS_DOWN) { |
| reason = "ALPS marked it DOWN"; |
| } else if (node->state == BNS_UNAVAIL) { |
| reason = "node is UNAVAILABLE"; |
| } else if (node->state == BNS_ROUTE) { |
| reason = "node does ROUTING"; |
| } else if (node->state == BNS_SUSPECT) { |
| reason = "entered SUSPECT mode"; |
| } else if (node->state == BNS_ADMINDOWN) { |
| reason = "node is ADMINDOWN"; |
| } else if (node->state != BNS_UP) { |
| reason = "state not UP"; |
| } else if (node->role != BNR_BATCH) { |
| reason = "mode not BATCH"; |
| } else if (node->arch != BNA_XT) { |
| reason = "arch not XT/XE"; |
| } |
| } |
| |
| /* Base state entirely derives from ALPS |
| * NOTE: The node bitmaps are not defined when this code is |
| * initially executed. */ |
| node_ptr->node_state &= NODE_STATE_FLAGS; |
| if (reason) { |
| if (node_ptr->down_time == 0) |
| node_ptr->down_time = now; |
| if (IS_NODE_DOWN(node_ptr)) { |
| /* node still down */ |
| debug("Initial DOWN node %s - %s", |
| node_ptr->name, node_ptr->reason); |
| } else if (slurmctld_conf.slurmd_timeout && |
| ((now - node_ptr->down_time) < |
| slurmctld_conf.slurmd_timeout)) { |
| node_ptr->node_state |= NODE_STATE_NO_RESPOND; |
| } else { |
| info("Initial DOWN node %s - %s", |
| node_ptr->name, reason); |
| node_ptr->reason = xstrdup(reason); |
| /* Node state flags preserved above */ |
| node_ptr->node_state |= NODE_STATE_DOWN; |
| clusteracct_storage_g_node_down(acct_db_conn, |
| node_ptr, |
| now, NULL, |
| slurm_get_slurm_user_id()); |
| } |
| } else { |
| bool node_up_flag = IS_NODE_DOWN(node_ptr) && |
| !IS_NODE_DRAIN(node_ptr) && |
| !IS_NODE_FAIL(node_ptr); |
| node_ptr->down_time = 0; |
| if (node_is_allocated(node)) |
| node_ptr->node_state |= NODE_STATE_ALLOCATED; |
| else |
| node_ptr->node_state |= NODE_STATE_IDLE; |
| node_ptr->node_state &= (~NODE_STATE_NO_RESPOND); |
| xfree(node_ptr->reason); |
| if (node_up_flag) { |
| info("ALPS returned node %s to service", |
| node_ptr->name); |
| clusteracct_storage_g_node_up(acct_db_conn, |
| node_ptr, now); |
| } |
| } |
| |
| free_stmt_result(stmt); |
| } |
| |
| if (stmt_close(stmt)) |
| error("error closing statement: %s", mysql_stmt_error(stmt)); |
| cray_close_sdb(handle); |
| free_inv(inv); |
| |
| return SLURM_SUCCESS; |
| } |
| |
| struct basil_accel_param* build_accel_param(struct job_record* job_ptr) |
| { |
| int gpu_mem_req; |
| struct basil_accel_param* head,* bap_ptr; |
| |
| gpu_mem_req = gres_plugin_get_job_value_by_type(job_ptr->gres_list, |
| "gpu_mem"); |
| |
| if (gpu_mem_req == NO_VAL) |
| gpu_mem_req = 0; |
| |
| if (!job_ptr) { |
| info("The job_ptr is NULL; nothing to do!"); |
| return NULL; |
| } else if (!job_ptr->details) { |
| info("The job_ptr->details is NULL; nothing to do!"); |
| return NULL; |
| } |
| |
| head = xmalloc(sizeof(struct basil_accel_param)); |
| bap_ptr = head; |
| bap_ptr->type = BA_GPU; /* Currently BASIL only permits |
| * generic resources of type GPU. */ |
| bap_ptr->memory_mb = gpu_mem_req; |
| bap_ptr->next = NULL; |
| |
| return head; |
| } |
| |
| |
| /** |
| * do_basil_reserve - create a BASIL reservation. |
| * IN job_ptr - pointer to job which has just been allocated resources |
| * RET 0 or error code, job will abort or be requeued on failure |
| */ |
| extern int do_basil_reserve(struct job_record *job_ptr) |
| { |
| struct nodespec *ns_head = NULL; |
| uint16_t mppwidth = 0, mppdepth, mppnppn; |
| /* mppmem must be at least 1 for gang scheduling to work so |
| * if you are wondering why gang scheduling isn't working you |
| * should check your slurm.conf for DefMemPerNode */ |
| uint32_t mppmem = 0, node_min_mem = 0; |
| uint32_t resv_id; |
| int i, first_bit, last_bit; |
| long rc; |
| char *user, batch_id[16]; |
| struct basil_accel_param* bap; |
| |
| if (!job_ptr->job_resrcs || job_ptr->job_resrcs->nhosts == 0) |
| return SLURM_SUCCESS; |
| |
| debug3("job #%u: %u nodes = %s, cpus=%u" , job_ptr->job_id, |
| job_ptr->job_resrcs->nhosts, |
| job_ptr->job_resrcs->nodes, |
| job_ptr->job_resrcs->ncpus |
| ); |
| |
| if (job_ptr->job_resrcs->node_bitmap == NULL) { |
| error("job %u node_bitmap not set", job_ptr->job_id); |
| return SLURM_SUCCESS; |
| } |
| |
| first_bit = bit_ffs(job_ptr->job_resrcs->node_bitmap); |
| last_bit = bit_fls(job_ptr->job_resrcs->node_bitmap); |
| if (first_bit == -1 || last_bit == -1) |
| return SLURM_SUCCESS; /* no nodes allocated */ |
| |
| mppdepth = MAX(1, job_ptr->details->cpus_per_task); |
| mppnppn = job_ptr->details->ntasks_per_node; |
| |
| /* mppmem */ |
| if (job_ptr->details->pn_min_memory & MEM_PER_CPU) { |
| /* Only honour --mem-per-cpu if --ntasks has been given */ |
| if (job_ptr->details->num_tasks) |
| mppmem = job_ptr->details->pn_min_memory & ~MEM_PER_CPU; |
| } else if (job_ptr->details->pn_min_memory) { |
| node_min_mem = job_ptr->details->pn_min_memory; |
| } |
| |
| for (i = first_bit; i <= last_bit; i++) { |
| struct node_record *node_ptr = node_record_table_ptr + i; |
| uint32_t basil_node_id; |
| |
| if (!bit_test(job_ptr->job_resrcs->node_bitmap, i)) |
| continue; |
| |
| if (!node_ptr->name || node_ptr->name[0] == '\0') |
| continue; /* bad node */ |
| |
| if (sscanf(node_ptr->name, "nid%05u", &basil_node_id) != 1) |
| fatal("can not read basil_node_id from %s", |
| node_ptr->name); |
| |
| if (ns_add_node(&ns_head, basil_node_id, false) != 0) { |
| error("can not add node %s (nid%05u)", node_ptr->name, |
| basil_node_id); |
| free_nodespec(ns_head); |
| return SLURM_ERROR; |
| } |
| |
| if (node_min_mem) { |
| uint32_t node_cpus, node_mem; |
| int32_t tmp_mppmem; |
| |
| if (slurmctld_conf.fast_schedule) { |
| node_cpus = node_ptr->config_ptr->cpus; |
| node_mem = node_ptr->config_ptr->real_memory; |
| } else { |
| node_cpus = node_ptr->cpus; |
| node_mem = node_ptr->real_memory; |
| } |
| /* |
| * ALPS 'Processing Elements per Node' value (aprun -N), |
| * which in slurm is --ntasks-per-node and 'mppnppn' in |
| * PBS: if --ntasks is specified, default to the number |
| * of cores per node (also the default for 'aprun -N'). |
| * On a heterogeneous system the nodes aren't |
| * always the same so keep track of the lowest |
| * mppmem and use it as the level for all |
| * nodes (mppmem is 0 when coming in). |
| */ |
| node_mem /= mppnppn ? mppnppn : node_cpus; |
| tmp_mppmem = node_min_mem = MIN(node_mem, node_min_mem); |
| |
| /* If less than or equal to 0 make sure you |
| have 1 at least since 0 means give all the |
| memory to the job. |
| */ |
| if (tmp_mppmem <= 0) |
| tmp_mppmem = 1; |
| |
| if (mppmem) |
| mppmem = MIN(mppmem, tmp_mppmem); |
| else |
| mppmem = tmp_mppmem; |
| } |
| } |
| |
| /* mppwidth */ |
| for (i = 0; i < job_ptr->job_resrcs->nhosts; i++) { |
| uint16_t node_tasks = job_ptr->job_resrcs->cpus[i] / mppdepth; |
| |
| if (mppnppn && mppnppn < node_tasks) |
| node_tasks = mppnppn; |
| mppwidth += node_tasks; |
| } |
| |
| snprintf(batch_id, sizeof(batch_id), "%u", job_ptr->job_id); |
| user = uid_to_string(job_ptr->user_id); |
| |
| if (job_ptr->gres_list) |
| bap = build_accel_param(job_ptr); |
| else |
| bap = NULL; |
| |
| rc = basil_reserve(user, batch_id, mppwidth, mppdepth, mppnppn, |
| mppmem, ns_head, bap); |
| xfree(user); |
| if (rc <= 0) { |
| /* errno value will be resolved by select_g_job_begin() */ |
| errno = is_transient_error(rc) ? EAGAIN : ECONNABORTED; |
| return SLURM_ERROR; |
| } |
| |
| resv_id = rc; |
| if (_set_select_jobinfo(job_ptr->select_jobinfo->data, |
| SELECT_JOBDATA_RESV_ID, &resv_id) != SLURM_SUCCESS) { |
| /* |
| * This is a fatal error since it means we will not be able to |
| * confirm the reservation; no step will be able to run in it. |
| */ |
| error("job %u: can not set resId %u", job_ptr->job_id, resv_id); |
| basil_release(resv_id); |
| return SLURM_ERROR; |
| } |
| if (mppmem) |
| job_ptr->details->pn_min_memory = mppmem | MEM_PER_CPU; |
| |
| info("ALPS RESERVATION #%u, JobId %u: BASIL -n %d -N %d -d %d -m %d", |
| resv_id, job_ptr->job_id, mppwidth, mppnppn, mppdepth, mppmem); |
| |
| return SLURM_SUCCESS; |
| } |
| |
| /** |
| * do_basil_confirm - confirm an existing BASIL reservation. |
| * This requires the alloc_sid to equal the session ID (getsid()) of the process |
| * executing the aprun/mpirun commands |
| * Returns: SLURM_SUCCESS if ok, READY_JOB_ERROR/FATAL on transient/fatal error. |
| */ |
| extern int do_basil_confirm(struct job_record *job_ptr) |
| { |
| uint32_t resv_id; |
| uint64_t pagg_id; |
| |
| if (_get_select_jobinfo(job_ptr->select_jobinfo->data, |
| SELECT_JOBDATA_RESV_ID, &resv_id) != SLURM_SUCCESS) { |
| error("can not read resId for JobId=%u", job_ptr->job_id); |
| } else if (resv_id == 0) { |
| /* On Cray XT/XE, a reservation ID of 0 is always invalid. */ |
| error("JobId=%u has invalid (ZERO) resId", job_ptr->job_id); |
| } else if (_get_select_jobinfo(job_ptr->select_jobinfo->data, |
| SELECT_JOBDATA_PAGG_ID, &pagg_id) != SLURM_SUCCESS) { |
| error("can not read pagg ID for JobId=%u", job_ptr->job_id); |
| } else { |
| int rc; |
| |
| if (pagg_id == 0) { |
| #ifdef HAVE_REAL_CRAY |
| /* This fallback case is for interactive jobs only */ |
| error("JobId %u has no pagg ID, falling back to SID", |
| job_ptr->job_id); |
| #endif |
| pagg_id = job_ptr->alloc_sid; |
| } |
| |
| rc = basil_confirm(resv_id, job_ptr->job_id, pagg_id); |
| if (rc == 0) { |
| debug2("confirmed ALPS resId %u for JobId %u, pagg " |
| "%"PRIu64"", resv_id, job_ptr->job_id, pagg_id); |
| return SLURM_SUCCESS; |
| } else if (rc == -BE_NO_RESID) { |
| /* |
| * If ALPS can not find the reservation ID we are trying |
| * to confirm, it may be that the job has already been |
| * canceled, or that the reservation has timed out after |
| * waiting for the confirmation. |
| * It is more likely that this error occurs on a per-job |
| * basis, hence in this case do not drain frontend node. |
| */ |
| error("JobId %u has invalid ALPS resId %u - job " |
| "already canceled?", job_ptr->job_id, resv_id); |
| return SLURM_SUCCESS; |
| } else { |
| error("confirming ALPS resId %u of JobId %u FAILED: %s", |
| resv_id, job_ptr->job_id, basil_strerror(rc)); |
| |
| if (is_transient_error(rc)) |
| return READY_JOB_ERROR; |
| } |
| } |
| return READY_JOB_FATAL; |
| } |
| |
| /** |
| * do_basil_signal - pass job signal on to any APIDs |
| * IN job_ptr - job to be signalled |
| * IN signal - signal(7) number |
| * Only signal job if an ALPS reservation exists (non-0 reservation ID). |
| */ |
| extern int do_basil_signal(struct job_record *job_ptr, int signal) |
| { |
| uint32_t resv_id; |
| |
| if (_get_select_jobinfo(job_ptr->select_jobinfo->data, |
| SELECT_JOBDATA_RESV_ID, &resv_id) != SLURM_SUCCESS) { |
| error("can not read resId for JobId=%u", job_ptr->job_id); |
| } else if (resv_id != 0) { |
| int rc = basil_signal_apids(resv_id, signal, NULL); |
| |
| if (rc) |
| error("could not signal APIDs of resId %u: %s", resv_id, |
| basil_strerror(rc)); |
| } |
| return SLURM_SUCCESS; |
| } |
| |
| void *_sig_basil(void *args) |
| { |
| args_sig_basil_t *args_sig_basil = (args_sig_basil_t *) args; |
| int rc; |
| |
| sleep(args_sig_basil->delay); |
| rc = basil_signal_apids(args_sig_basil->resv_id, |
| args_sig_basil->signal, NULL); |
| if (rc) { |
| error("could not signal APIDs of resId %u: %s", |
| args_sig_basil->resv_id, basil_strerror(rc)); |
| } |
| xfree(args); |
| return NULL; |
| } |
| |
| /** |
| * queue_basil_signal - queue job signal on to any APIDs |
| * IN job_ptr - job to be signalled |
| * IN signal - signal(7) number |
| * IN delay - how long to delay the signal, in seconds |
| * Only signal job if an ALPS reservation exists (non-0 reservation ID). |
| */ |
| extern void queue_basil_signal(struct job_record *job_ptr, int signal, |
| uint16_t delay) |
| { |
| args_sig_basil_t *args_sig_basil; |
| pthread_attr_t attr_sig_basil; |
| pthread_t thread_sig_basil; |
| uint32_t resv_id; |
| |
| if (_get_select_jobinfo(job_ptr->select_jobinfo->data, |
| SELECT_JOBDATA_RESV_ID, &resv_id) != SLURM_SUCCESS) { |
| error("can not read resId for JobId=%u", job_ptr->job_id); |
| return; |
| } |
| if (resv_id == 0) |
| return; |
| if ((delay == 0) || (delay == (uint16_t) NO_VAL)) { |
| /* Send the signal now */ |
| int rc = basil_signal_apids(resv_id, signal, NULL); |
| |
| if (rc) |
| error("could not signal APIDs of resId %u: %s", resv_id, |
| basil_strerror(rc)); |
| return; |
| } |
| |
| /* Create a thread to send the signal later */ |
| slurm_attr_init(&attr_sig_basil); |
| if (pthread_attr_setdetachstate(&attr_sig_basil, |
| PTHREAD_CREATE_DETACHED)) { |
| error("pthread_attr_setdetachstate error %m"); |
| return; |
| } |
| args_sig_basil = xmalloc(sizeof(args_sig_basil_t)); |
| args_sig_basil->resv_id = resv_id; |
| args_sig_basil->signal = signal; |
| args_sig_basil->delay = delay; |
| if (pthread_create(&thread_sig_basil, &attr_sig_basil, |
| _sig_basil, (void *) args_sig_basil)) { |
| error("pthread_create error %m"); |
| return; |
| } |
| slurm_attr_destroy(&attr_sig_basil); |
| } |
| |
| /** |
| * do_basil_release - release an (unconfirmed) BASIL reservation |
| * IN job_ptr - pointer to job which has just been deallocated resources |
| * RET see below |
| */ |
| extern int do_basil_release(struct job_record *job_ptr) |
| { |
| uint32_t resv_id; |
| |
| if (_get_select_jobinfo(job_ptr->select_jobinfo->data, |
| SELECT_JOBDATA_RESV_ID, &resv_id) != SLURM_SUCCESS) { |
| error("can not read resId for JobId=%u", job_ptr->job_id); |
| } else if (resv_id && basil_release(resv_id) == 0) { |
| /* The resv_id is non-zero only if the job is or was running. */ |
| debug("released ALPS resId %u for JobId %u", |
| resv_id, job_ptr->job_id); |
| } |
| /* |
| * Error handling: we only print out the errors (basil_release does this |
| * internally), but do not signal error to select_g_job_fini(). Calling |
| * contexts of this function (deallocate_nodes, batch_finish) only print |
| * additional error text: no further action is taken at this stage. |
| */ |
| return SLURM_SUCCESS; |
| } |
| |
| /** |
| * do_basil_switch - suspend/resume BASIL reservation |
| * IN job_ptr - pointer to job which has just been deallocated resources |
| * IN suspend - to suspend or not to suspend |
| * RET see below |
| */ |
| extern int do_basil_switch(struct job_record *job_ptr, bool suspend) |
| { |
| uint32_t resv_id; |
| |
| if (_get_select_jobinfo(job_ptr->select_jobinfo->data, |
| SELECT_JOBDATA_RESV_ID, &resv_id) != SLURM_SUCCESS) { |
| error("can not read resId for JobId=%u", job_ptr->job_id); |
| } else if (resv_id && basil_switch(resv_id, suspend) == 0) { |
| /* The resv_id is non-zero only if the job is or was running. */ |
| debug("%s ALPS resId %u for JobId %u", |
| suspend ? "Suspended" : "Resumed", |
| resv_id, job_ptr->job_id); |
| } |
| return SLURM_SUCCESS; |
| } |