| /*****************************************************************************\ |
| * basil_interface.c - slurmctld interface to BASIL, Cray's Batch Application |
| * Scheduler Interface Layer (BASIL). In order to support development, |
| * these functions will provide basic BASIL-like functionality even |
| * without a BASIL command being present. |
| ***************************************************************************** |
| * Copyright (C) 2009 Lawrence Livermore National Security. |
| * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). |
| * Written by Morris Jette <jette1@llnl.gov> |
| * CODE-OCEC-09-009. All rights reserved. |
| * |
| * This file is part of SLURM, a resource management program. |
| * For details, see <https://computing.llnl.gov/linux/slurm/>. |
| * Please also read the included file: DISCLAIMER. |
| * |
| * SLURM is free software; you can redistribute it and/or modify it under |
| * the terms of the GNU General Public License as published by the Free |
| * Software Foundation; either version 2 of the License, or (at your option) |
| * any later version. |
| * |
| * In addition, as a special exception, the copyright holders give permission |
| * to link the code of portions of this program with the OpenSSL library under |
| * certain conditions as described in each individual source file, and |
| * distribute linked combinations including the two. You must obey the GNU |
| * General Public License in all respects for all of the code used other than |
| * OpenSSL. If you modify file(s) with this exception, you may extend this |
| * exception to your version of the file(s), but you are not obligated to do |
| * so. If you do not wish to do so, delete this exception statement from your |
| * version. If you delete this exception statement from all source files in |
| * the program, then also delete it here. |
| * |
| * SLURM is distributed in the hope that it will be useful, but WITHOUT ANY |
| * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS |
| * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more |
| * details. |
| * |
| * You should have received a copy of the GNU General Public License along |
| * with SLURM; if not, write to the Free Software Foundation, Inc., |
| * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. |
| \*****************************************************************************/ |
| |
| /* FIXME: Document, ALPS must be started before SLURM */ |
| /* FIXME: Document BASIL_RESERVATION_ID env var */ |
| |
| #if HAVE_CONFIG_H |
| # include "config.h" |
| #endif /* HAVE_CONFIG_H */ |
| |
| #include <slurm/slurm_errno.h> |
| #include <stdlib.h> |
| #include <string.h> |
| |
| #include "src/common/log.h" |
| #include "src/common/node_select.h" |
| #include "src/common/xmalloc.h" |
| #include "src/common/xstring.h" |
| #include "src/slurmctld/basil_interface.h" |
| #include "src/slurmctld/slurmctld.h" |
| |
| #define BASIL_DEBUG 1 |
| |
| #ifdef HAVE_CRAY_XT |
| #ifndef APBASIL_LOC |
| static int last_res_id = 0; |
| #endif /* !APBASIL_LOC */ |
| |
| #ifdef APBASIL_LOC |
| /* Make sure that each SLURM node has a BASIL node ID */ |
| static void _validate_basil_node_id(void) |
| { |
| int i; |
| struct node_record *node_ptr = node_record_table_ptr; |
| |
| for (i=0; i<node_record_cnt; i++, node_ptr++) |
| if (node_ptr->basil_node_id != NO_VAL) |
| continue; |
| if (IS_NODE_DOWN(node_ptr)) |
| continue; |
| |
| error("Node %s has no basil node_id", node_ptr->name); |
| last_node_update = time(NULL); |
| set_node_down(node_ptr->name, "No BASIL node_id"); |
| } |
| } |
| #endif /* APBASIL_LOC */ |
| #endif /* HAVE_CRAY_XT */ |
| |
| /* |
| * basil_query - Query BASIL for node and reservation state. |
| * Execute once at slurmctld startup and periodically thereafter. |
| * RET 0 or error code |
| */ |
| extern int basil_query(void) |
| { |
| int error_code = SLURM_SUCCESS; |
| #ifdef HAVE_CRAY_XT |
| #ifdef APBASIL_LOC |
| struct config_record *config_ptr; |
| struct node_record *node_ptr; |
| struct job_record *job_ptr; |
| ListIterator job_iterator; |
| int i; |
| char *reason, *res_id; |
| static bool first_run = true; |
| |
| /* Issue the BASIL QUERY request */ |
| if (request_failure) { |
| fatal("basil query error: %s", "TBD"); |
| return SLURM_ERROR; |
| } |
| debug("basil query initiated"); |
| |
| if (first_run) { |
| /* Set basil_node_id to NO_VAL since the default value |
| * of zero is a valid BASIL node ID */ |
| node_ptr = node_record_table_ptr; |
| for (i=0; i<node_record_cnt; i++, node_ptr++) |
| node_ptr->basil_node_id = NO_VAL; |
| first_run = false; |
| } |
| |
| /* Validate configuration for each node that BASIL reports */ |
| for (each_basil_node) { |
| #if BASIL_DEBUG |
| /* Log node state according to BASIL */ |
| info("basil query: name=%s arch=%s", |
| basil_node_name, basil_node_arch, etc.); |
| #endif /* BASIL_DEBUG */ |
| |
| /* NOTE: Cray should provide X-, Y- and Z-coordinates |
| * in the future. When that happens, we'll want to use |
| * those numbers to generate the hostname: |
| * slurm_host_name = xmalloc(sizeof(conf->node_prefix) + 4); |
| * sprintf(slurm_host_name: %s%d%d%d", basil_node_name, X,Y,Z); |
| * Until then the node name must contain a 3-digit numberic |
| * suffix specifying the X-, Y- and Z-coordinates. |
| */ |
| node_ptr = find_node_record(basil_node_name); |
| if (node_ptr == NULL) { |
| error("basil node %s not found in slurm", |
| basil_node_name); |
| continue; |
| } |
| |
| /* Record BASIL's node_id for use in reservations */ |
| node_ptr->basil_node_id = basil_node_id; |
| |
| /* Update architecture in slurmctld's node record */ |
| if (node_ptr->arch == NULL) { |
| xfree(node_ptr->arch); |
| node_ptr->arch = xstrdup(basil_node_arch); |
| } |
| |
| /* Update slurmctld's node state if necessary */ |
| reason = NULL; |
| if (!IS_NODE_DOWN(node_ptr)) { |
| if (strcmp(basil_state, "UP")) |
| reason = "basil state not UP"; |
| else if (strcmp(basil_role, "BATCH")) |
| reason = "basil role not BATCH"; |
| } |
| |
| /* Calculate the total count of processors and |
| * MB of memory on the node */ |
| config_ptr = node_ptr->config_ptr; |
| if ((slurmctld_conf.fast_schedule != 2) && |
| (basil_cpus < config_ptr->cpus)) { |
| error("Node %s has low cpu count %d", |
| node_ptr->name, basil_cpus); |
| reason = "Low CPUs"; |
| } |
| node_ptr->cpus = basil_cpus; |
| if ((slurmctld_conf.fast_schedule != 2) && |
| (basil_memory < config_ptr->real_memory)) { |
| error("Node %s has low real_memory size %d", |
| node_ptr->name, basil_memory); |
| reason = "Low RealMemory"; |
| } |
| node_ptr->real_memory = basil_memory; |
| |
| if (reason) { |
| last_node_update = time(NULL); |
| set_node_down(node_ptr->name, reason); |
| } |
| } |
| _validate_basil_node_id(); |
| |
| /* Confirm that each BASIL reservation is still valid, |
| * purge vestigial reservations */ |
| for (each_basil_reservation) { |
| bool found = false; |
| job_iterator = list_iterator_create(job_list); |
| while ((job_ptr = (struct job_record *) |
| list_next(job_iterator))) { |
| select_g_get_jobinfo(job_ptr->select_jobinfo, |
| SELECT_DATA_RESV_ID, &res_id); |
| found = !strcmp(res_id, basil_reservation_id); |
| xfree(res_id); |
| if (found) |
| break; |
| } |
| list_iterator_destroy(job_iterator); |
| if (found) { |
| error("vestigial basil reservation %s being removed", |
| basil_reservation_id); |
| basil_dealloc(basil_reservation_id); |
| } |
| } |
| #else |
| struct job_record *job_ptr; |
| ListIterator job_iterator; |
| char *res_id, *tmp; |
| int job_res_id; |
| |
| /* Capture the highest reservation ID recorded to avoid re-use */ |
| job_iterator = list_iterator_create(job_list); |
| while ((job_ptr = (struct job_record *) list_next(job_iterator))) { |
| res_id = NULL; |
| select_g_get_jobinfo(job_ptr->select_jobinfo, |
| SELECT_DATA_RESV_ID, &res_id); |
| if (res_id) { |
| tmp = strchr(res_id, '_'); |
| if (tmp) { |
| job_res_id = atoi(tmp+1); |
| last_res_id = MAX(last_res_id, job_res_id); |
| } |
| xfree(res_id); |
| } |
| } |
| list_iterator_destroy(job_iterator); |
| debug("basil_query() executed, last_res_id=%d", last_res_id); |
| #endif /* APBASIL_LOC */ |
| #endif /* HAVE_CRAY_XT */ |
| |
| return error_code; |
| } |
| |
| /* |
| * basil_reserve - create a BASIL reservation. |
| * IN job_ptr - pointer to job which has just been allocated resources |
| * RET 0 or error code, job will abort or be requeued on failure |
| */ |
| extern int basil_reserve(struct job_record *job_ptr) |
| { |
| int error_code = SLURM_SUCCESS; |
| #ifdef HAVE_CRAY_XT |
| #ifdef APBASIL_LOC |
| /* Issue the BASIL RESERVE request */ |
| if (request_failure) { |
| error("basil reserve error: %s", "TBD"); |
| return SLURM_ERROR; |
| } |
| select_g_set_jobinfo(job_ptr->select_jobinfo, |
| SELECT_DATA_RESV_ID, reservation_id); |
| debug("basil reservation made job_id=%u resv_id=%s", |
| job_ptr->job_id, reservation_id); |
| #else |
| char reservation_id[32]; |
| snprintf(reservation_id, sizeof(reservation_id), |
| "resv_%d", ++last_res_id); |
| select_g_set_jobinfo(job_ptr->select_jobinfo, |
| SELECT_DATA_RESV_ID, reservation_id); |
| debug("basil reservation made job_id=%u resv_id=%s", |
| job_ptr->job_id, reservation_id); |
| #endif /* APBASIL_LOC */ |
| #endif /* HAVE_CRAY_XT */ |
| return error_code; |
| } |
| |
| /* |
| * basil_release - release a BASIL reservation by job. |
| * IN job_ptr - pointer to job which has just been deallocated resources |
| * RET 0 or error code |
| */ |
| extern int basil_release(struct job_record *job_ptr) |
| { |
| int error_code = SLURM_SUCCESS; |
| #ifdef HAVE_CRAY_XT |
| char *reservation_id = NULL; |
| select_g_get_jobinfo(job_ptr->select_jobinfo, |
| SELECT_DATA_RESV_ID, &reservation_id); |
| if (reservation_id) { |
| error_code = basil_release_id(reservation_id); |
| xfree(reservation_id); |
| } |
| #endif /* HAVE_CRAY_XT */ |
| return error_code; |
| } |
| |
| /* |
| * basil_release_id - release a BASIL reservation by ID. |
| * IN reservation_id - ID of reservation to release |
| * RET 0 or error code |
| */ |
| extern int basil_release_id(char *reservation_id) |
| { |
| int error_code = SLURM_SUCCESS; |
| #ifdef HAVE_CRAY_XT |
| #ifdef APBASIL_LOC |
| /* Issue the BASIL RELEASE request */ |
| if (request_failure) { |
| error("basil release of %s error: %s", reservation_id, "TBD"); |
| return SLURM_ERROR; |
| } |
| debug("basil release of reservation %s complete", reservation_id); |
| #else |
| debug("basil release of reservation %s complete", reservation_id); |
| #endif /* APBASIL_LOC */ |
| #endif /* HAVE_CRAY_XT */ |
| return error_code; |
| } |