| /*****************************************************************************\ |
| * slurm_acct_gather_energy.c - implementation-independent job energy |
| * accounting plugin definitions |
| ***************************************************************************** |
| * Copyright (C) SchedMD LLC. |
| * Copyright (C) 2012 Bull-HN-PHX. |
| * Written by Bull-HN-PHX/d.rusak |
| * |
| * This file is part of Slurm, a resource management program. |
| * For details, see <https://slurm.schedmd.com/>. |
| * Please also read the included file: DISCLAIMER. |
| * |
| * Slurm is free software; you can redistribute it and/or modify it under |
| * the terms of the GNU General Public License as published by the Free |
| * Software Foundation; either version 2 of the License, or (at your option) |
| * any later version. |
| * |
| * In addition, as a special exception, the copyright holders give permission |
| * to link the code of portions of this program with the OpenSSL library under |
| * certain conditions as described in each individual source file, and |
| * distribute linked combinations including the two. You must obey the GNU |
| * General Public License in all respects for all of the code used other than |
| * OpenSSL. If you modify file(s) with this exception, you may extend this |
| * exception to your version of the file(s), but you are not obligated to do |
| * so. If you do not wish to do so, delete this exception statement from your |
| * version. If you delete this exception statement from all source files in |
| * the program, then also delete it here. |
| * |
| * Slurm is distributed in the hope that it will be useful, but WITHOUT ANY |
| * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS |
| * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more |
| * details. |
| * |
| * You should have received a copy of the GNU General Public License along |
| * with Slurm; if not, write to the Free Software Foundation, Inc., |
| * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. |
| \*****************************************************************************/ |
| |
| #ifdef HAVE_CONFIG_H |
| # include "config.h" |
| #endif |
| |
| #include <pthread.h> |
| #include <stdlib.h> |
| #include <string.h> |
| |
| #if HAVE_SYS_PRCTL_H |
| # include <sys/prctl.h> |
| #endif |
| |
| #include "src/common/macros.h" |
| #include "src/common/plugin.h" |
| #include "src/common/plugrack.h" |
| #include "src/common/slurm_protocol_api.h" |
| #include "src/common/xmalloc.h" |
| #include "src/common/xstring.h" |
| #include "src/interfaces/acct_gather_energy.h" |
| #include "src/slurmd/slurmstepd/slurmstepd_job.h" |
| |
| /* |
| ** Define slurm-specific aliases for use by plugins, see slurm_xlator.h |
| ** for details. |
| */ |
| strong_alias(acct_gather_energy_destroy, slurm_acct_gather_energy_destroy); |
| |
| typedef struct slurm_acct_gather_energy_ops { |
| int (*update_node_energy) (void); |
| int (*get_data) (enum acct_energy_type data_type, void *data); |
| int (*set_data) (enum acct_energy_type data_type, void *data); |
| void (*conf_options) (s_p_options_t **full_options, |
| int *full_options_cnt); |
| void (*conf_set) (int context_id_in, |
| s_p_hashtbl_t *tbl); |
| void (*conf_values) (list_t **data); |
| } slurm_acct_gather_energy_ops_t; |
| /* |
| * These strings must be kept in the same order as the fields |
| * declared for slurm_acct_gather_energy_ops_t. |
| */ |
| static const char *syms[] = { |
| "acct_gather_energy_p_update_node_energy", |
| "acct_gather_energy_p_get_data", |
| "acct_gather_energy_p_set_data", |
| "acct_gather_energy_p_conf_options", |
| "acct_gather_energy_p_conf_set", |
| "acct_gather_energy_p_conf_values", |
| }; |
| |
| static slurm_acct_gather_energy_ops_t *ops; |
| static plugin_context_t **g_context = NULL; |
| static int g_context_num = -1; |
| static pthread_mutex_t g_context_lock = PTHREAD_MUTEX_INITIALIZER; |
| static bool init_run = false; |
| static bool acct_shutdown = true; |
| static int freq = 0; |
| static pthread_t watch_node_thread_id = 0; |
| static acct_gather_profile_timer_t *profile_timer = |
| &acct_gather_profile_timer[PROFILE_ENERGY]; |
| |
| static void *_watch_node(void *arg) |
| { |
| int delta = profile_timer->freq - 1; |
| |
| #if HAVE_SYS_PRCTL_H |
| if (prctl(PR_SET_NAME, "acctg_energy", NULL, NULL, NULL) < 0) { |
| error("%s: cannot set my name to %s %m", |
| __func__, "acctg_energy"); |
| } |
| #endif |
| |
| while (init_run && acct_gather_profile_test()) { |
| /* Do this until shutdown is requested */ |
| slurm_mutex_lock(&g_context_lock); |
| for (int i = 0; i < g_context_num; i++) { |
| if (!g_context[i]) |
| continue; |
| (*(ops[i].set_data))(ENERGY_DATA_PROFILE, &delta); |
| } |
| slurm_mutex_unlock(&g_context_lock); |
| |
| slurm_mutex_lock(&profile_timer->notify_mutex); |
| slurm_cond_wait(&profile_timer->notify, |
| &profile_timer->notify_mutex); |
| slurm_mutex_unlock(&profile_timer->notify_mutex); |
| } |
| |
| return NULL; |
| } |
| |
| |
| extern int acct_gather_energy_init(void) |
| { |
| int retval = SLURM_SUCCESS; |
| char *plugin_type = "acct_gather_energy"; |
| char *full_plugin_type = NULL; |
| char *last = NULL, *plugin_entry, *type = NULL; |
| |
| slurm_mutex_lock(&g_context_lock); |
| |
| if (g_context_num >= 0) |
| goto done; |
| |
| g_context_num = 0; /* mark it before anything else */ |
| if (!slurm_conf.acct_gather_energy_type) |
| goto done; |
| |
| full_plugin_type = xstrdup(slurm_conf.acct_gather_energy_type); |
| plugin_entry = full_plugin_type; |
| while ((type = strtok_r(plugin_entry, ",", &last))) { |
| xrealloc(ops, sizeof(slurm_acct_gather_energy_ops_t) * |
| (g_context_num + 1)); |
| xrealloc(g_context, (sizeof(plugin_context_t *) * |
| (g_context_num + 1))); |
| if (!xstrncmp(type, "acct_gather_energy/", 19)) |
| type += 19; /* backward compatibility */ |
| type = xstrdup_printf("%s/%s", plugin_type, type); |
| g_context[g_context_num] = plugin_context_create( |
| plugin_type, type, (void **)&ops[g_context_num], |
| syms, sizeof(syms)); |
| if (!g_context[g_context_num]) { |
| error("cannot create %s context for %s", |
| plugin_type, type); |
| xfree(type); |
| retval = SLURM_ERROR; |
| break; |
| } |
| |
| xfree(type); |
| g_context_num++; |
| plugin_entry = NULL; /* for next iteration */ |
| } |
| xfree(full_plugin_type); |
| init_run = true; |
| done: |
| slurm_mutex_unlock(&g_context_lock); |
| if (retval != SLURM_SUCCESS) |
| fatal("can not open the %s plugin", type); |
| xfree(type); |
| |
| return retval; |
| } |
| |
| extern int acct_gather_energy_fini(void) |
| { |
| int rc2, rc = SLURM_SUCCESS; |
| |
| slurm_mutex_lock(&g_context_lock); |
| |
| if (!init_run) { |
| slurm_mutex_unlock(&g_context_lock); |
| return SLURM_SUCCESS; |
| } |
| |
| init_run = false; |
| |
| if (watch_node_thread_id) { |
| slurm_mutex_unlock(&g_context_lock); |
| slurm_mutex_lock(&profile_timer->notify_mutex); |
| slurm_cond_signal(&profile_timer->notify); |
| slurm_mutex_unlock(&profile_timer->notify_mutex); |
| slurm_thread_join(watch_node_thread_id); |
| watch_node_thread_id = 0; |
| slurm_mutex_lock(&g_context_lock); |
| } |
| |
| for (int i = 0; i < g_context_num; i++) { |
| if (!g_context[i]) |
| continue; |
| |
| rc2 = plugin_context_destroy(g_context[i]); |
| if (rc2 != SLURM_SUCCESS) { |
| debug("%s: %s: %s", __func__, |
| g_context[i]->type, |
| slurm_strerror(rc2)); |
| rc = SLURM_ERROR; |
| } |
| } |
| |
| xfree(ops); |
| xfree(g_context); |
| g_context_num = -1; |
| |
| slurm_mutex_unlock(&g_context_lock); |
| |
| return rc; |
| } |
| |
| extern acct_gather_energy_t *acct_gather_energy_alloc(uint16_t cnt) |
| { |
| return xcalloc(cnt, sizeof(struct acct_gather_energy)); |
| } |
| |
| extern void acct_gather_energy_destroy(acct_gather_energy_t *energy) |
| { |
| xfree(energy); |
| } |
| |
| extern void acct_gather_energy_pack(acct_gather_energy_t *energy, buf_t *buffer, |
| uint16_t protocol_version) |
| { |
| if (protocol_version >= SLURM_24_11_PROTOCOL_VERSION) { |
| if (!energy) { |
| pack64(0, buffer); |
| pack32(0, buffer); |
| pack64(0, buffer); |
| pack32(0, buffer); |
| pack64(0, buffer); |
| pack_time(0, buffer); |
| pack_time(0, buffer); |
| return; |
| } |
| |
| pack64(energy->base_consumed_energy, buffer); |
| pack32(energy->ave_watts, buffer); |
| pack64(energy->consumed_energy, buffer); |
| pack32(energy->current_watts, buffer); |
| pack64(energy->previous_consumed_energy, buffer); |
| pack_time(energy->poll_time, buffer); |
| pack_time(energy->slurmd_start_time, buffer); |
| } else if (protocol_version >= SLURM_MIN_PROTOCOL_VERSION) { |
| if (!energy) { |
| pack64(0, buffer); |
| pack32(0, buffer); |
| pack64(0, buffer); |
| pack32(0, buffer); |
| pack64(0, buffer); |
| pack_time(0, buffer); |
| return; |
| } |
| |
| pack64(energy->base_consumed_energy, buffer); |
| pack32(energy->ave_watts, buffer); |
| pack64(energy->consumed_energy, buffer); |
| pack32(energy->current_watts, buffer); |
| pack64(energy->previous_consumed_energy, buffer); |
| pack_time(energy->poll_time, buffer); |
| } |
| } |
| |
| extern int acct_gather_energy_unpack(acct_gather_energy_t **energy, |
| buf_t *buffer, |
| uint16_t protocol_version, bool need_alloc) |
| { |
| acct_gather_energy_t *energy_ptr; |
| |
| if (need_alloc) { |
| energy_ptr = acct_gather_energy_alloc(1); |
| *energy = energy_ptr; |
| } else { |
| energy_ptr = *energy; |
| } |
| |
| if (protocol_version >= SLURM_24_11_PROTOCOL_VERSION) { |
| safe_unpack64(&energy_ptr->base_consumed_energy, buffer); |
| safe_unpack32(&energy_ptr->ave_watts, buffer); |
| safe_unpack64(&energy_ptr->consumed_energy, buffer); |
| safe_unpack32(&energy_ptr->current_watts, buffer); |
| safe_unpack64(&energy_ptr->previous_consumed_energy, buffer); |
| safe_unpack_time(&energy_ptr->poll_time, buffer); |
| safe_unpack_time(&energy_ptr->slurmd_start_time, buffer); |
| } else if (protocol_version >= SLURM_MIN_PROTOCOL_VERSION) { |
| safe_unpack64(&energy_ptr->base_consumed_energy, buffer); |
| safe_unpack32(&energy_ptr->ave_watts, buffer); |
| safe_unpack64(&energy_ptr->consumed_energy, buffer); |
| safe_unpack32(&energy_ptr->current_watts, buffer); |
| safe_unpack64(&energy_ptr->previous_consumed_energy, buffer); |
| safe_unpack_time(&energy_ptr->poll_time, buffer); |
| } |
| |
| return SLURM_SUCCESS; |
| |
| unpack_error: |
| if (need_alloc) { |
| acct_gather_energy_destroy(energy_ptr); |
| *energy = NULL; |
| } else |
| memset(energy_ptr, 0, sizeof(acct_gather_energy_t)); |
| |
| return SLURM_ERROR; |
| } |
| |
| extern int acct_gather_energy_g_update_node_energy(void) |
| { |
| int retval = SLURM_ERROR; |
| |
| xassert(g_context_num >= 0); |
| |
| if (!g_context_num) |
| return SLURM_SUCCESS; |
| |
| slurm_mutex_lock(&g_context_lock); |
| for (int i = 0; i < g_context_num; i++) { |
| if (!g_context[i]) |
| continue; |
| retval = (*(ops[i].update_node_energy))(); |
| } |
| slurm_mutex_unlock(&g_context_lock); |
| |
| return retval; |
| } |
| |
| extern int acct_gather_energy_g_get_sum(enum acct_energy_type data_type, |
| acct_gather_energy_t *energy) |
| { |
| int retval = SLURM_ERROR; |
| static acct_gather_energy_t *e, *energy_array; |
| |
| xassert(g_context_num >= 0); |
| |
| if (!g_context_num) |
| return SLURM_SUCCESS; |
| |
| |
| slurm_mutex_lock(&g_context_lock); |
| |
| if (g_context_num == 1) { |
| retval = (*(ops[0].get_data))(data_type, energy); |
| slurm_mutex_unlock(&g_context_lock); |
| return retval; |
| } |
| |
| energy_array = acct_gather_energy_alloc(g_context_num); |
| for (int i = 0; i < g_context_num; i++) { |
| if (!g_context[i]) |
| continue; |
| |
| e = &energy_array[i]; |
| |
| retval = (*(ops[i].get_data))(data_type, e); |
| if (retval != SLURM_SUCCESS || (e->consumed_energy == NO_VAL64)) |
| continue; |
| |
| energy->base_consumed_energy += e->base_consumed_energy; |
| energy->ave_watts += e->ave_watts; |
| energy->consumed_energy += e->consumed_energy; |
| energy->current_watts += e->current_watts; |
| energy->previous_consumed_energy += e->previous_consumed_energy; |
| |
| /* |
| * node poll_time is computed as the oldest poll_time of |
| * the sensors |
| */ |
| if (!energy->poll_time || (energy->poll_time > e->poll_time)) |
| energy->poll_time = e->poll_time; |
| |
| } |
| slurm_mutex_unlock(&g_context_lock); |
| acct_gather_energy_destroy(energy_array); |
| |
| return retval; |
| } |
| |
| extern int acct_gather_energy_g_get_data(int context_id, |
| enum acct_energy_type data_type, |
| void *data) |
| { |
| int retval = SLURM_ERROR; |
| |
| xassert(g_context_num >= 0); |
| |
| if (!g_context_num) |
| return SLURM_SUCCESS; |
| |
| slurm_mutex_lock(&g_context_lock); |
| |
| xassert((context_id < g_context_num) && (context_id >= 0)); |
| xassert(g_context[context_id]); |
| |
| retval = (*(ops[context_id].get_data))(data_type, data); |
| |
| slurm_mutex_unlock(&g_context_lock); |
| |
| return retval; |
| } |
| |
| extern int acct_gather_energy_g_set_data(enum acct_energy_type data_type, |
| void *data) |
| { |
| int retval = SLURM_ERROR; |
| |
| xassert(g_context_num >= 0); |
| |
| if (!g_context_num) |
| return SLURM_SUCCESS; |
| |
| slurm_mutex_lock(&g_context_lock); |
| for (int i = 0; i < g_context_num; i++) { |
| if (!g_context[i]) |
| continue; |
| retval = (*(ops[i].set_data))(data_type, data); |
| } |
| slurm_mutex_unlock(&g_context_lock); |
| |
| return retval; |
| } |
| |
| extern int acct_gather_energy_startpoll(uint32_t frequency) |
| { |
| int retval = SLURM_SUCCESS; |
| |
| xassert(g_context_num >= 0); |
| |
| if (!g_context_num) |
| return SLURM_SUCCESS; |
| |
| if (!acct_shutdown) { |
| error("%s: poll already started!", __func__); |
| return retval; |
| } |
| |
| acct_shutdown = false; |
| |
| freq = frequency; |
| |
| if (frequency == 0) { /* don't want dynamic monitoring? */ |
| debug2("%s: dynamic logging disabled", __func__); |
| return retval; |
| } |
| |
| /* create polling thread */ |
| slurm_thread_create(&watch_node_thread_id, _watch_node, NULL); |
| |
| debug3("%s: dynamic logging enabled", __func__); |
| |
| return retval; |
| } |
| |
| extern int acct_gather_energy_g_conf_options(s_p_options_t **full_options, |
| int *full_options_cnt) |
| { |
| xassert(g_context_num >= 0); |
| |
| if (!g_context_num) |
| return SLURM_SUCCESS; |
| |
| slurm_mutex_lock(&g_context_lock); |
| for (int i = 0; i < g_context_num; i++) { |
| if (!g_context[i]) |
| continue; |
| (*(ops[i].conf_options))(full_options, full_options_cnt); |
| } |
| slurm_mutex_unlock(&g_context_lock); |
| return SLURM_SUCCESS; |
| } |
| |
| extern int acct_gather_energy_g_conf_set(s_p_hashtbl_t *tbl) |
| { |
| xassert(g_context_num >= 0); |
| |
| if (!g_context_num) |
| return SLURM_SUCCESS; |
| |
| slurm_mutex_lock(&g_context_lock); |
| for (int i = 0; i < g_context_num; i++) { |
| if (!g_context[i]) |
| continue; |
| (*(ops[i].conf_set))(i, tbl); |
| } |
| slurm_mutex_unlock(&g_context_lock); |
| return SLURM_SUCCESS; |
| } |
| |
| extern int acct_gather_energy_g_conf_values(void *data) |
| { |
| xassert(g_context_num >= 0); |
| |
| if (!g_context_num) |
| return SLURM_SUCCESS; |
| |
| slurm_mutex_lock(&g_context_lock); |
| for (int i = 0; i < g_context_num; i++) { |
| if (!g_context[i]) |
| continue; |
| (*(ops[i].conf_values))(data); |
| } |
| slurm_mutex_unlock(&g_context_lock); |
| return SLURM_SUCCESS; |
| } |