| /*****************************************************************************\ |
| * cpu_frequency.c - support for srun option --cpu-freq=<frequency> |
| ***************************************************************************** |
| * Copyright (C) 2012 Bull |
| * Written by Don Albert, <don.albert@bull.com> |
| * Modified by Rod Schultz, <rod.schultz@bull.com> for min-max:gov |
| * Modified by Janne Blomqvist, <janne.blomqvist@aalto.fi> for |
| * intel_pstate support |
| * |
| * This file is part of Slurm, a resource management program. |
| * For details, see <https://slurm.schedmd.com/>. |
| * Please also read the included file: DISCLAIMER. |
| * |
| * Slurm is free software; you can redistribute it and/or modify it under |
| * the terms of the GNU General Public License as published by the Free |
| * Software Foundation; either version 2 of the License, or (at your option) |
| * any later version. |
| * |
| * In addition, as a special exception, the copyright holders give permission |
| * to link the code of portions of this program with the OpenSSL library under |
| * certain conditions as described in each individual source file, and |
| * distribute linked combinations including the two. You must obey the GNU |
| * General Public License in all respects for all of the code used other than |
| * OpenSSL. If you modify file(s) with this exception, you may extend this |
| * exception to your version of the file(s), but you are not obligated to do |
| * so. If you do not wish to do so, delete this exception statement from your |
| * version. If you delete this exception statement from all source files in |
| * the program, then also delete it here. |
| * |
| * Slurm is distributed in the hope that it will be useful, but WITHOUT ANY |
| * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS |
| * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more |
| * details. |
| * |
| * You should have received a copy of the GNU General Public License along |
| * with Slurm; if not, write to the Free Software Foundation, Inc., |
| * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. |
| \*****************************************************************************/ |
| |
| #include <ctype.h> |
| #include <errno.h> |
| #include <fcntl.h> |
| #include <limits.h> |
| #include <stdlib.h> |
| #include <sys/types.h> |
| #include <sys/stat.h> |
| |
| #include "slurm/slurm.h" |
| |
| #include "src/common/cpu_frequency.h" |
| #include "src/common/env.h" |
| #include "src/common/fd.h" |
| #include "src/common/log.h" |
| #include "src/common/slurm_protocol_api.h" |
| #include "src/common/slurm_protocol_defs.h" |
| #include "src/common/strlcpy.h" |
| #include "src/common/xmalloc.h" |
| #include "src/common/xstring.h" |
| #include "src/common/read_config.h" |
| #include "src/common/slurm_resource_info.h" |
| #include "src/slurmd/slurmd/slurmd.h" |
| |
| #define PATH_TO_CPU "/sys/devices/system/cpu/" |
| #define LINE_LEN 100 |
| #define FREQ_LIST_MAX 64 |
| #define GOV_NAME_LEN 24 |
| |
| #define GOV_CONSERVATIVE 0x01 |
| #define GOV_ONDEMAND 0x02 |
| #define GOV_PERFORMANCE 0x04 |
| #define GOV_POWERSAVE 0x08 |
| #define GOV_USERSPACE 0x10 |
| #define GOV_SCHEDUTIL 0x20 |
| |
| static uint16_t cpu_freq_count = 0; |
| static int set_batch_freq = -1; |
| |
| static struct cpu_freq_data { |
| uint8_t avail_governors; |
| uint8_t nfreq; |
| bool org_set; |
| uint32_t avail_freq[FREQ_LIST_MAX]; |
| char org_governor[GOV_NAME_LEN]; |
| char new_governor[GOV_NAME_LEN]; |
| uint32_t org_frequency; |
| uint32_t new_frequency; |
| uint32_t org_min_freq; |
| uint32_t new_min_freq; |
| uint32_t org_max_freq; |
| uint32_t new_max_freq; |
| } * cpufreq = NULL; |
| static char *slurmd_spooldir = NULL; |
| |
| static int _cpu_freq_cpu_avail(int cpx); |
| static int _cpu_freq_current_state(int cpx); |
| static uint16_t _cpu_freq_next_cpu(char **core_range, uint16_t *cpx, |
| uint16_t *start, uint16_t *end); |
| static uint32_t _cpu_freq_get_scaling_freq(int cpuidx, char* option); |
| static void _cpu_freq_init_data(int cpx); |
| static void _cpu_freq_setup_data(stepd_step_rec_t *step, int cpx); |
| static bool _cpu_freq_test_scaling_freq(int cpuidx, char *option); |
| static int _derive_avail_freq(int cpuidx); |
| static int _fd_lock_retry(int fd); |
| |
| static int _fd_lock_retry(int fd) |
| { |
| int i, rc; |
| |
| for (i = 0; i < 10; i++) { |
| if (i) |
| usleep(1000); /* 1000 usec */ |
| rc = fd_get_write_lock(fd); |
| if (rc == 0) |
| break; |
| if ((errno != EACCES) && (errno != EAGAIN)) |
| break; /* Lock held by other job */ |
| } |
| return rc; |
| } |
| |
| /* This set of locks it designed to prevent race conditions when changing |
| * CPU frequency or govorner. Specifically, when a job ends it should only |
| * reset CPU frequency if it was the last job to set the CPU frequency. |
| * with gang scheduling and cancellation of suspended or running jobs there |
| * can be timing issues. |
| * _set_cpu_owner_lock - set specified job to own the CPU, file locked at exit |
| * _test_cpu_owner_lock - test if the specified job owns the CPU |
| */ |
| static int _set_cpu_owner_lock(int cpu_id, uint32_t job_id) |
| { |
| char tmp[PATH_MAX]; |
| int fd; |
| |
| snprintf(tmp, sizeof(tmp), "%s/cpu", slurmd_spooldir); |
| if ((mkdir(tmp, 0700) != 0) && (errno != EEXIST)) { |
| error("mkdir failed: %m %s",tmp); |
| return -1; |
| } |
| snprintf(tmp, sizeof(tmp), "%s/cpu/%d", slurmd_spooldir, cpu_id); |
| fd = open(tmp, O_CREAT | O_RDWR, 0600); |
| if (fd < 0) { |
| error("%s: open: %m %s", __func__, tmp); |
| return fd; |
| } |
| if (_fd_lock_retry(fd) < 0) |
| error("%s: fd_get_write_lock: %m %s", __func__, tmp); |
| safe_write(fd, &job_id, sizeof(job_id)); |
| |
| return fd; |
| |
| rwfail: |
| error("%s: write: %m %s", __func__, tmp); |
| return fd; |
| } |
| |
| /* Test if specified job ID owns this CPU for frequency/governor control |
| * RET 0 if owner, -1 otherwise */ |
| static int _test_cpu_owner_lock(int cpu_id, uint32_t job_id) |
| { |
| char tmp[PATH_MAX]; |
| uint32_t in_job_id; |
| int fd; |
| |
| snprintf(tmp, sizeof(tmp), "%s/cpu", slurmd_spooldir); |
| if ((mkdir(tmp, 0700) != 0) && (errno != EEXIST)) { |
| error("%s: mkdir failed: %m %s", __func__, tmp); |
| return -1; |
| } |
| snprintf(tmp, sizeof(tmp), "%s/cpu/%d", slurmd_spooldir, cpu_id); |
| fd = open(tmp, O_RDWR, 0600); |
| if (fd < 0) { |
| if (errno != ENOENT) /* Race condition */ |
| error("%s: open: %m %s", __func__, tmp); |
| return -1; |
| } |
| if (_fd_lock_retry(fd) < 0) { |
| error("%s: fd_get_write_lock: %m %s", __func__, tmp); |
| close(fd); |
| return -1; |
| } |
| safe_read(fd, &in_job_id, sizeof(in_job_id)); |
| (void) fd_release_lock(fd); |
| |
| if (job_id != in_job_id) { |
| /* Result of various race conditions */ |
| debug("%s: CPU %d now owned by job %u rather than job %u", |
| __func__, cpu_id, in_job_id, job_id); |
| close(fd); |
| return -1; |
| } |
| close(fd); |
| debug2("%s: CPU %d owned by job %u as expected", |
| __func__, cpu_id, job_id); |
| |
| return 0; |
| |
| rwfail: |
| error("%s: read: %m %s", __func__, tmp); |
| (void) fd_release_lock(fd); |
| close(fd); |
| return -1; |
| } |
| |
| /* |
| * Try do build a table of available frequencies based upon the min/max values |
| */ |
| static int _derive_avail_freq(int cpuidx) |
| { |
| uint32_t min_freq, max_freq, delta_freq; |
| int i; |
| |
| min_freq = _cpu_freq_get_scaling_freq(cpuidx, "scaling_min_freq"); |
| if (min_freq == 0) |
| return SLURM_ERROR; |
| max_freq = _cpu_freq_get_scaling_freq(cpuidx, "scaling_max_freq"); |
| if (max_freq == 0) |
| return SLURM_ERROR; |
| delta_freq = (max_freq - min_freq) / (FREQ_LIST_MAX - 1); |
| for (i = 0; i < (FREQ_LIST_MAX - 1); i++) |
| cpufreq[cpuidx].avail_freq[i] = min_freq + (delta_freq * i); |
| cpufreq[cpuidx].avail_freq[FREQ_LIST_MAX - 1] = max_freq; |
| cpufreq[cpuidx].nfreq = FREQ_LIST_MAX; |
| |
| return SLURM_SUCCESS; |
| } |
| |
| /* |
| * Find available frequencies on this cpu |
| * IN cpuidx - cpu to query |
| * Return: SLURM_SUCCESS or SLURM_ERROR |
| * avail_freq array will be in strictly ascending order |
| */ |
| static int |
| _cpu_freq_cpu_avail(int cpuidx) |
| { |
| FILE *fp = NULL; |
| char path[PATH_MAX]; |
| int i, j, k; |
| uint32_t freq; |
| bool all_avail = false; |
| |
| snprintf(path, sizeof(path), PATH_TO_CPU |
| "cpu%u/cpufreq/scaling_available_frequencies", cpuidx); |
| if ( ( fp = fopen(path, "r") ) == NULL ) { |
| /* |
| * Don't log an error here, scaling_available_frequencies |
| * does not exist when using the intel_pstate driver. |
| * Derive values from min/max values |
| */ |
| return _derive_avail_freq(cpuidx); |
| } |
| for (i = 0; i < (FREQ_LIST_MAX-1); i++) { |
| if ( fscanf(fp, "%u", &freq) == EOF) { |
| all_avail = true; |
| break; |
| } |
| /* make sure list is sorted */ |
| for (j = 0; j < i; j++) { |
| if (freq < cpufreq[cpuidx].avail_freq[j]) { |
| for (k = i; k >= j; k--) { |
| cpufreq[cpuidx].avail_freq[k+1] = |
| cpufreq[cpuidx].avail_freq[k]; |
| } |
| break; |
| } |
| } |
| cpufreq[cpuidx].avail_freq[j] = freq; |
| } |
| cpufreq[cpuidx].nfreq = i; |
| fclose(fp); |
| if (!all_avail) |
| error("all available frequencies not scanned"); |
| return SLURM_SUCCESS; |
| } |
| |
| /* |
| * called to check if the node supports setting CPU frequency |
| * if so, initialize fields in cpu_freq_data structure |
| */ |
| extern void |
| cpu_freq_init(slurmd_conf_t *conf) |
| { |
| char path[PATH_MAX]; |
| struct stat statbuf; |
| FILE *fp; |
| char value[LINE_LEN]; |
| unsigned int i, j; |
| |
| xfree(slurmd_spooldir); |
| slurmd_spooldir = xstrdup(conf->spooldir); |
| |
| if (running_in_slurmstepd()) |
| return; |
| |
| /* check for cpufreq support */ |
| if ( stat(PATH_TO_CPU "cpu0/cpufreq", &statbuf) != 0 ) { |
| info("CPU frequency setting not configured for this node"); |
| return; |
| } |
| |
| if (!S_ISDIR(statbuf.st_mode)) { |
| error(PATH_TO_CPU "cpu0/cpufreq not a directory"); |
| return; |
| } |
| |
| /* get the cpu frequency info into the cpu_freq_data structure */ |
| cpu_freq_count = conf->block_map_size; |
| if (!cpufreq) { |
| int cpuidx; |
| cpufreq = (struct cpu_freq_data *) |
| xmalloc(cpu_freq_count * |
| sizeof(struct cpu_freq_data)); |
| |
| for (cpuidx = 0; cpuidx < cpu_freq_count; cpuidx++) |
| _cpu_freq_init_data(cpuidx); |
| } |
| |
| debug2("Gathering cpu frequency information for %u cpus", |
| cpu_freq_count); |
| for (i = 0; i < cpu_freq_count; i++) { |
| snprintf(path, sizeof(path), |
| PATH_TO_CPU |
| "cpu%u/cpufreq/scaling_available_governors", i); |
| if ((fp = fopen(path, "r")) == NULL) |
| continue; |
| if (fgets(value, LINE_LEN, fp) == NULL) { |
| fclose(fp); |
| continue; |
| } |
| if (strstr(value, "conservative")) { |
| cpufreq[i].avail_governors |= GOV_CONSERVATIVE; |
| if (i == 0) |
| log_flag(CPU_FREQ, "cpu_freq: Conservative governor defined on cpu 0"); |
| } |
| if (strstr(value, "ondemand")) { |
| cpufreq[i].avail_governors |= GOV_ONDEMAND; |
| if (i == 0) |
| log_flag(CPU_FREQ, "cpu_freq: OnDemand governor defined on cpu 0"); |
| } |
| if (strstr(value, "performance")) { |
| cpufreq[i].avail_governors |= GOV_PERFORMANCE; |
| if (i == 0) |
| log_flag(CPU_FREQ, "cpu_freq: Performance governor defined on cpu 0"); |
| } |
| if (strstr(value, "powersave")) { |
| cpufreq[i].avail_governors |= GOV_POWERSAVE; |
| if (i == 0) |
| log_flag(CPU_FREQ, "cpu_freq: PowerSave governor defined on cpu 0"); |
| } |
| if (strstr(value, "userspace")) { |
| cpufreq[i].avail_governors |= GOV_USERSPACE; |
| if (i == 0) |
| log_flag(CPU_FREQ, "cpu_freq: UserSpace governor defined on cpu 0"); |
| } |
| if (strstr(value, "schedutil")) { |
| cpufreq[i].avail_governors |= GOV_SCHEDUTIL; |
| if (i == 0) |
| log_flag(CPU_FREQ, "cpu_freq: SchedUtil governor defined on cpu 0"); |
| } |
| fclose(fp); |
| if (_cpu_freq_cpu_avail(i) == SLURM_ERROR) |
| continue; |
| if ((i == 0) && |
| (slurm_conf.debug_flags & DEBUG_FLAG_CPU_FREQ)) { |
| for (j = 0; j < cpufreq[i].nfreq; j++) { |
| info("cpu_freq: frequency %u defined on cpu 0", |
| cpufreq[i].avail_freq[j]); |
| } |
| } |
| } |
| return; |
| } |
| |
| extern void |
| cpu_freq_fini(void) |
| { |
| xfree(cpufreq); |
| xfree(slurmd_spooldir); |
| } |
| |
| /* |
| * Send the cpu_frequency table info to slurmstepd |
| */ |
| extern void |
| cpu_freq_send_info(int fd) |
| { |
| if (cpu_freq_count) { |
| safe_write(fd, &cpu_freq_count, sizeof(uint16_t)); |
| safe_write(fd, cpufreq, |
| (cpu_freq_count * sizeof(struct cpu_freq_data))); |
| } else { |
| safe_write(fd, &cpu_freq_count, sizeof(uint16_t)); |
| } |
| return; |
| rwfail: |
| error("Unable to send CPU frequency information for %u CPUs", |
| cpu_freq_count); |
| return; |
| } |
| |
| |
| /* |
| * Receive the cpu_frequency table info from slurmd |
| */ |
| extern void |
| cpu_freq_recv_info(int fd) |
| { |
| safe_read(fd, &cpu_freq_count, sizeof(uint16_t)); |
| |
| if (cpu_freq_count) { |
| if (!cpufreq) { |
| cpufreq = (struct cpu_freq_data *) |
| xmalloc(cpu_freq_count * |
| sizeof(struct cpu_freq_data)); |
| } |
| safe_read(fd, cpufreq, |
| (cpu_freq_count * sizeof(struct cpu_freq_data))); |
| debug2("Received CPU frequency information for %u CPUs", |
| cpu_freq_count); |
| } |
| return; |
| rwfail: |
| error("Unable to receive CPU frequency information for %u CPUs", |
| cpu_freq_count); |
| cpu_freq_count = 0; |
| return; |
| } |
| |
| /* |
| * Validate the cpus and select the frequency to set |
| * Called from task/affinity code with task launch request containing |
| * a pointer to a hex map string of the cpus to be used by this step |
| */ |
| extern void |
| cpu_freq_cpuset_validate(stepd_step_rec_t *step) |
| { |
| int cpuidx, cpu_num; |
| bitstr_t *cpus_to_set; |
| bitstr_t *cpu_map; |
| char *cpu_bind; |
| char *cpu_str; |
| char *savestr = NULL; |
| char cpu_bind_type_string[128]; |
| |
| if (set_batch_freq == -1) { |
| if (xstrcasestr(slurm_conf.launch_params, |
| "batch_step_set_cpu_freq")) |
| set_batch_freq = 1; |
| else |
| set_batch_freq = 0; |
| } |
| |
| if (((step->step_id.step_id == SLURM_BATCH_SCRIPT) && |
| !set_batch_freq) || |
| (step->step_id.step_id == SLURM_INTERACTIVE_STEP) || |
| (step->step_id.step_id == SLURM_EXTERN_CONT)) |
| return; |
| |
| slurm_sprint_cpu_bind_type(cpu_bind_type_string, step->cpu_bind_type); |
| |
| log_flag(CPU_FREQ, "%s: request: min=(%12d %8x) max=(%12d %8x) governor=%8x", |
| __func__, step->cpu_freq_min, step->cpu_freq_min, |
| step->cpu_freq_max, step->cpu_freq_max, step->cpu_freq_gov); |
| log_flag(CPU_FREQ, " jobid=%u, stepid=%u, tasks=%u cpu/task=%u, cpus=%u", |
| step->step_id.job_id, step->step_id.step_id, |
| step->node_tasks, step->cpus_per_task, |
| step->cpus); |
| log_flag(CPU_FREQ, " cpu_bind_type=%4x, cpu_bind map=%s", |
| step->cpu_bind_type, step->cpu_bind); |
| |
| if (!cpu_freq_count) |
| return; |
| |
| if (step->cpu_bind == NULL) { |
| /* |
| * slurm_verify_cpu_bind will set cpu_bind to NULL for manual |
| * binding that doesn't require an argument |
| */ |
| if (!((step->cpu_bind_type & CPU_BIND_NONE) || |
| (step->cpu_bind_type & CPU_BIND_LDRANK))) |
| error("cpu_freq_cpuset_validate: cpu_bind string is null"); |
| return; |
| } |
| cpu_bind = xstrdup(step->cpu_bind); |
| |
| if ( (cpu_str = strtok_r(cpu_bind, ",", &savestr) ) == NULL) { |
| error("cpu_freq_cpuset_validate: cpu_bind string invalid"); |
| xfree(cpu_bind); |
| return; |
| } |
| |
| cpu_map = bit_alloc(cpu_freq_count); |
| cpus_to_set = bit_alloc(cpu_freq_count); |
| |
| do { |
| debug3(" cpu_str = %s", cpu_str); |
| |
| if ((step->cpu_bind_type & CPU_BIND_MAP) == CPU_BIND_MAP) { |
| cpu_num = atoi(cpu_str); |
| if (cpu_num >= cpu_freq_count) { |
| error("cpu_freq_cpuset_validate: invalid cpu " |
| "number %d", cpu_num); |
| FREE_NULL_BITMAP(cpu_map); |
| FREE_NULL_BITMAP(cpus_to_set); |
| xfree(cpu_bind); |
| return; |
| } |
| bit_set(cpu_map, (bitoff_t)cpu_num); |
| } else { |
| if (bit_unfmt_hexmask(cpu_map, cpu_str) == -1) { |
| error("cpu_freq_cpuset_validate: invalid cpu " |
| "mask %s", cpu_bind); |
| FREE_NULL_BITMAP(cpu_map); |
| FREE_NULL_BITMAP(cpus_to_set); |
| xfree(cpu_bind); |
| return; |
| } |
| } |
| bit_or(cpus_to_set, cpu_map); |
| } while ( (cpu_str = strtok_r(NULL, ",", &savestr) ) != NULL); |
| |
| for (cpuidx = 0; cpuidx < cpu_freq_count; cpuidx++) { |
| if (bit_test(cpus_to_set, cpuidx)) { |
| _cpu_freq_setup_data(step, cpuidx); |
| } |
| } |
| cpu_freq_set(step); |
| |
| FREE_NULL_BITMAP(cpu_map); |
| FREE_NULL_BITMAP(cpus_to_set); |
| xfree(cpu_bind); |
| return; |
| } |
| |
| /* |
| * Validate the cpus and select the frequency to set |
| * Called from task cgroup cpuset code with string containing |
| * the list of cpus to be used by this step |
| */ |
| extern void |
| cpu_freq_cgroup_validate(stepd_step_rec_t *step, char *step_alloc_cores) |
| { |
| uint16_t start = USHRT_MAX; |
| uint16_t end = USHRT_MAX; |
| uint16_t cpuidx = 0; |
| char *core_range; |
| |
| if (set_batch_freq == -1) { |
| if (xstrcasestr(slurm_conf.launch_params, |
| "batch_step_set_cpu_freq")) |
| set_batch_freq = 1; |
| else |
| set_batch_freq = 0; |
| } |
| |
| if (((step->step_id.step_id == SLURM_BATCH_SCRIPT) && |
| !set_batch_freq) || |
| (step->step_id.step_id == SLURM_INTERACTIVE_STEP) || |
| (step->step_id.step_id == SLURM_EXTERN_CONT)) |
| return; |
| |
| log_flag(CPU_FREQ, "%s: request: min=(%12d %8x) max=(%12d %8x) governor=%8x", |
| __func__, step->cpu_freq_min, step->cpu_freq_min, |
| step->cpu_freq_max, step->cpu_freq_max, step->cpu_freq_gov); |
| log_flag(CPU_FREQ, " jobid=%u, stepid=%u, tasks=%u cpu/task=%u, cpus=%u", |
| step->step_id.job_id, step->step_id.step_id, |
| step->node_tasks, step->cpus_per_task, |
| step->cpus); |
| log_flag(CPU_FREQ, " cpu_bind_type=%4x, cpu_bind map=%s", |
| step->cpu_bind_type, step->cpu_bind); |
| log_flag(CPU_FREQ, " step logical cores = %s, step physical cores = %s", |
| step->step_alloc_cores, step_alloc_cores); |
| |
| if (!cpu_freq_count) |
| return; |
| |
| /* set entries in cpu frequency table for this step's cpus */ |
| core_range = step_alloc_cores; |
| while ( (cpuidx = _cpu_freq_next_cpu(&core_range, &cpuidx, |
| &start, &end)) != USHRT_MAX) { |
| if (cpuidx >= cpu_freq_count) { |
| error("cpu_freq_validate: index %u exceeds cpu count %u", |
| cpuidx, cpu_freq_count); |
| return; |
| } |
| _cpu_freq_setup_data(step, cpuidx); |
| } |
| cpu_freq_set(step); |
| return; |
| } |
| |
| /* |
| * get the next number in a range |
| * assumes range is well-formed, i.e., monotonically increasing, |
| * no leading/trailing punctuation, either comma separated or dash |
| * separated: e.g., "4-6,8,10,13-15" |
| */ |
| uint16_t |
| _cpu_freq_next_cpu(char **core_range, uint16_t *cpuidx, |
| uint16_t *start, uint16_t *end) |
| { |
| int i; |
| char *p; |
| |
| p = *core_range; |
| |
| if (*start == USHRT_MAX) { |
| if (*p == '\0') |
| return USHRT_MAX; |
| if (*p == ',') |
| p++; |
| |
| i = 0; |
| while ( isdigit(*p) ) { |
| i = i*10 + (*p - '0'); |
| p++; |
| } |
| *core_range = p; |
| *start = i; |
| return i; |
| } |
| |
| if (*end == USHRT_MAX) { |
| switch (*p) |
| { |
| case '-' : |
| p++; |
| i = 0; |
| while ( isdigit(*p) ) { |
| i = i*10 + (*p - '0'); |
| p++; |
| } |
| *core_range = p; |
| *end = i; |
| break; |
| |
| case ',': |
| p++; |
| i = 0; |
| while ( isdigit(*p) ) { |
| i = i*10 + (*p - '0'); |
| p++; |
| } |
| *start = i; |
| *end = USHRT_MAX; |
| *core_range = p; |
| return i; |
| |
| case '\0' : |
| return USHRT_MAX; |
| } |
| } |
| |
| i = *cpuidx; |
| if ( i < *end ) { |
| i++; |
| if ( i == *end) { |
| *start = USHRT_MAX; |
| *end = USHRT_MAX; |
| } |
| } |
| return i; |
| } |
| |
| /* |
| * Find current governor on this cpu |
| * |
| * Return: SLURM_SUCCESS or SLURM_ERROR |
| */ |
| static int |
| _cpu_freq_get_cur_gov(int cpuidx) |
| { |
| FILE *fp = NULL; |
| char path[PATH_MAX], gov_value[LINE_LEN]; |
| int j; |
| |
| snprintf(path, sizeof(path), |
| PATH_TO_CPU "cpu%u/cpufreq/scaling_governor", cpuidx); |
| if ((fp = fopen(path, "r")) == NULL) { |
| error("%s: Could not open scaling_governor", __func__); |
| return SLURM_ERROR; |
| } |
| if (fgets(gov_value, LINE_LEN, fp) == NULL) { |
| error("%s: Could not read scaling_governor", __func__); |
| fclose(fp); |
| return SLURM_ERROR; |
| } |
| if (strlen(gov_value) >= GOV_NAME_LEN) { |
| error("%s: scaling_governor is to long", __func__); |
| fclose(fp); |
| return SLURM_ERROR; |
| } |
| strcpy(cpufreq[cpuidx].org_governor, gov_value); |
| fclose(fp); |
| j = strlen(cpufreq[cpuidx].org_governor); |
| if ((j > 0) && (cpufreq[cpuidx].org_governor[j - 1] == '\n')) |
| cpufreq[cpuidx].org_governor[j - 1] = '\0'; |
| return SLURM_SUCCESS; |
| } |
| |
| /* |
| * set cpu governor |
| */ |
| static int |
| _cpu_freq_set_gov(stepd_step_rec_t *step, int cpuidx, char *gov) |
| { |
| char path[PATH_MAX]; |
| FILE *fp; |
| int fd, rc; |
| |
| rc = SLURM_SUCCESS; |
| snprintf(path, sizeof(path), PATH_TO_CPU |
| "cpu%u/cpufreq/scaling_governor", cpuidx); |
| fd = _set_cpu_owner_lock(cpuidx, step->step_id.job_id); |
| if ((fp = fopen(path, "w"))) { |
| fputs(gov, fp); |
| fputc('\n', fp); |
| fclose(fp); |
| } else { |
| error("%s: Can not set CPU governor: %m", __func__); |
| rc = SLURM_ERROR; |
| } |
| if (fd >= 0) { |
| (void) fd_release_lock(fd); |
| (void) close(fd); |
| } |
| return rc; |
| } |
| |
| /* |
| * get one of scalling_min_freq, scaling_max_freq, cpuinfo_cur_freq |
| * |
| * Return: value of scaling_min_freq, or 0 on error |
| */ |
| static uint32_t |
| _cpu_freq_get_scaling_freq(int cpuidx, char* option) |
| { |
| FILE *fp = NULL; |
| char path[PATH_MAX]; |
| uint32_t freq; |
| /* get the value from 'option' */ |
| snprintf(path, sizeof(path), PATH_TO_CPU |
| "cpu%u/cpufreq/%s", cpuidx, option); |
| if ( ( fp = fopen(path, "r") ) == NULL ) { |
| error("%s: Could not open %s", __func__, option); |
| return 0; |
| } |
| if (fscanf (fp, "%u", &freq) < 1) { |
| error("%s: Could not read %s", __func__, option); |
| fclose(fp); |
| return 0; |
| } |
| fclose(fp); |
| return freq; |
| } |
| |
| /* |
| * test for existence of cpufreq file |
| * |
| * Return: true if file found |
| */ |
| static bool |
| _cpu_freq_test_scaling_freq(int cpuidx, char *option) |
| { |
| char path[PATH_MAX]; |
| struct stat stat_buf; |
| |
| /* get the value from 'option' */ |
| snprintf(path, sizeof(path), PATH_TO_CPU |
| "cpu%u/cpufreq/%s", cpuidx, option); |
| if (stat(path, &stat_buf) == 0) |
| return true; |
| return false; |
| } |
| |
| /* |
| * set one of scalling_min_freq, scaling_max_freq, scaling_setspeed |
| * -- assume governor already set to userspace --- |
| * |
| */ |
| static int |
| _cpu_freq_set_scaling_freq(stepd_step_rec_t *step, int cpx, uint32_t freq, |
| char* option) |
| { |
| char path[PATH_MAX]; |
| FILE *fp; |
| int fd, rc; |
| uint32_t newfreq; |
| |
| rc = SLURM_SUCCESS; |
| snprintf(path, sizeof(path), PATH_TO_CPU |
| "cpu%u/cpufreq/%s", cpx, option); |
| fd = _set_cpu_owner_lock(cpx, step->step_id.job_id); |
| if ((fp = fopen(path, "w"))) { |
| fprintf(fp, "%u\n", freq); |
| fclose(fp); |
| } else { |
| error("%s: Can not set %s: %m", __func__, option); |
| rc = SLURM_ERROR; |
| } |
| if (fd >= 0) { |
| (void) fd_release_lock(fd); |
| (void) close(fd); |
| } |
| if (slurm_conf.debug_flags & DEBUG_FLAG_CPU_FREQ) { |
| newfreq = _cpu_freq_get_scaling_freq(cpx, option); |
| if (newfreq != freq) { |
| error("Failed to set freq_scaling %s to %u (org=%u)", |
| option, freq, newfreq); |
| } |
| } |
| return rc; |
| |
| } |
| |
| /* |
| * Get current state |
| * |
| * IN: cpuidx - cpu to query |
| * Return: SLURM_SUCCESS or SLURM_ERROR |
| */ |
| static int |
| _cpu_freq_current_state(int cpuidx) |
| { |
| static int freq_file = -1; |
| uint32_t freq; |
| |
| if (cpufreq[cpuidx].org_set) { |
| /* |
| * The current state was already loaded for this cpu. |
| * Likely caused by stacked task plugins. Prevent |
| * overwriting the original values so they can be |
| * restored correctly after job completion. |
| */ |
| return SLURM_SUCCESS; |
| } |
| |
| /* |
| * Getting 'previous' values using the 'scaling' values rather |
| * than the 'cpuinfo' values. |
| * The 'cpuinfo' values are read only. min/max seem to be raw |
| * hardware capability. |
| * The 'scaling' values are set by the governor. |
| * For the current frequency, use the cpuinfo_cur_freq file |
| * since the intel_pstate driver doesn't necessarily create |
| * the scaling_cur_freq file. |
| */ |
| if (freq_file == -1) { |
| if (_cpu_freq_test_scaling_freq(cpuidx, "cpuinfo_cur_freq")) |
| freq_file = 0; |
| else /* Use "scaling_cur_freq" */ |
| freq_file = 1; |
| } |
| if (freq_file == 0) |
| freq = _cpu_freq_get_scaling_freq(cpuidx, "cpuinfo_cur_freq"); |
| else |
| freq = _cpu_freq_get_scaling_freq(cpuidx, "scaling_cur_freq"); |
| if (freq == 0) |
| return SLURM_ERROR; |
| cpufreq[cpuidx].org_frequency = freq; |
| freq = _cpu_freq_get_scaling_freq(cpuidx, "scaling_min_freq"); |
| if (freq == 0) |
| return SLURM_ERROR; |
| cpufreq[cpuidx].org_min_freq = freq; |
| freq = _cpu_freq_get_scaling_freq(cpuidx, "scaling_max_freq"); |
| if (freq == 0) |
| return SLURM_ERROR; |
| cpufreq[cpuidx].org_max_freq = freq; |
| |
| if (_cpu_freq_get_cur_gov(cpuidx) == SLURM_SUCCESS) { |
| cpufreq[cpuidx].org_set = true; |
| return SLURM_SUCCESS; |
| } else { |
| return SLURM_ERROR; |
| } |
| } |
| |
| |
| /* |
| * Copy string representation of a governor into cpufreq structure for a cpu. |
| */ |
| static int |
| _cpu_freq_govspec_string(uint32_t cpu_freq, int cpuidx) |
| { |
| |
| if ((cpu_freq & CPU_FREQ_RANGE_FLAG) == 0) |
| return SLURM_ERROR; |
| |
| switch(cpu_freq) |
| { |
| case CPU_FREQ_CONSERVATIVE: |
| if (cpufreq[cpuidx].avail_governors & GOV_CONSERVATIVE) |
| strcpy(cpufreq[cpuidx].new_governor, "conservative"); |
| return SLURM_SUCCESS; |
| case CPU_FREQ_ONDEMAND: |
| if (cpufreq[cpuidx].avail_governors & GOV_ONDEMAND) |
| strcpy(cpufreq[cpuidx].new_governor,"ondemand"); |
| return SLURM_SUCCESS; |
| case CPU_FREQ_PERFORMANCE: |
| if (cpufreq[cpuidx].avail_governors & GOV_PERFORMANCE) |
| strcpy(cpufreq[cpuidx].new_governor, "performance"); |
| return SLURM_SUCCESS; |
| case CPU_FREQ_POWERSAVE: |
| if (cpufreq[cpuidx].avail_governors & GOV_POWERSAVE) |
| strcpy(cpufreq[cpuidx].new_governor, "powersave"); |
| return SLURM_SUCCESS; |
| case CPU_FREQ_USERSPACE: |
| if (cpufreq[cpuidx].avail_governors & GOV_USERSPACE) |
| strcpy(cpufreq[cpuidx].new_governor, "userspace"); |
| return SLURM_SUCCESS; |
| case CPU_FREQ_SCHEDUTIL: |
| if (cpufreq[cpuidx].avail_governors & GOV_SCHEDUTIL) |
| strcpy(cpufreq[cpuidx].new_governor, "schedutil"); |
| return SLURM_SUCCESS; |
| default: |
| return SLURM_ERROR; |
| } |
| } |
| |
| /* |
| * Convert frequency_spec into an actual frequency |
| * Returns -- frequency from avail frequency list, or NO_VAL |
| */ |
| uint32_t |
| _cpu_freq_freqspec_num(uint32_t cpu_freq, int cpuidx) |
| { |
| int fx, j; |
| if (!cpufreq || !cpufreq[cpuidx].nfreq) |
| return NO_VAL; |
| /* assume the frequency list is in ascending order */ |
| if (cpu_freq & CPU_FREQ_RANGE_FLAG) { /* Named values */ |
| switch(cpu_freq) |
| { |
| case CPU_FREQ_LOW : |
| return cpufreq[cpuidx].avail_freq[0]; |
| |
| case CPU_FREQ_MEDIUM : |
| if (cpufreq[cpuidx].nfreq == 1) |
| return cpufreq[cpuidx].avail_freq[0]; |
| fx = (cpufreq[cpuidx].nfreq - 1) / 2; |
| return cpufreq[cpuidx].avail_freq[fx]; |
| |
| case CPU_FREQ_HIGHM1 : |
| if (cpufreq[cpuidx].nfreq == 1) |
| return cpufreq[cpuidx].avail_freq[0]; |
| fx = cpufreq[cpuidx].nfreq - 2; |
| return cpufreq[cpuidx].avail_freq[fx]; |
| |
| case CPU_FREQ_HIGH : |
| fx = cpufreq[cpuidx].nfreq - 1; |
| return cpufreq[cpuidx].avail_freq[fx]; |
| |
| default: |
| return NO_VAL; |
| } |
| } |
| |
| /* check for request above or below available values */ |
| if (cpu_freq < cpufreq[cpuidx].avail_freq[0]) { |
| error("Rounding requested frequency %d " |
| "up to lowest available %d", cpu_freq, |
| cpufreq[cpuidx].avail_freq[0]); |
| return cpufreq[cpuidx].avail_freq[0]; |
| } else if (cpufreq[cpuidx].avail_freq[cpufreq[cpuidx].nfreq - 1] |
| < cpu_freq) { |
| error("Rounding requested frequency %d " |
| "down to highest available %d", cpu_freq, |
| cpufreq[cpuidx].avail_freq[cpufreq[cpuidx].nfreq - 1]); |
| return cpufreq[cpuidx].avail_freq[cpufreq[cpuidx].nfreq - 1]; |
| } |
| |
| /* check for frequency, round up if no exact match */ |
| for (j = 0; j < cpufreq[cpuidx].nfreq; ) { |
| if (cpu_freq == cpufreq[cpuidx].avail_freq[j]) { |
| return cpufreq[cpuidx].avail_freq[j]; |
| } |
| j++; /* step up to next element to round up * |
| * safe to advance due to bounds checks above here */ |
| if (cpu_freq < cpufreq[cpuidx].avail_freq[j]) { |
| info("Rounding requested frequency %d " |
| "up to next available %d", cpu_freq, |
| cpufreq[cpuidx].avail_freq[j]); |
| return cpufreq[cpuidx].avail_freq[j]; |
| } |
| } |
| /* loop above must return due to previous bounds checks |
| * but return NO_VAL here anyways to silence compiler warnings */ |
| return NO_VAL; |
| } |
| |
| /* |
| * Initialize data structure |
| */ |
| static void |
| _cpu_freq_init_data(int cpx) |
| { |
| /* avail_governors -- set at initialization */ |
| cpufreq[cpx].org_governor[0] = '\0'; |
| cpufreq[cpx].new_governor[0] = '\0'; |
| cpufreq[cpx].org_frequency = NO_VAL; |
| cpufreq[cpx].new_frequency = NO_VAL; |
| cpufreq[cpx].org_min_freq = NO_VAL; |
| cpufreq[cpx].new_min_freq = NO_VAL; |
| cpufreq[cpx].org_max_freq = NO_VAL; |
| cpufreq[cpx].new_max_freq = NO_VAL; |
| cpufreq[cpx].org_set = false; |
| } |
| /* |
| * Set either current frequency (speed) |
| * Or min/max governor base on --cpu-freq parameter |
| */ |
| static void |
| _cpu_freq_setup_data(stepd_step_rec_t *step, int cpx) |
| { |
| uint32_t freq; |
| |
| /* If no --cpu-freq, use default governor from conf file. */ |
| if (step->cpu_freq_gov == NO_VAL) |
| step->cpu_freq_gov = slurm_conf.cpu_freq_def; |
| if (step->cpu_freq_gov == NO_VAL) |
| return; |
| |
| /* Get current state */ |
| if (_cpu_freq_current_state(cpx) == SLURM_ERROR) |
| return; |
| |
| if (step->cpu_freq_min == NO_VAL && |
| step->cpu_freq_max != NO_VAL && |
| step->cpu_freq_gov == NO_VAL) { |
| /* Pre version 15.08 behavior */ |
| freq = _cpu_freq_freqspec_num(step->cpu_freq_max, cpx); |
| cpufreq[cpx].new_frequency = freq; |
| goto newfreq; |
| } |
| if (step->cpu_freq_gov == CPU_FREQ_USERSPACE) { |
| _cpu_freq_govspec_string(step->cpu_freq_gov, cpx); |
| if (step->cpu_freq_max == NO_VAL) { |
| return; /* pre version 15.08 behavior. */ |
| } |
| /* Power capping */ |
| freq = _cpu_freq_freqspec_num(step->cpu_freq_max, cpx); |
| cpufreq[cpx].new_frequency = freq; |
| cpufreq[cpx].new_min_freq = freq; |
| cpufreq[cpx].new_max_freq = freq; |
| return; |
| } |
| if (step->cpu_freq_min != NO_VAL && step->cpu_freq_max != NO_VAL) { |
| freq = _cpu_freq_freqspec_num(step->cpu_freq_min, cpx); |
| cpufreq[cpx].new_min_freq = freq; |
| freq = _cpu_freq_freqspec_num(step->cpu_freq_max, cpx); |
| cpufreq[cpx].new_max_freq = freq; |
| } |
| |
| if (step->cpu_freq_gov != NO_VAL) { |
| _cpu_freq_govspec_string(step->cpu_freq_gov, cpx); |
| } |
| newfreq: |
| /* Make sure a 'new' frequency is within scaling min/max */ |
| if (cpufreq[cpx].new_frequency != NO_VAL) { |
| if (cpufreq[cpx].new_frequency < cpufreq[cpx].org_min_freq) { |
| cpufreq[cpx].new_min_freq = cpufreq[cpx].new_frequency; |
| } |
| if (cpufreq[cpx].new_frequency > cpufreq[cpx].org_max_freq) { |
| cpufreq[cpx].new_max_freq = cpufreq[cpx].new_frequency; |
| } |
| } |
| } |
| |
| /* |
| * check an argument against valid governors. |
| * |
| * Input: - arg - string value of governor |
| * - illegal - combination of enums for governors not allowed. |
| * Returns - enum of governor found |
| * - or 0 if not found |
| */ |
| static uint32_t |
| _cpu_freq_check_gov(const char* arg, uint32_t illegal) |
| { |
| uint32_t rc = 0; |
| if (xstrncasecmp(arg, "co", 2) == 0) { |
| rc = CPU_FREQ_CONSERVATIVE; |
| } else if (xstrncasecmp(arg, "perf", 4) == 0) { |
| rc = CPU_FREQ_PERFORMANCE; |
| } else if (xstrncasecmp(arg, "pow", 3) == 0) { |
| rc = CPU_FREQ_POWERSAVE; |
| } else if (xstrncasecmp(arg, "user", 4) == 0) { |
| rc = CPU_FREQ_USERSPACE; |
| } else if (xstrncasecmp(arg, "onde", 4) == 0) { |
| rc = CPU_FREQ_ONDEMAND; |
| } else if (xstrncasecmp(arg, "sche", 4) == 0) { |
| rc = CPU_FREQ_SCHEDUTIL; |
| } |
| rc &= (~illegal); |
| if (rc == 0) |
| return 0; |
| return (rc | CPU_FREQ_RANGE_FLAG); |
| } |
| |
| /* |
| * check an argument for a frequency or frequency synonym. |
| * |
| * Input: - arg - string value of frequency |
| * |
| * Returns - frequency |
| * - enum for synonym |
| * 0 on error. |
| */ |
| static uint32_t |
| _cpu_freq_check_freq(const char* arg) |
| { |
| char *end; |
| uint32_t frequency; |
| |
| if (xstrncasecmp(arg, "lo", 2) == 0) { |
| return CPU_FREQ_LOW; |
| } else if (xstrncasecmp(arg, "him1", 4) == 0 || |
| xstrncasecmp(arg, "highm1", 6) == 0) { |
| return CPU_FREQ_HIGHM1; |
| } else if (xstrncasecmp(arg, "hi", 2) == 0) { |
| return CPU_FREQ_HIGH; |
| } else if (xstrncasecmp(arg, "med", 3) == 0) { |
| return CPU_FREQ_MEDIUM; |
| } |
| frequency = strtoul(arg, &end, 10); |
| if ((*end != '\0') || |
| ((frequency == 0) && (errno == EINVAL))) { |
| error("unrecognized --cpu-freq argument \"%s\"", arg); |
| return 0; |
| } |
| return frequency; |
| } |
| |
| /* |
| * set cpu frequency if possible for each cpu of the job step |
| */ |
| extern void |
| cpu_freq_set(stepd_step_rec_t *step) |
| { |
| char freq_detail[100]; |
| uint32_t freq; |
| int i, rc; |
| |
| if ((!cpu_freq_count) || (!cpufreq)) |
| return; |
| |
| for (i = 0; i < cpu_freq_count; i++) { |
| if (cpufreq[i].new_frequency == NO_VAL |
| && cpufreq[i].new_min_freq == NO_VAL |
| && cpufreq[i].new_max_freq == NO_VAL |
| && cpufreq[i].new_governor[0] == '\0') |
| continue; /* Nothing to set on this CPU */ |
| |
| log_flag(CPU_FREQ, "cpu_freq: current_state cpu=%d org_min=%u org_freq=%u org_max=%u org_gpv=%s", |
| i, cpufreq[i].org_min_freq, cpufreq[i].org_frequency, |
| cpufreq[i].org_max_freq, cpufreq[i].org_governor); |
| |
| /* Max must be set before min, per |
| * www.kernel.org/doc/Documentation/cpu-freq/user-guide.txt |
| */ |
| if (cpufreq[i].new_max_freq != NO_VAL ) { |
| freq = cpufreq[i].new_max_freq; |
| if (cpufreq[i].org_frequency > freq) { |
| /* The current frequency is > requested max, |
| * Set it so it is in range |
| * have to go to UserSpace to do it. */ |
| rc = _cpu_freq_set_gov(step, i, "userspace"); |
| if (rc == SLURM_ERROR) |
| return; |
| rc = _cpu_freq_set_scaling_freq(step, i, freq, |
| "scaling_setspeed"); |
| if (rc == SLURM_ERROR) |
| continue; |
| if (cpufreq[i].new_governor[0] == '\0') { |
| /* Not requesting new gov, so restore */ |
| rc = _cpu_freq_set_gov(step, i, |
| cpufreq[i].org_governor); |
| if (rc == SLURM_ERROR) |
| continue; |
| } |
| } |
| rc = _cpu_freq_set_scaling_freq(step, i, freq, |
| "scaling_max_freq"); |
| if (rc == SLURM_ERROR) |
| continue; |
| } |
| if (cpufreq[i].new_min_freq != NO_VAL) { |
| freq = cpufreq[i].new_min_freq; |
| if (cpufreq[i].org_frequency < freq) { |
| /* The current frequency is < requested min, |
| * Set it so it is in range |
| * have to go to UserSpace to do it. */ |
| rc = _cpu_freq_set_gov(step, i, "userspace"); |
| if (rc == SLURM_ERROR) |
| continue; |
| rc = _cpu_freq_set_scaling_freq(step, i, freq, |
| "scaling_setspeed"); |
| if (rc == SLURM_ERROR) |
| continue; |
| if (cpufreq[i].new_governor[0] == '\0') { |
| /* Not requesting new gov, so restore */ |
| rc= _cpu_freq_set_gov(step, i, |
| cpufreq[i].org_governor); |
| if (rc == SLURM_ERROR) |
| continue; |
| } |
| } |
| rc= _cpu_freq_set_scaling_freq(step, i, freq, |
| "scaling_min_freq"); |
| if (rc == SLURM_ERROR) |
| continue; |
| } |
| if (cpufreq[i].new_frequency != NO_VAL) { |
| if (xstrcmp(cpufreq[i].org_governor,"userspace")) { |
| rc = _cpu_freq_set_gov(step, i, "userspace"); |
| if (rc == SLURM_ERROR) |
| continue; |
| } |
| rc = _cpu_freq_set_scaling_freq(step, i, |
| cpufreq[i].new_frequency, |
| "scaling_setspeed"); |
| if (rc == SLURM_ERROR) |
| continue; |
| } |
| if (cpufreq[i].new_governor[0] != '\0') { |
| rc = _cpu_freq_set_gov(step, i, cpufreq[i].new_governor); |
| if (rc == SLURM_ERROR) |
| continue; |
| } |
| if (slurm_conf.debug_flags & DEBUG_FLAG_CPU_FREQ) { |
| cpu_freq_debug(NULL, NULL, |
| freq_detail, sizeof(freq_detail), |
| NO_VAL, cpufreq[i].new_min_freq, |
| cpufreq[i].new_max_freq, |
| cpufreq[i].new_frequency); |
| if (cpufreq[i].new_governor[0] != '\0') { |
| info("cpu_freq: set cpu=%d %s Governor=%s", |
| i, freq_detail, cpufreq[i].new_governor); |
| } else { |
| info("cpu_freq: reset cpu=%d %s", i, |
| freq_detail); |
| } |
| } |
| } |
| } |
| |
| /* |
| * reset the cpus used by the process to their |
| * default frequency and governor type |
| */ |
| extern void |
| cpu_freq_reset(stepd_step_rec_t *step) |
| { |
| int i, rc, fd; |
| char freq_detail[100]; |
| |
| if ((!cpu_freq_count) || (!cpufreq)) |
| return; |
| |
| for (i = 0; i < cpu_freq_count; i++) { |
| if (cpufreq[i].new_frequency == NO_VAL |
| && cpufreq[i].new_min_freq == NO_VAL |
| && cpufreq[i].new_max_freq == NO_VAL |
| && cpufreq[i].new_governor[0] == '\0') |
| continue; /* Nothing to reset on this CPU */ |
| |
| fd = _test_cpu_owner_lock(i, step->step_id.job_id); |
| if (fd < 0) |
| continue; |
| |
| if (cpufreq[i].new_frequency != NO_VAL) { |
| rc = _cpu_freq_set_gov(step, i, "userspace"); |
| if (rc == SLURM_ERROR) |
| continue; |
| rc = _cpu_freq_set_scaling_freq(step, i, |
| cpufreq[i].org_frequency, |
| "scaling_setspeed"); |
| if (rc == SLURM_ERROR) |
| continue; |
| cpufreq[i].new_governor[0] = 'u'; /* force gov reset */ |
| } |
| /* Max must be set before min, per |
| * www.kernel.org/doc/Documentation/cpu-freq/user-guide.txt |
| */ |
| if (cpufreq[i].new_max_freq != NO_VAL) { |
| rc = _cpu_freq_set_scaling_freq(step, i, |
| cpufreq[i].org_max_freq, |
| "scaling_max_freq"); |
| if (rc == SLURM_ERROR) |
| continue; |
| } |
| if (cpufreq[i].new_min_freq != NO_VAL) { |
| rc = _cpu_freq_set_scaling_freq(step, i, |
| cpufreq[i].org_min_freq, |
| "scaling_min_freq"); |
| if (rc == SLURM_ERROR) |
| continue; |
| } |
| if (cpufreq[i].new_governor[0] != '\0') { |
| rc = _cpu_freq_set_gov( |
| step, i, cpufreq[i].org_governor); |
| if (rc == SLURM_ERROR) |
| continue; |
| } |
| |
| if (slurm_conf.debug_flags & DEBUG_FLAG_CPU_FREQ) { |
| cpu_freq_debug(NULL, NULL, |
| freq_detail, sizeof(freq_detail), |
| NO_VAL, cpufreq[i].org_min_freq, |
| cpufreq[i].org_max_freq, |
| cpufreq[i].org_frequency); |
| if (cpufreq[i].new_governor[0] != '\0') { |
| info("cpu_freq: reset cpu=%d %s Governor=%s", |
| i, freq_detail, cpufreq[i].org_governor); |
| } else { |
| info("cpu_freq: reset cpu=%d %s", i, |
| freq_detail); |
| } |
| } |
| } |
| } |
| |
| /* Convert a cpu_freq number to its equivalent string */ |
| extern void |
| cpu_freq_to_string(char *buf, int buf_size, uint32_t cpu_freq) |
| { |
| if (cpu_freq == CPU_FREQ_LOW) |
| snprintf(buf, buf_size, "Low"); |
| else if (cpu_freq == CPU_FREQ_MEDIUM) |
| snprintf(buf, buf_size, "Medium"); |
| else if (cpu_freq == CPU_FREQ_HIGHM1) |
| snprintf(buf, buf_size, "Highm1"); |
| else if (cpu_freq == CPU_FREQ_HIGH) |
| snprintf(buf, buf_size, "High"); |
| else if (cpu_freq == CPU_FREQ_CONSERVATIVE) |
| snprintf(buf, buf_size, "Conservative"); |
| else if (cpu_freq == CPU_FREQ_PERFORMANCE) |
| snprintf(buf, buf_size, "Performance"); |
| else if (cpu_freq == CPU_FREQ_POWERSAVE) |
| snprintf(buf, buf_size, "PowerSave"); |
| else if (cpu_freq == CPU_FREQ_USERSPACE) |
| snprintf(buf, buf_size, "UserSpace"); |
| else if (cpu_freq == CPU_FREQ_ONDEMAND) |
| snprintf(buf, buf_size, "OnDemand"); |
| else if (cpu_freq == CPU_FREQ_SCHEDUTIL) |
| snprintf(buf, buf_size, "SchedUtil"); |
| else if (cpu_freq & CPU_FREQ_RANGE_FLAG) |
| snprintf(buf, buf_size, "Unknown"); |
| else if (fuzzy_equal(cpu_freq, NO_VAL)) { |
| if (buf_size > 0) |
| buf[0] = '\0'; |
| } else |
| convert_num_unit2((double)cpu_freq, buf, buf_size, |
| UNIT_KILO, NO_VAL, 1000, 0); |
| } |
| |
| extern char *cpu_freq_to_cmdline(uint32_t min, uint32_t max, uint32_t gov) |
| { |
| char bfgov[32], bfmin[32], bfmax[32]; |
| char *bfall = NULL; |
| bfgov[0] = '\0'; |
| bfmin[0] = '\0'; |
| bfmax[0] = '\0'; |
| |
| /* |
| * Default value from command line is NO_VAL, |
| * Default value from slurmstepd for batch jobs is 0 |
| * Convert slurmstepd values to command line ones. |
| */ |
| if (min == 0) |
| min = NO_VAL; |
| if (max == 0) |
| max = NO_VAL; |
| if (gov == 0) |
| gov = NO_VAL; |
| |
| if ((min == NO_VAL) && (max == NO_VAL) && (gov == NO_VAL)) |
| return NULL; |
| |
| if (min != NO_VAL) { |
| if (min & CPU_FREQ_RANGE_FLAG) { |
| cpu_freq_to_string(bfmin, sizeof(bfmin), min); |
| } else { |
| snprintf(bfmin, 32, "%u", min); |
| } |
| } |
| if (max != NO_VAL) { |
| if (max & CPU_FREQ_RANGE_FLAG) { |
| cpu_freq_to_string(bfmax, sizeof(bfmax), max); |
| } else { |
| snprintf(bfmax, 32, "%u", max); |
| } |
| } |
| if (gov != NO_VAL) { |
| cpu_freq_to_string(bfgov, sizeof(bfgov), gov); |
| } |
| if ((min != NO_VAL) && (max != NO_VAL) && (gov != NO_VAL)) { |
| xstrfmtcat(bfall, "%s-%s:%s", bfmin, bfmax, bfgov); |
| } else if ((min != NO_VAL) && (max != NO_VAL)) { |
| xstrfmtcat(bfall, "%s-%s", bfmin, bfmax); |
| } else if (max != NO_VAL) { |
| xstrcat(bfall, bfmax); |
| } else if (gov != NO_VAL) { |
| xstrcat(bfall, bfgov); |
| } |
| |
| return bfall; |
| } |
| |
| /* |
| * Set environment variables associated with the frequency variables. |
| */ |
| extern int cpu_freq_set_env(char *var, uint32_t min, uint32_t max, |
| uint32_t gov) |
| { |
| char *bfall = cpu_freq_to_cmdline(min, max, gov); |
| if (bfall && setenvf(NULL, var, "%s", bfall)) { |
| xfree(bfall); |
| error("Unable to set %s", var); |
| return SLURM_ERROR; |
| } |
| xfree(bfall); |
| return SLURM_SUCCESS; |
| } |
| |
| /* Convert a composite cpu governor enum to its equivalent string |
| * |
| * Input: - buf - buffer to contain string |
| * - bufsz - size of buffer |
| * - gpvs - composite enum of governors |
| */ |
| extern void |
| cpu_freq_govlist_to_string(char* buf, uint16_t bufsz, uint32_t govs) |
| { |
| char *list = NULL; |
| char *sep = "", *pos = NULL; |
| |
| if ((govs & CPU_FREQ_CONSERVATIVE) == CPU_FREQ_CONSERVATIVE) { |
| xstrfmtcatat(list, &pos, "%s%s", sep, "Conservative"); |
| sep = ","; |
| } |
| if ((govs & CPU_FREQ_ONDEMAND) == CPU_FREQ_ONDEMAND) { |
| xstrfmtcatat(list, &pos, "%s%s", sep, "OnDemand"); |
| sep = ","; |
| } |
| if ((govs & CPU_FREQ_PERFORMANCE) == CPU_FREQ_PERFORMANCE) { |
| xstrfmtcatat(list, &pos, "%s%s", sep, "Performance"); |
| sep = ","; |
| } |
| if ((govs & CPU_FREQ_POWERSAVE) == CPU_FREQ_POWERSAVE) { |
| xstrfmtcatat(list, &pos, "%s%s", sep, "PowerSave"); |
| sep = ","; |
| } |
| if ((govs & CPU_FREQ_SCHEDUTIL) == CPU_FREQ_SCHEDUTIL) { |
| xstrfmtcatat(list, &pos, "%s%s", sep, "SchedUtil"); |
| sep = ","; |
| } |
| if ((govs & CPU_FREQ_USERSPACE) == CPU_FREQ_USERSPACE) { |
| xstrfmtcatat(list, &pos, "%s%s", sep, "UserSpace"); |
| sep = ","; |
| } |
| |
| if (list) { |
| strlcpy(buf, list, bufsz); |
| xfree(list); |
| } else { |
| strlcpy(buf, "No Governors defined", bufsz); |
| } |
| } |
| |
| /* |
| * Verify slurm.conf CpuFreqDef option |
| * |
| * Input: - arg - governor/frequency value to check: |
| * valid governor, low, medium, highm1, high, |
| * or numeric frequency |
| * - freq - pointer to corresponding enum or numeric value |
| * Returns - -1 on error, else 0 |
| */ |
| extern int |
| cpu_freq_verify_def(const char *arg, uint32_t *freq) |
| { |
| uint32_t cpufreq = 0; |
| |
| cpufreq = _cpu_freq_check_gov(arg, CPU_FREQ_USERSPACE); |
| if (cpufreq) { |
| debug3("cpu_freq_verify_def: %s set", arg); |
| *freq = cpufreq; |
| return 0; |
| } |
| error("%s: CpuFreqDef=%s invalid", __func__, arg); |
| return -1; |
| } |
| |
| /* |
| * Verify slurm.conf CpuFreqGovernors list |
| * |
| * Input: - arg - string list of governors |
| * - govs - pointer to composite of enum for each governor in list |
| * Returns - -1 on error, else 0 |
| */ |
| extern int |
| cpu_freq_verify_govlist(const char *arg, uint32_t *govs) |
| { |
| char *list, *gov, *savestr = NULL; |
| uint32_t agov; |
| |
| *govs = 0; |
| if (arg == NULL) { |
| error("cpu_freq_verify_govlist: governor list is empty"); |
| return -1; |
| } |
| |
| list = xstrdup(arg); |
| if ( (gov = strtok_r(list, ",", &savestr) ) == NULL) { |
| error("cpu_freq_verify_govlist: governor list '%s' invalid", |
| arg); |
| return -1; |
| } |
| do { |
| debug3("cpu_freq_verify_govlist: gov = %s", gov); |
| agov = _cpu_freq_check_gov(gov, 0); |
| if (agov == 0) { |
| error("cpu_freq_verify_govlist: governor '%s' invalid", |
| gov); |
| return -1; |
| } |
| *govs |= agov; |
| } while ( (gov = strtok_r(NULL, ",", &savestr) ) != NULL); |
| xfree(list); |
| return 0; |
| } |
| |
| /* |
| * Verify cpu_freq command line option |
| * |
| * --cpu-freq=arg |
| * where arg is p1[-p2][:p3] |
| * |
| * - p1 can be [#### | low | medium | high | highm1] |
| * which will set the current frequency, and set the governor to |
| * UserSpace. |
| * - p1 can be [Conservative | OnDemand | Performance | PowerSave | UserSpace] |
| * which will set the governor to the corresponding value. |
| * - When p2 is present, p1 will be the minimum frequency and p2 will be |
| * the maximum. The governor cannot be UserSpace, so CpuFreqDef must be set in |
| * slurm.conf if there's no p3. |
| * - p2 can be [#### | medium | high | highm1] p2 must be greater than p1. |
| * - If the current frequency is < min, it will be set to min. |
| * Likewise, if the current frequency is > max, it will be set to max. |
| * - p3 can be [Conservative | OnDemand | Performance | PowerSave | UserSpace] |
| * which will set the governor to the corresponding value. |
| * When p3 is UserSpace, p2 must be empty. |
| * p2 will have been set by PowerCapping. |
| * |
| * returns -1 on error, 0 otherwise |
| */ |
| extern int |
| cpu_freq_verify_cmdline(const char *arg, |
| uint32_t *cpu_freq_min, |
| uint32_t *cpu_freq_max, |
| uint32_t *cpu_freq_gov) |
| { |
| char *poscolon, *posdash; |
| char *p1=NULL, *p2=NULL, *p3=NULL; |
| uint32_t frequency; |
| int rc = 0; |
| |
| if (arg == NULL || cpu_freq_min == NULL || cpu_freq_max == NULL |
| || cpu_freq_gov == NULL) { |
| return -1; |
| } |
| *cpu_freq_min = NO_VAL; |
| *cpu_freq_max = NO_VAL; |
| *cpu_freq_gov = NO_VAL; |
| poscolon = strchr(arg,':'); |
| if (poscolon) { |
| p3 = xstrdup((poscolon+1)); |
| } |
| posdash = strchr(arg,'-'); |
| if (posdash) { |
| p1 = xstrndup(arg, (posdash-arg)); |
| if (poscolon) { |
| p2 = xstrndup((posdash+1), ((poscolon-posdash)-1)); |
| } else { |
| p2 = xstrdup((posdash+1)); |
| } |
| } else { |
| if (poscolon) { |
| p1 = xstrndup(arg, (poscolon-arg)); |
| } else { |
| p1 = xstrdup(arg); |
| } |
| } |
| |
| frequency = _cpu_freq_check_gov(p1, 0); |
| if (frequency != 0) { |
| if (p3) { |
| error("governor cannot be specified twice " |
| "%s{-}:%s in --cpu-freq", p1, p3); |
| rc = -1; |
| goto clean; |
| } |
| *cpu_freq_gov = frequency; |
| } else { |
| frequency = _cpu_freq_check_freq(p1); |
| if (frequency == 0) { |
| rc = -1; |
| goto clean; |
| } |
| *cpu_freq_max = frequency; |
| } |
| if (p2) { |
| if (!p3 && (slurm_conf.cpu_freq_def == NO_VAL)) { |
| /* |
| * If the user specified a range without a governor, |
| * (even if userspace is not set), we won't accept the |
| * request. We don't know how the cpus are set and we |
| * won't decide which one to set for the user. Note that |
| * a range is valid for multiple governors. |
| */ |
| error("You must explicitly choose a governor when defining a range. Please specify only one value for the desired frequency (p1) or choose a specific governor (p3)."); |
| rc = -1; |
| goto clean; |
| } |
| frequency = _cpu_freq_check_freq(p2); |
| if (frequency == 0) { |
| rc = -1; |
| goto clean; |
| } |
| *cpu_freq_min = *cpu_freq_max; |
| *cpu_freq_max = frequency; |
| if (*cpu_freq_max < *cpu_freq_min) { |
| error("min cpu-frec (%s) must be < max cpu-freq (%s)", |
| p1, p2); |
| rc = -1; |
| goto clean; |
| } |
| } |
| if (p3) { |
| frequency = _cpu_freq_check_gov(p3, 0); |
| if (frequency == 0) { |
| error("illegal governor: %s on --cpu-freq", p3); |
| rc = -1; |
| goto clean; |
| } |
| if (!p2) { |
| if (frequency != CPU_FREQ_USERSPACE) { |
| error("gov on cpu-frec (%s) illegal without max", |
| p3); |
| rc = -1; |
| goto clean; |
| } |
| } else { |
| if (frequency == CPU_FREQ_USERSPACE) { |
| error("%s governor does not support a range. Please specify only one value for the desired frequency (p1) or choose a different governor.", |
| p3); |
| rc = -1; |
| goto clean; |
| } |
| } |
| *cpu_freq_gov = frequency; |
| } else if (p2 && (*cpu_freq_gov == NO_VAL) && |
| (slurm_conf.cpu_freq_def != NO_VAL)) { |
| /* |
| * No governor specified and a range is specified. |
| * Use slurm.conf CpuFreqDef if defined. Note that this cannot |
| * be UserSpace. |
| */ |
| *cpu_freq_gov = slurm_conf.cpu_freq_def; |
| } |
| |
| /* Also force this in case we specify just one frequency. */ |
| if ((*cpu_freq_gov == NO_VAL) && !p2 && !p3) |
| *cpu_freq_gov = CPU_FREQ_USERSPACE; |
| |
| clean: |
| if (*cpu_freq_gov != NO_VAL) { |
| if (((*cpu_freq_gov & slurm_conf.cpu_freq_govs) |
| & ~CPU_FREQ_RANGE_FLAG) == 0) { |
| error("governor of %s is not allowed in slurm.conf", |
| arg); |
| *cpu_freq_gov = NO_VAL; |
| rc = -1; |
| } |
| } |
| if (slurm_conf.debug_flags & DEBUG_FLAG_CPU_FREQ) { |
| cpu_freq_debug("command", "NO_VAL", NULL, 0, |
| *cpu_freq_gov, *cpu_freq_min, |
| *cpu_freq_max, NO_VAL); |
| } |
| xfree(p1); |
| xfree(p2); |
| xfree(p3); |
| return rc; |
| |
| } |
| |
| /* |
| * Convert frequency parameters to strings |
| * Typically called to produce string for a log or reporting utility. |
| * |
| * When label!=NULL, info message is put to log. This is convenient for |
| * inserting debug calls to verify values in structures or messages. |
| * noval_str==NULL allows missing parameters not to be reported. |
| * freq_str is a buffer to hold the composite string for all input values. |
| * freq_len is length of freq_str |
| * gov is a governor value |
| * min is a minimum value |
| * max is a maximum value |
| * freq is a (current) frequency value. |
| * |
| * Returns 0 if all parameters are NO_VAL (or 0) |
| */ |
| extern int |
| cpu_freq_debug(char* label, char* noval_str, char* freq_str, int freq_len, |
| uint32_t gov, uint32_t min, uint32_t max, uint32_t freq) |
| { |
| int rc = 0; |
| char bfgov[64], bfmin[32], bfmax[32], bffreq[32]; |
| char *sep1 = " ", *sep2 = " ", *sep3 = " "; |
| |
| bfgov[0] = '\0'; |
| bfmin[0] = '\0'; |
| bfmax[0] = '\0'; |
| bffreq[0] = '\0'; |
| |
| if (freq != NO_VAL && freq != 0) { |
| rc = 1; |
| sprintf(bffreq, "cur_freq=%u", freq); |
| } else { |
| sep1 = ""; |
| } |
| if ((min != NO_VAL) && (min != 0)) { |
| rc = 1; |
| if (min & CPU_FREQ_RANGE_FLAG) { |
| strcpy(bfmin, "CPU_min_freq="); |
| cpu_freq_to_string(&bfmin[13], (sizeof(bfmin)-13), min); |
| } else { |
| sprintf(bfmin, "CPU_min_freq=%u", min); |
| } |
| } else if (noval_str) { |
| if (strlen(noval_str) >= sizeof(bfmin)) { |
| error("%s: minimum CPU frequency string too large", |
| __func__); |
| } else { |
| strlcpy(bfmin, noval_str, sizeof(bfmin)); |
| } |
| } else { |
| sep2 = ""; |
| } |
| if ((max != NO_VAL) && (max != 0)) { |
| rc = 1; |
| if (max & CPU_FREQ_RANGE_FLAG) { |
| strcpy(bfmax, "CPU_max_freq="); |
| cpu_freq_to_string(&bfmax[13], (sizeof(bfmax)-13), max); |
| } else { |
| sprintf(bfmax, "CPU_max_freq=%u", max); |
| } |
| } else if (noval_str) { |
| if (strlen(noval_str) >= sizeof(bfmax)) { |
| error("%s: maximum CPU frequency string too large", |
| __func__); |
| } else { |
| strlcpy(bfmax, noval_str, sizeof(bfmax)); |
| } |
| } else { |
| sep3 = ""; |
| } |
| if ((gov != NO_VAL) && (gov != 0)) { |
| rc = 1; |
| strcpy(bfgov, "Governor="); |
| cpu_freq_to_string(&bfgov[9], (sizeof(bfgov)-9), gov); |
| } else if (noval_str) { |
| if (strlen(noval_str) >= sizeof(bfgov)) { |
| error("%s: max CPU governor string too large", |
| __func__); |
| } else { |
| strlcpy(bfgov, noval_str, sizeof(bfgov)); |
| } |
| } |
| if (rc) { |
| if (freq_str) { |
| snprintf(freq_str, freq_len, "%s%s%s%s%s%s%s", |
| bffreq, sep1, bfmin, sep2, bfmax, sep3, bfgov); |
| } |
| } else { |
| if (freq_str) |
| freq_str[0] = '\0'; |
| } |
| if (label) { |
| info("cpu-freq: %s :: %s%s%s%s%s%s%s", label, |
| bffreq, sep1, bfmin, sep2, bfmax, sep3, bfgov); |
| } |
| return rc; |
| } |