blob: 891b0acacbbeaa60e4f80012513a41db24591e31 [file] [log] [blame]
/* SPDX-License-Identifier: LGPL-2.1-or-later */
#include "bus-log-control-api.h"
#include "bus-util.h"
#include "bus-polkit.h"
#include "cgroup-util.h"
#include "fd-util.h"
#include "fileio.h"
#include "memory-util.h"
#include "oomd-manager-bus.h"
#include "oomd-manager.h"
#include "path-util.h"
#include "percent-util.h"
typedef struct ManagedOOMReply {
ManagedOOMMode mode;
char *path;
char *property;
uint32_t limit;
} ManagedOOMReply;
static void managed_oom_reply_destroy(ManagedOOMReply *reply) {
static int managed_oom_mode(const char *name, JsonVariant *v, JsonDispatchFlags flags, void *userdata) {
ManagedOOMMode *mode = userdata, m;
const char *s;
assert_se(s = json_variant_string(v));
m = managed_oom_mode_from_string(s);
if (m < 0)
return json_log(v, flags, m, "%s is not a valid ManagedOOMMode", s);
*mode = m;
return 0;
static int process_managed_oom_reply(
Varlink *link,
JsonVariant *parameters,
const char *error_id,
VarlinkReplyFlags flags,
void *userdata) {
JsonVariant *c, *cgroups;
Manager *m = userdata;
int r = 0;
static const JsonDispatch dispatch_table[] = {
{ "mode", JSON_VARIANT_STRING, managed_oom_mode, offsetof(ManagedOOMReply, mode), JSON_MANDATORY },
{ "path", JSON_VARIANT_STRING, json_dispatch_string, offsetof(ManagedOOMReply, path), JSON_MANDATORY },
{ "property", JSON_VARIANT_STRING, json_dispatch_string, offsetof(ManagedOOMReply, property), JSON_MANDATORY },
{ "limit", JSON_VARIANT_UNSIGNED, json_dispatch_uint32, offsetof(ManagedOOMReply, limit), 0 },
if (error_id) {
r = -EIO;
log_debug("Error getting ManagedOOM cgroups: %s", error_id);
goto finish;
cgroups = json_variant_by_key(parameters, "cgroups");
if (!cgroups) {
r = -EINVAL;
goto finish;
/* Skip malformed elements and keep processing in case the others are good */
_cleanup_(managed_oom_reply_destroy) ManagedOOMReply reply = {};
OomdCGroupContext *ctx;
Hashmap *monitor_hm;
loadavg_t limit;
int ret;
if (!json_variant_is_object(c))
ret = json_dispatch(c, dispatch_table, NULL, 0, &reply);
if (ret == -ENOMEM) {
r = ret;
goto finish;
if (ret < 0)
monitor_hm = streq(, "ManagedOOMSwap") ?
m->monitored_swap_cgroup_contexts : m->monitored_mem_pressure_cgroup_contexts;
if (reply.mode == MANAGED_OOM_AUTO) {
(void) oomd_cgroup_context_free(hashmap_remove(monitor_hm, empty_to_root(reply.path)));
limit = m->default_mem_pressure_limit;
if (streq(, "ManagedOOMMemoryPressure") && reply.limit > 0) {
int permyriad = UINT32_SCALE_TO_PERMYRIAD(reply.limit);
ret = store_loadavg_fixed_point(
(unsigned long) permyriad / 100,
(unsigned long) permyriad % 100,
if (ret < 0)
ret = oomd_insert_cgroup_context(NULL, monitor_hm, reply.path);
if (ret == -ENOMEM) {
r = ret;
goto finish;
if (ret < 0 && ret != -EEXIST)
log_debug_errno(ret, "Failed to insert reply, ignoring: %m");
/* Always update the limit in case it was changed. For non-memory pressure detection the value is
* ignored so always updating it here is not a problem. */
ctx = hashmap_get(monitor_hm, empty_to_root(reply.path));
if (ctx)
ctx->mem_pressure_limit = limit;
m->varlink = varlink_close_unref(link);
return r;
/* Fill `new_h` with `path`'s descendent OomdCGroupContexts. Only include descendent cgroups that are possible
* candidates for action. That is, only leaf cgroups or cgroups with set to "1".
* This function ignores most errors in order to handle cgroups that may have been cleaned up while populating
* the hashmap.
* `new_h` is of the form { key: cgroup paths -> value: OomdCGroupContext } */
static int recursively_get_cgroup_context(Hashmap *new_h, const char *path) {
_cleanup_free_ char *subpath = NULL;
_cleanup_closedir_ DIR *d = NULL;
int r;
r = cg_enumerate_subgroups(SYSTEMD_CGROUP_CONTROLLER, path, &d);
if (r < 0)
return r;
r = cg_read_subgroup(d, &subpath);
if (r < 0)
return r;
else if (r == 0) { /* No subgroups? We're a leaf node */
r = oomd_insert_cgroup_context(NULL, new_h, path);
if (r == -ENOMEM)
return r;
if (r < 0)
log_debug_errno(r, "Failed to insert context for %s, ignoring: %m", path);
return 0;
do {
_cleanup_free_ char *cg_path = NULL;
bool oom_group;
cg_path = path_join(empty_to_root(path), subpath);
if (!cg_path)
return -ENOMEM;
subpath = mfree(subpath);
r = cg_get_attribute_as_bool("memory", cg_path, "", &oom_group);
/* The cgroup might be gone. Skip it as a candidate since we can't get information on it. */
if (r == -ENOMEM)
return r;
if (r < 0) {
log_debug_errno(r, "Failed to read from %s, ignoring: %m", cg_path);
return 0;
if (oom_group)
r = oomd_insert_cgroup_context(NULL, new_h, cg_path);
r = recursively_get_cgroup_context(new_h, cg_path);
if (r == -ENOMEM)
return r;
if (r < 0)
log_debug_errno(r, "Failed to insert or recursively get from %s, ignoring: %m", cg_path);
} while ((r = cg_read_subgroup(d, &subpath)) > 0);
return 0;
static int update_monitored_cgroup_contexts(Hashmap **monitored_cgroups) {
_cleanup_hashmap_free_ Hashmap *new_base = NULL;
OomdCGroupContext *ctx;
int r;
new_base = hashmap_new(&oomd_cgroup_ctx_hash_ops);
if (!new_base)
return -ENOMEM;
HASHMAP_FOREACH(ctx, *monitored_cgroups) {
/* Skip most errors since the cgroup we're trying to update might not exist anymore. */
r = oomd_insert_cgroup_context(*monitored_cgroups, new_base, ctx->path);
if (r == -ENOMEM)
return r;
if (r < 0 && !IN_SET(r, -EEXIST, -ENOENT))
log_debug_errno(r, "Failed to insert context for %s, ignoring: %m", ctx->path);
*monitored_cgroups = TAKE_PTR(new_base);
return 0;
static int get_monitored_cgroup_contexts_candidates(Hashmap *monitored_cgroups, Hashmap **ret_candidates) {
_cleanup_hashmap_free_ Hashmap *candidates = NULL;
OomdCGroupContext *ctx;
int r;
candidates = hashmap_new(&oomd_cgroup_ctx_hash_ops);
if (!candidates)
return -ENOMEM;
HASHMAP_FOREACH(ctx, monitored_cgroups) {
r = recursively_get_cgroup_context(candidates, ctx->path);
if (r == -ENOMEM)
return r;
if (r < 0)
log_debug_errno(r, "Failed to recursively get contexts for %s, ignoring: %m", ctx->path);
*ret_candidates = TAKE_PTR(candidates);
return 0;
static int update_monitored_cgroup_contexts_candidates(Hashmap *monitored_cgroups, Hashmap **candidates) {
_cleanup_hashmap_free_ Hashmap *new_candidates = NULL;
int r;
r = get_monitored_cgroup_contexts_candidates(monitored_cgroups, &new_candidates);
if (r < 0)
return log_debug_errno(r, "Failed to get candidate contexts: %m");
oomd_update_cgroup_contexts_between_hashmaps(*candidates, new_candidates);
*candidates = TAKE_PTR(new_candidates);
return 0;
static int acquire_managed_oom_connect(Manager *m) {
_cleanup_(varlink_close_unrefp) Varlink *link = NULL;
int r;
r = varlink_connect_address(&link, VARLINK_ADDR_PATH_MANAGED_OOM);
if (r < 0)
return log_error_errno(r, "Failed to connect to %s: %m", VARLINK_ADDR_PATH_MANAGED_OOM);
(void) varlink_set_userdata(link, m);
(void) varlink_set_description(link, "oomd");
(void) varlink_set_relative_timeout(link, USEC_INFINITY);
r = varlink_attach_event(link, m->event, SD_EVENT_PRIORITY_NORMAL);
if (r < 0)
return log_error_errno(r, "Failed to attach varlink connection to event loop: %m");
r = varlink_bind_reply(link, process_managed_oom_reply);
if (r < 0)
return log_error_errno(r, "Failed to bind reply callback: %m");
r = varlink_observe(link, "io.systemd.ManagedOOM.SubscribeManagedOOMCGroups", NULL);
if (r < 0)
return log_error_errno(r, "Failed to observe varlink call: %m");
m->varlink = TAKE_PTR(link);
return 0;
static int monitor_swap_contexts_handler(sd_event_source *s, uint64_t usec, void *userdata) {
Manager *m = userdata;
usec_t usec_now;
int r;
/* Reset timer */
r = sd_event_now(sd_event_source_get_event(s), CLOCK_MONOTONIC, &usec_now);
if (r < 0)
return log_error_errno(r, "Failed to reset event timer: %m");
r = sd_event_source_set_time_relative(s, SWAP_INTERVAL_USEC);
if (r < 0)
return log_error_errno(r, "Failed to set relative time for timer: %m");
/* Reconnect if our connection dropped */
if (!m->varlink) {
r = acquire_managed_oom_connect(m);
if (r < 0)
return log_error_errno(r, "Failed to acquire varlink connection: %m");
/* We still try to acquire system information for oomctl even if no units want swap monitoring */
r = oomd_system_context_acquire("/proc/meminfo", &m->system_context);
/* If there are no units depending on swap actions, the only error we exit on is ENOMEM. */
if (r == -ENOMEM || (r < 0 && !hashmap_isempty(m->monitored_swap_cgroup_contexts)))
return log_error_errno(r, "Failed to acquire system context: %m");
/* Return early if nothing is requesting swap monitoring */
if (hashmap_isempty(m->monitored_swap_cgroup_contexts))
return 0;
/* Note that m->monitored_swap_cgroup_contexts does not need to be updated every interval because only the
* system context is used for deciding whether the swap threshold is hit. m->monitored_swap_cgroup_contexts
* is only used to decide which cgroups to kill (and even then only the resource usages of its descendent
* nodes are the ones that matter). */
/* Check amount of memory free and swap free so we don't free up swap when memory is still available. */
if (oomd_mem_free_below(&m->system_context, 10000 - m->swap_used_limit_permyriad) &&
oomd_swap_free_below(&m->system_context, 10000 - m->swap_used_limit_permyriad)) {
_cleanup_hashmap_free_ Hashmap *candidates = NULL;
_cleanup_free_ char *selected = NULL;
uint64_t threshold;
log_debug("Memory used (%"PRIu64") / total (%"PRIu64") and "
"swap used (%"PRIu64") / total (%"PRIu64") is more than " PERMYRIAD_AS_PERCENT_FORMAT_STR,
m->system_context.mem_used, m->system_context.mem_total,
m->system_context.swap_used, m->system_context.swap_total,
r = get_monitored_cgroup_contexts_candidates(m->monitored_swap_cgroup_contexts, &candidates);
if (r == -ENOMEM)
return log_oom();
if (r < 0)
log_debug_errno(r, "Failed to get monitored swap cgroup candidates, ignoring: %m");
threshold = m->system_context.swap_total * THRESHOLD_SWAP_USED_PERCENT / 100;
r = oomd_kill_by_swap_usage(candidates, threshold, m->dry_run, &selected);
if (r == -ENOMEM)
return log_oom();
if (r < 0)
log_notice_errno(r, "Failed to kill any cgroup(s) based on swap: %m");
else {
if (selected && r > 0)
log_notice("Killed %s due to memory used (%"PRIu64") / total (%"PRIu64") and "
"swap used (%"PRIu64") / total (%"PRIu64") being more than "
m->system_context.mem_used, m->system_context.mem_total,
m->system_context.swap_used, m->system_context.swap_total,
return 0;
return 0;
static void clear_candidate_hashmapp(Manager **m) {
if (*m)
static int monitor_memory_pressure_contexts_handler(sd_event_source *s, uint64_t usec, void *userdata) {
/* Don't want to use stale candidate data. Setting this will clear the candidate hashmap on return unless we
* update the candidate data (in which case clear_candidates will be NULL). */
_unused_ _cleanup_(clear_candidate_hashmapp) Manager *clear_candidates = userdata;
_cleanup_set_free_ Set *targets = NULL;
bool in_post_action_delay = false;
Manager *m = userdata;
usec_t usec_now;
int r;
/* Reset timer */
r = sd_event_now(sd_event_source_get_event(s), CLOCK_MONOTONIC, &usec_now);
if (r < 0)
return log_error_errno(r, "Failed to reset event timer: %m");
r = sd_event_source_set_time_relative(s, MEM_PRESSURE_INTERVAL_USEC);
if (r < 0)
return log_error_errno(r, "Failed to set relative time for timer: %m");
/* Reconnect if our connection dropped */
if (!m->varlink) {
r = acquire_managed_oom_connect(m);
if (r < 0)
return log_error_errno(r, "Failed to acquire varlink connection: %m");
/* Return early if nothing is requesting memory pressure monitoring */
if (hashmap_isempty(m->monitored_mem_pressure_cgroup_contexts))
return 0;
/* Update the cgroups used for detection/action */
r = update_monitored_cgroup_contexts(&m->monitored_mem_pressure_cgroup_contexts);
if (r == -ENOMEM)
return log_oom();
if (r < 0)
log_debug_errno(r, "Failed to update monitored memory pressure cgroup contexts, ignoring: %m");
/* Since pressure counters are lagging, we need to wait a bit after a kill to ensure we don't read stale
* values and go on a kill storm. */
if (m->mem_pressure_post_action_delay_start > 0) {
if (m->mem_pressure_post_action_delay_start + POST_ACTION_DELAY_USEC > usec_now)
in_post_action_delay = true;
m->mem_pressure_post_action_delay_start = 0;
r = oomd_pressure_above(m->monitored_mem_pressure_cgroup_contexts, m->default_mem_pressure_duration_usec, &targets);
if (r == -ENOMEM)
return log_oom();
if (r < 0)
log_debug_errno(r, "Failed to check if memory pressure exceeded limits, ignoring: %m");
else if (r == 1 && !in_post_action_delay) {
OomdCGroupContext *t;
SET_FOREACH(t, targets) {
_cleanup_free_ char *selected = NULL;
/* Check if there was reclaim activity in the given interval. The concern is the following case:
* Pressure climbed, a lot of high-frequency pages were reclaimed, and we killed the offending
* cgroup. Even after this, well-behaved processes will fault in recently resident pages and
* this will cause pressure to remain high. Thus if there isn't any reclaim pressure, no need
* to kill something (it won't help anyways). */
if ((now(CLOCK_MONOTONIC) - t->last_had_mem_reclaim) > RECLAIM_DURATION_USEC)
log_debug("Memory pressure for %s is %lu.%02lu%% > %lu.%02lu%% for > %s with reclaim activity",
LOAD_INT(t->memory_pressure.avg10), LOAD_FRAC(t->memory_pressure.avg10),
LOAD_INT(t->mem_pressure_limit), LOAD_FRAC(t->mem_pressure_limit),
format_timespan(ts, sizeof ts,
r = update_monitored_cgroup_contexts_candidates(
m->monitored_mem_pressure_cgroup_contexts, &m->monitored_mem_pressure_cgroup_contexts_candidates);
if (r == -ENOMEM)
return log_oom();
if (r < 0)
log_debug_errno(r, "Failed to update monitored memory pressure candidate cgroup contexts, ignoring: %m");
clear_candidates = NULL;
r = oomd_kill_by_pgscan_rate(m->monitored_mem_pressure_cgroup_contexts_candidates, t->path, m->dry_run, &selected);
if (r == -ENOMEM)
return log_oom();
if (r < 0)
log_notice_errno(r, "Failed to kill any cgroup(s) under %s based on pressure: %m", t->path);
else {
/* Don't act on all the high pressure cgroups at once; return as soon as we kill one.
* If r == 0 then it means there were not eligible candidates, the candidate cgroup
* disappeared, or the candidate cgroup has no processes by the time we tried to kill
* it. In either case, go through the event loop again and select a new candidate if
* pressure is still high. */
m->mem_pressure_post_action_delay_start = usec_now;
if (selected && r > 0)
log_notice("Killed %s due to memory pressure for %s being %lu.%02lu%% > %lu.%02lu%%"
" for > %s with reclaim activity",
selected, t->path,
LOAD_INT(t->memory_pressure.avg10), LOAD_FRAC(t->memory_pressure.avg10),
LOAD_INT(t->mem_pressure_limit), LOAD_FRAC(t->mem_pressure_limit),
format_timespan(ts, sizeof ts,
return 0;
} else {
/* If any monitored cgroup is over their pressure limit, get all the kill candidates for every
* monitored cgroup. This saves CPU cycles from doing it every interval by only doing it when a kill
* might happen.
* Candidate cgroup data will continue to get updated during the post-action delay period in case
* pressure continues to be high after a kill. */
OomdCGroupContext *c;
HASHMAP_FOREACH(c, m->monitored_mem_pressure_cgroup_contexts) {
if (c->mem_pressure_limit_hit_start == 0)
r = update_monitored_cgroup_contexts_candidates(
m->monitored_mem_pressure_cgroup_contexts, &m->monitored_mem_pressure_cgroup_contexts_candidates);
if (r == -ENOMEM)
return log_oom();
if (r < 0)
log_debug_errno(r, "Failed to update monitored memory pressure candidate cgroup contexts, ignoring: %m");
else {
clear_candidates = NULL;
return 0;
static int monitor_swap_contexts(Manager *m) {
_cleanup_(sd_event_source_unrefp) sd_event_source *s = NULL;
int r;
r = sd_event_add_time(m->event, &s, CLOCK_MONOTONIC, 0, 0, monitor_swap_contexts_handler, m);
if (r < 0)
return r;
r = sd_event_source_set_exit_on_failure(s, true);
if (r < 0)
return r;
r = sd_event_source_set_enabled(s, SD_EVENT_ON);
if (r < 0)
return r;
(void) sd_event_source_set_description(s, "oomd-swap-timer");
m->swap_context_event_source = TAKE_PTR(s);
return 0;
static int monitor_memory_pressure_contexts(Manager *m) {
_cleanup_(sd_event_source_unrefp) sd_event_source *s = NULL;
int r;
r = sd_event_add_time(m->event, &s, CLOCK_MONOTONIC, 0, 0, monitor_memory_pressure_contexts_handler, m);
if (r < 0)
return r;
r = sd_event_source_set_exit_on_failure(s, true);
if (r < 0)
return r;
r = sd_event_source_set_enabled(s, SD_EVENT_ON);
if (r < 0)
return r;
(void) sd_event_source_set_description(s, "oomd-memory-pressure-timer");
m->mem_pressure_context_event_source = TAKE_PTR(s);
return 0;
Manager* manager_free(Manager *m) {
return mfree(m);
int manager_new(Manager **ret) {
_cleanup_(manager_freep) Manager *m = NULL;
int r;
m = new0(Manager, 1);
if (!m)
return -ENOMEM;
r = sd_event_default(&m->event);
if (r < 0)
return r;
(void) sd_event_set_watchdog(m->event, true);
r = sd_event_add_signal(m->event, NULL, SIGINT, NULL, NULL);
if (r < 0)
return r;
r = sd_event_add_signal(m->event, NULL, SIGTERM, NULL, NULL);
if (r < 0)
return r;
m->monitored_swap_cgroup_contexts = hashmap_new(&oomd_cgroup_ctx_hash_ops);
if (!m->monitored_swap_cgroup_contexts)
return -ENOMEM;
m->monitored_mem_pressure_cgroup_contexts = hashmap_new(&oomd_cgroup_ctx_hash_ops);
if (!m->monitored_mem_pressure_cgroup_contexts)
return -ENOMEM;
m->monitored_mem_pressure_cgroup_contexts_candidates = hashmap_new(&oomd_cgroup_ctx_hash_ops);
if (!m->monitored_mem_pressure_cgroup_contexts_candidates)
return -ENOMEM;
*ret = TAKE_PTR(m);
return 0;
static int manager_connect_bus(Manager *m) {
int r;
r = bus_open_system_watch_bind_with_description(&m->bus, "bus-api-oom");
if (r < 0)
return log_error_errno(r, "Failed to connect to bus: %m");
r = bus_add_implementation(m->bus, &manager_object, m);
if (r < 0)
return r;
r = bus_log_control_api_register(m->bus);
if (r < 0)
return r;
r = sd_bus_request_name_async(m->bus, NULL, "org.freedesktop.oom1", 0, NULL, NULL);
if (r < 0)
return log_error_errno(r, "Failed to request name: %m");
r = sd_bus_attach_event(m->bus, m->event, 0);
if (r < 0)
return log_error_errno(r, "Failed to attach bus to event loop: %m");
return 0;
int manager_start(
Manager *m,
bool dry_run,
int swap_used_limit_permyriad,
int mem_pressure_limit_permyriad,
usec_t mem_pressure_usec) {
unsigned long l, f;
int r;
m->dry_run = dry_run;
m->swap_used_limit_permyriad = swap_used_limit_permyriad >= 0 ? swap_used_limit_permyriad : DEFAULT_SWAP_USED_LIMIT_PERCENT * 100;
assert(m->swap_used_limit_permyriad <= 10000);
if (mem_pressure_limit_permyriad >= 0) {
assert(mem_pressure_limit_permyriad <= 10000);
l = mem_pressure_limit_permyriad / 100;
f = mem_pressure_limit_permyriad % 100;
} else {
f = 0;
r = store_loadavg_fixed_point(l, f, &m->default_mem_pressure_limit);
if (r < 0)
return r;
m->default_mem_pressure_duration_usec = mem_pressure_usec ?: DEFAULT_MEM_PRESSURE_DURATION_USEC;
r = manager_connect_bus(m);
if (r < 0)
return r;
r = acquire_managed_oom_connect(m);
if (r < 0)
return r;
r = monitor_memory_pressure_contexts(m);
if (r < 0)
return r;
r = monitor_swap_contexts(m);
if (r < 0)
return r;
return 0;
int manager_get_dump_string(Manager *m, char **ret) {
_cleanup_free_ char *dump = NULL;
_cleanup_fclose_ FILE *f = NULL;
OomdCGroupContext *c;
size_t size;
char *key;
int r;
f = open_memstream_unlocked(&dump, &size);
if (!f)
return -errno;
"Dry Run: %s\n"
"Default Memory Pressure Limit: %lu.%02lu%%\n"
"Default Memory Pressure Duration: %s\n"
"System Context:\n",
LOAD_INT(m->default_mem_pressure_limit), LOAD_FRAC(m->default_mem_pressure_limit),
format_timespan(buf, sizeof(buf), m->default_mem_pressure_duration_usec, USEC_PER_SEC));
oomd_dump_system_context(&m->system_context, f, "\t");
fprintf(f, "Swap Monitored CGroups:\n");
HASHMAP_FOREACH_KEY(c, key, m->monitored_swap_cgroup_contexts)
oomd_dump_swap_cgroup_context(c, f, "\t");
fprintf(f, "Memory Pressure Monitored CGroups:\n");
HASHMAP_FOREACH_KEY(c, key, m->monitored_mem_pressure_cgroup_contexts)
oomd_dump_memory_pressure_cgroup_context(c, f, "\t");
r = fflush_and_check(f);
if (r < 0)
return r;
f = safe_fclose(f);
*ret = TAKE_PTR(dump);
return 0;