mirror of
https://github.com/morgan9e/systemd
synced 2026-04-15 17:06:39 +09:00
core: rework how we track cgroup realized state (#37733)
This commit is contained in:
@@ -687,13 +687,10 @@ int bpf_firewall_install(Unit *u) {
|
||||
cc = unit_get_cgroup_context(u);
|
||||
if (!cc)
|
||||
return -EINVAL;
|
||||
|
||||
crt = unit_get_cgroup_runtime(u);
|
||||
if (!crt)
|
||||
return -EINVAL;
|
||||
if (!crt->cgroup_path)
|
||||
return -EINVAL;
|
||||
if (!crt->cgroup_realized)
|
||||
return -EINVAL;
|
||||
if (!crt || !crt->cgroup_path)
|
||||
return -EOWNERDEAD;
|
||||
|
||||
if (bpf_program_supported() <= 0)
|
||||
return log_unit_debug_errno(u, SYNTHETIC_ERRNO(EOPNOTSUPP),
|
||||
|
||||
@@ -1909,7 +1909,7 @@ void unit_invalidate_cgroup_members_masks(Unit *u) {
|
||||
unit_invalidate_cgroup_members_masks(slice);
|
||||
}
|
||||
|
||||
int unit_default_cgroup_path(const Unit *u, char **ret) {
|
||||
static int unit_default_cgroup_path(const Unit *u, char **ret) {
|
||||
_cleanup_free_ char *p = NULL;
|
||||
int r;
|
||||
|
||||
@@ -1942,7 +1942,7 @@ int unit_default_cgroup_path(const Unit *u, char **ret) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
int unit_set_cgroup_path(Unit *u, const char *path) {
|
||||
static int unit_set_cgroup_path(Unit *u, const char *path) {
|
||||
_cleanup_free_ char *p = NULL;
|
||||
CGroupRuntime *crt;
|
||||
int r;
|
||||
@@ -1950,7 +1950,6 @@ int unit_set_cgroup_path(Unit *u, const char *path) {
|
||||
assert(u);
|
||||
|
||||
crt = unit_get_cgroup_runtime(u);
|
||||
|
||||
if (crt && streq_ptr(crt->cgroup_path, path))
|
||||
return 0;
|
||||
|
||||
@@ -1976,7 +1975,18 @@ int unit_set_cgroup_path(Unit *u, const char *path) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
int unit_watch_cgroup(Unit *u) {
|
||||
int unit_get_cgroup_path_with_fallback(const Unit *u, char **ret) {
|
||||
assert(u);
|
||||
assert(ret);
|
||||
|
||||
const CGroupRuntime *crt = unit_get_cgroup_runtime(u);
|
||||
if (!crt || !crt->cgroup_path)
|
||||
return unit_default_cgroup_path(u, ret);
|
||||
|
||||
return strdup_to_full(ret, crt->cgroup_path); /* returns 1 -> cgroup_path is alive */
|
||||
}
|
||||
|
||||
static int unit_watch_cgroup(Unit *u) {
|
||||
_cleanup_free_ char *events = NULL;
|
||||
int r;
|
||||
|
||||
@@ -2021,7 +2031,7 @@ int unit_watch_cgroup(Unit *u) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
int unit_watch_cgroup_memory(Unit *u) {
|
||||
static int unit_watch_cgroup_memory(Unit *u) {
|
||||
_cleanup_free_ char *events = NULL;
|
||||
int r;
|
||||
|
||||
@@ -2077,42 +2087,14 @@ int unit_watch_cgroup_memory(Unit *u) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
int unit_pick_cgroup_path(Unit *u) {
|
||||
_cleanup_free_ char *path = NULL;
|
||||
int r;
|
||||
|
||||
assert(u);
|
||||
|
||||
if (!UNIT_HAS_CGROUP_CONTEXT(u))
|
||||
return -EINVAL;
|
||||
|
||||
CGroupRuntime *crt = unit_setup_cgroup_runtime(u);
|
||||
if (!crt)
|
||||
return -ENOMEM;
|
||||
if (crt->cgroup_path)
|
||||
return 0;
|
||||
|
||||
r = unit_default_cgroup_path(u, &path);
|
||||
if (r < 0)
|
||||
return log_unit_error_errno(u, r, "Failed to generate default cgroup path: %m");
|
||||
|
||||
r = unit_set_cgroup_path(u, path);
|
||||
if (r == -EEXIST)
|
||||
return log_unit_error_errno(u, r, "Control group %s exists already.", empty_to_root(path));
|
||||
if (r < 0)
|
||||
return log_unit_error_errno(u, r, "Failed to set unit's control group path to %s: %m", empty_to_root(path));
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int unit_update_cgroup(
|
||||
Unit *u,
|
||||
CGroupMask target_mask,
|
||||
CGroupMask enable_mask,
|
||||
ManagerState state) {
|
||||
|
||||
bool created;
|
||||
_cleanup_free_ char *cgroup_full_path = NULL;
|
||||
_cleanup_free_ char *cgroup = NULL, *cgroup_full_path = NULL;
|
||||
bool set_path, created;
|
||||
int r;
|
||||
|
||||
assert(u);
|
||||
@@ -2123,19 +2105,28 @@ static int unit_update_cgroup(
|
||||
if (u->freezer_state != FREEZER_RUNNING)
|
||||
return log_unit_error_errno(u, SYNTHETIC_ERRNO(EBUSY), "Cannot realize cgroup for frozen unit.");
|
||||
|
||||
/* Figure out our cgroup path */
|
||||
r = unit_pick_cgroup_path(u);
|
||||
r = unit_get_cgroup_path_with_fallback(u, &cgroup);
|
||||
if (r < 0)
|
||||
return r;
|
||||
|
||||
CGroupRuntime *crt = ASSERT_PTR(unit_get_cgroup_runtime(u));
|
||||
return log_unit_error_errno(u, r, "Failed to get cgroup path: %m");
|
||||
set_path = r == 0;
|
||||
|
||||
/* First, create our own group */
|
||||
r = cg_create(crt->cgroup_path);
|
||||
r = cg_create(cgroup);
|
||||
if (r < 0)
|
||||
return log_unit_error_errno(u, r, "Failed to create cgroup %s: %m", empty_to_root(crt->cgroup_path));
|
||||
return log_unit_error_errno(u, r, "Failed to create cgroup %s: %m", empty_to_root(cgroup));
|
||||
created = r;
|
||||
|
||||
if (set_path) {
|
||||
r = unit_set_cgroup_path(u, cgroup);
|
||||
if (r == -EEXIST)
|
||||
return log_unit_error_errno(u, r, "Picked control group '%s' as default, but it's in use already.", empty_to_root(cgroup));
|
||||
if (r < 0)
|
||||
return log_unit_error_errno(u, r, "Failed to set unit's control group path to '%s': %m", empty_to_root(cgroup));
|
||||
assert(r > 0);
|
||||
}
|
||||
|
||||
CGroupRuntime *crt = ASSERT_PTR(unit_get_cgroup_runtime(u));
|
||||
|
||||
uint64_t cgroup_id = 0;
|
||||
r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, crt->cgroup_path, NULL, &cgroup_full_path);
|
||||
if (r == 0) {
|
||||
@@ -2153,7 +2144,7 @@ static int unit_update_cgroup(
|
||||
(void) unit_watch_cgroup_memory(u);
|
||||
|
||||
/* For v2 we preserve enabled controllers in delegated units, adjust others, */
|
||||
if (created || !crt->cgroup_realized || !unit_cgroup_delegate(u)) {
|
||||
if (created || !unit_cgroup_delegate(u)) {
|
||||
CGroupMask result_mask = 0;
|
||||
|
||||
/* Enable all controllers we need */
|
||||
@@ -2166,7 +2157,6 @@ static int unit_update_cgroup(
|
||||
}
|
||||
|
||||
/* Keep track that this is now realized */
|
||||
crt->cgroup_realized = true;
|
||||
crt->cgroup_realized_mask = target_mask;
|
||||
|
||||
/* Set attributes */
|
||||
@@ -2317,10 +2307,6 @@ int unit_remove_subcgroup(Unit *u, const char *suffix_path) {
|
||||
if (!unit_cgroup_delegate(u))
|
||||
return -ENOMEDIUM;
|
||||
|
||||
r = unit_pick_cgroup_path(u);
|
||||
if (r < 0)
|
||||
return r;
|
||||
|
||||
CGroupRuntime *crt = unit_get_cgroup_runtime(u);
|
||||
if (!crt || !crt->cgroup_path)
|
||||
return -EOWNERDEAD;
|
||||
@@ -2375,7 +2361,7 @@ static bool unit_has_mask_realized(
|
||||
* enabled through cgroup.subtree_control, and since the BPF pseudo-controllers don't show up there, they
|
||||
* simply don't matter. */
|
||||
|
||||
return crt->cgroup_realized &&
|
||||
return crt->cgroup_path &&
|
||||
((crt->cgroup_realized_mask ^ target_mask) & CGROUP_MASK_V1) == 0 &&
|
||||
((crt->cgroup_enabled_mask ^ enable_mask) & CGROUP_MASK_V2) == 0 &&
|
||||
crt->cgroup_invalidated_mask == 0;
|
||||
@@ -2397,7 +2383,7 @@ static bool unit_has_mask_disables_realized(
|
||||
* Unlike unit_has_mask_realized, we don't care what was enabled, only that anything we want to remove is
|
||||
* already removed. */
|
||||
|
||||
return !crt->cgroup_realized ||
|
||||
return !crt->cgroup_path ||
|
||||
(FLAGS_SET(crt->cgroup_realized_mask, target_mask & CGROUP_MASK_V1) &&
|
||||
FLAGS_SET(crt->cgroup_enabled_mask, enable_mask & CGROUP_MASK_V2));
|
||||
}
|
||||
@@ -2418,7 +2404,7 @@ static bool unit_has_mask_enables_realized(
|
||||
* Unlike unit_has_mask_realized, we don't care about the controllers that are not present, only that anything
|
||||
* we want to add is already added. */
|
||||
|
||||
return crt->cgroup_realized &&
|
||||
return crt->cgroup_path &&
|
||||
((crt->cgroup_realized_mask | target_mask) & CGROUP_MASK_V1) == (crt->cgroup_realized_mask & CGROUP_MASK_V1) &&
|
||||
((crt->cgroup_enabled_mask | enable_mask) & CGROUP_MASK_V2) == (crt->cgroup_enabled_mask & CGROUP_MASK_V2);
|
||||
}
|
||||
@@ -2497,7 +2483,7 @@ static int unit_realize_cgroup_now_disable(Unit *u, ManagerState state) {
|
||||
|
||||
/* The cgroup for this unit might not actually be fully realised yet, in which case it isn't
|
||||
* holding any controllers open anyway. */
|
||||
if (!rt->cgroup_realized)
|
||||
if (!rt->cgroup_path)
|
||||
continue;
|
||||
|
||||
/* We must disable those below us first in order to release the controller. */
|
||||
@@ -2669,7 +2655,7 @@ void unit_add_family_to_cgroup_realize_queue(Unit *u) {
|
||||
/* We only enqueue siblings if they were realized once at least, in the main
|
||||
* hierarchy. */
|
||||
crt = unit_get_cgroup_runtime(m);
|
||||
if (!crt || !crt->cgroup_realized)
|
||||
if (!crt || !crt->cgroup_path)
|
||||
continue;
|
||||
|
||||
/* If the unit doesn't need any new controllers and has current ones
|
||||
@@ -2885,7 +2871,6 @@ void unit_prune_cgroup(Unit *u) {
|
||||
assert(crt == unit_get_cgroup_runtime(u));
|
||||
assert(!crt->cgroup_path);
|
||||
|
||||
crt->cgroup_realized = false;
|
||||
crt->cgroup_realized_mask = 0;
|
||||
crt->cgroup_enabled_mask = 0;
|
||||
|
||||
@@ -3413,7 +3398,7 @@ Unit* manager_get_unit_by_cgroup(Manager *m, const char *cgroup) {
|
||||
|
||||
e = strrchr(p, '/');
|
||||
if (!e || e == p)
|
||||
return hashmap_get(m->cgroup_unit, SPECIAL_ROOT_SLICE);
|
||||
return NULL; /* reached cgroup root? return NULL and possibly fall back to manager_get_unit_by_pidref_watching() */
|
||||
|
||||
*e = 0;
|
||||
|
||||
@@ -3423,7 +3408,7 @@ Unit* manager_get_unit_by_cgroup(Manager *m, const char *cgroup) {
|
||||
}
|
||||
}
|
||||
|
||||
Unit *manager_get_unit_by_pidref_cgroup(Manager *m, const PidRef *pid) {
|
||||
Unit* manager_get_unit_by_pidref_cgroup(Manager *m, const PidRef *pid) {
|
||||
_cleanup_free_ char *cgroup = NULL;
|
||||
|
||||
assert(m);
|
||||
@@ -4199,6 +4184,8 @@ CGroupRuntime* cgroup_runtime_new(void) {
|
||||
.ipv6_deny_map_fd = -EBADF,
|
||||
|
||||
.cgroup_invalidated_mask = _CGROUP_MASK_ALL,
|
||||
|
||||
.deserialized_cgroup_realized = -1,
|
||||
};
|
||||
|
||||
unit_reset_cpu_accounting(/* unit = */ NULL, crt);
|
||||
@@ -4335,7 +4322,6 @@ int cgroup_runtime_serialize(Unit *u, FILE *f, FDSet *fds) {
|
||||
if (crt->cgroup_id != 0)
|
||||
(void) serialize_item_format(f, "cgroup-id", "%" PRIu64, crt->cgroup_id);
|
||||
|
||||
(void) serialize_bool(f, "cgroup-realized", crt->cgroup_realized);
|
||||
(void) serialize_cgroup_mask(f, "cgroup-realized-mask", crt->cgroup_realized_mask);
|
||||
(void) serialize_cgroup_mask(f, "cgroup-enabled-mask", crt->cgroup_enabled_mask);
|
||||
(void) serialize_cgroup_mask(f, "cgroup-invalidated-mask", crt->cgroup_invalidated_mask);
|
||||
@@ -4435,15 +4421,13 @@ int cgroup_runtime_deserialize_one(Unit *u, const char *key, const char *value,
|
||||
if (r < 0)
|
||||
log_unit_debug_errno(u, r, "Failed to set cgroup path %s, ignoring: %m", value);
|
||||
|
||||
(void) unit_watch_cgroup(u);
|
||||
(void) unit_watch_cgroup_memory(u);
|
||||
return 1;
|
||||
}
|
||||
|
||||
if (MATCH_DESERIALIZE_IMMEDIATE(u, "cgroup-id", key, value, safe_atou64, cgroup_id))
|
||||
return 1;
|
||||
|
||||
if (MATCH_DESERIALIZE(u, "cgroup-realized", key, value, parse_boolean, cgroup_realized))
|
||||
if (MATCH_DESERIALIZE_IMMEDIATE(u, "cgroup-realized", key, value, parse_tristate, deserialized_cgroup_realized))
|
||||
return 1;
|
||||
|
||||
if (MATCH_DESERIALIZE_IMMEDIATE(u, "cgroup-realized-mask", key, value, cg_mask_from_string, cgroup_realized_mask))
|
||||
|
||||
@@ -323,7 +323,6 @@ typedef struct CGroupRuntime {
|
||||
struct bpf_link *restrict_ifaces_egress_bpf_link;
|
||||
#endif
|
||||
|
||||
bool cgroup_realized:1;
|
||||
bool cgroup_members_mask_valid:1;
|
||||
|
||||
/* Reset cgroup accounting next time we fork something off */
|
||||
@@ -331,6 +330,8 @@ typedef struct CGroupRuntime {
|
||||
|
||||
/* Whether we warned about clamping the CPU quota period */
|
||||
bool warned_clamping_cpu_quota_period:1;
|
||||
|
||||
int deserialized_cgroup_realized; /* tristate, for backwards compat */
|
||||
} CGroupRuntime;
|
||||
|
||||
uint64_t cgroup_context_cpu_weight(CGroupContext *c, ManagerState state);
|
||||
@@ -377,14 +378,10 @@ void unit_invalidate_cgroup_members_masks(Unit *u);
|
||||
|
||||
void unit_add_family_to_cgroup_realize_queue(Unit *u);
|
||||
|
||||
int unit_default_cgroup_path(const Unit *u, char **ret);
|
||||
int unit_set_cgroup_path(Unit *u, const char *path);
|
||||
int unit_pick_cgroup_path(Unit *u);
|
||||
int unit_get_cgroup_path_with_fallback(const Unit *u, char **ret);
|
||||
|
||||
int unit_realize_cgroup(Unit *u);
|
||||
void unit_prune_cgroup(Unit *u);
|
||||
int unit_watch_cgroup(Unit *u);
|
||||
int unit_watch_cgroup_memory(Unit *u);
|
||||
void unit_add_to_cgroup_realize_queue(Unit *u);
|
||||
|
||||
int unit_cgroup_is_empty(Unit *u);
|
||||
|
||||
@@ -88,14 +88,10 @@ static void bad_specifier(const Unit *u, char specifier) {
|
||||
|
||||
static int specifier_cgroup(char specifier, const void *data, const char *root, const void *userdata, char **ret) {
|
||||
const Unit *u = ASSERT_PTR(userdata);
|
||||
CGroupRuntime *crt = unit_get_cgroup_runtime(u);
|
||||
|
||||
bad_specifier(u, specifier);
|
||||
|
||||
if (crt && crt->cgroup_path)
|
||||
return strdup_to(ret, crt->cgroup_path);
|
||||
|
||||
return unit_default_cgroup_path(u, ret);
|
||||
return unit_get_cgroup_path_with_fallback(u, ret);
|
||||
}
|
||||
|
||||
static int specifier_cgroup_root(char specifier, const void *data, const char *root, const void *userdata, char **ret) {
|
||||
@@ -112,14 +108,8 @@ static int specifier_cgroup_slice(char specifier, const void *data, const char *
|
||||
bad_specifier(u, specifier);
|
||||
|
||||
slice = UNIT_GET_SLICE(u);
|
||||
if (slice) {
|
||||
CGroupRuntime *crt = unit_get_cgroup_runtime(slice);
|
||||
|
||||
if (crt && crt->cgroup_path)
|
||||
return strdup_to(ret, crt->cgroup_path);
|
||||
|
||||
return unit_default_cgroup_path(slice, ret);
|
||||
}
|
||||
if (slice)
|
||||
return unit_get_cgroup_path_with_fallback(slice, ret);
|
||||
|
||||
return strdup_to(ret, u->manager->cgroup_root);
|
||||
}
|
||||
|
||||
@@ -364,22 +364,20 @@ int unit_deserialize_state(Unit *u, FILE *f, FDSet *fds) {
|
||||
}
|
||||
}
|
||||
|
||||
/* Versions before 228 did not carry a state change timestamp. In this case, take the current
|
||||
* time. This is useful, so that timeouts based on this timestamp don't trigger too early, and is
|
||||
* in-line with the logic from before 228 where the base for timeouts was not persistent across
|
||||
* reboots. */
|
||||
|
||||
if (!dual_timestamp_is_set(&u->state_change_timestamp))
|
||||
dual_timestamp_now(&u->state_change_timestamp);
|
||||
|
||||
/* Let's make sure that everything that is deserialized also gets any potential new cgroup settings
|
||||
* applied after we are done. For that we invalidate anything already realized, so that we can
|
||||
* realize it again. */
|
||||
CGroupRuntime *crt;
|
||||
crt = unit_get_cgroup_runtime(u);
|
||||
if (crt && crt->cgroup_realized) {
|
||||
unit_invalidate_cgroup(u, _CGROUP_MASK_ALL);
|
||||
unit_invalidate_cgroup_bpf(u);
|
||||
CGroupRuntime *crt = unit_get_cgroup_runtime(u);
|
||||
if (crt && crt->cgroup_path) {
|
||||
/* Since v258, CGroupRuntime.cgroup_path is coupled with cgroup realized state, which however
|
||||
* wasn't the case in prior versions with the realized state tracked in a discrete field.
|
||||
* Patch cgroup_realized == 0 back to no cgroup_path here hence. */
|
||||
if (crt->deserialized_cgroup_realized == 0)
|
||||
unit_release_cgroup(u, /* drop_cgroup_runtime = */ false);
|
||||
else {
|
||||
unit_invalidate_cgroup(u, _CGROUP_MASK_ALL);
|
||||
unit_invalidate_cgroup_bpf(u);
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
@@ -512,11 +510,9 @@ void unit_dump(Unit *u, FILE *f, const char *prefix) {
|
||||
|
||||
fprintf(f,
|
||||
"%s\tSlice: %s\n"
|
||||
"%s\tCGroup: %s\n"
|
||||
"%s\tCGroup realized: %s\n",
|
||||
"%s\tCGroup: %s\n",
|
||||
prefix, strna(unit_slice_name(u)),
|
||||
prefix, strna(crt ? crt->cgroup_path : NULL),
|
||||
prefix, yes_no(crt ? crt->cgroup_realized : false));
|
||||
prefix, strna(crt ? crt->cgroup_path : NULL));
|
||||
|
||||
if (crt && crt->cgroup_realized_mask != 0) {
|
||||
_cleanup_free_ char *s = NULL;
|
||||
|
||||
@@ -3395,7 +3395,7 @@ int unit_set_slice(Unit *u, Unit *slice) {
|
||||
/* Disallow slice changes if @u is already bound to cgroups */
|
||||
if (UNIT_GET_SLICE(u)) {
|
||||
CGroupRuntime *crt = unit_get_cgroup_runtime(u);
|
||||
if (crt && crt->cgroup_realized)
|
||||
if (crt && crt->cgroup_path)
|
||||
return -EBUSY;
|
||||
}
|
||||
|
||||
@@ -6006,16 +6006,17 @@ static int unit_log_leftover_process_stop(const PidRef *pid, int sig, void *user
|
||||
}
|
||||
|
||||
int unit_warn_leftover_processes(Unit *u, bool start) {
|
||||
_cleanup_free_ char *cgroup = NULL;
|
||||
int r;
|
||||
|
||||
assert(u);
|
||||
|
||||
(void) unit_pick_cgroup_path(u);
|
||||
|
||||
CGroupRuntime *crt = unit_get_cgroup_runtime(u);
|
||||
if (!crt || !crt->cgroup_path)
|
||||
return 0;
|
||||
r = unit_get_cgroup_path_with_fallback(u, &cgroup);
|
||||
if (r < 0)
|
||||
return r;
|
||||
|
||||
return cg_kill_recursive(
|
||||
crt->cgroup_path,
|
||||
cgroup,
|
||||
/* sig= */ 0,
|
||||
/* flags= */ 0,
|
||||
/* killed_pids= */ NULL,
|
||||
|
||||
Reference in New Issue
Block a user