core: assorted fixes and cleanups for cgroup (#39094)

This commit is contained in:
Yu Watanabe
2025-09-26 13:56:25 +09:00
committed by GitHub
6 changed files with 60 additions and 22 deletions

View File

@@ -1149,7 +1149,7 @@ NFTSet=cgroup:inet:filter:my_service user:inet:filter:serviceuser
one more restricted, depending on the use case.</para>
<para>Note that these settings might not be supported on some systems (for example if eBPF control group
support is not enabled in the underlying kernel or container manager). These settings will fail the service in
support is not enabled in the underlying kernel or container manager). These settings will have no effect in
that case. If compatibility with such systems is desired it is hence recommended to attach your filter manually
(requires <varname>Delegate=</varname><constant>yes</constant>) instead of using this setting.</para>

View File

@@ -547,9 +547,9 @@ int bpf_firewall_compile(Unit *u) {
if (!cc)
return -EINVAL;
crt = unit_setup_cgroup_runtime(u);
crt = unit_get_cgroup_runtime(u);
if (!crt)
return -ENOMEM;
return -ESTALE;
if (bpf_program_supported() <= 0)
return log_unit_debug_errno(u, SYNTHETIC_ERRNO(EOPNOTSUPP),

View File

@@ -2204,12 +2204,6 @@ int unit_attach_pids_to_cgroup(Unit *u, Set *pids, const char *suffix_path) {
if (set_isempty(pids))
return 0;
/* Load any custom firewall BPF programs here once to test if they are existing and actually loadable.
* Fail here early since later errors in the call chain unit_realize_cgroup to cgroup_context_apply are ignored. */
r = bpf_firewall_load_custom(u);
if (r < 0)
return r;
r = unit_realize_cgroup(u);
if (r < 0)
return r;

View File

@@ -5559,11 +5559,11 @@ int unit_fork_helper_process(Unit *u, const char *name, bool into_cgroup, PidRef
* with the child's PID. */
if (into_cgroup) {
(void) unit_realize_cgroup(u);
r = unit_realize_cgroup(u);
if (r < 0)
return r;
crt = unit_setup_cgroup_runtime(u);
if (!crt)
return -ENOMEM;
crt = unit_get_cgroup_runtime(u);
}
r = safe_fork(name, FORK_REOPEN_LOG|FORK_DEATHSIG_SIGTERM, &pid);
@@ -6005,15 +6005,11 @@ int unit_prepare_exec(Unit *u) {
assert(u);
/* Load any custom firewall BPF programs here once to test if they are existing and actually loadable.
* Fail here early since later errors in the call chain unit_realize_cgroup to cgroup_context_apply are ignored. */
r = bpf_firewall_load_custom(u);
if (r < 0)
return r;
/* Prepares everything so that we can fork of a process for this unit */
(void) unit_realize_cgroup(u);
r = unit_realize_cgroup(u);
if (r < 0)
return r;
CGroupRuntime *crt = unit_get_cgroup_runtime(u);
if (crt && crt->reset_accounting) {

View File

@@ -49,7 +49,8 @@ int main(int argc, char *argv[]) {
if (!can_memlock())
return log_tests_skipped("Can't use mlock()");
r = enter_cgroup_subroot(NULL);
_cleanup_free_ char *cgroup_path = NULL;
r = enter_cgroup_subroot(&cgroup_path);
if (r == -ENOMEDIUM)
return log_tests_skipped("cgroupfs not available");
@@ -128,6 +129,8 @@ int main(int argc, char *argv[]) {
SERVICE(u)->type = SERVICE_ONESHOT;
u->load_state = UNIT_LOADED;
CGroupRuntime *crt = ASSERT_PTR(unit_setup_cgroup_runtime(u));
unit_dump(u, stdout, NULL);
r = bpf_firewall_compile(u);
@@ -135,7 +138,6 @@ int main(int argc, char *argv[]) {
return log_tests_skipped("Kernel doesn't support the necessary bpf bits (masked out via seccomp?)");
ASSERT_OK(r);
CGroupRuntime *crt = ASSERT_PTR(unit_get_cgroup_runtime(u));
ASSERT_NOT_NULL(crt->ip_bpf_ingress);
ASSERT_NOT_NULL(crt->ip_bpf_egress);

View File

@@ -0,0 +1,46 @@
#!/usr/bin/env bash
# SPDX-License-Identifier: LGPL-2.1-or-later
set -ex
set -o pipefail
# Test that the service is not invoked if the cgroup cannot be created.
# It seems openSUSE kernel (at least kernel-default-6.16.8-1.1.x86_64.rpm) has a
# bag in kernel oom killer or clone3 syscall, and spawning executor on a cgroup
# with too small MemoryMax= triggers infinite loop of OOM kill, and posix_spawn()
# will never return, and the service manager will stuck.
####
# [ 119.776797] systemd invoked oom-killer: gfp_mask=0xcc0(GFP_KERNEL), order=0, oom_score_adj=0
# [ 119.776859] CPU: 1 UID: 0 PID: 1472 Comm: systemd Not tainted 6.16.8-1-default #1 PREEMPT(voluntary) openSUSE Tumbleweed 6c85865973e4ae641870ed68afe8933a6986c974
# [ 119.776865] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.17.0-5.fc42 04/01/2014
# [ 119.776867] Call Trace:
# (snip)
# [ 119.778126] Out of memory and no killable processes...
####
# On other distributions, the oom killer is triggered, but clone3 immediately
# fails with ENOMEM, and such problematic loop does not happen.
. /etc/os-release
if [[ "$ID" =~ opensuse ]]; then
echo "Skipping cgroup test with too small MemoryMax= setting on openSUSE."
exit 0
fi
cat >/run/systemd/system/testslice.slice <<EOF
[Slice]
MemoryMax=1
EOF
cat >/run/systemd/system/testservice.service <<EOF
[Service]
Type=oneshot
ExecStart=cat /proc/self/cgroup
Slice=testslice.slice
EOF
systemctl daemon-reload
(! systemctl start testservice.service)
rm /run/systemd/system/testslice.slice
rm /run/systemd/system/testservice.service
exit 0