From 8b4ee3d68d2e70d9a396b74d155eab3b11763311 Mon Sep 17 00:00:00 2001 From: Yu Watanabe Date: Wed, 24 Sep 2025 05:29:22 +0900 Subject: [PATCH 1/3] core/unit: fail earlier before spawning executor when we failed to realize cgroup Before 23ac08115af83e3a0a937fa207fc52511aba2ffa, even if we failed to create the cgroup for a unit, a cgroup runtime object for the cgroup is created with the cgroup path. Hence, the creation of cgroup is failed, execution of the unit will fail in posix_spawn_wrapper() and logged something like the following: ``` systemd[1]: testservice.service: Failed to create cgroup /testslice.slice/testservice.service: Cannot allocate memory systemd[1]: testservice.service: Failed to spawn executor: No such file or directory systemd[1]: testservice.service: Failed to spawn 'start' task: No such file or directory systemd[1]: testservice.service: Failed with result 'resources'. systemd[1]: Failed to start testservice.service. ``` However, after the commit, when we failed to create the cgroup, a cgroup runtime object is not created, hence NULL will be assigned to ExecParameters.cgroup_path in unit_set_exec_params(). Hence, the unit process will be invoked in the init.scope. ``` systemd[1]: testservice.service: Failed to create cgroup /testslice.slice/testservice.service: Cannot allocate memory systemd[1]: Starting testservice.service... cat[1094]: 0::/init.scope systemd[1]: testservice.service: Deactivated successfully. systemd[1]: Finished testservice.service. ``` where the test service calls 'cat /proc/self/cgroup'. To fix the issue, let's fail earlier when we failed to create cgroup. Follow-up for 23ac08115af83e3a0a937fa207fc52511aba2ffa (v258). --- src/core/unit.c | 12 +++-- ...CGROUP.abort-on-cgroup-creation-failure.sh | 46 +++++++++++++++++++ 2 files changed, 53 insertions(+), 5 deletions(-) create mode 100755 test/units/TEST-19-CGROUP.abort-on-cgroup-creation-failure.sh diff --git a/src/core/unit.c b/src/core/unit.c index e99f8f2b3e..147bc8f5f2 100644 --- a/src/core/unit.c +++ b/src/core/unit.c @@ -5559,11 +5559,11 @@ int unit_fork_helper_process(Unit *u, const char *name, bool into_cgroup, PidRef * with the child's PID. */ if (into_cgroup) { - (void) unit_realize_cgroup(u); + r = unit_realize_cgroup(u); + if (r < 0) + return r; - crt = unit_setup_cgroup_runtime(u); - if (!crt) - return -ENOMEM; + crt = unit_get_cgroup_runtime(u); } r = safe_fork(name, FORK_REOPEN_LOG|FORK_DEATHSIG_SIGTERM, &pid); @@ -6013,7 +6013,9 @@ int unit_prepare_exec(Unit *u) { /* Prepares everything so that we can fork of a process for this unit */ - (void) unit_realize_cgroup(u); + r = unit_realize_cgroup(u); + if (r < 0) + return r; CGroupRuntime *crt = unit_get_cgroup_runtime(u); if (crt && crt->reset_accounting) { diff --git a/test/units/TEST-19-CGROUP.abort-on-cgroup-creation-failure.sh b/test/units/TEST-19-CGROUP.abort-on-cgroup-creation-failure.sh new file mode 100755 index 0000000000..41f8b96e25 --- /dev/null +++ b/test/units/TEST-19-CGROUP.abort-on-cgroup-creation-failure.sh @@ -0,0 +1,46 @@ +#!/usr/bin/env bash +# SPDX-License-Identifier: LGPL-2.1-or-later +set -ex +set -o pipefail + +# Test that the service is not invoked if the cgroup cannot be created. + +# It seems openSUSE kernel (at least kernel-default-6.16.8-1.1.x86_64.rpm) has a +# bag in kernel oom killer or clone3 syscall, and spawning executor on a cgroup +# with too small MemoryMax= triggers infinite loop of OOM kill, and posix_spawn() +# will never return, and the service manager will stuck. +#### +# [ 119.776797] systemd invoked oom-killer: gfp_mask=0xcc0(GFP_KERNEL), order=0, oom_score_adj=0 +# [ 119.776859] CPU: 1 UID: 0 PID: 1472 Comm: systemd Not tainted 6.16.8-1-default #1 PREEMPT(voluntary) openSUSE Tumbleweed 6c85865973e4ae641870ed68afe8933a6986c974 +# [ 119.776865] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.17.0-5.fc42 04/01/2014 +# [ 119.776867] Call Trace: +# (snip) +# [ 119.778126] Out of memory and no killable processes... +#### +# On other distributions, the oom killer is triggered, but clone3 immediately +# fails with ENOMEM, and such problematic loop does not happen. +. /etc/os-release +if [[ "$ID" =~ opensuse ]]; then + echo "Skipping cgroup test with too small MemoryMax= setting on openSUSE." + exit 0 +fi + +cat >/run/systemd/system/testslice.slice </run/systemd/system/testservice.service < Date: Wed, 24 Sep 2025 06:02:22 +0900 Subject: [PATCH 2/3] core/bpf-firewall: make failures in loading custom BPF program not critical All other resource control features work as 'best-effort', and failures in applying them are handled gracefully. However, unlike the other features, we tested if the BPF programs can be loaded and refuse execution on failure. Moreover, the previous behavior of testing loading BPF programs had inconsistency: the test was silently skipped if the cgroup for the unit does not exist yet, but tested when the cgroup already exists. Let's not handle failures in loading custom BPF programs as critical, but gracefully ignore them, like we do for the other resource control features. Follow-up for fab347489fcfafbc8367c86afc637ce1b81ae59e. --- man/systemd.resource-control.xml | 2 +- src/core/cgroup.c | 6 ------ src/core/unit.c | 6 ------ 3 files changed, 1 insertion(+), 13 deletions(-) diff --git a/man/systemd.resource-control.xml b/man/systemd.resource-control.xml index 73b6734ddd..16de3caf5f 100644 --- a/man/systemd.resource-control.xml +++ b/man/systemd.resource-control.xml @@ -1149,7 +1149,7 @@ NFTSet=cgroup:inet:filter:my_service user:inet:filter:serviceuser one more restricted, depending on the use case. Note that these settings might not be supported on some systems (for example if eBPF control group - support is not enabled in the underlying kernel or container manager). These settings will fail the service in + support is not enabled in the underlying kernel or container manager). These settings will have no effect in that case. If compatibility with such systems is desired it is hence recommended to attach your filter manually (requires Delegate=yes) instead of using this setting. diff --git a/src/core/cgroup.c b/src/core/cgroup.c index 6d37c03127..64bbcadd8f 100644 --- a/src/core/cgroup.c +++ b/src/core/cgroup.c @@ -2204,12 +2204,6 @@ int unit_attach_pids_to_cgroup(Unit *u, Set *pids, const char *suffix_path) { if (set_isempty(pids)) return 0; - /* Load any custom firewall BPF programs here once to test if they are existing and actually loadable. - * Fail here early since later errors in the call chain unit_realize_cgroup to cgroup_context_apply are ignored. */ - r = bpf_firewall_load_custom(u); - if (r < 0) - return r; - r = unit_realize_cgroup(u); if (r < 0) return r; diff --git a/src/core/unit.c b/src/core/unit.c index 147bc8f5f2..a4ddf4d91e 100644 --- a/src/core/unit.c +++ b/src/core/unit.c @@ -6005,12 +6005,6 @@ int unit_prepare_exec(Unit *u) { assert(u); - /* Load any custom firewall BPF programs here once to test if they are existing and actually loadable. - * Fail here early since later errors in the call chain unit_realize_cgroup to cgroup_context_apply are ignored. */ - r = bpf_firewall_load_custom(u); - if (r < 0) - return r; - /* Prepares everything so that we can fork of a process for this unit */ r = unit_realize_cgroup(u); From e8a5cda4714fc6fe622a03cfca6c75888d63e354 Mon Sep 17 00:00:00 2001 From: Yu Watanabe Date: Wed, 24 Sep 2025 04:45:21 +0900 Subject: [PATCH 3/3] core/bpf-firewall: replace unnecessary unit_setup_cgroup_runtime() with unit_get_cgroup_runtime() Except for the test, bpf_firewall_compile() is only called by the following: cgroup_context_apply() -> cgroup_apply_firewall() -> bpf_firewall_compile() and in the early stage of cgroup_context_apply(), it checks if the cgroup runtime exists. Hence, it is not necessary to try to allocate the runtime in bpf_firewall_compile(). --- src/core/bpf-firewall.c | 4 ++-- src/test/test-bpf-firewall.c | 6 ++++-- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/src/core/bpf-firewall.c b/src/core/bpf-firewall.c index e0cbe16463..fce885da87 100644 --- a/src/core/bpf-firewall.c +++ b/src/core/bpf-firewall.c @@ -547,9 +547,9 @@ int bpf_firewall_compile(Unit *u) { if (!cc) return -EINVAL; - crt = unit_setup_cgroup_runtime(u); + crt = unit_get_cgroup_runtime(u); if (!crt) - return -ENOMEM; + return -ESTALE; if (bpf_program_supported() <= 0) return log_unit_debug_errno(u, SYNTHETIC_ERRNO(EOPNOTSUPP), diff --git a/src/test/test-bpf-firewall.c b/src/test/test-bpf-firewall.c index f431529ff6..7f06589907 100644 --- a/src/test/test-bpf-firewall.c +++ b/src/test/test-bpf-firewall.c @@ -49,7 +49,8 @@ int main(int argc, char *argv[]) { if (!can_memlock()) return log_tests_skipped("Can't use mlock()"); - r = enter_cgroup_subroot(NULL); + _cleanup_free_ char *cgroup_path = NULL; + r = enter_cgroup_subroot(&cgroup_path); if (r == -ENOMEDIUM) return log_tests_skipped("cgroupfs not available"); @@ -128,6 +129,8 @@ int main(int argc, char *argv[]) { SERVICE(u)->type = SERVICE_ONESHOT; u->load_state = UNIT_LOADED; + CGroupRuntime *crt = ASSERT_PTR(unit_setup_cgroup_runtime(u)); + unit_dump(u, stdout, NULL); r = bpf_firewall_compile(u); @@ -135,7 +138,6 @@ int main(int argc, char *argv[]) { return log_tests_skipped("Kernel doesn't support the necessary bpf bits (masked out via seccomp?)"); ASSERT_OK(r); - CGroupRuntime *crt = ASSERT_PTR(unit_get_cgroup_runtime(u)); ASSERT_NOT_NULL(crt->ip_bpf_ingress); ASSERT_NOT_NULL(crt->ip_bpf_egress);