diff --git a/man/org.freedesktop.systemd1.xml b/man/org.freedesktop.systemd1.xml
index d5f270c681..95fb54d2f2 100644
--- a/man/org.freedesktop.systemd1.xml
+++ b/man/org.freedesktop.systemd1.xml
@@ -3374,6 +3374,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2eservice {
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly (ss) ProtectHostnameEx = ...;
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
+ readonly s PrivateBPF = '...';
+ @org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly b MemoryKSM = ...;
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly s NetworkNamespacePath = '...';
@@ -3975,6 +3977,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2eservice {
+
+
@@ -4701,6 +4705,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2eservice {
+
+
@@ -5583,6 +5589,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2esocket {
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly (ss) ProtectHostnameEx = ...;
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
+ readonly s PrivateBPF = '...';
+ @org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly b MemoryKSM = ...;
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly s NetworkNamespacePath = '...';
@@ -6204,6 +6212,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2esocket {
+
+
@@ -6910,6 +6920,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2esocket {
+
+
@@ -7616,6 +7628,8 @@ node /org/freedesktop/systemd1/unit/home_2emount {
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly (ss) ProtectHostnameEx = ...;
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
+ readonly s PrivateBPF = '...';
+ @org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly b MemoryKSM = ...;
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly s NetworkNamespacePath = '...';
@@ -8159,6 +8173,8 @@ node /org/freedesktop/systemd1/unit/home_2emount {
+
+
@@ -8773,6 +8789,8 @@ node /org/freedesktop/systemd1/unit/home_2emount {
+
+
@@ -9612,6 +9630,8 @@ node /org/freedesktop/systemd1/unit/dev_2dsda3_2eswap {
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly (ss) ProtectHostnameEx = ...;
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
+ readonly s PrivateBPF = '...';
+ @org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly b MemoryKSM = ...;
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly s NetworkNamespacePath = '...';
@@ -10137,6 +10157,8 @@ node /org/freedesktop/systemd1/unit/dev_2dsda3_2eswap {
+
+
@@ -10733,6 +10755,8 @@ node /org/freedesktop/systemd1/unit/dev_2dsda3_2eswap {
+
+
@@ -12316,6 +12340,7 @@ $ gdbus introspect --system --dest org.freedesktop.systemd1 \
PrivatePIDs were added in version 257.
ProtectHostnameEx,
DelegateNamespaces,
+ PrivateBPF,
RemoveSubGroup(),
StateDirectoryQuota,
StateDirectoryQuotaUsage,
@@ -12374,6 +12399,7 @@ $ gdbus introspect --system --dest org.freedesktop.systemd1 \
PassPIDFD,
AcceptFileDescriptors,
DelegateNamespaces,
+ PrivateBPF,
RemoveSubgroup(),
DeferTrigger,
DeferTriggerMaxUSec,
@@ -12429,6 +12455,7 @@ $ gdbus introspect --system --dest org.freedesktop.systemd1 \
PrivatePIDs were added in version 257.
ProtectHostnameEx,
DelegateNamespaces,
+ PrivateBPF,
RemoveSubgroup(),
ReloadResult,
CleanResult,
@@ -12484,6 +12511,7 @@ $ gdbus introspect --system --dest org.freedesktop.systemd1 \
PrivatePIDs were added in version 257.
ProtectHostnameEx,
DelegateNamespaces,
+ PrivateBPF,
RemoveSubgroup(),
StateDirectoryQuota,
StateDirectoryQuotaUsage,
diff --git a/man/systemd.exec.xml b/man/systemd.exec.xml
index 813ea02313..85db1de264 100644
--- a/man/systemd.exec.xml
+++ b/man/systemd.exec.xml
@@ -2555,6 +2555,16 @@ RestrictNamespaces=~cgroup net
+
+ PrivateBPF=
+
+ Takes a boolean argument. If set, mount a private instance of the BPF filesystem
+ on /sys/fs/bpf/. Otherwise, if ProtectKernelTunables= is set,
+ the instance from the host is inherited but mounted read-only. Defaults to false.
+
+
+
+
LockPersonality=
diff --git a/src/core/dbus-execute.c b/src/core/dbus-execute.c
index b1e3df1688..7e4d6fa6db 100644
--- a/src/core/dbus-execute.c
+++ b/src/core/dbus-execute.c
@@ -54,6 +54,7 @@ BUS_DEFINE_PROPERTY_GET_ENUM(bus_property_get_exec_preserve_mode, exec_preserve_
static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_exec_keyring_mode, exec_keyring_mode, ExecKeyringMode);
static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_protect_proc, protect_proc, ProtectProc);
static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_proc_subset, proc_subset, ProcSubset);
+static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_private_bpf, private_bpf, PrivateBPF);
static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_protect_home, protect_home, ProtectHome);
static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_protect_system, protect_system, ProtectSystem);
static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_personality, personality, unsigned long);
@@ -1316,6 +1317,7 @@ const sd_bus_vtable bus_exec_vtable[] = {
SD_BUS_PROPERTY("ProcSubset", "s", property_get_proc_subset, offsetof(ExecContext, proc_subset), SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("ProtectHostname", "b", property_get_protect_hostname, offsetof(ExecContext, protect_hostname), SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("ProtectHostnameEx", "(ss)", property_get_protect_hostname_ex, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("PrivateBPF", "s", property_get_private_bpf, offsetof(ExecContext, private_bpf), SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("MemoryKSM", "b", bus_property_get_tristate, offsetof(ExecContext, memory_ksm), SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("NetworkNamespacePath", "s", NULL, offsetof(ExecContext, network_namespace_path), SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("IPCNamespacePath", "s", NULL, offsetof(ExecContext, ipc_namespace_path), SD_BUS_VTABLE_PROPERTY_CONST),
@@ -1753,6 +1755,7 @@ static BUS_DEFINE_SET_TRANSIENT_PARSE(protect_home, ProtectHome, protect_home_fr
static BUS_DEFINE_SET_TRANSIENT_PARSE(keyring_mode, ExecKeyringMode, exec_keyring_mode_from_string);
static BUS_DEFINE_SET_TRANSIENT_PARSE(protect_proc, ProtectProc, protect_proc_from_string);
static BUS_DEFINE_SET_TRANSIENT_PARSE(proc_subset, ProcSubset, proc_subset_from_string);
+static BUS_DEFINE_SET_TRANSIENT_PARSE(private_bpf, PrivateBPF, private_bpf_from_string);
BUS_DEFINE_SET_TRANSIENT_PARSE(exec_preserve_mode, ExecPreserveMode, exec_preserve_mode_from_string);
static BUS_DEFINE_SET_TRANSIENT_PARSE_PTR(personality, unsigned long, parse_personality);
static BUS_DEFINE_SET_TRANSIENT_TO_STRING_ALLOC(secure_bits, "i", int32_t, int, "%" PRIi32, secure_bits_to_string_alloc_with_check);
@@ -2279,6 +2282,9 @@ int bus_exec_context_set_transient_property(
if (streq(name, "ProcSubset"))
return bus_set_transient_proc_subset(u, name, &c->proc_subset, message, flags, error);
+ if (streq(name, "PrivateBPF"))
+ return bus_set_transient_private_bpf(u, name, &c->private_bpf, message, flags, error);
+
if (streq(name, "RuntimeDirectoryPreserve"))
return bus_set_transient_exec_preserve_mode(u, name, &c->runtime_directory_preserve_mode, message, flags, error);
diff --git a/src/core/exec-invoke.c b/src/core/exec-invoke.c
index 09deb0f5c1..e6fce99340 100644
--- a/src/core/exec-invoke.c
+++ b/src/core/exec-invoke.c
@@ -2270,6 +2270,61 @@ static int setup_private_users_child(int unshare_ready_fd, const char *uid_map,
return 0;
}
+static int bpffs_prepare(
+ PidRef *ret_pid,
+ int *ret_sock_fd,
+ int *ret_errno_pipe) {
+
+ _cleanup_close_pair_ int socket_fds[2] = EBADF_PAIR, bpffs_errno_pipe[2] = EBADF_PAIR;
+ int r;
+
+ assert(ret_sock_fd);
+ assert(ret_pid);
+ assert(ret_errno_pipe);
+
+ r = pipe2(bpffs_errno_pipe, O_CLOEXEC|O_NONBLOCK);
+ if (r < 0)
+ return log_debug_errno(errno, "Failed to create pipe: %m");
+
+ r = socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, socket_fds);
+ if (r < 0)
+ return log_debug_errno(errno, "Failed to create socket pair: %m");
+
+ r = pidref_safe_fork("(sd-bpffs)", FORK_RESET_SIGNALS|FORK_DEATHSIG_SIGKILL, ret_pid);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to fork bpffs privileged helper: %m");
+ if (r == 0) {
+ _cleanup_close_ int fs_fd = -EBADF;
+
+ bpffs_errno_pipe[0] = safe_close(bpffs_errno_pipe[0]);
+ socket_fds[0] = safe_close(socket_fds[0]);
+
+ fs_fd = receive_one_fd(socket_fds[1], /* flags = */ 0);
+ if (fs_fd < 0) {
+ log_debug_errno(fs_fd, "Failed to receive file descriptor from parent: %m");
+ report_errno_and_exit(bpffs_errno_pipe[1], fs_fd);
+ }
+
+ r = fsconfig(fs_fd, FSCONFIG_CMD_CREATE, /* key = */ NULL, /* value = */ NULL, /* aux = */ 0);
+ if (r < 0) {
+ log_debug_errno(errno, "Failed to create bpffs superblock: %m");
+ report_errno_and_exit(bpffs_errno_pipe[1], errno);
+ }
+
+ if (write(socket_fds[1], (uint8_t[1]) {}, 1) < 0) {
+ log_debug_errno(errno, "Failed to send data to child: %m");
+ report_errno_and_exit(bpffs_errno_pipe[1], errno);
+ }
+
+ _exit(EXIT_SUCCESS);
+ }
+
+ *ret_sock_fd = TAKE_FD(socket_fds[0]);
+ *ret_errno_pipe = TAKE_FD(bpffs_errno_pipe[0]);
+
+ return 0;
+}
+
static int setup_private_users(PrivateUsers private_users, uid_t ouid, gid_t ogid, uid_t uid, gid_t gid, bool allow_setgroups) {
_cleanup_free_ char *uid_map = NULL, *gid_map = NULL;
_cleanup_close_pair_ int errno_pipe[2] = EBADF_PAIR;
@@ -3600,9 +3655,10 @@ static int apply_mount_namespace(
ExecRuntime *runtime,
const char *memory_pressure_path,
bool needs_sandboxing,
- char **reterr_path,
uid_t exec_directory_uid,
- gid_t exec_directory_gid) {
+ gid_t exec_directory_gid,
+ int bpffs_socket_fd,
+ char **reterr_path) {
_cleanup_(verity_settings_done) VeritySettings verity = VERITY_SETTINGS_DEFAULT;
_cleanup_strv_free_ char **empty_directories = NULL, **symlinks = NULL,
@@ -3814,6 +3870,9 @@ static int apply_mount_namespace(
.protect_system = needs_sandboxing ? context->protect_system : PROTECT_SYSTEM_NO,
.protect_proc = needs_sandboxing ? context->protect_proc : PROTECT_PROC_DEFAULT,
.proc_subset = needs_sandboxing ? context->proc_subset : PROC_SUBSET_ALL,
+ .private_bpf = needs_sandboxing ? context->private_bpf : PRIVATE_BPF_NO,
+
+ .bpffs_socket_fd = bpffs_socket_fd,
};
r = setup_namespace(¶meters, reterr_path);
@@ -4454,6 +4513,7 @@ static int setup_delegated_namespaces(
const ExecCommand *command,
bool needs_sandboxing,
bool have_cap_sys_admin,
+ int bpffs_socket_fd,
int *reterr_exit_status) {
int r;
@@ -4574,9 +4634,10 @@ static int setup_delegated_namespaces(
runtime,
memory_pressure_path,
needs_sandboxing,
- &error_path,
uid,
- gid);
+ gid,
+ bpffs_socket_fd,
+ &error_path);
if (r < 0) {
*reterr_exit_status = EXIT_NAMESPACE;
return log_error_errno(r, "Failed to set up mount namespacing%s%s: %m",
@@ -4911,7 +4972,9 @@ int exec_invoke(
_cleanup_free_ gid_t *gids = NULL, *gids_after_pam = NULL;
int ngids = 0, ngids_after_pam = 0;
int socket_fd = -EBADF, named_iofds[3] = EBADF_TRIPLET;
+ _cleanup_close_ int bpffs_socket_fd = -EBADF, bpffs_errno_pipe = -EBADF;
size_t n_storage_fds, n_socket_fds, n_extra_fds;
+ _cleanup_(pidref_done_sigkill_wait) PidRef bpffs_pidref = PIDREF_NULL;
assert(command);
assert(context);
@@ -5627,6 +5690,26 @@ int exec_invoke(
}
}
+ if (context->private_bpf != PRIVATE_BPF_NO) {
+ /* To create a BPF token, the bpffs has to be mounted with the fsopen()/fsmount() API.
+ * More specifically, fsopen() must be called within the user namespace, then all the
+ * fsconfig() as privileged user, and finally and fsmount() and move_mount() in
+ * the user namespace.
+ * To do this, we split the code into a bpffs_prepare() and mount_bpffs() functions,
+ * the first runs as privileged user the second as unprivileged one, and they coordinate
+ * by sending messages and file descriptors via a socket pair.
+ * The user and mount namespaces need to be unshared in this exact order and before
+ * the fsopen() call for the fsopen() API to work as unprivileged.
+ * This is the kernel sample doing this:
+ * https://github.com/torvalds/linux/blob/master/tools/testing/selftests/bpf/prog_tests/token.c
+ */
+ r = bpffs_prepare(&bpffs_pidref, &bpffs_socket_fd, &bpffs_errno_pipe);
+ if (r < 0) {
+ *exit_status = EXIT_BPF;
+ return log_error_errno(r, "Failed to mount bpffs in bpffs_prepare(): %m");
+ }
+ }
+
if (needs_sandboxing && !have_cap_sys_admin && exec_needs_cap_sys_admin(context, params)) {
/* If we're unprivileged, set up the user namespace first to enable use of the other namespaces.
* Users with CAP_SYS_ADMIN can set up user namespaces last because they will be able to
@@ -5665,6 +5748,7 @@ int exec_invoke(
command,
needs_sandboxing,
have_cap_sys_admin,
+ bpffs_socket_fd,
exit_status);
if (r < 0)
return r;
@@ -5724,10 +5808,30 @@ int exec_invoke(
command,
needs_sandboxing,
have_cap_sys_admin,
+ bpffs_socket_fd,
exit_status);
if (r < 0)
return r;
+ if (context->private_bpf != PRIVATE_BPF_NO) {
+ r = pidref_wait_for_terminate_and_check("(sd-bpffs)", &bpffs_pidref, /* flags = */ 0);
+ if (r < 0) {
+ *exit_status = EXIT_BPF;
+ return r;
+ }
+ /* If something strange happened with the child, let's consider this fatal, too */
+ if (r != EXIT_SUCCESS) {
+ *exit_status = EXIT_BPF;
+ ssize_t ss = read(bpffs_errno_pipe, &r, sizeof(r));
+ if (ss == sizeof(r))
+ return log_debug_errno(r, "bpffs helper exited with error: %m");
+ if (ss < 0)
+ return log_debug_errno(errno, "Failed to read from the bpffs helper errno pipe: %m");
+ return log_debug_errno(SYNTHETIC_ERRNO(EIO), "Short read from the bpffs helper errno pipe.");
+ }
+ pidref_done(&bpffs_pidref);
+ }
+
if (needs_sandboxing && exec_needs_cgroup_namespace(context) && params->cgroup_path) {
/* Move ourselves into the subcgroup now *after* we've unshared the cgroup namespace, which
* ensures the root of the cgroup namespace is the top level service cgroup and not the
diff --git a/src/core/execute-serialize.c b/src/core/execute-serialize.c
index 0a1af05e51..167e4dfd7f 100644
--- a/src/core/execute-serialize.c
+++ b/src/core/execute-serialize.c
@@ -1803,6 +1803,10 @@ static int exec_context_serialize(const ExecContext *c, FILE *f) {
if (r < 0)
return r;
+ r = serialize_item(f, "exec-context-private-bpf", private_bpf_to_string(c->private_bpf));
+ if (r < 0)
+ return r;
+
r = serialize_item(f, "exec-context-runtime-directory-preserve-mode", exec_preserve_mode_to_string(c->runtime_directory_preserve_mode));
if (r < 0)
return r;
@@ -2741,6 +2745,10 @@ static int exec_context_deserialize(ExecContext *c, FILE *f) {
c->proc_subset = proc_subset_from_string(val);
if (c->proc_subset < 0)
return -EINVAL;
+ } else if ((val = startswith(l, "exec-context-private-bpf="))) {
+ c->private_bpf = private_bpf_from_string(val);
+ if (c->private_bpf < 0)
+ return -EINVAL;
} else if ((val = startswith(l, "exec-context-runtime-directory-preserve-mode="))) {
c->runtime_directory_preserve_mode = exec_preserve_mode_from_string(val);
if (c->runtime_directory_preserve_mode < 0)
diff --git a/src/core/execute.c b/src/core/execute.c
index 5d5cc41207..9fc9e549de 100644
--- a/src/core/execute.c
+++ b/src/core/execute.c
@@ -324,6 +324,7 @@ bool exec_needs_mount_namespace(
exec_needs_cgroup_mount(context) ||
context->protect_proc != PROTECT_PROC_DEFAULT ||
context->proc_subset != PROC_SUBSET_ALL ||
+ context->private_bpf != PRIVATE_BPF_NO ||
exec_needs_ipc_namespace(context) ||
exec_needs_pid_namespace(context, params))
return true;
@@ -1124,7 +1125,8 @@ void exec_context_dump(const ExecContext *c, FILE* f, const char *prefix) {
"%sKeyringMode: %s\n"
"%sProtectHostname: %s%s%s\n"
"%sProtectProc: %s\n"
- "%sProcSubset: %s\n",
+ "%sProcSubset: %s\n"
+ "%sPrivateBPF: %s\n",
prefix, c->umask,
prefix, empty_to_root(c->working_directory),
prefix, empty_to_root(c->root_directory),
@@ -1151,7 +1153,8 @@ void exec_context_dump(const ExecContext *c, FILE* f, const char *prefix) {
prefix, exec_keyring_mode_to_string(c->keyring_mode),
prefix, protect_hostname_to_string(c->protect_hostname), c->private_hostname ? ":" : "", strempty(c->private_hostname),
prefix, protect_proc_to_string(c->protect_proc),
- prefix, proc_subset_to_string(c->proc_subset));
+ prefix, proc_subset_to_string(c->proc_subset),
+ prefix, private_bpf_to_string(c->private_bpf));
if (c->set_login_environment >= 0)
fprintf(f, "%sSetLoginEnvironment: %s\n", prefix, yes_no(c->set_login_environment > 0));
diff --git a/src/core/execute.h b/src/core/execute.h
index da1600a044..6f1df610a8 100644
--- a/src/core/execute.h
+++ b/src/core/execute.h
@@ -300,6 +300,8 @@ typedef struct ExecContext {
ProtectProc protect_proc; /* hidepid= */
ProcSubset proc_subset; /* subset= */
+ PrivateBPF private_bpf;
+
int private_mounts;
int mount_apivfs;
int bind_log_sockets;
diff --git a/src/core/load-fragment-gperf.gperf.in b/src/core/load-fragment-gperf.gperf.in
index 7d4d174d84..edb0639539 100644
--- a/src/core/load-fragment-gperf.gperf.in
+++ b/src/core/load-fragment-gperf.gperf.in
@@ -67,6 +67,7 @@
{{type}}.KeyringMode, config_parse_exec_keyring_mode, 0, offsetof({{type}}, exec_context.keyring_mode)
{{type}}.ProtectProc, config_parse_protect_proc, 0, offsetof({{type}}, exec_context.protect_proc)
{{type}}.ProcSubset, config_parse_proc_subset, 0, offsetof({{type}}, exec_context.proc_subset)
+{{type}}.PrivateBPF, config_parse_private_bpf, 0, offsetof({{type}}, exec_context.private_bpf)
{% if HAVE_SECCOMP %}
{{type}}.SystemCallFilter, config_parse_syscall_filter, 0, offsetof({{type}}, exec_context)
{{type}}.SystemCallArchitectures, config_parse_syscall_archs, 0, offsetof({{type}}, exec_context.syscall_archs)
diff --git a/src/core/load-fragment.c b/src/core/load-fragment.c
index c1e704b1c6..9c544a35e0 100644
--- a/src/core/load-fragment.c
+++ b/src/core/load-fragment.c
@@ -133,6 +133,7 @@ DEFINE_CONFIG_PARSE_ENUM(config_parse_device_policy, cgroup_device_policy, CGrou
DEFINE_CONFIG_PARSE_ENUM(config_parse_exec_keyring_mode, exec_keyring_mode, ExecKeyringMode);
DEFINE_CONFIG_PARSE_ENUM(config_parse_protect_proc, protect_proc, ProtectProc);
DEFINE_CONFIG_PARSE_ENUM(config_parse_proc_subset, proc_subset, ProcSubset);
+DEFINE_CONFIG_PARSE_ENUM(config_parse_private_bpf, private_bpf, PrivateBPF);
DEFINE_CONFIG_PARSE_ENUM(config_parse_private_tmp, private_tmp, PrivateTmp);
DEFINE_CONFIG_PARSE_ENUM(config_parse_private_users, private_users, PrivateUsers);
DEFINE_CONFIG_PARSE_ENUM(config_parse_private_pids, private_pids, PrivatePIDs);
diff --git a/src/core/load-fragment.h b/src/core/load-fragment.h
index a31ad750d3..ba226e2e5c 100644
--- a/src/core/load-fragment.h
+++ b/src/core/load-fragment.h
@@ -129,6 +129,7 @@ CONFIG_PARSER_PROTOTYPE(config_parse_bind_paths);
CONFIG_PARSER_PROTOTYPE(config_parse_exec_keyring_mode);
CONFIG_PARSER_PROTOTYPE(config_parse_protect_proc);
CONFIG_PARSER_PROTOTYPE(config_parse_proc_subset);
+CONFIG_PARSER_PROTOTYPE(config_parse_private_bpf);
CONFIG_PARSER_PROTOTYPE(config_parse_job_timeout_sec);
CONFIG_PARSER_PROTOTYPE(config_parse_job_running_timeout_sec);
CONFIG_PARSER_PROTOTYPE(config_parse_log_extra_fields);
diff --git a/src/core/namespace.c b/src/core/namespace.c
index faa84ced20..0768eafac2 100644
--- a/src/core/namespace.c
+++ b/src/core/namespace.c
@@ -79,6 +79,7 @@ typedef enum MountMode {
MOUNT_EXTENSION_IMAGE, /* Mounted outside the root directory, and used by subsequent mounts */
MOUNT_MQUEUEFS,
MOUNT_READ_WRITE_IMPLICIT, /* Should have the lowest priority. */
+ MOUNT_BPFFS, /* Special mount for bpffs, which is mounted with fsmount() and move_mount() */
_MOUNT_MODE_MAX,
_MOUNT_MODE_INVALID = -EINVAL,
} MountMode;
@@ -161,13 +162,17 @@ static const MountEntry protect_kernel_tunables_proc_table[] = {
static const MountEntry protect_kernel_tunables_sys_table[] = {
{ "/sys", MOUNT_READ_ONLY, false },
- { "/sys/fs/bpf", MOUNT_READ_ONLY, true },
{ "/sys/fs/cgroup", MOUNT_READ_WRITE_IMPLICIT, false }, /* READ_ONLY is set by ProtectControlGroups= option */
{ "/sys/fs/selinux", MOUNT_READ_WRITE_IMPLICIT, true },
{ "/sys/kernel/debug", MOUNT_READ_ONLY, true },
{ "/sys/kernel/tracing", MOUNT_READ_ONLY, true },
};
+/* PrivateBPF= option */
+static const MountEntry private_bpf_no_table[] = {
+ { "/sys/fs/bpf", MOUNT_READ_ONLY, true },
+};
+
/* ProtectKernelModules= option */
static const MountEntry protect_kernel_modules_table[] = {
{ "/usr/lib/modules", MOUNT_INACCESSIBLE, true },
@@ -927,6 +932,36 @@ static int append_protect_system(MountList *ml, ProtectSystem protect_system, bo
}
}
+static int append_private_bpf(
+ MountList *ml,
+ PrivateBPF private_bpf,
+ bool protect_kernel_tunables,
+ bool ignore_protect,
+ const NamespaceParameters *p) {
+
+ assert(ml);
+
+ switch (private_bpf) {
+ case PRIVATE_BPF_NO:
+ if (protect_kernel_tunables)
+ return append_static_mounts(ml, private_bpf_no_table, ELEMENTSOF(private_bpf_no_table), ignore_protect);
+ return 0;
+ case PRIVATE_BPF_YES: {
+ MountEntry *me = mount_list_extend(ml);
+ if (!me)
+ return log_oom_debug();
+
+ *me = (MountEntry) {
+ .path_const = "/sys/fs/bpf",
+ .mode = MOUNT_BPFFS,
+ };
+ return 0;
+ }
+ default:
+ assert_not_reached();
+ }
+}
+
static int mount_path_compare(const MountEntry *a, const MountEntry *b) {
int d;
@@ -1697,6 +1732,34 @@ static int mount_overlay(const MountEntry *m) {
return 1;
}
+static int mount_bpffs(const MountEntry *m, int socket_fd) {
+ int r;
+
+ assert(m);
+ assert(socket_fd >= 0);
+
+ _cleanup_close_ int fs_fd = fsopen("bpf", FSOPEN_CLOEXEC);
+ if (fs_fd < 0)
+ return log_debug_errno(errno, "Failed to fsopen: %m");
+
+ r = send_one_fd(socket_fd, fs_fd, /* flags = */ 0);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to send bpffs fd to child: %m");
+
+ if (read(socket_fd, (uint8_t[1]) {}, 1) < 0)
+ return log_debug_errno(errno, "Failed to receive data from child: %m");
+
+ _cleanup_close_ int mnt_fd = fsmount(fs_fd, /* flags = */ 0, /* mount_attrs = */ 0);
+ if (mnt_fd < 0)
+ return log_debug_errno(errno, "Failed to fsmount bpffs: %m");
+
+ r = move_mount(mnt_fd, "", AT_FDCWD, mount_entry_path(m), MOVE_MOUNT_F_EMPTY_PATH);
+ if (r < 0)
+ return log_debug_errno(errno, "Failed to move bpffs mount to %s: %m", mount_entry_path(m));
+
+ return 1;
+}
+
static int follow_symlink(
const char *root_directory,
MountEntry *m) {
@@ -1953,6 +2016,9 @@ static int apply_one_mount(
case MOUNT_OVERLAY:
return mount_overlay(m);
+ case MOUNT_BPFFS:
+ return mount_bpffs(m, p->bpffs_socket_fd);
+
default:
assert_not_reached();
}
@@ -2151,6 +2217,7 @@ static bool namespace_parameters_mount_apivfs(const NamespaceParameters *p) {
p->protect_kernel_tunables ||
p->protect_proc != PROTECT_PROC_DEFAULT ||
p->proc_subset != PROC_SUBSET_ALL ||
+ p->private_bpf != PRIVATE_BPF_NO ||
p->private_pids != PRIVATE_PIDS_NO;
}
@@ -2653,6 +2720,10 @@ int setup_namespace(const NamespaceParameters *p, char **reterr_path) {
if (r < 0)
return r;
+ r = append_private_bpf(&ml, p->private_bpf, p->protect_kernel_tunables, /* ignore_protect = */ false, p);
+ if (r < 0)
+ return r;
+
if (namespace_parameters_mount_apivfs(p)) {
r = append_static_mounts(&ml,
apivfs_table,
@@ -3888,6 +3959,13 @@ static const char* const proc_subset_table[_PROC_SUBSET_MAX] = {
DEFINE_STRING_TABLE_LOOKUP(proc_subset, ProcSubset);
+static const char* const private_bpf_table[_PRIVATE_BPF_MAX] = {
+ [PRIVATE_BPF_NO] = "no",
+ [PRIVATE_BPF_YES] = "yes",
+};
+
+DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(private_bpf, PrivateBPF, PRIVATE_BPF_YES);
+
static const char* const private_tmp_table[_PRIVATE_TMP_MAX] = {
[PRIVATE_TMP_NO] = "no",
[PRIVATE_TMP_CONNECTED] = "connected",
diff --git a/src/core/namespace.h b/src/core/namespace.h
index eadd991ed2..178ed1e548 100644
--- a/src/core/namespace.h
+++ b/src/core/namespace.h
@@ -51,6 +51,13 @@ typedef enum ProcSubset {
_PROC_SUBSET_INVALID = -EINVAL,
} ProcSubset;
+typedef enum PrivateBPF {
+ PRIVATE_BPF_NO,
+ PRIVATE_BPF_YES,
+ _PRIVATE_BPF_MAX,
+ _PRIVATE_BPF_INVALID = -EINVAL,
+} PrivateBPF;
+
typedef enum PrivateTmp {
PRIVATE_TMP_NO,
PRIVATE_TMP_CONNECTED, /* Bind mounted from the host's filesystem */
@@ -188,9 +195,12 @@ typedef struct NamespaceParameters {
ProtectSystem protect_system;
ProtectProc protect_proc;
ProcSubset proc_subset;
+ PrivateBPF private_bpf;
PrivateTmp private_tmp;
PrivateTmp private_var_tmp;
PrivatePIDs private_pids;
+
+ int bpffs_socket_fd;
} NamespaceParameters;
int setup_namespace(const NamespaceParameters *p, char **reterr_path);
@@ -223,6 +233,9 @@ ProtectProc protect_proc_from_string(const char *s) _pure_;
const char* proc_subset_to_string(ProcSubset i) _const_;
ProcSubset proc_subset_from_string(const char *s) _pure_;
+const char* private_bpf_to_string(PrivateBPF i) _const_;
+PrivateBPF private_bpf_from_string(const char *s) _pure_;
+
const char* private_tmp_to_string(PrivateTmp i) _const_;
PrivateTmp private_tmp_from_string(const char *s) _pure_;
diff --git a/src/shared/bus-unit-util.c b/src/shared/bus-unit-util.c
index 0e60cd63d4..8fc97db191 100644
--- a/src/shared/bus-unit-util.c
+++ b/src/shared/bus-unit-util.c
@@ -2425,6 +2425,7 @@ static const BusProperty execute_properties[] = {
{ "MountImagePolicy", bus_append_string },
{ "ExtensionImagePolicy", bus_append_string },
{ "PrivatePIDs", bus_append_string },
+ { "PrivateBPF", bus_append_string },
{ "IgnoreSIGPIPE", bus_append_parse_boolean },
{ "TTYVHangup", bus_append_parse_boolean },
{ "TTYReset", bus_append_parse_boolean },
diff --git a/test/units/TEST-07-PID1.private-bpf.sh b/test/units/TEST-07-PID1.private-bpf.sh
new file mode 100755
index 0000000000..f0c1dcf73e
--- /dev/null
+++ b/test/units/TEST-07-PID1.private-bpf.sh
@@ -0,0 +1,21 @@
+#!/usr/bin/env bash
+# SPDX-License-Identifier: LGPL-2.1-or-later
+set -eux
+set -o pipefail
+
+# Check that with ProtectKernelTunables=yes and PrivateBPF=no, the host bpffs is remounted ro
+systemd-run --wait \
+ -p PrivateUsers=yes \
+ -p PrivateMounts=yes \
+ -p DelegateNamespaces=mnt \
+ -p ProtectKernelTunables=yes \
+ -p PrivateBPF=no \
+ grep -q '/sys/fs/bpf .* ro,' /proc/mounts
+
+# Check that with PrivateBPF=yes, a new bpffs instance is mounted
+systemd-run --wait \
+ -p PrivateUsers=yes \
+ -p PrivateMounts=yes \
+ -p DelegateNamespaces=mnt \
+ -p PrivateBPF=yes \
+ grep -q '^none /sys/fs/bpf bpf rw' /proc/mounts