Files
systemd/src/nsresourced/userns-restrict.c
Lennart Poettering 8aee931e7a nsresourced: add new daemon for granting clients user namespaces and assigning resources to them
This adds a small, socket-activated Varlink daemon that can delegate UID
ranges for user namespaces to clients asking for it.

The primary call is AllocateUserRange() where the user passes in an
uninitialized userns fd, which is then set up.

There are other calls that allow assigning a mount fd to a userns
allocated that way, to set up permissions for a cgroup subtree, and to
allocate a veth for such a user namespace.

Since the UID assignments are supposed to be transitive, i.e. not
permanent, care is taken to ensure that users cannot create inodes owned
by these UIDs, so that persistancy cannot be acquired. This is
implemented via a BPF-LSM module that ensures that any member of a
userns allocated that way cannot create files unless the mount it
operates on is owned by the userns itself, or is explicitly
allowelisted.

BPF LSM program with contributions from Alexei Starovoitov.
2024-04-06 16:08:24 +02:00

347 lines
13 KiB
C

/* SPDX-License-Identifier: LGPL-2.1-or-later */
#include "userns-restrict.h"
#if HAVE_VMLINUX_H
#include <sched.h>
#include "bpf-dlopen.h"
#include "bpf-link.h"
#include "fd-util.h"
#include "fs-util.h"
#include "lsm-util.h"
#include "missing_mount.h"
#include "mkdir.h"
#include "mount-util.h"
#include "mountpoint-util.h"
#include "namespace-util.h"
#include "path-util.h"
#define USERNS_MAX (16U*1024U)
#define MOUNTS_MAX 4096U
#define PROGRAM_LINK_PREFIX "/sys/fs/bpf/systemd/userns-restrict/programs"
#define MAP_LINK_PREFIX "/sys/fs/bpf/systemd/userns-restrict/maps"
struct userns_restrict_bpf *userns_restrict_bpf_free(struct userns_restrict_bpf *obj) {
(void) userns_restrict_bpf__destroy(obj); /* this call is fine with NULL */
return NULL;
}
static int make_inner_hash_map(void) {
int fd;
fd = compat_bpf_map_create(
BPF_MAP_TYPE_HASH,
NULL,
sizeof(int),
sizeof(uint32_t),
MOUNTS_MAX,
NULL);
if (fd < 0)
return log_debug_errno(errno, "Failed allocate inner BPF map: %m");
return fd;
}
int userns_restrict_install(
bool pin,
struct userns_restrict_bpf **ret) {
_cleanup_(userns_restrict_bpf_freep) struct userns_restrict_bpf *obj = NULL;
_cleanup_close_ int dummy_mnt_id_hash_fd = -EBADF;
int r;
r = lsm_supported("bpf");
if (r < 0)
return r;
if (r == 0)
return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "bpf-lsm not supported, can't lock down user namespace.");
r = dlopen_bpf();
if (r < 0)
return r;
/* bpf_object__next_map() is not available in libbpf pre-0.7.0, and we want to use it. */
if (!sym_bpf_object__next_map)
return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "libbpf too old for locking down user namespace.");
obj = userns_restrict_bpf__open();
if (!obj)
return log_error_errno(errno, "Failed to open userns_restrict BPF object: %m");
if (pin) {
struct bpf_map *map;
/* libbpf will only create one level of dirs. Let's create the rest */
(void) mkdir_p(MAP_LINK_PREFIX, 0755);
(void) mkdir_p(PROGRAM_LINK_PREFIX, 0755);
map = sym_bpf_object__next_map(obj->obj, NULL);
while (map) {
_cleanup_free_ char *fn = NULL;
fn = path_join(MAP_LINK_PREFIX, sym_bpf_map__name(map));
if (!fn)
return log_oom();
r = sym_bpf_map__set_pin_path(map, fn);
if (r < 0)
return log_error_errno(r, "Failed to set pin path to '%s': %m", fn);
map = sym_bpf_object__next_map(obj->obj, map);
}
}
r = sym_bpf_map__set_max_entries(obj->maps.userns_mnt_id_hash, USERNS_MAX);
if (r < 0)
return log_error_errno(r, "Failed to size userns/mnt_id hash table: %m");
r = sym_bpf_map__set_max_entries(obj->maps.userns_ringbuf, USERNS_MAX * sizeof(unsigned int));
if (r < 0)
return log_error_errno(r, "Failed to size userns ring buffer: %m");
/* Dummy map to satisfy the verifier */
dummy_mnt_id_hash_fd = make_inner_hash_map();
if (dummy_mnt_id_hash_fd < 0)
return dummy_mnt_id_hash_fd;
r = sym_bpf_map__set_inner_map_fd(obj->maps.userns_mnt_id_hash, dummy_mnt_id_hash_fd);
if (r < 0)
return log_error_errno(r, "Failed to set inner BPF map: %m");
r = userns_restrict_bpf__load(obj);
if (r < 0)
return log_error_errno(r, "Failed to load BPF object: %m");
for (int i = 0; i < obj->skeleton->prog_cnt; i++) {
_cleanup_(bpf_link_freep) struct bpf_link *link = NULL;
struct bpf_prog_skeleton *ps = obj->skeleton->progs + i;
_cleanup_free_ char *fn = NULL;
bool linked = false;
const char *e;
e = startswith(ps->name, "userns_restrict_");
assert(e);
if (pin) {
fn = path_join(PROGRAM_LINK_PREFIX, e);
if (!fn)
return log_oom();
link = sym_bpf_link__open(fn);
r = sym_libbpf_get_error(link);
if (r < 0) {
if (r != -ENOENT)
return log_error_errno(r, "Unable to open pinned program link: %m");
link = NULL;
} else {
linked = true;
log_info("userns-restrict BPF-LSM program %s already attached.", ps->name);
}
}
if (!link) {
link = sym_bpf_program__attach(*ps->prog);
r = sym_libbpf_get_error(link);
if (r < 0)
return log_error_errno(r, "Failed to attach LSM BPF program: %m");
log_info("userns-restrict BPF-LSM program %s now attached.", ps->name);
}
if (pin && !linked) {
assert(fn);
r = sym_bpf_link__pin(link, fn);
if (r < 0)
return log_error_errno(r, "Failed to pin LSM attachment: %m");
}
*ps->link = TAKE_PTR(link);
}
if (pin) {
r = sym_bpf_object__pin_maps(obj->obj, NULL);
if (r < 0)
return log_error_errno(r, "Failed to pin BPF maps: %m");
}
if (ret)
*ret = TAKE_PTR(obj);
return 0;
}
int userns_restrict_put_by_inode(
struct userns_restrict_bpf *obj,
uint64_t userns_inode,
bool replace,
const int mount_fds[],
size_t n_mount_fds) {
_cleanup_close_ int inner_map_fd = -EBADF;
_cleanup_free_ int *mnt_ids = NULL;
uint64_t ino = userns_inode;
int r, outer_map_fd;
assert(obj);
assert(userns_inode != 0);
assert(n_mount_fds == 0 || mount_fds);
/* The BPF map type BPF_MAP_TYPE_HASH_OF_MAPS only supports 32bit keys, and user namespace inode
* numbers are 32bit too, even though ino_t is 64bit these days. Should we ever run into a 64bit
* inode let's refuse early, we can't support this with the current BPF code for now. */
if (userns_inode > UINT32_MAX)
return -EINVAL;
mnt_ids = new(int, n_mount_fds);
if (!mnt_ids)
return -ENOMEM;
for (size_t i = 0; i < n_mount_fds; i++) {
r = path_get_mnt_id_at(mount_fds[i], "", mnt_ids + i);
if (r < 0)
return log_debug_errno(r, "Failed to get mount ID: %m");
}
outer_map_fd = sym_bpf_map__fd(obj->maps.userns_mnt_id_hash);
if (outer_map_fd < 0)
return log_debug_errno(outer_map_fd, "Failed to get outer BPF map fd: %m");
if (replace) {
/* Add if missing, replace if already exists */
inner_map_fd = make_inner_hash_map();
if (inner_map_fd < 0)
return inner_map_fd;
r = sym_bpf_map_update_elem(outer_map_fd, &ino, &inner_map_fd, BPF_ANY);
if (r < 0)
return log_debug_errno(errno, "Failed to replace map in inode hash: %m");
} else {
/* Let's add an entry for this userns inode if missing. If it exists just extend the existing map. We
* might race against each other, hence we try a couple of times */
for (size_t n_try = 10;; n_try--) {
uint32_t innermap_id;
if (n_try == 0)
return log_debug_errno(SYNTHETIC_ERRNO(EEXIST),
"Stillcan't create inode entry in BPF map after 10 tries.");
r = sym_bpf_map_lookup_elem(outer_map_fd, &ino, &innermap_id);
if (r >= 0) {
inner_map_fd = sym_bpf_map_get_fd_by_id(innermap_id);
if (inner_map_fd < 0)
return log_debug_errno(inner_map_fd, "Failed to get file descriptor for inner map: %m");
break;
}
if (errno != ENOENT)
return log_debug_errno(errno, "Failed to look up inode hash entry: %m");
/* No entry for this user namespace yet. Let's create one */
inner_map_fd = make_inner_hash_map();
if (inner_map_fd < 0)
return inner_map_fd;
r = sym_bpf_map_update_elem(outer_map_fd, &ino, &inner_map_fd, BPF_NOEXIST);
if (r >= 0)
break;
if (errno != EEXIST)
return log_debug_errno(errno, "Failed to add mount ID list to inode hash: %m");
}
}
FOREACH_ARRAY(mntid, mnt_ids, n_mount_fds) {
uint32_t dummy_value = 1;
r = sym_bpf_map_update_elem(inner_map_fd, mntid, &dummy_value, BPF_ANY);
if (r < 0)
return log_debug_errno(errno, "Failed to add mount ID to map: %m");
log_debug("Allowing mount %i on userns inode %" PRIu64, *mntid, ino);
}
return 0;
}
int userns_restrict_put_by_fd(
struct userns_restrict_bpf *obj,
int userns_fd,
bool replace,
const int mount_fds[],
size_t n_mount_fds) {
struct stat st;
int r;
assert(obj);
assert(userns_fd >= 0);
assert(n_mount_fds == 0 || mount_fds);
r = fd_is_ns(userns_fd, CLONE_NEWUSER);
if (r < 0)
return log_debug_errno(r, "Failed to determine if file descriptor is user namespace: %m");
if (r == 0)
return log_debug_errno(SYNTHETIC_ERRNO(EBADF), "User namespace fd is not actually a user namespace fd.");
if (fstat(userns_fd, &st) < 0)
return log_debug_errno(errno, "Failed to fstat() user namespace: %m");
return userns_restrict_put_by_inode(
obj,
st.st_ino,
replace,
mount_fds,
n_mount_fds);
}
int userns_restrict_reset_by_inode(
struct userns_restrict_bpf *obj,
uint64_t ino) {
int r, outer_map_fd;
unsigned u;
assert(obj);
assert(ino != 0);
if (ino > UINT32_MAX) /* inodes larger than 32bit are definitely not included in our map, exit early */
return 0;
outer_map_fd = sym_bpf_map__fd(obj->maps.userns_mnt_id_hash);
if (outer_map_fd < 0)
return log_debug_errno(outer_map_fd, "Failed to get outer BPF map fd: %m");
u = (uint32_t) ino;
r = sym_bpf_map_delete_elem(outer_map_fd, &u);
if (r < 0)
return log_debug_errno(outer_map_fd, "Failed to remove entry for inode %" PRIu64 " from outer map: %m", ino);
return 0;
}
#else
int userns_restrict_install(bool pin, struct userns_restrict_bpf **ret) {
return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "User Namespace Restriction BPF support disabled.");
}
struct userns_restrict_bpf *userns_restrict_bpf_free(struct userns_restrict_bpf *obj) {
return NULL;
}
int userns_restrict_put_by_fd(struct userns_restrict_bpf *obj, int userns_fd, bool replace, const int mount_fds[], size_t n_mount_fds) {
return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "User Namespace Restriction BPF support disabled.");
}
int userns_restrict_put_by_inode(struct userns_restrict_bpf *obj, uint64_t userns_inode, bool replace, const int mount_fds[], size_t n_mount_fds) {
return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "User Namespace Restriction BPF support disabled.");
}
int userns_restrict_reset_by_inode(struct userns_restrict_bpf *obj, uint64_t userns_inode) {
return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "User Namespace Restriction BPF support disabled.");
}
#endif