From 9c0fad5fb5f47da125bb768dbb4cd0e824cccc7c Mon Sep 17 00:00:00 2001 From: Luke Shumaker Date: Fri, 7 Jul 2017 18:30:03 -0400 Subject: [PATCH 1/6] nspawn: Simplify mkdir_userns() usage, and trickle that up One of the things that mkdir_userns{,_p}() does is take an (optional) UID, and chown the directory to that. So we need a uid_t argument, and a way of telling if we should use that uid_t argument. Fortunately, that is built in to the uid_t type by having UID_INVALID as a possible value. However, currently mkdir_userns() also takes a MountSettingsMask and checks a couple of bits in it to decide if it should perform the chown. Drop the mask argument, and instead have the caller pass UID_INVALID if it shouldn't chown. --- src/nspawn/nspawn-mount.c | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/src/nspawn/nspawn-mount.c b/src/nspawn/nspawn-mount.c index b5df65e2a4..3613a179fe 100644 --- a/src/nspawn/nspawn-mount.c +++ b/src/nspawn/nspawn-mount.c @@ -442,7 +442,7 @@ int mount_sysfs(const char *dest, MountSettingsMask mount_settings) { MS_BIND|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT|extra_flags, NULL); } -static int mkdir_userns(const char *path, mode_t mode, MountSettingsMask mask, uid_t uid_shift) { +static int mkdir_userns(const char *path, mode_t mode, uid_t uid_shift) { int r; assert(path); @@ -451,10 +451,7 @@ static int mkdir_userns(const char *path, mode_t mode, MountSettingsMask mask, u if (r < 0 && r != -EEXIST) return r; - if ((mask & MOUNT_USE_USERNS) == 0) - return 0; - - if (mask & MOUNT_IN_USERNS) + if (uid_shift == UID_INVALID) return 0; if (lchown(path, uid_shift, uid_shift) < 0) @@ -463,7 +460,7 @@ static int mkdir_userns(const char *path, mode_t mode, MountSettingsMask mask, u return 0; } -static int mkdir_userns_p(const char *prefix, const char *path, mode_t mode, MountSettingsMask mask, uid_t uid_shift) { +static int mkdir_userns_p(const char *prefix, const char *path, mode_t mode, uid_t uid_shift) { const char *p, *e; int r; @@ -490,12 +487,12 @@ static int mkdir_userns_p(const char *prefix, const char *path, mode_t mode, Mou if (prefix && path_startswith(prefix, t)) continue; - r = mkdir_userns(t, mode, mask, uid_shift); + r = mkdir_userns(t, mode, uid_shift); if (r < 0) return r; } - return mkdir_userns(path, mode, mask, uid_shift); + return mkdir_userns(path, mode, uid_shift); } int mount_all(const char *dest, @@ -634,7 +631,7 @@ int mount_all(const char *dest, if (what && r > 0) continue; - r = mkdir_userns_p(dest, where, 0755, mount_settings, uid_shift); + r = mkdir_userns_p(dest, where, 0755, (use_userns && !in_userns) ? uid_shift : UID_INVALID); if (r < 0 && r != -EEXIST) { if (fatal && r != -EROFS) return log_error_errno(r, "Failed to create directory %s: %m", where); From 2fa017f16922776ff9751dc22031c7ee49920729 Mon Sep 17 00:00:00 2001 From: Luke Shumaker Date: Tue, 13 Jun 2017 18:06:09 -0400 Subject: [PATCH 2/6] nspawn: Simplify tmpfs_patch_options() usage, and trickle that up One of the things that tmpfs_patch_options does is take an (optional) UID, and insert "uid=${UID},gid=${UID}" into the options string. So we need a uid_t argument, and a way of telling if we should use it. Fortunately, that is built in to the uid_t type by having UID_INVALID as a possible value. So this is really a feature that requires one argument. Yet, it is somehow taking 4! That is absurd. Simplify it to only take one argument, and have that trickle all the way up to mount_all()'s usage. Now, in may of the uses, the argument becomes uid_shift == 0 ? UID_INVALID : uid_shift because it used to treat uid_shift=0 as invalid unless the patch_ids flag was also set. This keeps the behavior the same. Note that in all cases where it is invoked, if !use_userns (sometimes called !userns), then uid_shift is 0; we don't have to add any checks for that. That said, I'm pretty sure that "uid=0" and not setting "uid=" are the same, but Christian Brauner seemed to not think so when implementing the cgns support. https://github.com/systemd/systemd/pull/3589 --- src/nspawn/nspawn-mount.c | 25 +++++++++---------------- src/nspawn/nspawn-mount.h | 2 +- src/nspawn/nspawn.c | 2 -- 3 files changed, 10 insertions(+), 19 deletions(-) diff --git a/src/nspawn/nspawn-mount.c b/src/nspawn/nspawn-mount.c index 3613a179fe..21684aa49b 100644 --- a/src/nspawn/nspawn-mount.c +++ b/src/nspawn/nspawn-mount.c @@ -329,17 +329,13 @@ int overlay_mount_parse(CustomMount **l, size_t *n, const char *s, bool read_onl static int tmpfs_patch_options( const char *options, - bool userns, - uid_t uid_shift, uid_t uid_range, - bool patch_ids, + uid_t uid_shift, const char *selinux_apifs_context, char **ret) { char *buf = NULL; - if ((userns && uid_shift != 0) || patch_ids) { - assert(uid_shift != UID_INVALID); - + if (uid_shift != UID_INVALID) { if (asprintf(&buf, "%s%suid=" UID_FMT ",gid=" UID_FMT, strempty(options), options ? "," : "", uid_shift, uid_shift) < 0) @@ -497,7 +493,7 @@ static int mkdir_userns_p(const char *prefix, const char *path, mode_t mode, uid int mount_all(const char *dest, MountSettingsMask mount_settings, - uid_t uid_shift, uid_t uid_range, + uid_t uid_shift, const char *selinux_apifs_context) { #define PROC_INACCESSIBLE(path) \ @@ -646,10 +642,7 @@ int mount_all(const char *dest, o = mount_table[k].options; if (streq_ptr(mount_table[k].type, "tmpfs")) { - if (in_userns) - r = tmpfs_patch_options(o, use_userns, 0, uid_range, true, selinux_apifs_context, &options); - else - r = tmpfs_patch_options(o, use_userns, uid_shift, uid_range, false, selinux_apifs_context, &options); + r = tmpfs_patch_options(o, in_userns ? 0 : uid_shift, selinux_apifs_context, &options); if (r < 0) return log_oom(); if (r > 0) @@ -752,7 +745,7 @@ static int mount_tmpfs( return log_error_errno(r, "Creating mount point for tmpfs %s failed: %m", where); } - r = tmpfs_patch_options(m->options, userns, uid_shift, uid_range, false, selinux_apifs_context, &buf); + r = tmpfs_patch_options(m->options, uid_shift == 0 ? UID_INVALID : uid_shift, selinux_apifs_context, &buf); if (r < 0) return log_oom(); options = r > 0 ? buf : m->options; @@ -985,7 +978,7 @@ static int mount_legacy_cgns_supported( * uid/gid as seen from e.g. /proc/1/mountinfo. So we simply * pass uid 0 and not uid_shift to tmpfs_patch_options(). */ - r = tmpfs_patch_options("mode=755", userns, 0, uid_range, true, selinux_apifs_context, &options); + r = tmpfs_patch_options("mode=755", 0, selinux_apifs_context, &options); if (r < 0) return log_oom(); @@ -1087,7 +1080,7 @@ static int mount_legacy_cgns_unsupported( if (r == 0) { _cleanup_free_ char *options = NULL; - r = tmpfs_patch_options("mode=755", userns, uid_shift, uid_range, false, selinux_apifs_context, &options); + r = tmpfs_patch_options("mode=755", uid_shift == 0 ? UID_INVALID : uid_shift, selinux_apifs_context, &options); if (r < 0) return log_oom(); @@ -1298,7 +1291,7 @@ int setup_volatile_state( return log_error_errno(errno, "Failed to create %s: %m", directory); options = "mode=755"; - r = tmpfs_patch_options(options, userns, uid_shift, uid_range, false, selinux_apifs_context, &buf); + r = tmpfs_patch_options(options, uid_shift == 0 ? UID_INVALID : uid_shift, selinux_apifs_context, &buf); if (r < 0) return log_oom(); if (r > 0) @@ -1331,7 +1324,7 @@ int setup_volatile( return log_error_errno(errno, "Failed to create temporary directory: %m"); options = "mode=755"; - r = tmpfs_patch_options(options, userns, uid_shift, uid_range, false, selinux_apifs_context, &buf); + r = tmpfs_patch_options(options, uid_shift == 0 ? UID_INVALID : uid_shift, selinux_apifs_context, &buf); if (r < 0) return log_oom(); if (r > 0) diff --git a/src/nspawn/nspawn-mount.h b/src/nspawn/nspawn-mount.h index b823282cbd..e948d02c2c 100644 --- a/src/nspawn/nspawn-mount.h +++ b/src/nspawn/nspawn-mount.h @@ -43,7 +43,7 @@ int bind_mount_parse(CustomMount **l, size_t *n, const char *s, bool read_only); int tmpfs_mount_parse(CustomMount **l, size_t *n, const char *s); int overlay_mount_parse(CustomMount **l, size_t *n, const char *s, bool read_only); -int mount_all(const char *dest, MountSettingsMask mount_settings, uid_t uid_shift, uid_t uid_range, const char *selinux_apifs_context); +int mount_all(const char *dest, MountSettingsMask mount_settings, uid_t uid_shift, const char *selinux_apifs_context); int mount_sysfs(const char *dest, MountSettingsMask mount_settings); int mount_cgroups(const char *dest, CGroupUnified unified_requested, bool userns, uid_t uid_shift, uid_t uid_range, const char *selinux_apifs_context, bool use_cgns); diff --git a/src/nspawn/nspawn.c b/src/nspawn/nspawn.c index 9ea1c87590..9a2f72bf29 100644 --- a/src/nspawn/nspawn.c +++ b/src/nspawn/nspawn.c @@ -2558,7 +2558,6 @@ static int inner_child( r = mount_all(NULL, arg_mount_settings | MOUNT_IN_USERNS, arg_uid_shift, - arg_uid_range, arg_selinux_apifs_context); if (r < 0) return r; @@ -2990,7 +2989,6 @@ static int outer_child( r = mount_all(directory, arg_mount_settings, arg_uid_shift, - arg_uid_range, arg_selinux_apifs_context); if (r < 0) return r; From 0402948206203ccbd6b81b10d4bf8973b87b2c60 Mon Sep 17 00:00:00 2001 From: Luke Shumaker Date: Fri, 7 Jul 2017 18:57:08 -0400 Subject: [PATCH 3/6] nspawn: Move cgroup mount stuff from nspawn-mount.c to nspawn-cgroup.c --- src/nspawn/nspawn-cgroup.c | 417 +++++++++++++++++++++++++++++++++++++ src/nspawn/nspawn-cgroup.h | 3 + src/nspawn/nspawn-mount.c | 415 +----------------------------------- src/nspawn/nspawn-mount.h | 5 +- 4 files changed, 423 insertions(+), 417 deletions(-) diff --git a/src/nspawn/nspawn-cgroup.c b/src/nspawn/nspawn-cgroup.c index d8a39a6959..0be911999a 100644 --- a/src/nspawn/nspawn-cgroup.c +++ b/src/nspawn/nspawn-cgroup.c @@ -5,12 +5,16 @@ #include "alloc-util.h" #include "fd-util.h" #include "fileio.h" +#include "fs-util.h" #include "mkdir.h" #include "mount-util.h" #include "nspawn-cgroup.h" +#include "nspawn-mount.h" +#include "path-util.h" #include "rm-rf.h" #include "string-util.h" #include "strv.h" +#include "user-util.h" #include "util.h" static int chown_cgroup_path(const char *path, uid_t uid_shift) { @@ -188,3 +192,416 @@ int create_subcgroup(pid_t pid, bool keep_unit, CGroupUnified unified_requested) (void) cg_enable_everywhere(supported, supported, cgroup); return 0; } + +/* Retrieve existing subsystems. This function is called in a new cgroup + * namespace. + */ +static int get_process_controllers(Set **ret) { + _cleanup_set_free_free_ Set *controllers = NULL; + _cleanup_fclose_ FILE *f = NULL; + int r; + + assert(ret); + + controllers = set_new(&string_hash_ops); + if (!controllers) + return -ENOMEM; + + f = fopen("/proc/self/cgroup", "re"); + if (!f) + return errno == ENOENT ? -ESRCH : -errno; + + for (;;) { + _cleanup_free_ char *line = NULL; + char *e, *l; + + r = read_line(f, LONG_LINE_MAX, &line); + if (r < 0) + return r; + if (r == 0) + break; + + l = strchr(line, ':'); + if (!l) + continue; + + l++; + e = strchr(l, ':'); + if (!e) + continue; + + *e = 0; + + if (STR_IN_SET(l, "", "name=systemd", "name=unified")) + continue; + + r = set_put_strdup(controllers, l); + if (r < 0) + return r; + } + + *ret = TAKE_PTR(controllers); + + return 0; +} + +static int mount_legacy_cgroup_hierarchy( + const char *dest, + const char *controller, + const char *hierarchy, + bool read_only) { + + const char *to, *fstype, *opts; + int r; + + to = strjoina(strempty(dest), "/sys/fs/cgroup/", hierarchy); + + r = path_is_mount_point(to, dest, 0); + if (r < 0 && r != -ENOENT) + return log_error_errno(r, "Failed to determine if %s is mounted already: %m", to); + if (r > 0) + return 0; + + mkdir_p(to, 0755); + + /* The superblock mount options of the mount point need to be + * identical to the hosts', and hence writable... */ + if (streq(controller, SYSTEMD_CGROUP_CONTROLLER_HYBRID)) { + fstype = "cgroup2"; + opts = NULL; + } else if (streq(controller, SYSTEMD_CGROUP_CONTROLLER_LEGACY)) { + fstype = "cgroup"; + opts = "none,name=systemd,xattr"; + } else { + fstype = "cgroup"; + opts = controller; + } + + r = mount_verbose(LOG_ERR, "cgroup", to, fstype, MS_NOSUID|MS_NOEXEC|MS_NODEV, opts); + if (r < 0) + return r; + + /* ... hence let's only make the bind mount read-only, not the superblock. */ + if (read_only) { + r = mount_verbose(LOG_ERR, NULL, to, NULL, + MS_BIND|MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_RDONLY, NULL); + if (r < 0) + return r; + } + + return 1; +} + +/* Mount a legacy cgroup hierarchy when cgroup namespaces are supported. */ +static int mount_legacy_cgns_supported( + const char *dest, + CGroupUnified unified_requested, + bool userns, + uid_t uid_shift, + uid_t uid_range, + const char *selinux_apifs_context) { + + _cleanup_set_free_free_ Set *controllers = NULL; + const char *cgroup_root = "/sys/fs/cgroup", *c; + int r; + + (void) mkdir_p(cgroup_root, 0755); + + /* Mount a tmpfs to /sys/fs/cgroup if it's not mounted there yet. */ + r = path_is_mount_point(cgroup_root, dest, AT_SYMLINK_FOLLOW); + if (r < 0) + return log_error_errno(r, "Failed to determine if /sys/fs/cgroup is already mounted: %m"); + if (r == 0) { + _cleanup_free_ char *options = NULL; + + /* When cgroup namespaces are enabled and user namespaces are + * used then the mount of the cgroupfs is done *inside* the new + * user namespace. We're root in the new user namespace and the + * kernel will happily translate our uid/gid to the correct + * uid/gid as seen from e.g. /proc/1/mountinfo. So we simply + * pass uid 0 and not uid_shift to tmpfs_patch_options(). + */ + r = tmpfs_patch_options("mode=755", 0, selinux_apifs_context, &options); + if (r < 0) + return log_oom(); + + r = mount_verbose(LOG_ERR, "tmpfs", cgroup_root, "tmpfs", + MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME, options); + if (r < 0) + return r; + } + + r = cg_all_unified(); + if (r < 0) + return r; + if (r > 0) + goto skip_controllers; + + r = get_process_controllers(&controllers); + if (r < 0) + return log_error_errno(r, "Failed to determine cgroup controllers: %m"); + + for (;;) { + _cleanup_free_ const char *controller = NULL; + + controller = set_steal_first(controllers); + if (!controller) + break; + + r = mount_legacy_cgroup_hierarchy("", controller, controller, !userns); + if (r < 0) + return r; + + /* When multiple hierarchies are co-mounted, make their + * constituting individual hierarchies a symlink to the + * co-mount. + */ + c = controller; + for (;;) { + _cleanup_free_ char *target = NULL, *tok = NULL; + + r = extract_first_word(&c, &tok, ",", 0); + if (r < 0) + return log_error_errno(r, "Failed to extract co-mounted cgroup controller: %m"); + if (r == 0) + break; + + if (streq(controller, tok)) + break; + + target = prefix_root("/sys/fs/cgroup/", tok); + if (!target) + return log_oom(); + + r = symlink_idempotent(controller, target); + if (r == -EINVAL) + return log_error_errno(r, "Invalid existing symlink for combined hierarchy: %m"); + if (r < 0) + return log_error_errno(r, "Failed to create symlink for combined hierarchy: %m"); + } + } + +skip_controllers: + if (unified_requested >= CGROUP_UNIFIED_SYSTEMD) { + r = mount_legacy_cgroup_hierarchy("", SYSTEMD_CGROUP_CONTROLLER_HYBRID, "unified", false); + if (r < 0) + return r; + } + + r = mount_legacy_cgroup_hierarchy("", SYSTEMD_CGROUP_CONTROLLER_LEGACY, "systemd", false); + if (r < 0) + return r; + + if (!userns) + return mount_verbose(LOG_ERR, NULL, cgroup_root, NULL, + MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME|MS_RDONLY, "mode=755"); + + return 0; +} + +/* Mount legacy cgroup hierarchy when cgroup namespaces are unsupported. */ +static int mount_legacy_cgns_unsupported( + const char *dest, + CGroupUnified unified_requested, + bool userns, + uid_t uid_shift, + uid_t uid_range, + const char *selinux_apifs_context) { + + _cleanup_set_free_free_ Set *controllers = NULL; + const char *cgroup_root; + int r; + + cgroup_root = prefix_roota(dest, "/sys/fs/cgroup"); + + (void) mkdir_p(cgroup_root, 0755); + + /* Mount a tmpfs to /sys/fs/cgroup if it's not mounted there yet. */ + r = path_is_mount_point(cgroup_root, dest, AT_SYMLINK_FOLLOW); + if (r < 0) + return log_error_errno(r, "Failed to determine if /sys/fs/cgroup is already mounted: %m"); + if (r == 0) { + _cleanup_free_ char *options = NULL; + + r = tmpfs_patch_options("mode=755", uid_shift == 0 ? UID_INVALID : uid_shift, selinux_apifs_context, &options); + if (r < 0) + return log_oom(); + + r = mount_verbose(LOG_ERR, "tmpfs", cgroup_root, "tmpfs", + MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME, options); + if (r < 0) + return r; + } + + r = cg_all_unified(); + if (r < 0) + return r; + if (r > 0) + goto skip_controllers; + + r = cg_kernel_controllers(&controllers); + if (r < 0) + return log_error_errno(r, "Failed to determine cgroup controllers: %m"); + + for (;;) { + _cleanup_free_ char *controller = NULL, *origin = NULL, *combined = NULL; + + controller = set_steal_first(controllers); + if (!controller) + break; + + origin = prefix_root("/sys/fs/cgroup/", controller); + if (!origin) + return log_oom(); + + r = readlink_malloc(origin, &combined); + if (r == -EINVAL) { + /* Not a symbolic link, but directly a single cgroup hierarchy */ + + r = mount_legacy_cgroup_hierarchy(dest, controller, controller, true); + if (r < 0) + return r; + + } else if (r < 0) + return log_error_errno(r, "Failed to read link %s: %m", origin); + else { + _cleanup_free_ char *target = NULL; + + target = prefix_root(dest, origin); + if (!target) + return log_oom(); + + /* A symbolic link, a combination of controllers in one hierarchy */ + + if (!filename_is_valid(combined)) { + log_warning("Ignoring invalid combined hierarchy %s.", combined); + continue; + } + + r = mount_legacy_cgroup_hierarchy(dest, combined, combined, true); + if (r < 0) + return r; + + r = symlink_idempotent(combined, target); + if (r == -EINVAL) + return log_error_errno(r, "Invalid existing symlink for combined hierarchy: %m"); + if (r < 0) + return log_error_errno(r, "Failed to create symlink for combined hierarchy: %m"); + } + } + +skip_controllers: + if (unified_requested >= CGROUP_UNIFIED_SYSTEMD) { + r = mount_legacy_cgroup_hierarchy(dest, SYSTEMD_CGROUP_CONTROLLER_HYBRID, "unified", false); + if (r < 0) + return r; + } + + r = mount_legacy_cgroup_hierarchy(dest, SYSTEMD_CGROUP_CONTROLLER_LEGACY, "systemd", false); + if (r < 0) + return r; + + return mount_verbose(LOG_ERR, NULL, cgroup_root, NULL, + MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME|MS_RDONLY, "mode=755"); +} + +static int mount_unified_cgroups(const char *dest) { + const char *p; + int r; + + assert(dest); + + p = prefix_roota(dest, "/sys/fs/cgroup"); + + (void) mkdir_p(p, 0755); + + r = path_is_mount_point(p, dest, AT_SYMLINK_FOLLOW); + if (r < 0) + return log_error_errno(r, "Failed to determine if %s is mounted already: %m", p); + if (r > 0) { + p = prefix_roota(dest, "/sys/fs/cgroup/cgroup.procs"); + if (access(p, F_OK) >= 0) + return 0; + if (errno != ENOENT) + return log_error_errno(errno, "Failed to determine if mount point %s contains the unified cgroup hierarchy: %m", p); + + log_error("%s is already mounted but not a unified cgroup hierarchy. Refusing.", p); + return -EINVAL; + } + + return mount_verbose(LOG_ERR, "cgroup", p, "cgroup2", MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL); +} + +int mount_cgroups( + const char *dest, + CGroupUnified unified_requested, + bool userns, + uid_t uid_shift, + uid_t uid_range, + const char *selinux_apifs_context, + bool use_cgns) { + + if (unified_requested >= CGROUP_UNIFIED_ALL) + return mount_unified_cgroups(dest); + if (use_cgns) + return mount_legacy_cgns_supported(dest, unified_requested, userns, uid_shift, uid_range, selinux_apifs_context); + + return mount_legacy_cgns_unsupported(dest, unified_requested, userns, uid_shift, uid_range, selinux_apifs_context); +} + +static int mount_systemd_cgroup_writable_one(const char *root, const char *own) { + int r; + + assert(root); + assert(own); + + /* Make our own cgroup a (writable) bind mount */ + r = mount_verbose(LOG_ERR, own, own, NULL, MS_BIND, NULL); + if (r < 0) + return r; + + /* And then remount the systemd cgroup root read-only */ + return mount_verbose(LOG_ERR, NULL, root, NULL, + MS_BIND|MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_RDONLY, NULL); +} + +int mount_systemd_cgroup_writable( + const char *dest, + CGroupUnified unified_requested) { + + _cleanup_free_ char *own_cgroup_path = NULL; + const char *root, *own; + int r; + + assert(dest); + + r = cg_pid_get_path(NULL, 0, &own_cgroup_path); + if (r < 0) + return log_error_errno(r, "Failed to determine our own cgroup path: %m"); + + /* If we are living in the top-level, then there's nothing to do... */ + if (path_equal(own_cgroup_path, "/")) + return 0; + + if (unified_requested >= CGROUP_UNIFIED_ALL) { + + root = prefix_roota(dest, "/sys/fs/cgroup"); + own = strjoina(root, own_cgroup_path); + + } else { + + if (unified_requested >= CGROUP_UNIFIED_SYSTEMD) { + root = prefix_roota(dest, "/sys/fs/cgroup/unified"); + own = strjoina(root, own_cgroup_path); + + r = mount_systemd_cgroup_writable_one(root, own); + if (r < 0) + return r; + } + + root = prefix_roota(dest, "/sys/fs/cgroup/systemd"); + own = strjoina(root, own_cgroup_path); + } + + return mount_systemd_cgroup_writable_one(root, own); +} diff --git a/src/nspawn/nspawn-cgroup.h b/src/nspawn/nspawn-cgroup.h index 6783c3a39f..035e8fbd0f 100644 --- a/src/nspawn/nspawn-cgroup.h +++ b/src/nspawn/nspawn-cgroup.h @@ -9,3 +9,6 @@ int chown_cgroup(pid_t pid, CGroupUnified unified_requested, uid_t uid_shift); int sync_cgroup(pid_t pid, CGroupUnified unified_requested, uid_t uid_shift); int create_subcgroup(pid_t pid, bool keep_unit, CGroupUnified unified_requested); + +int mount_cgroups(const char *dest, CGroupUnified unified_requested, bool userns, uid_t uid_shift, uid_t uid_range, const char *selinux_apifs_context, bool use_cgns); +int mount_systemd_cgroup_writable(const char *dest, CGroupUnified unified_requested); diff --git a/src/nspawn/nspawn-mount.c b/src/nspawn/nspawn-mount.c index 21684aa49b..46b76b6ade 100644 --- a/src/nspawn/nspawn-mount.c +++ b/src/nspawn/nspawn-mount.c @@ -327,7 +327,7 @@ int overlay_mount_parse(CustomMount **l, size_t *n, const char *s, bool read_onl return 0; } -static int tmpfs_patch_options( +int tmpfs_patch_options( const char *options, uid_t uid_shift, const char *selinux_apifs_context, @@ -850,419 +850,6 @@ int mount_custom( return 0; } -/* Retrieve existing subsystems. This function is called in a new cgroup - * namespace. - */ -static int get_process_controllers(Set **ret) { - _cleanup_set_free_free_ Set *controllers = NULL; - _cleanup_fclose_ FILE *f = NULL; - int r; - - assert(ret); - - controllers = set_new(&string_hash_ops); - if (!controllers) - return -ENOMEM; - - f = fopen("/proc/self/cgroup", "re"); - if (!f) - return errno == ENOENT ? -ESRCH : -errno; - - for (;;) { - _cleanup_free_ char *line = NULL; - char *e, *l; - - r = read_line(f, LONG_LINE_MAX, &line); - if (r < 0) - return r; - if (r == 0) - break; - - l = strchr(line, ':'); - if (!l) - continue; - - l++; - e = strchr(l, ':'); - if (!e) - continue; - - *e = 0; - - if (STR_IN_SET(l, "", "name=systemd", "name=unified")) - continue; - - r = set_put_strdup(controllers, l); - if (r < 0) - return r; - } - - *ret = TAKE_PTR(controllers); - - return 0; -} - -static int mount_legacy_cgroup_hierarchy( - const char *dest, - const char *controller, - const char *hierarchy, - bool read_only) { - - const char *to, *fstype, *opts; - int r; - - to = strjoina(strempty(dest), "/sys/fs/cgroup/", hierarchy); - - r = path_is_mount_point(to, dest, 0); - if (r < 0 && r != -ENOENT) - return log_error_errno(r, "Failed to determine if %s is mounted already: %m", to); - if (r > 0) - return 0; - - mkdir_p(to, 0755); - - /* The superblock mount options of the mount point need to be - * identical to the hosts', and hence writable... */ - if (streq(controller, SYSTEMD_CGROUP_CONTROLLER_HYBRID)) { - fstype = "cgroup2"; - opts = NULL; - } else if (streq(controller, SYSTEMD_CGROUP_CONTROLLER_LEGACY)) { - fstype = "cgroup"; - opts = "none,name=systemd,xattr"; - } else { - fstype = "cgroup"; - opts = controller; - } - - r = mount_verbose(LOG_ERR, "cgroup", to, fstype, MS_NOSUID|MS_NOEXEC|MS_NODEV, opts); - if (r < 0) - return r; - - /* ... hence let's only make the bind mount read-only, not the superblock. */ - if (read_only) { - r = mount_verbose(LOG_ERR, NULL, to, NULL, - MS_BIND|MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_RDONLY, NULL); - if (r < 0) - return r; - } - - return 1; -} - -/* Mount a legacy cgroup hierarchy when cgroup namespaces are supported. */ -static int mount_legacy_cgns_supported( - const char *dest, - CGroupUnified unified_requested, - bool userns, - uid_t uid_shift, - uid_t uid_range, - const char *selinux_apifs_context) { - - _cleanup_set_free_free_ Set *controllers = NULL; - const char *cgroup_root = "/sys/fs/cgroup", *c; - int r; - - (void) mkdir_p(cgroup_root, 0755); - - /* Mount a tmpfs to /sys/fs/cgroup if it's not mounted there yet. */ - r = path_is_mount_point(cgroup_root, dest, AT_SYMLINK_FOLLOW); - if (r < 0) - return log_error_errno(r, "Failed to determine if /sys/fs/cgroup is already mounted: %m"); - if (r == 0) { - _cleanup_free_ char *options = NULL; - - /* When cgroup namespaces are enabled and user namespaces are - * used then the mount of the cgroupfs is done *inside* the new - * user namespace. We're root in the new user namespace and the - * kernel will happily translate our uid/gid to the correct - * uid/gid as seen from e.g. /proc/1/mountinfo. So we simply - * pass uid 0 and not uid_shift to tmpfs_patch_options(). - */ - r = tmpfs_patch_options("mode=755", 0, selinux_apifs_context, &options); - if (r < 0) - return log_oom(); - - r = mount_verbose(LOG_ERR, "tmpfs", cgroup_root, "tmpfs", - MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME, options); - if (r < 0) - return r; - } - - r = cg_all_unified(); - if (r < 0) - return r; - if (r > 0) - goto skip_controllers; - - r = get_process_controllers(&controllers); - if (r < 0) - return log_error_errno(r, "Failed to determine cgroup controllers: %m"); - - for (;;) { - _cleanup_free_ const char *controller = NULL; - - controller = set_steal_first(controllers); - if (!controller) - break; - - r = mount_legacy_cgroup_hierarchy("", controller, controller, !userns); - if (r < 0) - return r; - - /* When multiple hierarchies are co-mounted, make their - * constituting individual hierarchies a symlink to the - * co-mount. - */ - c = controller; - for (;;) { - _cleanup_free_ char *target = NULL, *tok = NULL; - - r = extract_first_word(&c, &tok, ",", 0); - if (r < 0) - return log_error_errno(r, "Failed to extract co-mounted cgroup controller: %m"); - if (r == 0) - break; - - if (streq(controller, tok)) - break; - - target = prefix_root("/sys/fs/cgroup/", tok); - if (!target) - return log_oom(); - - r = symlink_idempotent(controller, target); - if (r == -EINVAL) - return log_error_errno(r, "Invalid existing symlink for combined hierarchy: %m"); - if (r < 0) - return log_error_errno(r, "Failed to create symlink for combined hierarchy: %m"); - } - } - -skip_controllers: - if (unified_requested >= CGROUP_UNIFIED_SYSTEMD) { - r = mount_legacy_cgroup_hierarchy("", SYSTEMD_CGROUP_CONTROLLER_HYBRID, "unified", false); - if (r < 0) - return r; - } - - r = mount_legacy_cgroup_hierarchy("", SYSTEMD_CGROUP_CONTROLLER_LEGACY, "systemd", false); - if (r < 0) - return r; - - if (!userns) - return mount_verbose(LOG_ERR, NULL, cgroup_root, NULL, - MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME|MS_RDONLY, "mode=755"); - - return 0; -} - -/* Mount legacy cgroup hierarchy when cgroup namespaces are unsupported. */ -static int mount_legacy_cgns_unsupported( - const char *dest, - CGroupUnified unified_requested, - bool userns, - uid_t uid_shift, - uid_t uid_range, - const char *selinux_apifs_context) { - - _cleanup_set_free_free_ Set *controllers = NULL; - const char *cgroup_root; - int r; - - cgroup_root = prefix_roota(dest, "/sys/fs/cgroup"); - - (void) mkdir_p(cgroup_root, 0755); - - /* Mount a tmpfs to /sys/fs/cgroup if it's not mounted there yet. */ - r = path_is_mount_point(cgroup_root, dest, AT_SYMLINK_FOLLOW); - if (r < 0) - return log_error_errno(r, "Failed to determine if /sys/fs/cgroup is already mounted: %m"); - if (r == 0) { - _cleanup_free_ char *options = NULL; - - r = tmpfs_patch_options("mode=755", uid_shift == 0 ? UID_INVALID : uid_shift, selinux_apifs_context, &options); - if (r < 0) - return log_oom(); - - r = mount_verbose(LOG_ERR, "tmpfs", cgroup_root, "tmpfs", - MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME, options); - if (r < 0) - return r; - } - - r = cg_all_unified(); - if (r < 0) - return r; - if (r > 0) - goto skip_controllers; - - r = cg_kernel_controllers(&controllers); - if (r < 0) - return log_error_errno(r, "Failed to determine cgroup controllers: %m"); - - for (;;) { - _cleanup_free_ char *controller = NULL, *origin = NULL, *combined = NULL; - - controller = set_steal_first(controllers); - if (!controller) - break; - - origin = prefix_root("/sys/fs/cgroup/", controller); - if (!origin) - return log_oom(); - - r = readlink_malloc(origin, &combined); - if (r == -EINVAL) { - /* Not a symbolic link, but directly a single cgroup hierarchy */ - - r = mount_legacy_cgroup_hierarchy(dest, controller, controller, true); - if (r < 0) - return r; - - } else if (r < 0) - return log_error_errno(r, "Failed to read link %s: %m", origin); - else { - _cleanup_free_ char *target = NULL; - - target = prefix_root(dest, origin); - if (!target) - return log_oom(); - - /* A symbolic link, a combination of controllers in one hierarchy */ - - if (!filename_is_valid(combined)) { - log_warning("Ignoring invalid combined hierarchy %s.", combined); - continue; - } - - r = mount_legacy_cgroup_hierarchy(dest, combined, combined, true); - if (r < 0) - return r; - - r = symlink_idempotent(combined, target); - if (r == -EINVAL) - return log_error_errno(r, "Invalid existing symlink for combined hierarchy: %m"); - if (r < 0) - return log_error_errno(r, "Failed to create symlink for combined hierarchy: %m"); - } - } - -skip_controllers: - if (unified_requested >= CGROUP_UNIFIED_SYSTEMD) { - r = mount_legacy_cgroup_hierarchy(dest, SYSTEMD_CGROUP_CONTROLLER_HYBRID, "unified", false); - if (r < 0) - return r; - } - - r = mount_legacy_cgroup_hierarchy(dest, SYSTEMD_CGROUP_CONTROLLER_LEGACY, "systemd", false); - if (r < 0) - return r; - - return mount_verbose(LOG_ERR, NULL, cgroup_root, NULL, - MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME|MS_RDONLY, "mode=755"); -} - -static int mount_unified_cgroups(const char *dest) { - const char *p; - int r; - - assert(dest); - - p = prefix_roota(dest, "/sys/fs/cgroup"); - - (void) mkdir_p(p, 0755); - - r = path_is_mount_point(p, dest, AT_SYMLINK_FOLLOW); - if (r < 0) - return log_error_errno(r, "Failed to determine if %s is mounted already: %m", p); - if (r > 0) { - p = prefix_roota(dest, "/sys/fs/cgroup/cgroup.procs"); - if (access(p, F_OK) >= 0) - return 0; - if (errno != ENOENT) - return log_error_errno(errno, "Failed to determine if mount point %s contains the unified cgroup hierarchy: %m", p); - - log_error("%s is already mounted but not a unified cgroup hierarchy. Refusing.", p); - return -EINVAL; - } - - return mount_verbose(LOG_ERR, "cgroup", p, "cgroup2", MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL); -} - -int mount_cgroups( - const char *dest, - CGroupUnified unified_requested, - bool userns, - uid_t uid_shift, - uid_t uid_range, - const char *selinux_apifs_context, - bool use_cgns) { - - if (unified_requested >= CGROUP_UNIFIED_ALL) - return mount_unified_cgroups(dest); - if (use_cgns) - return mount_legacy_cgns_supported(dest, unified_requested, userns, uid_shift, uid_range, selinux_apifs_context); - - return mount_legacy_cgns_unsupported(dest, unified_requested, userns, uid_shift, uid_range, selinux_apifs_context); -} - -static int mount_systemd_cgroup_writable_one(const char *root, const char *own) { - int r; - - assert(root); - assert(own); - - /* Make our own cgroup a (writable) bind mount */ - r = mount_verbose(LOG_ERR, own, own, NULL, MS_BIND, NULL); - if (r < 0) - return r; - - /* And then remount the systemd cgroup root read-only */ - return mount_verbose(LOG_ERR, NULL, root, NULL, - MS_BIND|MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_RDONLY, NULL); -} - -int mount_systemd_cgroup_writable( - const char *dest, - CGroupUnified unified_requested) { - - _cleanup_free_ char *own_cgroup_path = NULL; - const char *root, *own; - int r; - - assert(dest); - - r = cg_pid_get_path(NULL, 0, &own_cgroup_path); - if (r < 0) - return log_error_errno(r, "Failed to determine our own cgroup path: %m"); - - /* If we are living in the top-level, then there's nothing to do... */ - if (path_equal(own_cgroup_path, "/")) - return 0; - - if (unified_requested >= CGROUP_UNIFIED_ALL) { - - root = prefix_roota(dest, "/sys/fs/cgroup"); - own = strjoina(root, own_cgroup_path); - - } else { - - if (unified_requested >= CGROUP_UNIFIED_SYSTEMD) { - root = prefix_roota(dest, "/sys/fs/cgroup/unified"); - own = strjoina(root, own_cgroup_path); - - r = mount_systemd_cgroup_writable_one(root, own); - if (r < 0) - return r; - } - - root = prefix_roota(dest, "/sys/fs/cgroup/systemd"); - own = strjoina(root, own_cgroup_path); - } - - return mount_systemd_cgroup_writable_one(root, own); -} - int setup_volatile_state( const char *directory, VolatileMode mode, diff --git a/src/nspawn/nspawn-mount.h b/src/nspawn/nspawn-mount.h index e948d02c2c..db55759ec3 100644 --- a/src/nspawn/nspawn-mount.h +++ b/src/nspawn/nspawn-mount.h @@ -46,9 +46,6 @@ int overlay_mount_parse(CustomMount **l, size_t *n, const char *s, bool read_onl int mount_all(const char *dest, MountSettingsMask mount_settings, uid_t uid_shift, const char *selinux_apifs_context); int mount_sysfs(const char *dest, MountSettingsMask mount_settings); -int mount_cgroups(const char *dest, CGroupUnified unified_requested, bool userns, uid_t uid_shift, uid_t uid_range, const char *selinux_apifs_context, bool use_cgns); -int mount_systemd_cgroup_writable(const char *dest, CGroupUnified unified_requested); - int mount_custom(const char *dest, CustomMount *mounts, size_t n, bool userns, uid_t uid_shift, uid_t uid_range, const char *selinux_apifs_context); int setup_volatile(const char *directory, VolatileMode mode, bool userns, uid_t uid_shift, uid_t uid_range, const char *selinux_apifs_context); @@ -56,3 +53,5 @@ int setup_volatile_state(const char *directory, VolatileMode mode, bool userns, int pivot_root_parse(char **pivot_root_new, char **pivot_root_old, const char *s); int setup_pivot_root(const char *directory, const char *pivot_root_new, const char *pivot_root_old); + +int tmpfs_patch_options(const char *options,uid_t uid_shift, const char *selinux_apifs_context, char **ret); From 93dbdf6cb1466133def725986a4605f8594959ae Mon Sep 17 00:00:00 2001 From: Luke Shumaker Date: Fri, 7 Jul 2017 15:17:41 -0400 Subject: [PATCH 4/6] nspawn: sync_cgroup(): Rename arg_uid_shift -> uid_shift Naming it arg_uid_shift is confusing because of the global arg_uid_shift in nspawn.c --- src/nspawn/nspawn-cgroup.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/nspawn/nspawn-cgroup.c b/src/nspawn/nspawn-cgroup.c index 0be911999a..4a3cd29094 100644 --- a/src/nspawn/nspawn-cgroup.c +++ b/src/nspawn/nspawn-cgroup.c @@ -75,7 +75,7 @@ int chown_cgroup(pid_t pid, CGroupUnified unified_requested, uid_t uid_shift) { return 0; } -int sync_cgroup(pid_t pid, CGroupUnified unified_requested, uid_t arg_uid_shift) { +int sync_cgroup(pid_t pid, CGroupUnified unified_requested, uid_t uid_shift) { _cleanup_free_ char *cgroup = NULL; char tree[] = "/tmp/unifiedXXXXXX", pid_string[DECIMAL_STR_MAX(pid) + 1]; bool undo_mount = false; @@ -129,7 +129,7 @@ int sync_cgroup(pid_t pid, CGroupUnified unified_requested, uid_t arg_uid_shift) } fn = strjoina(tree, cgroup); - r = chown_cgroup_path(fn, arg_uid_shift); + r = chown_cgroup_path(fn, uid_shift); if (r < 0) log_error_errno(r, "Failed to chown() cgroup %s: %m", fn); finish: From f09e86bcaa012d64addd2314fa6054657a02f64c Mon Sep 17 00:00:00 2001 From: Luke Shumaker Date: Sat, 10 Jun 2017 00:06:45 -0400 Subject: [PATCH 5/6] cgroup-util: cg_kernel_controllers(): Fix comment about including "name=" Remove "arbitrary named hierarchies" from the list of things that cg_kernel_controllers() might return, and clarify that "name=" pseudo-controllers are not included in the returned list. /proc/cgroups does not contain "name=" pseudo-controllers, and cg_kernel_controllers() makes no effort to enumerate them via a different mechanism. --- src/basic/cgroup-util.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/src/basic/cgroup-util.c b/src/basic/cgroup-util.c index 038ece4b06..daa15dbfcb 100644 --- a/src/basic/cgroup-util.c +++ b/src/basic/cgroup-util.c @@ -2384,10 +2384,9 @@ int cg_kernel_controllers(Set **ret) { assert(ret); - /* Determines the full list of kernel-known controllers. Might - * include controllers we don't actually support, arbitrary - * named hierarchies and controllers that aren't currently - * accessible (because not mounted). */ + /* Determines the full list of kernel-known controllers. Might include controllers we don't actually support + * and controllers that aren't currently accessible (because not mounted). This does not include "name=" + * pseudo-controllers. */ controllers = set_new(&string_hash_ops); if (!controllers) From 677a72cd3efdfde9d544b2d1fe62f352d6d8472c Mon Sep 17 00:00:00 2001 From: Luke Shumaker Date: Thu, 1 Jun 2017 13:59:20 -0400 Subject: [PATCH 6/6] nspawn: mount_sysfs(): Unconditionally mkdir /sys/fs/cgroup Currently, mount_sysfs() only creates /sys/fs/cgroup if cg_ns_supported(). The comment explains that we need to "Create mountpoint for cgroups. Otherwise we are not allowed since we remount /sys read-only."; that is: that we need to do it now, rather than later. However, the comment doesn't do anything to explain why we only need to do this if cg_ns_supported(); shouldn't we _always_ need to do it? The answer is that if !use_cgns, then this was already done by the outer child, so mount_sysfs() only needs to do it if use_cgns. Now, mount_sysfs() doesn't know whether use_cgns, but !cg_ns_supported() implies !use_cgns, so we can optimize" the case where we _know_ !use_cgns, and deal with a no-op mkdir_p() in the false-positive where cgns_supported() but !use_cgns. But is it really much of an optimization? We're potentially spending an access(2) (cg_ns_supported() could be cached from a previous call) to potentially save an lstat(2) and mkdir(2); and all of them are on virtual fileystems, so they should all be pretty cheap. So, simplify and drop the conditional. It's a dubious optimization that requires more text to explain than it's worth. --- src/nspawn/nspawn-mount.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/src/nspawn/nspawn-mount.c b/src/nspawn/nspawn-mount.c index 46b76b6ade..1279b9bb3e 100644 --- a/src/nspawn/nspawn-mount.c +++ b/src/nspawn/nspawn-mount.c @@ -429,10 +429,8 @@ int mount_sysfs(const char *dest, MountSettingsMask mount_settings) { /* Create mountpoint for cgroups. Otherwise we are not allowed since we * remount /sys read-only. */ - if (cg_ns_supported()) { - x = prefix_roota(top, "/fs/cgroup"); - (void) mkdir_p(x, 0755); - } + x = prefix_roota(top, "/fs/cgroup"); + (void) mkdir_p(x, 0755); return mount_verbose(LOG_ERR, NULL, top, NULL, MS_BIND|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT|extra_flags, NULL);