From 6af52c3a458691b016bedeba34c1e72294a67c81 Mon Sep 17 00:00:00 2001 From: Luca Boccassi Date: Thu, 13 Aug 2020 14:01:34 +0100 Subject: [PATCH 1/9] machine/basic: factor out helper function to add airlocked mount to namespace --- src/machine/machine-dbus.c | 214 ++---------------------------------- src/shared/mount-util.c | 217 +++++++++++++++++++++++++++++++++++++ src/shared/mount-util.h | 2 + 3 files changed, 227 insertions(+), 206 deletions(-) diff --git a/src/machine/machine-dbus.c b/src/machine/machine-dbus.c index 5d8d4276e2..e5d8915be0 100644 --- a/src/machine/machine-dbus.c +++ b/src/machine/machine-dbus.c @@ -810,17 +810,9 @@ int bus_machine_method_open_shell(sd_bus_message *message, void *userdata, sd_bu } int bus_machine_method_bind_mount(sd_bus_message *message, void *userdata, sd_bus_error *error) { - _cleanup_close_pair_ int errno_pipe_fd[2] = { -1, -1 }; - char mount_slave[] = "/tmp/propagate.XXXXXX", *mount_tmp, *mount_outside, *p; - bool mount_slave_created = false, mount_slave_mounted = false, - mount_tmp_created = false, mount_tmp_mounted = false, - mount_outside_created = false, mount_outside_mounted = false; - _cleanup_free_ char *chased_src = NULL; int read_only, make_file_or_directory; - const char *dest, *src; + const char *dest, *src, *propagate_directory; Machine *m = userdata; - struct stat st; - pid_t child; uid_t uid; int r; @@ -862,205 +854,15 @@ int bus_machine_method_bind_mount(sd_bus_message *message, void *userdata, sd_bu if (uid != 0) return sd_bus_error_setf(error, SD_BUS_ERROR_NOT_SUPPORTED, "Can't bind mount on container with user namespacing applied."); - /* One day, when bind mounting /proc/self/fd/n works across - * namespace boundaries we should rework this logic to make - * use of it... */ - - p = strjoina("/run/systemd/nspawn/propagate/", m->name, "/"); - if (laccess(p, F_OK) < 0) - return sd_bus_error_setf(error, SD_BUS_ERROR_NOT_SUPPORTED, "Container does not allow propagation of mount points."); - - r = chase_symlinks(src, NULL, CHASE_TRAIL_SLASH, &chased_src, NULL); + propagate_directory = strjoina("/run/systemd/nspawn/propagate/", m->name); + r = bind_mount_in_namespace(m->leader, + propagate_directory, + "/run/host/incoming/", + src, dest, read_only, make_file_or_directory); if (r < 0) - return sd_bus_error_set_errnof(error, r, "Failed to resolve source path: %m"); + return sd_bus_error_set_errnof(error, r, "Failed to mount %s on %s in machine's namespace: %m", src, dest); - if (lstat(chased_src, &st) < 0) - return sd_bus_error_set_errnof(error, errno, "Failed to stat() source path: %m"); - if (S_ISLNK(st.st_mode)) /* This shouldn't really happen, given that we just chased the symlinks above, but let's better be safe… */ - return sd_bus_error_setf(error, SD_BUS_ERROR_NOT_SUPPORTED, "Source directory can't be a symbolic link"); - - /* Our goal is to install a new bind mount into the container, - possibly read-only. This is irritatingly complex - unfortunately, currently. - - First, we start by creating a private playground in /tmp, - that we can mount MS_SLAVE. (Which is necessary, since - MS_MOVE cannot be applied to mounts with MS_SHARED parent - mounts.) */ - - if (!mkdtemp(mount_slave)) - return sd_bus_error_set_errnof(error, errno, "Failed to create playground %s: %m", mount_slave); - - mount_slave_created = true; - - r = mount_nofollow_verbose(LOG_DEBUG, mount_slave, mount_slave, NULL, MS_BIND, NULL); - if (r < 0) { - sd_bus_error_set_errnof(error, r, "Failed to make bind mount %s: %m", mount_slave); - goto finish; - } - - mount_slave_mounted = true; - - r = mount_nofollow_verbose(LOG_DEBUG, NULL, mount_slave, NULL, MS_SLAVE, NULL); - if (r < 0) { - sd_bus_error_set_errnof(error, r, "Failed to remount slave %s: %m", mount_slave); - goto finish; - } - - /* Second, we mount the source file or directory to a directory inside of our MS_SLAVE playground. */ - mount_tmp = strjoina(mount_slave, "/mount"); - r = make_mount_point_inode_from_stat(&st, mount_tmp, 0700); - if (r < 0) { - sd_bus_error_set_errnof(error, r, "Failed to create temporary mount point %s: %m", mount_tmp); - goto finish; - } - - mount_tmp_created = true; - - r = mount_nofollow_verbose(LOG_DEBUG, chased_src, mount_tmp, NULL, MS_BIND, NULL); - if (r < 0) { - sd_bus_error_set_errnof(error, r, "Failed to mount %s: %m", chased_src); - goto finish; - } - - mount_tmp_mounted = true; - - /* Third, we remount the new bind mount read-only if requested. */ - if (read_only) { - r = mount_nofollow_verbose(LOG_DEBUG, NULL, mount_tmp, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY, NULL); - if (r < 0) { - sd_bus_error_set_errnof(error, r, "Failed to remount read-only %s: %m", mount_tmp); - goto finish; - } - } - - /* Fourth, we move the new bind mount into the propagation directory. This way it will appear there read-only - * right-away. */ - - mount_outside = strjoina("/run/systemd/nspawn/propagate/", m->name, "/XXXXXX"); - if (S_ISDIR(st.st_mode)) - r = mkdtemp(mount_outside) ? 0 : -errno; - else { - r = mkostemp_safe(mount_outside); - safe_close(r); - } - if (r < 0) { - sd_bus_error_set_errnof(error, r, "Cannot create propagation file or directory %s: %m", mount_outside); - goto finish; - } - - mount_outside_created = true; - - r = mount_nofollow_verbose(LOG_DEBUG, mount_tmp, mount_outside, NULL, MS_MOVE, NULL); - if (r < 0) { - sd_bus_error_set_errnof(error, r, "Failed to move %s to %s: %m", mount_tmp, mount_outside); - goto finish; - } - - mount_outside_mounted = true; - mount_tmp_mounted = false; - - if (S_ISDIR(st.st_mode)) - (void) rmdir(mount_tmp); - else - (void) unlink(mount_tmp); - mount_tmp_created = false; - - (void) umount_verbose(LOG_DEBUG, mount_slave, UMOUNT_NOFOLLOW); - mount_slave_mounted = false; - - (void) rmdir(mount_slave); - mount_slave_created = false; - - if (pipe2(errno_pipe_fd, O_CLOEXEC|O_NONBLOCK) < 0) { - r = sd_bus_error_set_errnof(error, errno, "Failed to create pipe: %m"); - goto finish; - } - - r = safe_fork("(sd-bindmnt)", FORK_RESET_SIGNALS, &child); - if (r < 0) { - sd_bus_error_set_errnof(error, r, "Failed to fork(): %m"); - goto finish; - } - if (r == 0) { - const char *mount_inside, *q; - int mntfd; - - errno_pipe_fd[0] = safe_close(errno_pipe_fd[0]); - - q = procfs_file_alloca(m->leader, "ns/mnt"); - mntfd = open(q, O_RDONLY|O_NOCTTY|O_CLOEXEC); - if (mntfd < 0) { - r = log_error_errno(errno, "Failed to open mount namespace of leader: %m"); - goto child_fail; - } - - if (setns(mntfd, CLONE_NEWNS) < 0) { - r = log_error_errno(errno, "Failed to join namespace of leader: %m"); - goto child_fail; - } - - if (make_file_or_directory) { - (void) mkdir_parents(dest, 0755); - (void) make_mount_point_inode_from_stat(&st, dest, 0700); - } - - mount_inside = strjoina("/run/host/incoming/", basename(mount_outside)); - r = mount_nofollow_verbose(LOG_ERR, mount_inside, dest, NULL, MS_MOVE, NULL); - if (r < 0) - goto child_fail; - - _exit(EXIT_SUCCESS); - - child_fail: - (void) write(errno_pipe_fd[1], &r, sizeof(r)); - errno_pipe_fd[1] = safe_close(errno_pipe_fd[1]); - - _exit(EXIT_FAILURE); - } - - errno_pipe_fd[1] = safe_close(errno_pipe_fd[1]); - - r = wait_for_terminate_and_check("(sd-bindmnt)", child, 0); - if (r < 0) { - r = sd_bus_error_set_errnof(error, r, "Failed to wait for child: %m"); - goto finish; - } - if (r != EXIT_SUCCESS) { - if (read(errno_pipe_fd[0], &r, sizeof(r)) == sizeof(r)) - r = sd_bus_error_set_errnof(error, r, "Failed to mount: %m"); - else - r = sd_bus_error_setf(error, SD_BUS_ERROR_FAILED, "Child failed."); - goto finish; - } - - r = sd_bus_reply_method_return(message, NULL); - -finish: - if (mount_outside_mounted) - (void) umount_verbose(LOG_DEBUG, mount_outside, UMOUNT_NOFOLLOW); - if (mount_outside_created) { - if (S_ISDIR(st.st_mode)) - (void) rmdir(mount_outside); - else - (void) unlink(mount_outside); - } - - if (mount_tmp_mounted) - (void) umount_verbose(LOG_DEBUG, mount_tmp, UMOUNT_NOFOLLOW); - if (mount_tmp_created) { - if (S_ISDIR(st.st_mode)) - (void) rmdir(mount_tmp); - else - (void) unlink(mount_tmp); - } - - if (mount_slave_mounted) - (void) umount_verbose(LOG_DEBUG, mount_slave, UMOUNT_NOFOLLOW); - if (mount_slave_created) - (void) rmdir(mount_slave); - - return r; + return sd_bus_reply_method_return(message, NULL); } int bus_machine_method_copy(sd_bus_message *message, void *userdata, sd_bus_error *error) { diff --git a/src/shared/mount-util.c b/src/shared/mount-util.c index a6480b93a4..24c68c747f 100644 --- a/src/shared/mount-util.c +++ b/src/shared/mount-util.c @@ -14,15 +14,18 @@ #include "fs-util.h" #include "hashmap.h" #include "libmount-util.h" +#include "mkdir.h" #include "mount-util.h" #include "mountpoint-util.h" #include "parse-util.h" #include "path-util.h" +#include "process-util.h" #include "set.h" #include "stat-util.h" #include "stdio-util.h" #include "string-util.h" #include "strv.h" +#include "tmpfile-util.h" int mount_fd(const char *source, int target_fd, @@ -742,3 +745,217 @@ int mount_option_mangle( return 0; } + +int bind_mount_in_namespace( + pid_t target, + const char *propagate_path, + const char *incoming_path, + const char *src, + const char *dest, + bool read_only, + bool make_file_or_directory) { + + _cleanup_close_pair_ int errno_pipe_fd[2] = { -1, -1 }; + char mount_slave[] = "/tmp/propagate.XXXXXX", *mount_tmp, *mount_outside, *p; + bool mount_slave_created = false, mount_slave_mounted = false, + mount_tmp_created = false, mount_tmp_mounted = false, + mount_outside_created = false, mount_outside_mounted = false; + _cleanup_free_ char *chased_src = NULL; + struct stat st; + pid_t child; + int r; + + assert(target > 0); + assert(propagate_path); + assert(incoming_path); + assert(src); + assert(dest); + + /* One day, when bind mounting /proc/self/fd/n works across + * namespace boundaries we should rework this logic to make + * use of it... */ + + p = strjoina(propagate_path, "/"); + r = laccess(p, F_OK); + if (r < 0) + return log_debug_errno(r == -ENOENT ? SYNTHETIC_ERRNO(EOPNOTSUPP) : r, "Target does not allow propagation of mount points"); + + r = chase_symlinks(src, NULL, CHASE_TRAIL_SLASH, &chased_src, NULL); + if (r < 0) + return log_debug_errno(r, "Failed to resolve source path of %s: %m", src); + + if (lstat(chased_src, &st) < 0) + return log_debug_errno(errno, "Failed to stat() resolved source path %s: %m", chased_src); + if (S_ISLNK(st.st_mode)) /* This shouldn't really happen, given that we just chased the symlinks above, but let's better be safe… */ + return log_debug_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "Source directory %s can't be a symbolic link", chased_src); + + /* Our goal is to install a new bind mount into the container, + possibly read-only. This is irritatingly complex + unfortunately, currently. + + First, we start by creating a private playground in /tmp, + that we can mount MS_SLAVE. (Which is necessary, since + MS_MOVE cannot be applied to mounts with MS_SHARED parent + mounts.) */ + + if (!mkdtemp(mount_slave)) + return log_debug_errno(errno, "Failed to create playground %s: %m", mount_slave); + + mount_slave_created = true; + + r = mount_nofollow_verbose(LOG_DEBUG, mount_slave, mount_slave, NULL, MS_BIND, NULL); + if (r < 0) + goto finish; + + mount_slave_mounted = true; + + r = mount_nofollow_verbose(LOG_DEBUG, NULL, mount_slave, NULL, MS_SLAVE, NULL); + if (r < 0) + goto finish; + + /* Second, we mount the source file or directory to a directory inside of our MS_SLAVE playground. */ + mount_tmp = strjoina(mount_slave, "/mount"); + r = make_mount_point_inode_from_stat(&st, mount_tmp, 0700); + if (r < 0) { + log_debug_errno(r, "Failed to create temporary mount point %s: %m", mount_tmp); + goto finish; + } + + mount_tmp_created = true; + + r = mount_nofollow_verbose(LOG_DEBUG, chased_src, mount_tmp, NULL, MS_BIND, NULL); + if (r < 0) + goto finish; + + mount_tmp_mounted = true; + + /* Third, we remount the new bind mount read-only if requested. */ + if (read_only) { + r = mount_nofollow_verbose(LOG_DEBUG, NULL, mount_tmp, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY, NULL); + if (r < 0) + goto finish; + } + + /* Fourth, we move the new bind mount into the propagation directory. This way it will appear there read-only + * right-away. */ + + mount_outside = strjoina(propagate_path, "/XXXXXX"); + if (S_ISDIR(st.st_mode)) + r = mkdtemp(mount_outside) ? 0 : -errno; + else { + r = mkostemp_safe(mount_outside); + safe_close(r); + } + if (r < 0) { + log_debug_errno(r, "Cannot create propagation file or directory %s: %m", mount_outside); + goto finish; + } + + mount_outside_created = true; + + r = mount_nofollow_verbose(LOG_DEBUG, mount_tmp, mount_outside, NULL, MS_MOVE, NULL); + if (r < 0) + goto finish; + + mount_outside_mounted = true; + mount_tmp_mounted = false; + + if (S_ISDIR(st.st_mode)) + (void) rmdir(mount_tmp); + else + (void) unlink(mount_tmp); + mount_tmp_created = false; + + (void) umount_verbose(LOG_DEBUG, mount_slave, UMOUNT_NOFOLLOW); + mount_slave_mounted = false; + + (void) rmdir(mount_slave); + mount_slave_created = false; + + if (pipe2(errno_pipe_fd, O_CLOEXEC|O_NONBLOCK) < 0) { + log_debug_errno(errno, "Failed to create pipe: %m"); + goto finish; + } + + r = safe_fork("(sd-bindmnt)", FORK_RESET_SIGNALS, &child); + if (r < 0) + goto finish; + if (r == 0) { + const char *mount_inside, *q; + int mntfd; + + errno_pipe_fd[0] = safe_close(errno_pipe_fd[0]); + + q = procfs_file_alloca(target, "ns/mnt"); + mntfd = open(q, O_RDONLY|O_NOCTTY|O_CLOEXEC); + if (mntfd < 0) { + r = log_error_errno(errno, "Failed to open mount namespace of leader: %m"); + goto child_fail; + } + + if (setns(mntfd, CLONE_NEWNS) < 0) { + r = log_error_errno(errno, "Failed to join namespace of leader: %m"); + goto child_fail; + } + + if (make_file_or_directory) { + (void) mkdir_parents(dest, 0755); + (void) make_mount_point_inode_from_stat(&st, dest, 0700); + } + + /* Fifth, move the mount to the right place inside */ + mount_inside = strjoina(incoming_path, basename(mount_outside)); + r = mount_nofollow_verbose(LOG_ERR, mount_inside, dest, NULL, MS_MOVE, NULL); + if (r < 0) + goto child_fail; + + _exit(EXIT_SUCCESS); + + child_fail: + (void) write(errno_pipe_fd[1], &r, sizeof(r)); + errno_pipe_fd[1] = safe_close(errno_pipe_fd[1]); + + _exit(EXIT_FAILURE); + } + + errno_pipe_fd[1] = safe_close(errno_pipe_fd[1]); + + r = wait_for_terminate_and_check("(sd-bindmnt)", child, 0); + if (r < 0) { + log_debug_errno(r, "Failed to wait for child: %m"); + goto finish; + } + if (r != EXIT_SUCCESS) { + if (read(errno_pipe_fd[0], &r, sizeof(r)) == sizeof(r)) + log_debug_errno(r, "Failed to mount: %m"); + else + log_debug("Child failed."); + goto finish; + } + +finish: + if (mount_outside_mounted) + (void) umount_verbose(LOG_DEBUG, mount_outside, UMOUNT_NOFOLLOW); + if (mount_outside_created) { + if (S_ISDIR(st.st_mode)) + (void) rmdir(mount_outside); + else + (void) unlink(mount_outside); + } + + if (mount_tmp_mounted) + (void) umount_verbose(LOG_DEBUG, mount_tmp, UMOUNT_NOFOLLOW); + if (mount_tmp_created) { + if (S_ISDIR(st.st_mode)) + (void) rmdir(mount_tmp); + else + (void) unlink(mount_tmp); + } + + if (mount_slave_mounted) + (void) umount_verbose(LOG_DEBUG, mount_slave, UMOUNT_NOFOLLOW); + if (mount_slave_created) + (void) rmdir(mount_slave); + + return r; +} diff --git a/src/shared/mount-util.h b/src/shared/mount-util.h index 089fd69e29..fa36dd7875 100644 --- a/src/shared/mount-util.h +++ b/src/shared/mount-util.h @@ -97,3 +97,5 @@ static inline char* umount_and_rmdir_and_free(char *p) { return mfree(p); } DEFINE_TRIVIAL_CLEANUP_FUNC(char*, umount_and_rmdir_and_free); + +int bind_mount_in_namespace(pid_t target, const char *propagate_path, const char *incoming_path, const char *src, const char *dest, bool read_only, bool make_file_or_directory); From 724e689715c8d9f23d035ab20d8c87b6b6c06e33 Mon Sep 17 00:00:00 2001 From: Luca Boccassi Date: Tue, 15 Dec 2020 18:26:34 +0000 Subject: [PATCH 2/9] machine: adjust error message to use 'normalized' instead of ../ --- src/machine/machine-dbus.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/machine/machine-dbus.c b/src/machine/machine-dbus.c index e5d8915be0..7d6e1c7163 100644 --- a/src/machine/machine-dbus.c +++ b/src/machine/machine-dbus.c @@ -827,12 +827,12 @@ int bus_machine_method_bind_mount(sd_bus_message *message, void *userdata, sd_bu return r; if (!path_is_absolute(src) || !path_is_normalized(src)) - return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Source path must be absolute and not contain ../."); + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Source path must be absolute and normalized."); if (isempty(dest)) dest = src; else if (!path_is_absolute(dest) || !path_is_normalized(dest)) - return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Destination path must be absolute and not contain ../."); + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Destination path must be absolute and normalized."); r = bus_verify_polkit_async( message, From 2338a175fdec3859eab03115ca82a0d58453f5d7 Mon Sep 17 00:00:00 2001 From: Luca Boccassi Date: Thu, 13 Aug 2020 14:47:01 +0100 Subject: [PATCH 3/9] shared/mount-util: use namespace_fork utils --- src/shared/mount-util.c | 40 ++++++++++++++++++++++++---------------- 1 file changed, 24 insertions(+), 16 deletions(-) diff --git a/src/shared/mount-util.c b/src/shared/mount-util.c index 24c68c747f..d4b49cf5fd 100644 --- a/src/shared/mount-util.c +++ b/src/shared/mount-util.c @@ -17,6 +17,7 @@ #include "mkdir.h" #include "mount-util.h" #include "mountpoint-util.h" +#include "namespace-util.h" #include "parse-util.h" #include "path-util.h" #include "process-util.h" @@ -756,12 +757,13 @@ int bind_mount_in_namespace( bool make_file_or_directory) { _cleanup_close_pair_ int errno_pipe_fd[2] = { -1, -1 }; + _cleanup_close_ int self_mntns_fd = -1, mntns_fd = -1, root_fd = -1; char mount_slave[] = "/tmp/propagate.XXXXXX", *mount_tmp, *mount_outside, *p; bool mount_slave_created = false, mount_slave_mounted = false, mount_tmp_created = false, mount_tmp_mounted = false, mount_outside_created = false, mount_outside_mounted = false; _cleanup_free_ char *chased_src = NULL; - struct stat st; + struct stat st, self_mntns_st; pid_t child; int r; @@ -771,6 +773,24 @@ int bind_mount_in_namespace( assert(src); assert(dest); + r = namespace_open(target, NULL, &mntns_fd, NULL, NULL, &root_fd); + if (r < 0) + return log_debug_errno(r, "Failed to retrieve FDs of the target process' namespace: %m"); + + if (fstat(mntns_fd, &st) < 0) + return log_debug_errno(errno, "Failed to fstat mount namespace FD of target process: %m"); + + r = namespace_open(0, NULL, &self_mntns_fd, NULL, NULL, NULL); + if (r < 0) + return log_debug_errno(r, "Failed to retrieve FDs of systemd's namespace: %m"); + + if (fstat(self_mntns_fd, &self_mntns_st) < 0) + return log_debug_errno(errno, "Failed to fstat mount namespace FD of systemd: %m"); + + /* We can't add new mounts at runtime if the process wasn't started in a namespace */ + if (st.st_ino == self_mntns_st.st_ino && st.st_dev == self_mntns_st.st_dev) + return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), "Failed to activate bind mount in target, not running in a mount namespace"); + /* One day, when bind mounting /proc/self/fd/n works across * namespace boundaries we should rework this logic to make * use of it... */ @@ -877,27 +897,15 @@ int bind_mount_in_namespace( goto finish; } - r = safe_fork("(sd-bindmnt)", FORK_RESET_SIGNALS, &child); + r = namespace_fork("(sd-bindmnt)", "(sd-bindmnt-inner)", NULL, 0, FORK_RESET_SIGNALS|FORK_DEATHSIG, + -1, mntns_fd, -1, -1, root_fd, &child); if (r < 0) goto finish; if (r == 0) { - const char *mount_inside, *q; - int mntfd; + const char *mount_inside; errno_pipe_fd[0] = safe_close(errno_pipe_fd[0]); - q = procfs_file_alloca(target, "ns/mnt"); - mntfd = open(q, O_RDONLY|O_NOCTTY|O_CLOEXEC); - if (mntfd < 0) { - r = log_error_errno(errno, "Failed to open mount namespace of leader: %m"); - goto child_fail; - } - - if (setns(mntfd, CLONE_NEWNS) < 0) { - r = log_error_errno(errno, "Failed to join namespace of leader: %m"); - goto child_fail; - } - if (make_file_or_directory) { (void) mkdir_parents(dest, 0755); (void) make_mount_point_inode_from_stat(&st, dest, 0700); From 98f654fdeab1e1b6df2be76e29e4ccbb6624898d Mon Sep 17 00:00:00 2001 From: Luca Boccassi Date: Wed, 13 Jan 2021 23:52:00 +0000 Subject: [PATCH 4/9] machine: enter target PID namespace when adding a live mount machinectl fails since 21935150a0c42b91a322105f6a9129116bfc8e2e as it's now mounting onto a file descriptor in a target namespace, without joining the target's PID namespace. Note that it's not enough to setns CLONE_NEWPID, but a double-fork is required as well, as implemented by namespace_fork(). Add a test case to TEST-13-NSPAWN to cover this use case. --- src/shared/mount-util.c | 6 +++--- test/create-busybox-container | 3 +++ test/units/testsuite-13.sh | 25 +++++++++++++++++++++++++ 3 files changed, 31 insertions(+), 3 deletions(-) diff --git a/src/shared/mount-util.c b/src/shared/mount-util.c index d4b49cf5fd..0aaba49852 100644 --- a/src/shared/mount-util.c +++ b/src/shared/mount-util.c @@ -757,7 +757,7 @@ int bind_mount_in_namespace( bool make_file_or_directory) { _cleanup_close_pair_ int errno_pipe_fd[2] = { -1, -1 }; - _cleanup_close_ int self_mntns_fd = -1, mntns_fd = -1, root_fd = -1; + _cleanup_close_ int self_mntns_fd = -1, mntns_fd = -1, root_fd = -1, pidns_fd = -1; char mount_slave[] = "/tmp/propagate.XXXXXX", *mount_tmp, *mount_outside, *p; bool mount_slave_created = false, mount_slave_mounted = false, mount_tmp_created = false, mount_tmp_mounted = false, @@ -773,7 +773,7 @@ int bind_mount_in_namespace( assert(src); assert(dest); - r = namespace_open(target, NULL, &mntns_fd, NULL, NULL, &root_fd); + r = namespace_open(target, &pidns_fd, &mntns_fd, NULL, NULL, &root_fd); if (r < 0) return log_debug_errno(r, "Failed to retrieve FDs of the target process' namespace: %m"); @@ -898,7 +898,7 @@ int bind_mount_in_namespace( } r = namespace_fork("(sd-bindmnt)", "(sd-bindmnt-inner)", NULL, 0, FORK_RESET_SIGNALS|FORK_DEATHSIG, - -1, mntns_fd, -1, -1, root_fd, &child); + pidns_fd, mntns_fd, -1, -1, root_fd, &child); if (r < 0) goto finish; if (r == 0) { diff --git a/test/create-busybox-container b/test/create-busybox-container index 5ded42950a..b2b7b26294 100755 --- a/test/create-busybox-container +++ b/test/create-busybox-container @@ -28,6 +28,9 @@ ln -s busybox "$root/bin/cat" ln -s busybox "$root/bin/tr" ln -s busybox "$root/bin/ps" ln -s busybox "$root/bin/ip" +ln -s busybox "$root/bin/seq" +ln -s busybox "$root/bin/sleep" +ln -s busybox "$root/bin/test" mkdir -p "$root/sbin" cat <<'EOF' >"$root/sbin/init" diff --git a/test/units/testsuite-13.sh b/test/units/testsuite-13.sh index 969ca4a8d9..1844323d2f 100755 --- a/test/units/testsuite-13.sh +++ b/test/units/testsuite-13.sh @@ -93,6 +93,29 @@ if echo test >> /run/host/os-release; then exit 1; fi fi } +function check_machinectl_bind { + local _cmd='for i in $(seq 1 20); do if test -f /tmp/marker; then exit 0; fi; sleep 0.5; done; exit 1;' + + cat < /run/systemd/system/nspawn_machinectl_bind.service +[Service] +Type=notify +ExecStart=systemd-nspawn $SUSE_OPTS -D /testsuite-13.nc-container --notify-ready=no /bin/sh -x -e -c "$_cmd" +EOF + + systemctl start nspawn_machinectl_bind.service + + touch /tmp/marker + + machinectl bind --mkdir testsuite-13.nc-container /tmp/marker + + while systemctl show -P SubState nspawn_machinectl_bind.service | grep -q running + do + sleep 0.1 + done + + return $(systemctl show -P ExecMainStatus nspawn_machinectl_bind.service) +} + function run { if [[ "$1" = "yes" && "$is_v2_supported" = "no" ]]; then printf "Unified cgroup hierarchy is not supported. Skipping.\n" >&2 @@ -186,4 +209,6 @@ for api_vfs_writable in yes no network; do run yes yes $api_vfs_writable done +check_machinectl_bind + touch /testok From f7c18d3de8102056e4e78b08c9c8508644c5507f Mon Sep 17 00:00:00 2001 From: Luca Boccassi Date: Fri, 15 Jan 2021 15:50:45 +0000 Subject: [PATCH 5/9] machine: use file descriptor when chasing bind mount sources Allows to always operate on pinned inodes, rather than paths, so that races are less likely --- src/shared/mount-util.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/src/shared/mount-util.c b/src/shared/mount-util.c index 0aaba49852..9d0d7c73df 100644 --- a/src/shared/mount-util.c +++ b/src/shared/mount-util.c @@ -757,12 +757,12 @@ int bind_mount_in_namespace( bool make_file_or_directory) { _cleanup_close_pair_ int errno_pipe_fd[2] = { -1, -1 }; - _cleanup_close_ int self_mntns_fd = -1, mntns_fd = -1, root_fd = -1, pidns_fd = -1; - char mount_slave[] = "/tmp/propagate.XXXXXX", *mount_tmp, *mount_outside, *p; + _cleanup_close_ int self_mntns_fd = -1, mntns_fd = -1, root_fd = -1, pidns_fd = -1, chased_src_fd = -1; + char mount_slave[] = "/tmp/propagate.XXXXXX", *mount_tmp, *mount_outside, *p, + chased_src[STRLEN("/proc/self/fd/") + DECIMAL_STR_MAX(int)]; bool mount_slave_created = false, mount_slave_mounted = false, mount_tmp_created = false, mount_tmp_mounted = false, mount_outside_created = false, mount_outside_mounted = false; - _cleanup_free_ char *chased_src = NULL; struct stat st, self_mntns_st; pid_t child; int r; @@ -800,14 +800,15 @@ int bind_mount_in_namespace( if (r < 0) return log_debug_errno(r == -ENOENT ? SYNTHETIC_ERRNO(EOPNOTSUPP) : r, "Target does not allow propagation of mount points"); - r = chase_symlinks(src, NULL, CHASE_TRAIL_SLASH, &chased_src, NULL); + r = chase_symlinks(src, NULL, CHASE_TRAIL_SLASH, NULL, &chased_src_fd); if (r < 0) return log_debug_errno(r, "Failed to resolve source path of %s: %m", src); + xsprintf(chased_src, "/proc/self/fd/%i", chased_src_fd); - if (lstat(chased_src, &st) < 0) - return log_debug_errno(errno, "Failed to stat() resolved source path %s: %m", chased_src); + if (fstat(chased_src_fd, &st) < 0) + return log_debug_errno(errno, "Failed to stat() resolved source path %s: %m", src); if (S_ISLNK(st.st_mode)) /* This shouldn't really happen, given that we just chased the symlinks above, but let's better be safe… */ - return log_debug_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "Source directory %s can't be a symbolic link", chased_src); + return log_debug_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "Source directory %s can't be a symbolic link", src); /* Our goal is to install a new bind mount into the container, possibly read-only. This is irritatingly complex @@ -843,7 +844,7 @@ int bind_mount_in_namespace( mount_tmp_created = true; - r = mount_nofollow_verbose(LOG_DEBUG, chased_src, mount_tmp, NULL, MS_BIND, NULL); + r = mount_follow_verbose(LOG_DEBUG, chased_src, mount_tmp, NULL, MS_BIND, NULL); if (r < 0) goto finish; From 94293d65cd4125347e21b3e423d0e245226b1be2 Mon Sep 17 00:00:00 2001 From: Luca Boccassi Date: Fri, 18 Dec 2020 16:16:46 +0000 Subject: [PATCH 6/9] MountAPIVFS: always mount a tmpfs on /run We need a writable /run for most operations, but in case a read-only RootImage (or similar) is used, by default there's no additional tmpfs mount on /run. Change this behaviour and document it. --- NEWS | 10 ++++++++++ man/systemd.exec.xml | 9 +++++---- src/core/namespace.c | 21 ++++++++++++++++++++- 3 files changed, 35 insertions(+), 5 deletions(-) diff --git a/NEWS b/NEWS index bdac561bc0..c7ee3dbb40 100644 --- a/NEWS +++ b/NEWS @@ -1,5 +1,15 @@ systemd System and Service Manager +CHANGES WITH 248: + + * The MountAPIVFS= service file setting now additionally mounts a tmpfs + on /run/ if it is not already a mount point. A writable /run/ has always + been a requirement for a functioning system, but this was not + guaranteed when using a read-only image. + Users can always specify BindPaths= or InaccessiblePaths= as overrides, + and they will take precedence. If the host's root mount point is used, + there is no change in behaviour. + CHANGES WITH 247: * KERNEL API INCOMPATIBILITY: Linux 4.14 introduced two new uevents diff --git a/man/systemd.exec.xml b/man/systemd.exec.xml index 7328f9b965..568839e0d9 100644 --- a/man/systemd.exec.xml +++ b/man/systemd.exec.xml @@ -275,11 +275,12 @@ MountAPIVFS= Takes a boolean argument. If on, a private mount namespace for the unit's processes is created - and the API file systems /proc/, /sys/, and /dev/ - are mounted inside of it, unless they are already mounted. Note that this option has no effect unless used in - conjunction with RootDirectory=/RootImage= as these three mounts are + and the API file systems /proc/, /sys/, /dev/ and + /run/ (as an empty tmpfs) are mounted inside of it, unless they are + already mounted. Note that this option has no effect unless used in conjunction with + RootDirectory=/RootImage= as these four mounts are generally mounted in the host anyway, and unless the root directory is changed, the private mount namespace - will be a 1:1 copy of the host's, and include these three mounts. Note that the /dev/ file + will be a 1:1 copy of the host's, and include these four mounts. Note that the /dev/ file system of the host is bind mounted if this option is used without PrivateDevices=. To run the service with a private, minimal version of /dev/, combine this option with PrivateDevices=. diff --git a/src/core/namespace.c b/src/core/namespace.c index e32336a7ff..73a8fa73a4 100644 --- a/src/core/namespace.c +++ b/src/core/namespace.c @@ -51,6 +51,7 @@ typedef enum MountMode { EMPTY_DIR, SYSFS, PROCFS, + RUN, READONLY, READWRITE, TMPFS, @@ -76,12 +77,13 @@ typedef struct MountEntry { LIST_HEAD(MountOptions, image_options); } MountEntry; -/* If MountAPIVFS= is used, let's mount /sys and /proc into the it, but only as a fallback if the user hasn't mounted +/* If MountAPIVFS= is used, let's mount /sys, /proc, /dev and /run into the it, but only as a fallback if the user hasn't mounted * something there already. These mounts are hence overridden by any other explicitly configured mounts. */ static const MountEntry apivfs_table[] = { { "/proc", PROCFS, false }, { "/dev", BIND_DEV, false }, { "/sys", SYSFS, false }, + { "/run", RUN, false, .options_const = "mode=755" TMPFS_LIMITS_RUN, .flags = MS_NOSUID|MS_NODEV|MS_STRICTATIME }, }; /* ProtectKernelTunables= option and the related filesystem APIs */ @@ -945,6 +947,20 @@ static int mount_tmpfs(const MountEntry *m) { return 1; } +static int mount_run(const MountEntry *m) { + int r; + + assert(m); + + r = path_is_mount_point(mount_entry_path(m), NULL, 0); + if (r < 0 && r != -ENOENT) + return log_debug_errno(r, "Unable to determine whether /run is already mounted: %m"); + if (r > 0) /* make this a NOP if /run is already a mount point */ + return 0; + + return mount_tmpfs(m); +} + static int mount_images(const MountEntry *m) { _cleanup_(loop_device_unrefp) LoopDevice *loop_device = NULL; _cleanup_(decrypted_image_unrefp) DecryptedImage *decrypted_image = NULL; @@ -1170,6 +1186,9 @@ static int apply_mount( case PROCFS: return mount_procfs(m, ns_info); + case RUN: + return mount_run(m); + case MOUNT_IMAGES: return mount_images(m); From 5e8deb94c6f05137942b10b5288a37d9b09fd43f Mon Sep 17 00:00:00 2001 From: Luca Boccassi Date: Thu, 30 Jul 2020 19:37:10 +0100 Subject: [PATCH 7/9] core: add DBUS method to bind mount new nodes without service restart Allow to setup new bind mounts for a service at runtime (via either DBUS or a new 'systemctl bind' verb) with a new helper that forks into the unit's mount namespace. Add a new integration test to cover this. Useful for zero-downtime addition to services that are running inside mount namespaces, especially when using RootImage/RootDirectory. If a service runs with a read-only root, a tmpfs is added on /run to ensure we can create the airlock directory for incoming mounts under /run/host/incoming. --- man/org.freedesktop.systemd1.xml | 27 ++++++ man/systemctl.xml | 32 +++++++ man/systemd.exec.xml | 4 + meson.build | 2 + shell-completion/bash/systemctl.in | 2 +- shell-completion/zsh/_systemctl.in | 5 ++ src/core/dbus-manager.c | 16 ++++ src/core/dbus-service.c | 87 +++++++++++++++++++ src/core/dbus-service.h | 1 + src/core/dbus-unit.c | 32 ------- src/core/dbus-util.c | 33 +++++++ src/core/dbus-util.h | 1 + src/core/execute.c | 18 +++- src/core/execute.h | 2 + src/core/namespace.c | 37 +++++++- src/core/namespace.h | 2 + src/core/org.freedesktop.systemd1.conf | 8 ++ src/systemctl/systemctl-mount.c | 41 +++++++++ src/systemctl/systemctl-mount.h | 4 + src/systemctl/systemctl.c | 20 +++++ src/systemctl/systemctl.h | 2 + src/test/test-namespace.c | 2 + src/test/test-ns.c | 2 + test/TEST-57-RUNTIME-BIND-PATHS/Makefile | 1 + test/TEST-57-RUNTIME-BIND-PATHS/test.sh | 7 ++ test/units/testsuite-57-namespaced.service | 12 +++ .../units/testsuite-57-non-namespaced.service | 5 ++ test/units/testsuite-57.service | 7 ++ test/units/testsuite-57.sh | 43 +++++++++ 29 files changed, 415 insertions(+), 40 deletions(-) create mode 100644 src/systemctl/systemctl-mount.c create mode 100644 src/systemctl/systemctl-mount.h create mode 120000 test/TEST-57-RUNTIME-BIND-PATHS/Makefile create mode 100755 test/TEST-57-RUNTIME-BIND-PATHS/test.sh create mode 100644 test/units/testsuite-57-namespaced.service create mode 100644 test/units/testsuite-57-non-namespaced.service create mode 100644 test/units/testsuite-57.service create mode 100755 test/units/testsuite-57.sh diff --git a/man/org.freedesktop.systemd1.xml b/man/org.freedesktop.systemd1.xml index 78fd0b3378..90d0d66414 100644 --- a/man/org.freedesktop.systemd1.xml +++ b/man/org.freedesktop.systemd1.xml @@ -116,6 +116,11 @@ node /org/freedesktop/systemd1 { SetUnitProperties(in s name, in b runtime, in a(sv) properties); + BindMountUnit(in s name, + in s source, + in s destination, + in b read_only, + in b mkdir); RefUnit(in s name); UnrefUnit(in s name); StartTransientUnit(in s name, @@ -767,6 +772,8 @@ node /org/freedesktop/systemd1 { + + @@ -1156,6 +1163,9 @@ node /org/freedesktop/systemd1 { the "Try" flavor is used in which case a service that isn't running is not affected by the restart. The "ReloadOrRestart" flavors attempt a reload if the unit supports it and use a restart otherwise. + BindMountUnit() can be used to bind mount new files or directories into + a running service mount namespace. + KillUnit() may be used to kill (i.e. send a signal to) all processes of a unit. It takes the unit name, an enum who and a UNIX signal number to send. The who enum is one of @@ -2193,6 +2203,10 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2eservice { node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2eservice { interface org.freedesktop.systemd1.Service { methods: + BindMount(in s source, + in s destination, + in b read_only, + in b mkdir); GetProcesses(out a(sus) processes); AttachProcesses(in s subcgroup, in au pids); @@ -3252,6 +3266,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2eservice { + + @@ -3810,6 +3826,17 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2eservice { + + Methods + + BindMount() implements the same operation as the respective method on the + Manager object (see above). However, this method operates on the service + object and hence does not take a unit name parameter. Invoking the methods directly on the Manager + object has the advantage of not requiring a GetUnit() call to get the unit object + for a specific unit name. Calling the methods on the Manager object is hence a round trip + optimization. + + Properties diff --git a/man/systemctl.xml b/man/systemctl.xml index bb702cb078..a954e8727b 100644 --- a/man/systemctl.xml +++ b/man/systemctl.xml @@ -550,6 +550,23 @@ Jan 12 10:46:45 example.com bluetoothd[8900]: gatt-time-server: Input/output err + + bind UNIT PATH [PATH] + + Bind mounts a file or directory from the host into the specified unit's view. The first path + argument is the source file or directory on the host, the second path argument is the destination file or + directory in the unit's view. When the latter is omitted, the destination path in the unit's view is the same as + the source path on the host. When combined with the switch, a ready-only bind + mount is created. When combined with the switch, the destination path is first created + before the mount is applied. Note that this option is currently only supported for units that run within a mount + namespace (e.g.: with , , etc.). This command supports bind + mounting directories, regular files, device nodes, AF_UNIX socket nodes, as well as FIFOs. + The bind mount is ephemeral, and it is undone as soon as the current unit process exists. + Note that the namespace mentioned here, where the bind mount will be added to, is the one where the main service + process runs, as other processes run in distinct namespaces (e.g.: , + , etc.) + + service-log-level SERVICE [LEVEL] @@ -2246,6 +2263,21 @@ Jan 12 10:46:45 example.com bluetoothd[8900]: gatt-time-server: Input/output err + + + + When used with bind, creates the destination file or directory before + applying the bind mount. Note that even though the name of this option suggests that it is suitable only for + directories, this option also creates the destination file node to mount over if the object to mount is not + a directory, but a regular file, device node, socket or FIFO. + + + + + + When used with bind, creates a read-only bind mount. + + diff --git a/man/systemd.exec.xml b/man/systemd.exec.xml index 568839e0d9..9adb6a298e 100644 --- a/man/systemd.exec.xml +++ b/man/systemd.exec.xml @@ -285,6 +285,10 @@ the service with a private, minimal version of /dev/, combine this option with PrivateDevices=. + In order to allow propagating mounts at runtime in a safe manner, /run/systemd/propagate + on the host will be used to set up new mounts, and /run/host/incoming/ in the private namespace + will be used as an intermediate step to store them before being moved to the final mount point. + diff --git a/meson.build b/meson.build index e360ec95c9..2caf069602 100644 --- a/meson.build +++ b/meson.build @@ -2198,6 +2198,8 @@ public_programs += executable( 'src/systemctl/systemctl-log-setting.h', 'src/systemctl/systemctl-logind.c', 'src/systemctl/systemctl-logind.h', + 'src/systemctl/systemctl-mount.c', + 'src/systemctl/systemctl-mount.h', 'src/systemctl/systemctl-preset-all.c', 'src/systemctl/systemctl-preset-all.h', 'src/systemctl/systemctl-reset-failed.c', diff --git a/shell-completion/bash/systemctl.in b/shell-completion/bash/systemctl.in index 4bd80948d6..7e99ef4bf3 100644 --- a/shell-completion/bash/systemctl.in +++ b/shell-completion/bash/systemctl.in @@ -214,7 +214,7 @@ _systemctl () { list-timers list-units list-unit-files poweroff reboot rescue show-environment suspend get-default is-system-running preset-all' - [FILE]='link switch-root' + [FILE]='link switch-root bind' [TARGETS]='set-default' [MACHINES]='list-machines' [LOG_LEVEL]='log-level' diff --git a/shell-completion/zsh/_systemctl.in b/shell-completion/zsh/_systemctl.in index 10af6d5121..c4ea78b3c1 100644 --- a/shell-completion/zsh/_systemctl.in +++ b/shell-completion/zsh/_systemctl.in @@ -31,6 +31,7 @@ "reset-failed:Reset failed state for all, one, or more units" "list-dependencies:Show unit dependency tree" "clean:Remove configuration, state, cache, logs or runtime data of units" + "bind:Bind mount a path from the host into a unit's namespace" ) local -a machine_commands=( @@ -378,6 +379,10 @@ done _files } +(( $+functions[_systemctl_bind] )) || _systemctl_bind() { + _files + } + # no systemctl completion for: # [STANDALONE]='daemon-reexec daemon-reload default # emergency exit halt kexec list-jobs list-units diff --git a/src/core/dbus-manager.c b/src/core/dbus-manager.c index 3e1d609aa3..4b88f0d9f0 100644 --- a/src/core/dbus-manager.c +++ b/src/core/dbus-manager.c @@ -16,6 +16,7 @@ #include "dbus-job.h" #include "dbus-manager.h" #include "dbus-scope.h" +#include "dbus-service.h" #include "dbus-unit.h" #include "dbus.h" #include "env-util.h" @@ -725,6 +726,11 @@ static int method_set_unit_properties(sd_bus_message *message, void *userdata, s return method_generic_unit_operation(message, userdata, error, bus_unit_method_set_properties, GENERIC_UNIT_LOAD|GENERIC_UNIT_VALIDATE_LOADED); } +static int method_bind_mount_unit(sd_bus_message *message, void *userdata, sd_bus_error *error) { + /* Only add mounts on fully loaded units */ + return method_generic_unit_operation(message, userdata, error, bus_service_method_bind_mount, GENERIC_UNIT_VALIDATE_LOADED); +} + static int method_ref_unit(sd_bus_message *message, void *userdata, sd_bus_error *error) { /* Only allow reffing of fully loaded units, and make sure reffing a unit loads it. */ return method_generic_unit_operation(message, userdata, error, bus_unit_method_ref, GENERIC_UNIT_LOAD|GENERIC_UNIT_VALIDATE_LOADED); @@ -2760,6 +2766,16 @@ const sd_bus_vtable bus_manager_vtable[] = { NULL,, method_set_unit_properties, SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_NAMES("BindMountUnit", + "sssbb", + SD_BUS_PARAM(name) + SD_BUS_PARAM(source) + SD_BUS_PARAM(destination) + SD_BUS_PARAM(read_only) + SD_BUS_PARAM(mkdir), + NULL,, + method_bind_mount_unit, + SD_BUS_VTABLE_UNPRIVILEGED), SD_BUS_METHOD_WITH_NAMES("RefUnit", "s", SD_BUS_PARAM(name), diff --git a/src/core/dbus-service.c b/src/core/dbus-service.c index 64f9d4ab36..6df93e44a4 100644 --- a/src/core/dbus-service.c +++ b/src/core/dbus-service.c @@ -11,11 +11,15 @@ #include "dbus-manager.h" #include "dbus-service.h" #include "dbus-util.h" +#include "execute.h" #include "exit-status.h" #include "fd-util.h" #include "fileio.h" +#include "locale-util.h" +#include "mount-util.h" #include "parse-util.h" #include "path-util.h" +#include "selinux-access.h" #include "service.h" #include "signal-util.h" #include "string-util.h" @@ -91,6 +95,79 @@ static int property_get_exit_status_set( return sd_bus_message_close_container(reply); } +int bus_service_method_bind_mount(sd_bus_message *message, void *userdata, sd_bus_error *error) { + int read_only, make_file_or_directory; + const char *dest, *src, *propagate_directory; + Unit *u = userdata; + ExecContext *c; + pid_t unit_pid; + int r; + + assert(message); + assert(u); + + if (!MANAGER_IS_SYSTEM(u->manager)) + return sd_bus_error_setf(error, SD_BUS_ERROR_NOT_SUPPORTED, "Adding bind mounts at runtime is only supported for system managers."); + + r = mac_selinux_unit_access_check(u, message, "start", error); + if (r < 0) + return r; + + r = sd_bus_message_read(message, "ssbb", &src, &dest, &read_only, &make_file_or_directory); + if (r < 0) + return r; + + if (!path_is_absolute(src) || !path_is_normalized(src)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Source path must be absolute and normalized."); + + if (isempty(dest)) + dest = src; + else if (!path_is_absolute(dest) || !path_is_normalized(dest)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Destination path must be absolute and normalized."); + + r = bus_verify_manage_units_async_full( + u, + "bind-mount", + CAP_SYS_ADMIN, + N_("Authentication is required to bind mount on '$(unit)'."), + true, + message, + error); + if (r < 0) + return r; + if (r == 0) + return 1; /* No authorization for now, but the async polkit stuff will call us again when it has it */ + + if (u->type != UNIT_SERVICE) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Unit is not of type .service"); + + /* If it would be dropped at startup time, return an error. The context should always be available, but + * there's an assert in exec_needs_mount_namespace, so double-check just in case. */ + c = unit_get_exec_context(u); + if (!c) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Cannot access unit execution context"); + if (path_startswith_strv(dest, c->inaccessible_paths)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "%s is not accessible to this unit", dest); + + /* Ensure that the unit was started in a private mount namespace */ + if (!exec_needs_mount_namespace(c, NULL, unit_get_exec_runtime(u))) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Unit not running in private mount namespace, cannot activate bind mount"); + + unit_pid = unit_main_pid(u); + if (unit_pid == 0 || !UNIT_IS_ACTIVE_OR_RELOADING(unit_active_state(u))) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Unit is not running"); + + propagate_directory = strjoina("/run/systemd/propagate/", u->id); + r = bind_mount_in_namespace(unit_pid, + propagate_directory, + "/run/systemd/incoming/", + src, dest, read_only, make_file_or_directory); + if (r < 0) + return sd_bus_error_set_errnof(error, r, "Failed to mount %s on %s in unit's namespace: %m", src, dest); + + return sd_bus_reply_method_return(message, NULL); +} + const sd_bus_vtable bus_service_vtable[] = { SD_BUS_VTABLE_START(0), SD_BUS_PROPERTY("Type", "s", property_get_type, offsetof(Service, type), SD_BUS_VTABLE_PROPERTY_CONST), @@ -146,6 +223,16 @@ const sd_bus_vtable bus_service_vtable[] = { BUS_EXEC_COMMAND_LIST_VTABLE("ExecStopPost", offsetof(Service, exec_command[SERVICE_EXEC_STOP_POST]), SD_BUS_VTABLE_PROPERTY_EMITS_INVALIDATION), BUS_EXEC_EX_COMMAND_LIST_VTABLE("ExecStopPostEx", offsetof(Service, exec_command[SERVICE_EXEC_STOP_POST]), SD_BUS_VTABLE_PROPERTY_EMITS_INVALIDATION), + SD_BUS_METHOD_WITH_NAMES("BindMount", + "ssbb", + SD_BUS_PARAM(source) + SD_BUS_PARAM(destination) + SD_BUS_PARAM(read_only) + SD_BUS_PARAM(mkdir), + NULL,, + bus_service_method_bind_mount, + SD_BUS_VTABLE_UNPRIVILEGED), + /* The following four are obsolete, and thus marked hidden here. They moved into the Unit interface */ SD_BUS_PROPERTY("StartLimitInterval", "t", bus_property_get_usec, offsetof(Unit, start_ratelimit.interval), SD_BUS_VTABLE_PROPERTY_CONST|SD_BUS_VTABLE_HIDDEN), SD_BUS_PROPERTY("StartLimitBurst", "u", bus_property_get_unsigned, offsetof(Unit, start_ratelimit.burst), SD_BUS_VTABLE_PROPERTY_CONST|SD_BUS_VTABLE_HIDDEN), diff --git a/src/core/dbus-service.h b/src/core/dbus-service.h index 69311675c9..5b7b7b757b 100644 --- a/src/core/dbus-service.h +++ b/src/core/dbus-service.h @@ -9,4 +9,5 @@ extern const sd_bus_vtable bus_service_vtable[]; int bus_service_set_property(Unit *u, const char *name, sd_bus_message *i, UnitWriteFlags flags, sd_bus_error *error); +int bus_service_method_bind_mount(sd_bus_message *message, void *userdata, sd_bus_error *error); int bus_service_commit_properties(Unit *u); diff --git a/src/core/dbus-unit.c b/src/core/dbus-unit.c index 427152a757..67cc58ee9e 100644 --- a/src/core/dbus-unit.c +++ b/src/core/dbus-unit.c @@ -323,38 +323,6 @@ static int property_get_load_error( return sd_bus_message_append(reply, "(ss)", NULL, NULL); } -static int bus_verify_manage_units_async_full( - Unit *u, - const char *verb, - int capability, - const char *polkit_message, - bool interactive, - sd_bus_message *call, - sd_bus_error *error) { - - const char *details[9] = { - "unit", u->id, - "verb", verb, - }; - - if (polkit_message) { - details[4] = "polkit.message"; - details[5] = polkit_message; - details[6] = "polkit.gettext_domain"; - details[7] = GETTEXT_PACKAGE; - } - - return bus_verify_polkit_async( - call, - capability, - "org.freedesktop.systemd1.manage-units", - details, - interactive, - UID_INVALID, - &u->manager->polkit_registry, - error); -} - static const char *const polkit_message_for_job[_JOB_TYPE_MAX] = { [JOB_START] = N_("Authentication is required to start '$(unit)'."), [JOB_STOP] = N_("Authentication is required to stop '$(unit)'."), diff --git a/src/core/dbus-util.c b/src/core/dbus-util.c index d6223db305..2d22bc699a 100644 --- a/src/core/dbus-util.c +++ b/src/core/dbus-util.c @@ -1,5 +1,6 @@ /* SPDX-License-Identifier: LGPL-2.1-or-later */ +#include "bus-polkit.h" #include "bus-util.h" #include "dbus-util.h" #include "parse-util.h" @@ -153,3 +154,35 @@ int bus_set_transient_usec_internal( return 1; } + +int bus_verify_manage_units_async_full( + Unit *u, + const char *verb, + int capability, + const char *polkit_message, + bool interactive, + sd_bus_message *call, + sd_bus_error *error) { + + const char *details[9] = { + "unit", u->id, + "verb", verb, + }; + + if (polkit_message) { + details[4] = "polkit.message"; + details[5] = polkit_message; + details[6] = "polkit.gettext_domain"; + details[7] = GETTEXT_PACKAGE; + } + + return bus_verify_polkit_async( + call, + capability, + "org.freedesktop.systemd1.manage-units", + details, + interactive, + UID_INVALID, + &u->manager->polkit_registry, + error); +} diff --git a/src/core/dbus-util.h b/src/core/dbus-util.h index 4e7c68e843..e35c632d37 100644 --- a/src/core/dbus-util.h +++ b/src/core/dbus-util.h @@ -248,3 +248,4 @@ static inline int bus_set_transient_usec(Unit *u, const char *name, usec_t *p, s static inline int bus_set_transient_usec_fix_0(Unit *u, const char *name, usec_t *p, sd_bus_message *message, UnitWriteFlags flags, sd_bus_error *error) { return bus_set_transient_usec_internal(u, name, p, true, message, flags, error); } +int bus_verify_manage_units_async_full(Unit *u, const char *verb, int capability, const char *polkit_message, bool interactive, sd_bus_message *call, sd_bus_error *error); diff --git a/src/core/execute.c b/src/core/execute.c index ee5f082783..5f170db8d1 100644 --- a/src/core/execute.c +++ b/src/core/execute.c @@ -1987,13 +1987,12 @@ static int build_pass_environment(const ExecContext *c, char ***ret) { return 0; } -static bool exec_needs_mount_namespace( +bool exec_needs_mount_namespace( const ExecContext *context, const ExecParameters *params, const ExecRuntime *runtime) { assert(context); - assert(params); if (context->root_image) return true; @@ -2035,7 +2034,7 @@ static bool exec_needs_mount_namespace( return true; for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) { - if (!params->prefix[t]) + if (params && !params->prefix[t]) continue; if (!strv_isempty(context->directories[t].paths)) @@ -3115,7 +3114,7 @@ static int apply_mount_namespace( _cleanup_strv_free_ char **empty_directories = NULL; const char *tmp_dir = NULL, *var_tmp_dir = NULL; const char *root_dir = NULL, *root_image = NULL; - _cleanup_free_ char *creds_path = NULL; + _cleanup_free_ char *creds_path = NULL, *incoming_dir = NULL, *propagate_dir = NULL; NamespaceInfo ns_info; bool needs_sandboxing; BindMount *bind_mounts = NULL; @@ -3192,6 +3191,15 @@ static int apply_mount_namespace( } } + if (MANAGER_IS_SYSTEM(u->manager)) { + propagate_dir = path_join("/run/systemd/propagate/", u->id); + if (!propagate_dir) + return -ENOMEM; + incoming_dir = strdup("/run/systemd/incoming"); + if (!incoming_dir) + return -ENOMEM; + } + r = setup_namespace(root_dir, root_image, context->root_image_options, &ns_info, context->read_write_paths, needs_sandboxing ? context->read_only_paths : NULL, @@ -3211,6 +3219,8 @@ static int apply_mount_namespace( context->root_hash, context->root_hash_size, context->root_hash_path, context->root_hash_sig, context->root_hash_sig_size, context->root_hash_sig_path, context->root_verity, + propagate_dir, + incoming_dir, DISSECT_IMAGE_DISCARD_ON_LOOP|DISSECT_IMAGE_RELAX_VAR_CHECK|DISSECT_IMAGE_FSCK, error_path); diff --git a/src/core/execute.h b/src/core/execute.h index da8d6ae272..2da4699df1 100644 --- a/src/core/execute.h +++ b/src/core/execute.h @@ -471,3 +471,5 @@ ExecDirectoryType exec_directory_type_from_string(const char *s) _pure_; const char* exec_resource_type_to_string(ExecDirectoryType i) _const_; ExecDirectoryType exec_resource_type_from_string(const char *s) _pure_; + +bool exec_needs_mount_namespace(const ExecContext *context, const ExecParameters *params, const ExecRuntime *runtime); diff --git a/src/core/namespace.c b/src/core/namespace.c index 73a8fa73a4..4b5519e11b 100644 --- a/src/core/namespace.c +++ b/src/core/namespace.c @@ -1301,7 +1301,8 @@ static size_t namespace_calculate_mounts( const char* tmp_dir, const char* var_tmp_dir, const char *creds_path, - const char* log_namespace) { + const char* log_namespace, + bool setup_propagate) { size_t protect_home_cnt; size_t protect_system_cnt = @@ -1328,6 +1329,7 @@ static size_t namespace_calculate_mounts( n_bind_mounts + n_mount_images + n_temporary_filesystems + + (setup_propagate ? 1 : 0) + /* /run/systemd/incoming */ ns_info->private_dev + (ns_info->protect_kernel_tunables ? ELEMENTSOF(protect_kernel_tunables_table) : 0) + (ns_info->protect_kernel_modules ? ELEMENTSOF(protect_kernel_modules_table) : 0) + @@ -1487,6 +1489,8 @@ int setup_namespace( size_t root_hash_sig_size, const char *root_hash_sig_path, const char *verity_data_path, + const char *propagate_dir, + const char *incoming_dir, DissectImageFlags dissect_image_flags, char **error_path) { @@ -1495,13 +1499,16 @@ int setup_namespace( _cleanup_(dissected_image_unrefp) DissectedImage *dissected_image = NULL; _cleanup_(verity_settings_done) VeritySettings verity = VERITY_SETTINGS_DEFAULT; MountEntry *m = NULL, *mounts = NULL; - bool require_prefix = false; + bool require_prefix = false, setup_propagate = false; const char *root; size_t n_mounts; int r; assert(ns_info); + if (!isempty(propagate_dir) && !isempty(incoming_dir)) + setup_propagate = true; + if (mount_flags == 0) mount_flags = MS_SHARED; @@ -1585,7 +1592,8 @@ int setup_namespace( n_mount_images, tmp_dir, var_tmp_dir, creds_path, - log_namespace); + log_namespace, + setup_propagate); if (n_mounts > 0) { m = mounts = new0(MountEntry, n_mounts); @@ -1754,6 +1762,15 @@ int setup_namespace( }; } + /* Will be used to add bind mounts at runtime */ + if (setup_propagate) + *(m++) = (MountEntry) { + .source_const = propagate_dir, + .path_const = incoming_dir, + .mode = BIND_MOUNT, + .read_only = true, + }; + assert(mounts + n_mounts == m); /* Prepend the root directory where that's necessary */ @@ -1778,6 +1795,10 @@ int setup_namespace( goto finish; } + /* Create the source directory to allow runtime propagation of mounts */ + if (setup_propagate) + (void) mkdir_p(propagate_dir, 0600); + /* Remount / as SLAVE so that nothing now mounted in the namespace * shows up in the parent */ if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) { @@ -1919,6 +1940,16 @@ int setup_namespace( goto finish; } + /* bind_mount_in_namespace() will MS_MOVE into that directory, and that's only + * supported for non-shared mounts. This needs to happen after remounting / or it will fail. */ + if (setup_propagate) { + r = mount(NULL, incoming_dir, NULL, MS_SLAVE, NULL); + if (r < 0) { + log_error_errno(r, "Failed to remount %s with MS_SLAVE: %m", incoming_dir); + goto finish; + } + } + r = 0; finish: diff --git a/src/core/namespace.h b/src/core/namespace.h index da0861c406..91ee44cd51 100644 --- a/src/core/namespace.h +++ b/src/core/namespace.h @@ -127,6 +127,8 @@ int setup_namespace( size_t root_hash_sig_size, const char *root_hash_sig_path, const char *root_verity, + const char *propagate_dir, + const char *incoming_dir, DissectImageFlags dissected_image_flags, char **error_path); diff --git a/src/core/org.freedesktop.systemd1.conf b/src/core/org.freedesktop.systemd1.conf index 8b32379835..0cea4d2b02 100644 --- a/src/core/org.freedesktop.systemd1.conf +++ b/src/core/org.freedesktop.systemd1.conf @@ -222,6 +222,10 @@ send_interface="org.freedesktop.systemd1.Manager" send_member="ReloadOrTryRestartUnit"/> + + @@ -392,6 +396,10 @@ send_interface="org.freedesktop.systemd1.Service" send_member="AttachProcesses"/> + + /run/testservice-57-fixed +mkdir -p /run/inaccessible + +systemctl start testsuite-57-namespaced.service + +# Ensure that inaccessible paths aren't bypassed by the runtime setup +set +e +systemctl bind --mkdir testsuite-57-namespaced.service /run/testservice-57-fixed /run/inaccessible/testfile_fixed && exit 1 +set -e + +echo "MARKER_RUNTIME" > /run/testservice-57-runtime + +systemctl bind --mkdir testsuite-57-namespaced.service /run/testservice-57-runtime /tmp/testfile_runtime + +while systemctl show -P SubState testsuite-57-namespaced.service | grep -q running +do + sleep 0.1 +done + +systemctl is-active testsuite-57-namespaced.service + +# Now test that systemctl bind fails when attempted on a non-namespaced unit +systemctl start testsuite-57-non-namespaced.service + +set +e +systemctl bind --mkdir testsuite-57-non-namespaced.service /run/testservice-57-runtime /tmp/testfile_runtime && exit 1 +set -e + +while systemctl show -P SubState testsuite-57-non-namespaced.service | grep -q running +do + sleep 0.1 +done + +set +e +systemctl is-active testsuite-57-non-namespaced.service && exit 1 +set -e + +echo OK > /testok + +exit 0 From a9d34376e659deb44b99875ea35764de9cf802f0 Mon Sep 17 00:00:00 2001 From: Luca Boccassi Date: Thu, 14 Jan 2021 16:48:13 +0000 Subject: [PATCH 8/9] test: skip missing optional libraries in image install Not all optional libraries might be available on developers machines, so log and skip. Also some pkg-config files are broken (eg: tss2 on Debian Stable) so skip if the required variables are missing, and improve logs. --- test/test-functions | 27 +++++++++++++++++++-------- 1 file changed, 19 insertions(+), 8 deletions(-) diff --git a/test/test-functions b/test/test-functions index 06dea3c7be..efa2989bff 100644 --- a/test/test-functions +++ b/test/test-functions @@ -698,14 +698,25 @@ install_missing_libraries() { # A number of dependencies is now optional via dlopen, so the install # script will not pick them up, since it looks at linkage. for lib in libcryptsetup libidn libidn2 pwquality libqrencode tss2-esys tss2-rc tss2-mu libfido2; do - if pkg-config --exists ${lib}; then - path=$(pkg-config --variable=libdir ${lib}) - if ! [[ ${lib} =~ ^lib ]]; then - lib="lib${lib}" - fi - inst_libs "${path}/${lib}.so" - inst_library "${path}/${lib}.so" - fi + ddebug "Searching for $lib via pkg-config" + if pkg-config --exists ${lib}; then + path=$(pkg-config --variable=libdir ${lib}) + if [ -z "${path}" ]; then + ddebug "$lib.pc does not contain a libdir variable, skipping" + continue + fi + + if ! [[ ${lib} =~ ^lib ]]; then + lib="lib${lib}" + fi + # Some pkg-config files are broken and give out the wrong paths + # (eg: libcryptsetup), so just ignore them + inst_libs "${path}/${lib}.so" || true + inst_library "${path}/${lib}.so" || true + else + ddebug "$lib.pc not found, skipping" + continue + fi done } From fa7a3cd00e844736ef5132c458bad1b240902504 Mon Sep 17 00:00:00 2001 From: Luca Boccassi Date: Thu, 14 Jan 2021 22:11:14 +0000 Subject: [PATCH 9/9] test: run strace with -f and copy log out --- test/test-functions | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/test/test-functions b/test/test-functions index efa2989bff..7012e8a8f3 100644 --- a/test/test-functions +++ b/test/test-functions @@ -629,7 +629,7 @@ create_strace_wrapper() { cat >$_strace_wrapper <