diff --git a/src/core/exec-credential.c b/src/core/exec-credential.c index 5fb49899fd..7321868d5d 100644 --- a/src/core/exec-credential.c +++ b/src/core/exec-credential.c @@ -29,7 +29,6 @@ #include "siphash24.h" #include "stat-util.h" #include "strv.h" -#include "tmpfile-util.h" #include "user-util.h" ExecSetCredential* exec_set_credential_free(ExecSetCredential *sc) { @@ -320,7 +319,6 @@ static int write_credential( gid_t gid, bool ownership_ok) { - _cleanup_free_ char *tmp = NULL; _cleanup_close_ int fd = -EBADF; int r; @@ -328,16 +326,10 @@ static int write_credential( assert(id); assert(data || size == 0); - r = tempfn_random_child("", "cred", &tmp); - if (r < 0) - return r; - - fd = openat(dfd, tmp, O_CREAT|O_RDWR|O_CLOEXEC|O_EXCL|O_NOFOLLOW|O_NOCTTY, 0600); + fd = openat(dfd, id, O_CREAT|O_EXCL|O_WRONLY|O_CLOEXEC, 0600); if (fd < 0) return -errno; - CLEANUP_TMPFILE_AT(dfd, tmp); - r = loop_write(fd, data, size); if (r < 0) return r; @@ -359,11 +351,6 @@ static int write_credential( return r; } - r = RET_NERRNO(renameat(dfd, tmp, dfd, id)); - if (r < 0) - return r; - - tmp = mfree(tmp); /* disarm CLEANUP_TMPFILE_AT() */ return 0; } @@ -426,36 +413,20 @@ static int credential_search_path(const ExecParameters *params, CredentialSearch return 0; } -static bool device_nodes_restricted( - const ExecContext *c, - const CGroupContext *cgroup_context) { - - assert(c); - assert(cgroup_context); - - /* Returns true if we have any reason to believe we might not be able to access the TPM device - * directly, even if we run as root/PID 1. This could be because /dev/ is replaced by a private - * version, or because a device node access list is configured. */ - - if (c->private_devices) - return true; - - if (cgroup_context_has_device_policy(cgroup_context)) - return true; - - return false; -} - struct load_cred_args { const ExecContext *context; - const CGroupContext *cgroup_context; const ExecParameters *params; const char *unit; + + bool always_ipc; + bool encrypted; + int write_dfd; uid_t uid; gid_t gid; bool ownership_ok; + uint64_t left; }; @@ -486,7 +457,7 @@ static int maybe_decrypt_and_write_credential( flags |= CREDENTIAL_ANY_SCOPE; - if (!device_nodes_restricted(args->context, args->cgroup_context)) { + if (!args->always_ipc) { r = decrypt_credential_and_warn( id, now(CLOCK_REALTIME), @@ -787,38 +758,49 @@ static int load_cred_recurse_dir_cb( return RECURSE_DIR_CONTINUE; } +static bool device_nodes_restricted( + const ExecContext *c, + const CGroupContext *cgroup_context) { + + assert(c); + assert(cgroup_context); + + /* Returns true if we have any reason to believe we might not be able to access the TPM device + * directly, even if we run as root/PID 1. This could be because /dev/ is replaced by a private + * version, or because a device node access list is configured. */ + + if (c->private_devices) + return true; + + if (cgroup_context_has_device_policy(cgroup_context)) + return true; + + return false; +} + static int acquire_credentials( const ExecContext *context, const CGroupContext *cgroup_context, const ExecParameters *params, const char *unit, - const char *p, + int dfd, uid_t uid, gid_t gid, bool ownership_ok) { - _cleanup_close_ int dfd = -EBADF; int r; assert(context); assert(cgroup_context); assert(params); assert(unit); - assert(p); - - dfd = open(p, O_DIRECTORY|O_CLOEXEC); - if (dfd < 0) - return -errno; - - r = fd_acl_make_writable(dfd); /* Add the "w" bit, if we are reusing an already set up credentials dir where it was unset */ - if (r < 0) - return r; + assert(dfd >= 0); struct load_cred_args args = { .context = context, - .cgroup_context = cgroup_context, .params = params, .unit = unit, + .always_ipc = device_nodes_restricted(context, cgroup_context), .write_dfd = dfd, .uid = uid, .gid = gid, @@ -919,7 +901,15 @@ static int acquire_credentials( return r; } - r = fd_acl_make_read_only(dfd); /* Now take away the "w" bit */ + return 0; +} + +static int credentials_dir_finalize_permissions(int dfd, uid_t uid, gid_t gid, bool ownership_ok) { + int r; + + assert(dfd >= 0); + + r = fd_acl_make_read_only(dfd); /* Take away the "w" bit */ if (r < 0) return r; @@ -943,157 +933,159 @@ static int acquire_credentials( return 0; } +static int setup_credentials_plain_dir( + const ExecContext *context, + const CGroupContext *cgroup_context, + const ExecParameters *params, + const char *unit, + const char *cred_dir, + uid_t uid, + gid_t gid) { + + _cleanup_free_ char *t = NULL, *workspace = NULL; + _cleanup_(rm_rf_safep) const char *workspace_rm = NULL; + _cleanup_close_ int dfd = -EBADF; + int r; + + assert(context); + assert(params); + assert(unit); + assert(cred_dir); + + /* Temporary workspace, that remains inaccessible all the time. We prepare stuff there before moving + * it into place, so that users can't access half-initialized credential stores. */ + t = path_join(params->prefix[EXEC_DIRECTORY_RUNTIME], "systemd/temporary-credentials"); + if (!t) + return -ENOMEM; + + r = mkdir_label(t, 0700); + if (r < 0 && r != -EEXIST) + return r; + + workspace = path_join(t, unit); + if (!workspace) + return -ENOMEM; + + dfd = open_mkdir(workspace, O_CLOEXEC|O_EXCL, 0700); + if (dfd < 0) + return log_debug_errno(dfd, "Failed to create workspace for credentials: %m"); + workspace_rm = workspace; + + (void) label_fix_full(dfd, /* inode_path = */ NULL, cred_dir, /* flags = */ 0); + + r = acquire_credentials(context, cgroup_context, params, unit, dfd, uid, gid, /* ownership_ok = */ false); + if (r < 0) + return r; + + r = RET_NERRNO(rename(workspace, cred_dir)); + if (r >= 0) + workspace_rm = NULL; + if (r == -EEXIST) { + log_debug_errno(r, "Credential dir '%s' already populated, exchanging with workspace.", cred_dir); + r = RET_NERRNO(renameat2(AT_FDCWD, workspace, AT_FDCWD, cred_dir, RENAME_EXCHANGE)); + } + if (r < 0) + return log_debug_errno(r, "Failed to move credentials workspace into place: %m"); + + /* rename() requires both the source and target to be writable, hence lock down write permission + * as last step. */ + r = credentials_dir_finalize_permissions(dfd, uid, gid, /* ownership_ok = */ false); + if (r < 0) + return log_debug_errno(r, "Failed to adjust ACLs of credentials dir: %m"); + + return 0; +} + static int setup_credentials_internal( const ExecContext *context, const CGroupContext *cgroup_context, const ExecParameters *params, const char *unit, - const char *final, /* This is where the credential store shall eventually end up at */ - const char *workspace, /* This is where we can prepare it before moving it to the final place */ - bool reuse_workspace, /* Whether to reuse any existing workspace mount if it already is a mount */ - bool must_mount, /* Whether to require that we mount something, it's not OK to use the plain directory fall back */ + const char *cred_dir, uid_t uid, gid_t gid) { - bool final_mounted; - int r, workspace_mounted; /* negative if we don't know yet whether we have/can mount something; true - * if we mounted something; false if we definitely can't mount anything */ + _cleanup_close_ int fs_fd = -EBADF, mfd = -EBADF, dfd = -EBADF; + bool dir_mounted; + int r; assert(context); assert(params); assert(unit); - assert(final); - assert(workspace); + assert(cred_dir); - r = path_is_mount_point(final); - if (r < 0) - return log_debug_errno(r, "Failed to determine if '%s' is a mountpoint: %m", final); - final_mounted = r > 0; - - if (final_mounted) { - if (FLAGS_SET(params->flags, EXEC_SETUP_CREDENTIALS_FRESH)) { - r = umount_verbose(LOG_DEBUG, final, MNT_DETACH|UMOUNT_NOFOLLOW); - if (r < 0) - return r; - - final_mounted = false; - } else { - /* We can reuse the previous credential dir */ - r = dir_is_empty(final, /* ignore_hidden_or_backup = */ false); - if (r < 0) - return r; - if (r == 0) { - log_debug("Credential dir for unit '%s' already set up, skipping.", unit); - return 0; - } + if (!FLAGS_SET(params->flags, EXEC_SETUP_CREDENTIALS_FRESH)) { + /* We may reuse the previous credential dir */ + r = dir_is_empty(cred_dir, /* ignore_hidden_or_backup = */ false); + if (r < 0) + return r; + if (r == 0) { + log_debug("Credential dir for unit '%s' already set up, skipping.", unit); + return 0; } } - if (reuse_workspace) { - r = path_is_mount_point(workspace); - if (r < 0) - return r; - if (r > 0) - workspace_mounted = true; /* If this is already a mount, and we are supposed to reuse - * it, let's keep this in mind */ - else - workspace_mounted = -1; /* We need to figure out if we can mount something to the workspace */ - } else - workspace_mounted = -1; /* ditto */ + r = path_is_mount_point(cred_dir); + if (r < 0) + return log_debug_errno(r, "Failed to determine if '%s' is a mountpoint: %m", cred_dir); + dir_mounted = r > 0; - /* If both the final place and the workspace are mounted, we have no mounts to set up, based on - * the assumption that they're actually the same tmpfs (but the latter with MS_RDONLY different). - * If the workspace is not mounted, we just bind the final place over and make it writable. */ - must_mount = must_mount || final_mounted; - - if (workspace_mounted < 0) { - if (!final_mounted) - /* Nothing is mounted on the workspace yet, let's try to mount a new tmpfs if - * not using the final place. */ - r = mount_credentials_fs(workspace, CREDENTIALS_TOTAL_SIZE_MAX, /* ro= */ false); - if (final_mounted || r < 0) { - /* If using final place or failed to mount new tmpfs, make a bind mount from - * the final to the workspace, so that we can make it writable there. */ - r = mount_nofollow_verbose(LOG_DEBUG, final, workspace, NULL, MS_BIND|MS_REC, NULL); - if (r < 0) { - if (!ERRNO_IS_PRIVILEGE(r)) - /* Propagate anything that isn't a permission problem. */ - return r; - - if (must_mount) - /* If it's not OK to use the plain directory fallback, propagate all - * errors too. */ - return r; - - /* If we lack privileges to bind mount stuff, then let's gracefully proceed - * for compat with container envs, and just use the final dir as is. - * Final place must not be mounted in this case (refused by must_mount - * above) */ - - workspace_mounted = false; - } else { - /* Make the new bind mount writable (i.e. drop MS_RDONLY) */ - r = mount_nofollow_verbose(LOG_DEBUG, - NULL, - workspace, - NULL, - MS_BIND|MS_REMOUNT|credentials_fs_mount_flags(/* ro= */ false), - NULL); - if (r < 0) - return r; - - workspace_mounted = true; - } - } else - workspace_mounted = true; + mfd = fsmount_credentials_fs(&fs_fd); + if (ERRNO_IS_NEG_PRIVILEGE(mfd) && !dir_mounted) { + log_debug_errno(mfd, "Lacking privilege to mount credentials fs, falling back to plain directory."); + return setup_credentials_plain_dir(context, cgroup_context, params, unit, cred_dir, uid, gid); } + if (mfd < 0) + return log_debug_errno(mfd, "Failed to mount credentials fs: %m"); - assert(workspace_mounted >= 0); - assert(!must_mount || workspace_mounted); + dfd = fd_reopen(mfd, O_DIRECTORY|O_CLOEXEC); + if (dfd < 0) + return dfd; - const char *where = workspace_mounted ? workspace : final; - - (void) label_fix_full(AT_FDCWD, where, final, 0); - - r = acquire_credentials(context, cgroup_context, params, unit, where, uid, gid, workspace_mounted); - if (r < 0) { - /* If we're using final place as workspace, and failed to acquire credentials, we might - * have left half-written creds there. Let's get rid of the whole mount, so future - * calls won't reuse it. */ - if (final_mounted) - (void) umount_verbose(LOG_DEBUG, final, MNT_DETACH|UMOUNT_NOFOLLOW); + (void) label_fix_full(dfd, /* inode_path = */ NULL, cred_dir, /* flags = */ 0); + r = acquire_credentials(context, cgroup_context, params, unit, dfd, uid, gid, /* ownership_ok = */ true); + if (r < 0) return r; - } - if (workspace_mounted) { - if (!final_mounted) { - /* Make workspace read-only now, so that any bind mount we make from it defaults to - * read-only too */ - r = mount_nofollow_verbose(LOG_DEBUG, NULL, workspace, NULL, MS_BIND|MS_REMOUNT|credentials_fs_mount_flags(/* ro= */ true), NULL); - if (r < 0) - return r; + r = credentials_dir_finalize_permissions(dfd, uid, gid, /* ownership_ok = */ true); + if (r < 0) + return log_debug_errno(r, "Failed to adjust ACLs of credentials dir: %m"); - /* And mount it to the final place, read-only */ - r = mount_nofollow_verbose(LOG_DEBUG, workspace, final, NULL, MS_MOVE, NULL); - } else - /* Otherwise we just get rid of the bind mount of final place */ - r = umount_verbose(LOG_DEBUG, workspace, MNT_DETACH|UMOUNT_NOFOLLOW); + // Work around a kernel bug that results in tmpfs reconfiguration failure. + // FIXME: drop this once https://lore.kernel.org/linux-fsdevel/20251108190930.440685-1-me@yhndnzj.com/ + // is merged and hits the distro kernels. + (void) fsconfig(fs_fd, FSCONFIG_SET_FLAG, "noswap", NULL, 0); + + if (fsconfig(fs_fd, FSCONFIG_SET_FLAG, "ro", NULL, 0) < 0) + return -errno; + + if (fsconfig(fs_fd, FSCONFIG_CMD_RECONFIGURE, NULL, NULL, 0) < 0) + return -errno; + + log_debug("Successfully reconfigured credentials fs to be read only."); + + if (dir_mounted) { + /* Firstly, try to move beneath the existing mount, which guarantees strictly atomic replacement + * (needs kernel >= 6.5) */ + r = move_mount(mfd, "", AT_FDCWD, cred_dir, MOVE_MOUNT_F_EMPTY_PATH|MOVE_MOUNT_BENEATH); + if (r >= 0) + return umount_verbose(LOG_DEBUG, cred_dir, MNT_DETACH|UMOUNT_NOFOLLOW); + if (errno != EINVAL) + return log_debug_errno(errno, "Failed to move credentials fs into place: %m"); + + log_debug_errno(errno, "Unable to move credentials fs beneath existing mount '%s', unmounting instead: %m", + cred_dir); + + r = umount_verbose(LOG_DEBUG, cred_dir, MNT_DETACH|UMOUNT_NOFOLLOW); if (r < 0) return r; - } else { - _cleanup_free_ char *parent = NULL; - - /* If we do not have our own mount put used the plain directory fallback, then we need to - * open access to the top-level credential directory and the per-service directory now */ - - r = path_extract_directory(final, &parent); - if (r < 0) - return r; - if (chmod(parent, 0755) < 0) - return -errno; } + r = move_mount(mfd, "", AT_FDCWD, cred_dir, MOVE_MOUNT_F_EMPTY_PATH); + if (r < 0) + return log_debug_errno(errno, "Failed to move credentials fs into place: %m"); + return 0; } @@ -1136,96 +1128,12 @@ int exec_setup_credentials( if (r < 0 && r != -EEXIST) return r; - r = safe_fork("(sd-mkdcreds)", FORK_DEATHSIG_SIGTERM|FORK_WAIT|FORK_NEW_MOUNTNS, NULL); - if (r < 0) { - _cleanup_(rmdir_and_freep) char *u = NULL; /* remove the temporary workspace if we can */ - _cleanup_free_ char *t = NULL; - - /* If this is not a privilege or support issue then propagate the error */ - if (!ERRNO_IS_NOT_SUPPORTED(r) && !ERRNO_IS_PRIVILEGE(r)) - return r; - - /* Temporary workspace, that remains inaccessible all the time. We prepare stuff there before moving - * it into place, so that users can't access half-initialized credential stores. */ - t = path_join(params->prefix[EXEC_DIRECTORY_RUNTIME], "systemd/temporary-credentials"); - if (!t) - return -ENOMEM; - - /* We can't set up a mount namespace. In that case operate on a fixed, inaccessible per-unit - * directory outside of /run/credentials/ first, and then move it over to /run/credentials/ - * after it is fully set up */ - u = path_join(t, unit); - if (!u) - return -ENOMEM; - - FOREACH_STRING(i, t, u) { - r = mkdir_label(i, 0700); - if (r < 0 && r != -EEXIST) - return log_debug_errno(r, "Failed to make directory '%s': %m", i); - } - - r = setup_credentials_internal( - context, - cgroup_context, - params, - unit, - p, /* final mount point */ - u, /* temporary workspace to overmount */ - true, /* reuse the workspace if it is already a mount */ - false, /* it's OK to fall back to a plain directory if we can't mount anything */ - uid, - gid); - if (r < 0) - return r; - - } else if (r == 0) { - - /* We managed to set up a mount namespace, and are now in a child. That's great. In this case - * we can use the same directory for all cases, after turning off propagation. Question - * though is: where do we turn off propagation exactly, and where do we place the workspace - * directory? We need some place that is guaranteed to be a mount point in the host, and - * which is guaranteed to have a subdir we can mount over. /run/ is not suitable for this, - * since we ultimately want to move the resulting file system there, i.e. we need propagation - * for /run/ eventually. We could use our own /run/systemd/bind mount on itself, but that - * would be visible in the host mount table all the time, which we want to avoid. Hence, what - * we do here instead we use /dev/ and /dev/shm/ for our purposes. We know for sure that - * /dev/ is a mount point and we now for sure that /dev/shm/ exists. Hence we can turn off - * propagation on the former, and then overmount the latter. - * - * Yes it's nasty playing games with /dev/ and /dev/shm/ like this, since it does not exist - * for this purpose, but there are few other candidates that work equally well for us, and - * given that we do this in a privately namespaced short-lived single-threaded process that - * no one else sees this should be OK to do. */ - - /* Turn off propagation from our namespace to host */ - r = mount_nofollow_verbose(LOG_DEBUG, NULL, "/dev", NULL, MS_SLAVE|MS_REC, NULL); - if (r < 0) - goto child_fail; - - r = setup_credentials_internal( - context, - cgroup_context, - params, - unit, - p, /* final mount point */ - "/dev/shm", /* temporary workspace to overmount */ - false, /* do not reuse /dev/shm if it is already a mount, under no circumstances */ - true, /* insist that something is mounted, do not allow fallback to plain directory */ - uid, - gid); - if (r < 0) - goto child_fail; - - _exit(EXIT_SUCCESS); - - child_fail: - _exit(EXIT_FAILURE); - } + r = setup_credentials_internal(context, cgroup_context, params, unit, p, uid, gid); /* If the credentials dir is empty and not a mount point, then there's no point in having it. Let's * try to remove it. This matters in particular if we created the dir as mount point but then didn't * actually end up mounting anything on it. In that case we'd rather have ENOENT than EACCESS being * seen by users when trying access this inode. */ (void) rmdir(p); - return 0; + return r; } diff --git a/src/core/import-creds.c b/src/core/import-creds.c index 0f35514079..e0df078448 100644 --- a/src/core/import-creds.c +++ b/src/core/import-creds.c @@ -103,7 +103,7 @@ static int acquire_credential_directory(ImportCredentialsContext *c, const char (void) mount_nofollow_verbose(LOG_WARNING, NULL, path, NULL, MS_BIND|MS_REMOUNT|credentials_fs_mount_flags(/* ro= */ false), NULL); else if (with_mount) /* If not a mount point yet, and the credentials are not encrypted, then let's try to mount a no-swap fs there */ - (void) mount_credentials_fs(path, CREDENTIALS_TOTAL_SIZE_MAX, /* ro= */ false); + (void) mount_credentials_fs(path); c->target_dir_fd = open(path, O_RDONLY|O_DIRECTORY|O_CLOEXEC); if (c->target_dir_fd < 0) diff --git a/src/shared/acl-util.c b/src/shared/acl-util.c index d16c165ed7..1dff9646cc 100644 --- a/src/shared/acl-util.c +++ b/src/shared/acl-util.c @@ -682,16 +682,12 @@ int fd_acl_make_read_only(int fd) { r = dlopen_libacl(); if (r < 0) - return r; + goto maybe_fallback; acl = sym_acl_get_fd(fd); if (!acl) { - - if (!ERRNO_IS_NOT_SUPPORTED(errno)) - return -errno; - - /* No ACLs? Then just update the regular mode_t */ - return fd_acl_make_read_only_fallback(fd); + r = -errno; + goto maybe_fallback; } for (r = sym_acl_get_entry(acl, ACL_FIRST_ENTRY, &i); @@ -729,75 +725,18 @@ int fd_acl_make_read_only(int fd) { return 0; if (sym_acl_set_fd(fd, acl) < 0) { - if (!ERRNO_IS_NOT_SUPPORTED(errno)) - return -errno; - - return fd_acl_make_read_only_fallback(fd); + r = -errno; + goto maybe_fallback; } return 1; -} -int fd_acl_make_writable(int fd) { - _cleanup_(acl_freep) acl_t acl = NULL; - acl_entry_t i; - int r; - - /* Safely adds the writable bit to the owner's ACL entry of this inode. (And only the owner's! – This - * not the obvious inverse of fd_acl_make_read_only() hence!) */ - - r = dlopen_libacl(); - if (r < 0) +maybe_fallback: + if (!ERRNO_IS_NEG_NOT_SUPPORTED(r)) return r; - acl = sym_acl_get_fd(fd); - if (!acl) { - if (!ERRNO_IS_NOT_SUPPORTED(errno)) - return -errno; - - /* No ACLs? Then just update the regular mode_t */ - return fd_acl_make_writable_fallback(fd); - } - - for (r = sym_acl_get_entry(acl, ACL_FIRST_ENTRY, &i); - r > 0; - r = sym_acl_get_entry(acl, ACL_NEXT_ENTRY, &i)) { - acl_permset_t permset; - acl_tag_t tag; - int b; - - if (sym_acl_get_tag_type(i, &tag) < 0) - return -errno; - - if (tag != ACL_USER_OBJ) - continue; - - if (sym_acl_get_permset(i, &permset) < 0) - return -errno; - - b = sym_acl_get_perm(permset, ACL_WRITE); - if (b < 0) - return -errno; - - if (b) - return 0; /* Already set? Then there's nothing to do. */ - - if (sym_acl_add_perm(permset, ACL_WRITE) < 0) - return -errno; - - break; - } - if (r < 0) - return -errno; - - if (sym_acl_set_fd(fd, acl) < 0) { - if (!ERRNO_IS_NOT_SUPPORTED(errno)) - return -errno; - - return fd_acl_make_writable_fallback(fd); - } - - return 1; + /* No ACLs? Then just update the regular mode_t */ + return fd_acl_make_read_only_fallback(fd); } #endif @@ -818,23 +757,6 @@ int fd_acl_make_read_only_fallback(int fd) { return 1; } -int fd_acl_make_writable_fallback(int fd) { - struct stat st; - - assert(fd >= 0); - - if (fstat(fd, &st) < 0) - return -errno; - - if ((st.st_mode & 0200) != 0) /* already set */ - return 0; - - if (fchmod(fd, (st.st_mode & 07777) | 0200) < 0) - return -errno; - - return 1; -} - int inode_type_can_acl(mode_t mode) { return IN_SET(mode & S_IFMT, S_IFSOCK, S_IFREG, S_IFBLK, S_IFCHR, S_IFDIR, S_IFIFO); } diff --git a/src/shared/acl-util.h b/src/shared/acl-util.h index 7f6650013b..5ff7cc47bc 100644 --- a/src/shared/acl-util.h +++ b/src/shared/acl-util.h @@ -4,7 +4,6 @@ #include "shared-forward.h" int fd_acl_make_read_only_fallback(int fd); -int fd_acl_make_writable_fallback(int fd); #if HAVE_ACL #include /* IWYU pragma: export */ @@ -56,7 +55,6 @@ int acls_for_file(const char *path, acl_type_t type, acl_t new, acl_t *ret); int fd_add_uid_acl_permission(int fd, uid_t uid, unsigned mask); int fd_acl_make_read_only(int fd); -int fd_acl_make_writable(int fd); /* acl_free() takes multiple argument types. Multiple cleanup functions are necessary. */ DEFINE_TRIVIAL_CLEANUP_FUNC_FULL_RENAME(acl_t, sym_acl_free, acl_freep, NULL); @@ -89,10 +87,6 @@ static inline int fd_add_uid_acl_permission(int fd, uid_t uid, unsigned mask) { static inline int fd_acl_make_read_only(int fd) { return fd_acl_make_read_only_fallback(fd); } - -static inline int fd_acl_make_writable(int fd) { - return fd_acl_make_writable_fallback(fd); -} #endif int inode_type_can_acl(mode_t mode); diff --git a/src/shared/creds-util.h b/src/shared/creds-util.h index 7b2fc76eb4..32c5a0ba88 100644 --- a/src/shared/creds-util.h +++ b/src/shared/creds-util.h @@ -9,7 +9,7 @@ #define CREDENTIAL_NAME_MAX FDNAME_MAX /* Put a size limit on the individual credential */ -#define CREDENTIAL_SIZE_MAX (1024U*1024U) +#define CREDENTIAL_SIZE_MAX (1U * U64_MB) /* Refuse to store more than 1M per service, after all this is unswappable memory. Note that for now we put * this to the same limit as the per-credential limit, i.e. if the user has n > 1 credentials instead of 1 it @@ -18,7 +18,7 @@ /* Put a size limit on encrypted credentials (which is the same as the unencrypted size plus a spacious 128K of extra * space for headers, IVs, exported TPM2 key material and so on. */ -#define CREDENTIAL_ENCRYPTED_SIZE_MAX (CREDENTIAL_SIZE_MAX + 128U*1024U) +#define CREDENTIAL_ENCRYPTED_SIZE_MAX (CREDENTIAL_SIZE_MAX + 128U * U64_KB) bool credential_name_valid(const char *s); bool credential_glob_valid(const char *s); diff --git a/src/shared/mount-util.c b/src/shared/mount-util.c index dcdc42b00a..fb3f5b5206 100644 --- a/src/shared/mount-util.c +++ b/src/shared/mount-util.c @@ -7,6 +7,7 @@ #include "alloc-util.h" #include "chase.h" +#include "creds-util.h" #include "dissect-image.h" #include "errno-util.h" #include "extract-word.h" @@ -1831,58 +1832,65 @@ unsigned long credentials_fs_mount_flags(bool ro) { return MS_NODEV|MS_NOEXEC|MS_NOSUID|ms_nosymfollow_supported()|(ro ? MS_RDONLY : 0); } -int mount_credentials_fs(const char *path, size_t size, bool ro) { - _cleanup_free_ char *opts = NULL; - int r, noswap_supported; +int fsmount_credentials_fs(int *ret_fsfd) { + _cleanup_close_ int fs_fd = -EBADF; + char size_str[DECIMAL_STR_MAX(uint64_t)]; /* Mounts a file system we can place credentials in, i.e. with tight access modes right from the * beginning, and ideally swapping turned off. In order of preference: * - * 1. tmpfs if it supports "noswap" + * 1. tmpfs if it supports "noswap" (needs kernel >= 6.3) * 2. ramfs - * 3. tmpfs if it doesn't support "noswap" + * 3. tmpfs without "noswap" */ - noswap_supported = mount_option_supported("tmpfs", "noswap", NULL); /* Check explicitly to avoid kmsg noise */ - if (noswap_supported > 0) { - _cleanup_free_ char *noswap_opts = NULL; + fs_fd = fsopen("tmpfs", FSOPEN_CLOEXEC); + if (fs_fd < 0) + return -errno; - if (asprintf(&noswap_opts, "mode=0700,nr_inodes=1024,size=%zu,noswap", size) < 0) - return -ENOMEM; + if (fsconfig(fs_fd, FSCONFIG_SET_STRING, "nr_inodes", "1024", 0) < 0) + return -errno; - /* Best case: tmpfs with noswap (needs kernel >= 6.3) */ + xsprintf(size_str, "%" PRIu64, CREDENTIALS_TOTAL_SIZE_MAX); + if (fsconfig(fs_fd, FSCONFIG_SET_STRING, "size", size_str, 0) < 0) + return -errno; - r = mount_nofollow_verbose( - LOG_DEBUG, - "tmpfs", - path, - "tmpfs", - credentials_fs_mount_flags(ro), - noswap_opts); - if (r >= 0) - return r; + if (fsconfig(fs_fd, FSCONFIG_SET_FLAG, "noswap", NULL, 0) < 0) { + if (errno != EINVAL) + return -errno; + + int ramfs_fd = fsopen("ramfs", FSOPEN_CLOEXEC); + if (ramfs_fd >= 0) + close_and_replace(fs_fd, ramfs_fd); } - r = mount_nofollow_verbose( - LOG_DEBUG, - "ramfs", - path, - "ramfs", - credentials_fs_mount_flags(ro), - "mode=0700"); - if (r >= 0) - return r; + if (fsconfig(fs_fd, FSCONFIG_SET_STRING, "mode", "0700", 0) < 0) + return -errno; - if (asprintf(&opts, "mode=0700,nr_inodes=1024,size=%zu", size) < 0) - return -ENOMEM; + if (fsconfig(fs_fd, FSCONFIG_CMD_CREATE, NULL, NULL, 0) < 0) + return -errno; - return mount_nofollow_verbose( - LOG_DEBUG, - "tmpfs", - path, - "tmpfs", - credentials_fs_mount_flags(ro), - opts); + int mfd = fsmount(fs_fd, FSMOUNT_CLOEXEC, + ms_flags_to_mount_attr(credentials_fs_mount_flags(/* ro = */ false))); + if (mfd < 0) + return -errno; + + if (ret_fsfd) + *ret_fsfd = TAKE_FD(fs_fd); + + return mfd; +} + +int mount_credentials_fs(const char *path) { + _cleanup_close_ int mfd = -EBADF; + + assert(path); + + mfd = fsmount_credentials_fs(/* ret_fsfd = */ NULL); + if (mfd < 0) + return mfd; + + return RET_NERRNO(move_mount(mfd, "", AT_FDCWD, path, MOVE_MOUNT_F_EMPTY_PATH)); } int make_fsmount( diff --git a/src/shared/mount-util.h b/src/shared/mount-util.h index c5968dc565..c607875375 100644 --- a/src/shared/mount-util.h +++ b/src/shared/mount-util.h @@ -159,7 +159,8 @@ int make_mount_point_inode_from_path(const char *source, const char *dest, mode_ int trigger_automount_at(int dir_fd, const char *path); unsigned long credentials_fs_mount_flags(bool ro); -int mount_credentials_fs(const char *path, size_t size, bool ro); +int fsmount_credentials_fs(int *ret_fsfd); +int mount_credentials_fs(const char *path); int make_fsmount(int error_log_level, const char *what, const char *type, unsigned long flags, const char *options, int userns_fd); diff --git a/src/test/test-acl-util.c b/src/test/test-acl-util.c index 5967c4b406..bf65d38b37 100644 --- a/src/test/test-acl-util.c +++ b/src/test/test-acl-util.c @@ -112,30 +112,6 @@ TEST_RET(fd_acl_make_read_only) { cmd = strjoina("stat ", fn); ASSERT_OK_ZERO_ERRNO(system(cmd)); - log_info("writable"); - ASSERT_OK_POSITIVE(fd_acl_make_writable(fd)); - - ASSERT_OK_ERRNO(fstat(fd, &st)); - ASSERT_EQ(st.st_mode & 0222, 0200u); - - cmd = strjoina("getfacl -p ", fn); - ASSERT_OK_ZERO_ERRNO(system(cmd)); - - cmd = strjoina("stat ", fn); - ASSERT_OK_ZERO_ERRNO(system(cmd)); - - log_info("read-only"); - ASSERT_OK_POSITIVE(fd_acl_make_read_only(fd)); - - ASSERT_OK_ERRNO(fstat(fd, &st)); - ASSERT_EQ(st.st_mode & 0222, 0000u); - - cmd = strjoina("getfacl -p ", fn); - ASSERT_OK_ZERO_ERRNO(system(cmd)); - - cmd = strjoina("stat ", fn); - ASSERT_OK_ZERO_ERRNO(system(cmd)); - return 0; }