core/exec-credentials: port to new mount API, ensure atomicity for creds installation (#39637)

This commit is contained in:
Yu Watanabe
2025-11-12 09:00:23 +09:00
committed by GitHub
8 changed files with 226 additions and 417 deletions

View File

@@ -29,7 +29,6 @@
#include "siphash24.h"
#include "stat-util.h"
#include "strv.h"
#include "tmpfile-util.h"
#include "user-util.h"
ExecSetCredential* exec_set_credential_free(ExecSetCredential *sc) {
@@ -320,7 +319,6 @@ static int write_credential(
gid_t gid,
bool ownership_ok) {
_cleanup_free_ char *tmp = NULL;
_cleanup_close_ int fd = -EBADF;
int r;
@@ -328,16 +326,10 @@ static int write_credential(
assert(id);
assert(data || size == 0);
r = tempfn_random_child("", "cred", &tmp);
if (r < 0)
return r;
fd = openat(dfd, tmp, O_CREAT|O_RDWR|O_CLOEXEC|O_EXCL|O_NOFOLLOW|O_NOCTTY, 0600);
fd = openat(dfd, id, O_CREAT|O_EXCL|O_WRONLY|O_CLOEXEC, 0600);
if (fd < 0)
return -errno;
CLEANUP_TMPFILE_AT(dfd, tmp);
r = loop_write(fd, data, size);
if (r < 0)
return r;
@@ -359,11 +351,6 @@ static int write_credential(
return r;
}
r = RET_NERRNO(renameat(dfd, tmp, dfd, id));
if (r < 0)
return r;
tmp = mfree(tmp); /* disarm CLEANUP_TMPFILE_AT() */
return 0;
}
@@ -426,36 +413,20 @@ static int credential_search_path(const ExecParameters *params, CredentialSearch
return 0;
}
static bool device_nodes_restricted(
const ExecContext *c,
const CGroupContext *cgroup_context) {
assert(c);
assert(cgroup_context);
/* Returns true if we have any reason to believe we might not be able to access the TPM device
* directly, even if we run as root/PID 1. This could be because /dev/ is replaced by a private
* version, or because a device node access list is configured. */
if (c->private_devices)
return true;
if (cgroup_context_has_device_policy(cgroup_context))
return true;
return false;
}
struct load_cred_args {
const ExecContext *context;
const CGroupContext *cgroup_context;
const ExecParameters *params;
const char *unit;
bool always_ipc;
bool encrypted;
int write_dfd;
uid_t uid;
gid_t gid;
bool ownership_ok;
uint64_t left;
};
@@ -486,7 +457,7 @@ static int maybe_decrypt_and_write_credential(
flags |= CREDENTIAL_ANY_SCOPE;
if (!device_nodes_restricted(args->context, args->cgroup_context)) {
if (!args->always_ipc) {
r = decrypt_credential_and_warn(
id,
now(CLOCK_REALTIME),
@@ -787,38 +758,49 @@ static int load_cred_recurse_dir_cb(
return RECURSE_DIR_CONTINUE;
}
static bool device_nodes_restricted(
const ExecContext *c,
const CGroupContext *cgroup_context) {
assert(c);
assert(cgroup_context);
/* Returns true if we have any reason to believe we might not be able to access the TPM device
* directly, even if we run as root/PID 1. This could be because /dev/ is replaced by a private
* version, or because a device node access list is configured. */
if (c->private_devices)
return true;
if (cgroup_context_has_device_policy(cgroup_context))
return true;
return false;
}
static int acquire_credentials(
const ExecContext *context,
const CGroupContext *cgroup_context,
const ExecParameters *params,
const char *unit,
const char *p,
int dfd,
uid_t uid,
gid_t gid,
bool ownership_ok) {
_cleanup_close_ int dfd = -EBADF;
int r;
assert(context);
assert(cgroup_context);
assert(params);
assert(unit);
assert(p);
dfd = open(p, O_DIRECTORY|O_CLOEXEC);
if (dfd < 0)
return -errno;
r = fd_acl_make_writable(dfd); /* Add the "w" bit, if we are reusing an already set up credentials dir where it was unset */
if (r < 0)
return r;
assert(dfd >= 0);
struct load_cred_args args = {
.context = context,
.cgroup_context = cgroup_context,
.params = params,
.unit = unit,
.always_ipc = device_nodes_restricted(context, cgroup_context),
.write_dfd = dfd,
.uid = uid,
.gid = gid,
@@ -919,7 +901,15 @@ static int acquire_credentials(
return r;
}
r = fd_acl_make_read_only(dfd); /* Now take away the "w" bit */
return 0;
}
static int credentials_dir_finalize_permissions(int dfd, uid_t uid, gid_t gid, bool ownership_ok) {
int r;
assert(dfd >= 0);
r = fd_acl_make_read_only(dfd); /* Take away the "w" bit */
if (r < 0)
return r;
@@ -943,157 +933,159 @@ static int acquire_credentials(
return 0;
}
static int setup_credentials_plain_dir(
const ExecContext *context,
const CGroupContext *cgroup_context,
const ExecParameters *params,
const char *unit,
const char *cred_dir,
uid_t uid,
gid_t gid) {
_cleanup_free_ char *t = NULL, *workspace = NULL;
_cleanup_(rm_rf_safep) const char *workspace_rm = NULL;
_cleanup_close_ int dfd = -EBADF;
int r;
assert(context);
assert(params);
assert(unit);
assert(cred_dir);
/* Temporary workspace, that remains inaccessible all the time. We prepare stuff there before moving
* it into place, so that users can't access half-initialized credential stores. */
t = path_join(params->prefix[EXEC_DIRECTORY_RUNTIME], "systemd/temporary-credentials");
if (!t)
return -ENOMEM;
r = mkdir_label(t, 0700);
if (r < 0 && r != -EEXIST)
return r;
workspace = path_join(t, unit);
if (!workspace)
return -ENOMEM;
dfd = open_mkdir(workspace, O_CLOEXEC|O_EXCL, 0700);
if (dfd < 0)
return log_debug_errno(dfd, "Failed to create workspace for credentials: %m");
workspace_rm = workspace;
(void) label_fix_full(dfd, /* inode_path = */ NULL, cred_dir, /* flags = */ 0);
r = acquire_credentials(context, cgroup_context, params, unit, dfd, uid, gid, /* ownership_ok = */ false);
if (r < 0)
return r;
r = RET_NERRNO(rename(workspace, cred_dir));
if (r >= 0)
workspace_rm = NULL;
if (r == -EEXIST) {
log_debug_errno(r, "Credential dir '%s' already populated, exchanging with workspace.", cred_dir);
r = RET_NERRNO(renameat2(AT_FDCWD, workspace, AT_FDCWD, cred_dir, RENAME_EXCHANGE));
}
if (r < 0)
return log_debug_errno(r, "Failed to move credentials workspace into place: %m");
/* rename() requires both the source and target to be writable, hence lock down write permission
* as last step. */
r = credentials_dir_finalize_permissions(dfd, uid, gid, /* ownership_ok = */ false);
if (r < 0)
return log_debug_errno(r, "Failed to adjust ACLs of credentials dir: %m");
return 0;
}
static int setup_credentials_internal(
const ExecContext *context,
const CGroupContext *cgroup_context,
const ExecParameters *params,
const char *unit,
const char *final, /* This is where the credential store shall eventually end up at */
const char *workspace, /* This is where we can prepare it before moving it to the final place */
bool reuse_workspace, /* Whether to reuse any existing workspace mount if it already is a mount */
bool must_mount, /* Whether to require that we mount something, it's not OK to use the plain directory fall back */
const char *cred_dir,
uid_t uid,
gid_t gid) {
bool final_mounted;
int r, workspace_mounted; /* negative if we don't know yet whether we have/can mount something; true
* if we mounted something; false if we definitely can't mount anything */
_cleanup_close_ int fs_fd = -EBADF, mfd = -EBADF, dfd = -EBADF;
bool dir_mounted;
int r;
assert(context);
assert(params);
assert(unit);
assert(final);
assert(workspace);
assert(cred_dir);
r = path_is_mount_point(final);
if (r < 0)
return log_debug_errno(r, "Failed to determine if '%s' is a mountpoint: %m", final);
final_mounted = r > 0;
if (final_mounted) {
if (FLAGS_SET(params->flags, EXEC_SETUP_CREDENTIALS_FRESH)) {
r = umount_verbose(LOG_DEBUG, final, MNT_DETACH|UMOUNT_NOFOLLOW);
if (r < 0)
return r;
final_mounted = false;
} else {
/* We can reuse the previous credential dir */
r = dir_is_empty(final, /* ignore_hidden_or_backup = */ false);
if (r < 0)
return r;
if (r == 0) {
log_debug("Credential dir for unit '%s' already set up, skipping.", unit);
return 0;
}
if (!FLAGS_SET(params->flags, EXEC_SETUP_CREDENTIALS_FRESH)) {
/* We may reuse the previous credential dir */
r = dir_is_empty(cred_dir, /* ignore_hidden_or_backup = */ false);
if (r < 0)
return r;
if (r == 0) {
log_debug("Credential dir for unit '%s' already set up, skipping.", unit);
return 0;
}
}
if (reuse_workspace) {
r = path_is_mount_point(workspace);
if (r < 0)
return r;
if (r > 0)
workspace_mounted = true; /* If this is already a mount, and we are supposed to reuse
* it, let's keep this in mind */
else
workspace_mounted = -1; /* We need to figure out if we can mount something to the workspace */
} else
workspace_mounted = -1; /* ditto */
r = path_is_mount_point(cred_dir);
if (r < 0)
return log_debug_errno(r, "Failed to determine if '%s' is a mountpoint: %m", cred_dir);
dir_mounted = r > 0;
/* If both the final place and the workspace are mounted, we have no mounts to set up, based on
* the assumption that they're actually the same tmpfs (but the latter with MS_RDONLY different).
* If the workspace is not mounted, we just bind the final place over and make it writable. */
must_mount = must_mount || final_mounted;
if (workspace_mounted < 0) {
if (!final_mounted)
/* Nothing is mounted on the workspace yet, let's try to mount a new tmpfs if
* not using the final place. */
r = mount_credentials_fs(workspace, CREDENTIALS_TOTAL_SIZE_MAX, /* ro= */ false);
if (final_mounted || r < 0) {
/* If using final place or failed to mount new tmpfs, make a bind mount from
* the final to the workspace, so that we can make it writable there. */
r = mount_nofollow_verbose(LOG_DEBUG, final, workspace, NULL, MS_BIND|MS_REC, NULL);
if (r < 0) {
if (!ERRNO_IS_PRIVILEGE(r))
/* Propagate anything that isn't a permission problem. */
return r;
if (must_mount)
/* If it's not OK to use the plain directory fallback, propagate all
* errors too. */
return r;
/* If we lack privileges to bind mount stuff, then let's gracefully proceed
* for compat with container envs, and just use the final dir as is.
* Final place must not be mounted in this case (refused by must_mount
* above) */
workspace_mounted = false;
} else {
/* Make the new bind mount writable (i.e. drop MS_RDONLY) */
r = mount_nofollow_verbose(LOG_DEBUG,
NULL,
workspace,
NULL,
MS_BIND|MS_REMOUNT|credentials_fs_mount_flags(/* ro= */ false),
NULL);
if (r < 0)
return r;
workspace_mounted = true;
}
} else
workspace_mounted = true;
mfd = fsmount_credentials_fs(&fs_fd);
if (ERRNO_IS_NEG_PRIVILEGE(mfd) && !dir_mounted) {
log_debug_errno(mfd, "Lacking privilege to mount credentials fs, falling back to plain directory.");
return setup_credentials_plain_dir(context, cgroup_context, params, unit, cred_dir, uid, gid);
}
if (mfd < 0)
return log_debug_errno(mfd, "Failed to mount credentials fs: %m");
assert(workspace_mounted >= 0);
assert(!must_mount || workspace_mounted);
dfd = fd_reopen(mfd, O_DIRECTORY|O_CLOEXEC);
if (dfd < 0)
return dfd;
const char *where = workspace_mounted ? workspace : final;
(void) label_fix_full(AT_FDCWD, where, final, 0);
r = acquire_credentials(context, cgroup_context, params, unit, where, uid, gid, workspace_mounted);
if (r < 0) {
/* If we're using final place as workspace, and failed to acquire credentials, we might
* have left half-written creds there. Let's get rid of the whole mount, so future
* calls won't reuse it. */
if (final_mounted)
(void) umount_verbose(LOG_DEBUG, final, MNT_DETACH|UMOUNT_NOFOLLOW);
(void) label_fix_full(dfd, /* inode_path = */ NULL, cred_dir, /* flags = */ 0);
r = acquire_credentials(context, cgroup_context, params, unit, dfd, uid, gid, /* ownership_ok = */ true);
if (r < 0)
return r;
}
if (workspace_mounted) {
if (!final_mounted) {
/* Make workspace read-only now, so that any bind mount we make from it defaults to
* read-only too */
r = mount_nofollow_verbose(LOG_DEBUG, NULL, workspace, NULL, MS_BIND|MS_REMOUNT|credentials_fs_mount_flags(/* ro= */ true), NULL);
if (r < 0)
return r;
r = credentials_dir_finalize_permissions(dfd, uid, gid, /* ownership_ok = */ true);
if (r < 0)
return log_debug_errno(r, "Failed to adjust ACLs of credentials dir: %m");
/* And mount it to the final place, read-only */
r = mount_nofollow_verbose(LOG_DEBUG, workspace, final, NULL, MS_MOVE, NULL);
} else
/* Otherwise we just get rid of the bind mount of final place */
r = umount_verbose(LOG_DEBUG, workspace, MNT_DETACH|UMOUNT_NOFOLLOW);
// Work around a kernel bug that results in tmpfs reconfiguration failure.
// FIXME: drop this once https://lore.kernel.org/linux-fsdevel/20251108190930.440685-1-me@yhndnzj.com/
// is merged and hits the distro kernels.
(void) fsconfig(fs_fd, FSCONFIG_SET_FLAG, "noswap", NULL, 0);
if (fsconfig(fs_fd, FSCONFIG_SET_FLAG, "ro", NULL, 0) < 0)
return -errno;
if (fsconfig(fs_fd, FSCONFIG_CMD_RECONFIGURE, NULL, NULL, 0) < 0)
return -errno;
log_debug("Successfully reconfigured credentials fs to be read only.");
if (dir_mounted) {
/* Firstly, try to move beneath the existing mount, which guarantees strictly atomic replacement
* (needs kernel >= 6.5) */
r = move_mount(mfd, "", AT_FDCWD, cred_dir, MOVE_MOUNT_F_EMPTY_PATH|MOVE_MOUNT_BENEATH);
if (r >= 0)
return umount_verbose(LOG_DEBUG, cred_dir, MNT_DETACH|UMOUNT_NOFOLLOW);
if (errno != EINVAL)
return log_debug_errno(errno, "Failed to move credentials fs into place: %m");
log_debug_errno(errno, "Unable to move credentials fs beneath existing mount '%s', unmounting instead: %m",
cred_dir);
r = umount_verbose(LOG_DEBUG, cred_dir, MNT_DETACH|UMOUNT_NOFOLLOW);
if (r < 0)
return r;
} else {
_cleanup_free_ char *parent = NULL;
/* If we do not have our own mount put used the plain directory fallback, then we need to
* open access to the top-level credential directory and the per-service directory now */
r = path_extract_directory(final, &parent);
if (r < 0)
return r;
if (chmod(parent, 0755) < 0)
return -errno;
}
r = move_mount(mfd, "", AT_FDCWD, cred_dir, MOVE_MOUNT_F_EMPTY_PATH);
if (r < 0)
return log_debug_errno(errno, "Failed to move credentials fs into place: %m");
return 0;
}
@@ -1136,96 +1128,12 @@ int exec_setup_credentials(
if (r < 0 && r != -EEXIST)
return r;
r = safe_fork("(sd-mkdcreds)", FORK_DEATHSIG_SIGTERM|FORK_WAIT|FORK_NEW_MOUNTNS, NULL);
if (r < 0) {
_cleanup_(rmdir_and_freep) char *u = NULL; /* remove the temporary workspace if we can */
_cleanup_free_ char *t = NULL;
/* If this is not a privilege or support issue then propagate the error */
if (!ERRNO_IS_NOT_SUPPORTED(r) && !ERRNO_IS_PRIVILEGE(r))
return r;
/* Temporary workspace, that remains inaccessible all the time. We prepare stuff there before moving
* it into place, so that users can't access half-initialized credential stores. */
t = path_join(params->prefix[EXEC_DIRECTORY_RUNTIME], "systemd/temporary-credentials");
if (!t)
return -ENOMEM;
/* We can't set up a mount namespace. In that case operate on a fixed, inaccessible per-unit
* directory outside of /run/credentials/ first, and then move it over to /run/credentials/
* after it is fully set up */
u = path_join(t, unit);
if (!u)
return -ENOMEM;
FOREACH_STRING(i, t, u) {
r = mkdir_label(i, 0700);
if (r < 0 && r != -EEXIST)
return log_debug_errno(r, "Failed to make directory '%s': %m", i);
}
r = setup_credentials_internal(
context,
cgroup_context,
params,
unit,
p, /* final mount point */
u, /* temporary workspace to overmount */
true, /* reuse the workspace if it is already a mount */
false, /* it's OK to fall back to a plain directory if we can't mount anything */
uid,
gid);
if (r < 0)
return r;
} else if (r == 0) {
/* We managed to set up a mount namespace, and are now in a child. That's great. In this case
* we can use the same directory for all cases, after turning off propagation. Question
* though is: where do we turn off propagation exactly, and where do we place the workspace
* directory? We need some place that is guaranteed to be a mount point in the host, and
* which is guaranteed to have a subdir we can mount over. /run/ is not suitable for this,
* since we ultimately want to move the resulting file system there, i.e. we need propagation
* for /run/ eventually. We could use our own /run/systemd/bind mount on itself, but that
* would be visible in the host mount table all the time, which we want to avoid. Hence, what
* we do here instead we use /dev/ and /dev/shm/ for our purposes. We know for sure that
* /dev/ is a mount point and we now for sure that /dev/shm/ exists. Hence we can turn off
* propagation on the former, and then overmount the latter.
*
* Yes it's nasty playing games with /dev/ and /dev/shm/ like this, since it does not exist
* for this purpose, but there are few other candidates that work equally well for us, and
* given that we do this in a privately namespaced short-lived single-threaded process that
* no one else sees this should be OK to do. */
/* Turn off propagation from our namespace to host */
r = mount_nofollow_verbose(LOG_DEBUG, NULL, "/dev", NULL, MS_SLAVE|MS_REC, NULL);
if (r < 0)
goto child_fail;
r = setup_credentials_internal(
context,
cgroup_context,
params,
unit,
p, /* final mount point */
"/dev/shm", /* temporary workspace to overmount */
false, /* do not reuse /dev/shm if it is already a mount, under no circumstances */
true, /* insist that something is mounted, do not allow fallback to plain directory */
uid,
gid);
if (r < 0)
goto child_fail;
_exit(EXIT_SUCCESS);
child_fail:
_exit(EXIT_FAILURE);
}
r = setup_credentials_internal(context, cgroup_context, params, unit, p, uid, gid);
/* If the credentials dir is empty and not a mount point, then there's no point in having it. Let's
* try to remove it. This matters in particular if we created the dir as mount point but then didn't
* actually end up mounting anything on it. In that case we'd rather have ENOENT than EACCESS being
* seen by users when trying access this inode. */
(void) rmdir(p);
return 0;
return r;
}

View File

@@ -103,7 +103,7 @@ static int acquire_credential_directory(ImportCredentialsContext *c, const char
(void) mount_nofollow_verbose(LOG_WARNING, NULL, path, NULL, MS_BIND|MS_REMOUNT|credentials_fs_mount_flags(/* ro= */ false), NULL);
else if (with_mount)
/* If not a mount point yet, and the credentials are not encrypted, then let's try to mount a no-swap fs there */
(void) mount_credentials_fs(path, CREDENTIALS_TOTAL_SIZE_MAX, /* ro= */ false);
(void) mount_credentials_fs(path);
c->target_dir_fd = open(path, O_RDONLY|O_DIRECTORY|O_CLOEXEC);
if (c->target_dir_fd < 0)

View File

@@ -682,16 +682,12 @@ int fd_acl_make_read_only(int fd) {
r = dlopen_libacl();
if (r < 0)
return r;
goto maybe_fallback;
acl = sym_acl_get_fd(fd);
if (!acl) {
if (!ERRNO_IS_NOT_SUPPORTED(errno))
return -errno;
/* No ACLs? Then just update the regular mode_t */
return fd_acl_make_read_only_fallback(fd);
r = -errno;
goto maybe_fallback;
}
for (r = sym_acl_get_entry(acl, ACL_FIRST_ENTRY, &i);
@@ -729,75 +725,18 @@ int fd_acl_make_read_only(int fd) {
return 0;
if (sym_acl_set_fd(fd, acl) < 0) {
if (!ERRNO_IS_NOT_SUPPORTED(errno))
return -errno;
return fd_acl_make_read_only_fallback(fd);
r = -errno;
goto maybe_fallback;
}
return 1;
}
int fd_acl_make_writable(int fd) {
_cleanup_(acl_freep) acl_t acl = NULL;
acl_entry_t i;
int r;
/* Safely adds the writable bit to the owner's ACL entry of this inode. (And only the owner's! This
* not the obvious inverse of fd_acl_make_read_only() hence!) */
r = dlopen_libacl();
if (r < 0)
maybe_fallback:
if (!ERRNO_IS_NEG_NOT_SUPPORTED(r))
return r;
acl = sym_acl_get_fd(fd);
if (!acl) {
if (!ERRNO_IS_NOT_SUPPORTED(errno))
return -errno;
/* No ACLs? Then just update the regular mode_t */
return fd_acl_make_writable_fallback(fd);
}
for (r = sym_acl_get_entry(acl, ACL_FIRST_ENTRY, &i);
r > 0;
r = sym_acl_get_entry(acl, ACL_NEXT_ENTRY, &i)) {
acl_permset_t permset;
acl_tag_t tag;
int b;
if (sym_acl_get_tag_type(i, &tag) < 0)
return -errno;
if (tag != ACL_USER_OBJ)
continue;
if (sym_acl_get_permset(i, &permset) < 0)
return -errno;
b = sym_acl_get_perm(permset, ACL_WRITE);
if (b < 0)
return -errno;
if (b)
return 0; /* Already set? Then there's nothing to do. */
if (sym_acl_add_perm(permset, ACL_WRITE) < 0)
return -errno;
break;
}
if (r < 0)
return -errno;
if (sym_acl_set_fd(fd, acl) < 0) {
if (!ERRNO_IS_NOT_SUPPORTED(errno))
return -errno;
return fd_acl_make_writable_fallback(fd);
}
return 1;
/* No ACLs? Then just update the regular mode_t */
return fd_acl_make_read_only_fallback(fd);
}
#endif
@@ -818,23 +757,6 @@ int fd_acl_make_read_only_fallback(int fd) {
return 1;
}
int fd_acl_make_writable_fallback(int fd) {
struct stat st;
assert(fd >= 0);
if (fstat(fd, &st) < 0)
return -errno;
if ((st.st_mode & 0200) != 0) /* already set */
return 0;
if (fchmod(fd, (st.st_mode & 07777) | 0200) < 0)
return -errno;
return 1;
}
int inode_type_can_acl(mode_t mode) {
return IN_SET(mode & S_IFMT, S_IFSOCK, S_IFREG, S_IFBLK, S_IFCHR, S_IFDIR, S_IFIFO);
}

View File

@@ -4,7 +4,6 @@
#include "shared-forward.h"
int fd_acl_make_read_only_fallback(int fd);
int fd_acl_make_writable_fallback(int fd);
#if HAVE_ACL
#include <acl/libacl.h> /* IWYU pragma: export */
@@ -56,7 +55,6 @@ int acls_for_file(const char *path, acl_type_t type, acl_t new, acl_t *ret);
int fd_add_uid_acl_permission(int fd, uid_t uid, unsigned mask);
int fd_acl_make_read_only(int fd);
int fd_acl_make_writable(int fd);
/* acl_free() takes multiple argument types. Multiple cleanup functions are necessary. */
DEFINE_TRIVIAL_CLEANUP_FUNC_FULL_RENAME(acl_t, sym_acl_free, acl_freep, NULL);
@@ -89,10 +87,6 @@ static inline int fd_add_uid_acl_permission(int fd, uid_t uid, unsigned mask) {
static inline int fd_acl_make_read_only(int fd) {
return fd_acl_make_read_only_fallback(fd);
}
static inline int fd_acl_make_writable(int fd) {
return fd_acl_make_writable_fallback(fd);
}
#endif
int inode_type_can_acl(mode_t mode);

View File

@@ -9,7 +9,7 @@
#define CREDENTIAL_NAME_MAX FDNAME_MAX
/* Put a size limit on the individual credential */
#define CREDENTIAL_SIZE_MAX (1024U*1024U)
#define CREDENTIAL_SIZE_MAX (1U * U64_MB)
/* Refuse to store more than 1M per service, after all this is unswappable memory. Note that for now we put
* this to the same limit as the per-credential limit, i.e. if the user has n > 1 credentials instead of 1 it
@@ -18,7 +18,7 @@
/* Put a size limit on encrypted credentials (which is the same as the unencrypted size plus a spacious 128K of extra
* space for headers, IVs, exported TPM2 key material and so on. */
#define CREDENTIAL_ENCRYPTED_SIZE_MAX (CREDENTIAL_SIZE_MAX + 128U*1024U)
#define CREDENTIAL_ENCRYPTED_SIZE_MAX (CREDENTIAL_SIZE_MAX + 128U * U64_KB)
bool credential_name_valid(const char *s);
bool credential_glob_valid(const char *s);

View File

@@ -7,6 +7,7 @@
#include "alloc-util.h"
#include "chase.h"
#include "creds-util.h"
#include "dissect-image.h"
#include "errno-util.h"
#include "extract-word.h"
@@ -1831,58 +1832,65 @@ unsigned long credentials_fs_mount_flags(bool ro) {
return MS_NODEV|MS_NOEXEC|MS_NOSUID|ms_nosymfollow_supported()|(ro ? MS_RDONLY : 0);
}
int mount_credentials_fs(const char *path, size_t size, bool ro) {
_cleanup_free_ char *opts = NULL;
int r, noswap_supported;
int fsmount_credentials_fs(int *ret_fsfd) {
_cleanup_close_ int fs_fd = -EBADF;
char size_str[DECIMAL_STR_MAX(uint64_t)];
/* Mounts a file system we can place credentials in, i.e. with tight access modes right from the
* beginning, and ideally swapping turned off. In order of preference:
*
* 1. tmpfs if it supports "noswap"
* 1. tmpfs if it supports "noswap" (needs kernel >= 6.3)
* 2. ramfs
* 3. tmpfs if it doesn't support "noswap"
* 3. tmpfs without "noswap"
*/
noswap_supported = mount_option_supported("tmpfs", "noswap", NULL); /* Check explicitly to avoid kmsg noise */
if (noswap_supported > 0) {
_cleanup_free_ char *noswap_opts = NULL;
fs_fd = fsopen("tmpfs", FSOPEN_CLOEXEC);
if (fs_fd < 0)
return -errno;
if (asprintf(&noswap_opts, "mode=0700,nr_inodes=1024,size=%zu,noswap", size) < 0)
return -ENOMEM;
if (fsconfig(fs_fd, FSCONFIG_SET_STRING, "nr_inodes", "1024", 0) < 0)
return -errno;
/* Best case: tmpfs with noswap (needs kernel >= 6.3) */
xsprintf(size_str, "%" PRIu64, CREDENTIALS_TOTAL_SIZE_MAX);
if (fsconfig(fs_fd, FSCONFIG_SET_STRING, "size", size_str, 0) < 0)
return -errno;
r = mount_nofollow_verbose(
LOG_DEBUG,
"tmpfs",
path,
"tmpfs",
credentials_fs_mount_flags(ro),
noswap_opts);
if (r >= 0)
return r;
if (fsconfig(fs_fd, FSCONFIG_SET_FLAG, "noswap", NULL, 0) < 0) {
if (errno != EINVAL)
return -errno;
int ramfs_fd = fsopen("ramfs", FSOPEN_CLOEXEC);
if (ramfs_fd >= 0)
close_and_replace(fs_fd, ramfs_fd);
}
r = mount_nofollow_verbose(
LOG_DEBUG,
"ramfs",
path,
"ramfs",
credentials_fs_mount_flags(ro),
"mode=0700");
if (r >= 0)
return r;
if (fsconfig(fs_fd, FSCONFIG_SET_STRING, "mode", "0700", 0) < 0)
return -errno;
if (asprintf(&opts, "mode=0700,nr_inodes=1024,size=%zu", size) < 0)
return -ENOMEM;
if (fsconfig(fs_fd, FSCONFIG_CMD_CREATE, NULL, NULL, 0) < 0)
return -errno;
return mount_nofollow_verbose(
LOG_DEBUG,
"tmpfs",
path,
"tmpfs",
credentials_fs_mount_flags(ro),
opts);
int mfd = fsmount(fs_fd, FSMOUNT_CLOEXEC,
ms_flags_to_mount_attr(credentials_fs_mount_flags(/* ro = */ false)));
if (mfd < 0)
return -errno;
if (ret_fsfd)
*ret_fsfd = TAKE_FD(fs_fd);
return mfd;
}
int mount_credentials_fs(const char *path) {
_cleanup_close_ int mfd = -EBADF;
assert(path);
mfd = fsmount_credentials_fs(/* ret_fsfd = */ NULL);
if (mfd < 0)
return mfd;
return RET_NERRNO(move_mount(mfd, "", AT_FDCWD, path, MOVE_MOUNT_F_EMPTY_PATH));
}
int make_fsmount(

View File

@@ -159,7 +159,8 @@ int make_mount_point_inode_from_path(const char *source, const char *dest, mode_
int trigger_automount_at(int dir_fd, const char *path);
unsigned long credentials_fs_mount_flags(bool ro);
int mount_credentials_fs(const char *path, size_t size, bool ro);
int fsmount_credentials_fs(int *ret_fsfd);
int mount_credentials_fs(const char *path);
int make_fsmount(int error_log_level, const char *what, const char *type, unsigned long flags, const char *options, int userns_fd);

View File

@@ -112,30 +112,6 @@ TEST_RET(fd_acl_make_read_only) {
cmd = strjoina("stat ", fn);
ASSERT_OK_ZERO_ERRNO(system(cmd));
log_info("writable");
ASSERT_OK_POSITIVE(fd_acl_make_writable(fd));
ASSERT_OK_ERRNO(fstat(fd, &st));
ASSERT_EQ(st.st_mode & 0222, 0200u);
cmd = strjoina("getfacl -p ", fn);
ASSERT_OK_ZERO_ERRNO(system(cmd));
cmd = strjoina("stat ", fn);
ASSERT_OK_ZERO_ERRNO(system(cmd));
log_info("read-only");
ASSERT_OK_POSITIVE(fd_acl_make_read_only(fd));
ASSERT_OK_ERRNO(fstat(fd, &st));
ASSERT_EQ(st.st_mode & 0222, 0000u);
cmd = strjoina("getfacl -p ", fn);
ASSERT_OK_ZERO_ERRNO(system(cmd));
cmd = strjoina("stat ", fn);
ASSERT_OK_ZERO_ERRNO(system(cmd));
return 0;
}