mirror of
https://github.com/morgan9e/systemd
synced 2026-04-15 00:47:10 +09:00
core/exec-credentials: port to new mount API, ensure atomicity for creds installation (#39637)
This commit is contained in:
@@ -29,7 +29,6 @@
|
||||
#include "siphash24.h"
|
||||
#include "stat-util.h"
|
||||
#include "strv.h"
|
||||
#include "tmpfile-util.h"
|
||||
#include "user-util.h"
|
||||
|
||||
ExecSetCredential* exec_set_credential_free(ExecSetCredential *sc) {
|
||||
@@ -320,7 +319,6 @@ static int write_credential(
|
||||
gid_t gid,
|
||||
bool ownership_ok) {
|
||||
|
||||
_cleanup_free_ char *tmp = NULL;
|
||||
_cleanup_close_ int fd = -EBADF;
|
||||
int r;
|
||||
|
||||
@@ -328,16 +326,10 @@ static int write_credential(
|
||||
assert(id);
|
||||
assert(data || size == 0);
|
||||
|
||||
r = tempfn_random_child("", "cred", &tmp);
|
||||
if (r < 0)
|
||||
return r;
|
||||
|
||||
fd = openat(dfd, tmp, O_CREAT|O_RDWR|O_CLOEXEC|O_EXCL|O_NOFOLLOW|O_NOCTTY, 0600);
|
||||
fd = openat(dfd, id, O_CREAT|O_EXCL|O_WRONLY|O_CLOEXEC, 0600);
|
||||
if (fd < 0)
|
||||
return -errno;
|
||||
|
||||
CLEANUP_TMPFILE_AT(dfd, tmp);
|
||||
|
||||
r = loop_write(fd, data, size);
|
||||
if (r < 0)
|
||||
return r;
|
||||
@@ -359,11 +351,6 @@ static int write_credential(
|
||||
return r;
|
||||
}
|
||||
|
||||
r = RET_NERRNO(renameat(dfd, tmp, dfd, id));
|
||||
if (r < 0)
|
||||
return r;
|
||||
|
||||
tmp = mfree(tmp); /* disarm CLEANUP_TMPFILE_AT() */
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -426,36 +413,20 @@ static int credential_search_path(const ExecParameters *params, CredentialSearch
|
||||
return 0;
|
||||
}
|
||||
|
||||
static bool device_nodes_restricted(
|
||||
const ExecContext *c,
|
||||
const CGroupContext *cgroup_context) {
|
||||
|
||||
assert(c);
|
||||
assert(cgroup_context);
|
||||
|
||||
/* Returns true if we have any reason to believe we might not be able to access the TPM device
|
||||
* directly, even if we run as root/PID 1. This could be because /dev/ is replaced by a private
|
||||
* version, or because a device node access list is configured. */
|
||||
|
||||
if (c->private_devices)
|
||||
return true;
|
||||
|
||||
if (cgroup_context_has_device_policy(cgroup_context))
|
||||
return true;
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
struct load_cred_args {
|
||||
const ExecContext *context;
|
||||
const CGroupContext *cgroup_context;
|
||||
const ExecParameters *params;
|
||||
const char *unit;
|
||||
|
||||
bool always_ipc;
|
||||
|
||||
bool encrypted;
|
||||
|
||||
int write_dfd;
|
||||
uid_t uid;
|
||||
gid_t gid;
|
||||
bool ownership_ok;
|
||||
|
||||
uint64_t left;
|
||||
};
|
||||
|
||||
@@ -486,7 +457,7 @@ static int maybe_decrypt_and_write_credential(
|
||||
|
||||
flags |= CREDENTIAL_ANY_SCOPE;
|
||||
|
||||
if (!device_nodes_restricted(args->context, args->cgroup_context)) {
|
||||
if (!args->always_ipc) {
|
||||
r = decrypt_credential_and_warn(
|
||||
id,
|
||||
now(CLOCK_REALTIME),
|
||||
@@ -787,38 +758,49 @@ static int load_cred_recurse_dir_cb(
|
||||
return RECURSE_DIR_CONTINUE;
|
||||
}
|
||||
|
||||
static bool device_nodes_restricted(
|
||||
const ExecContext *c,
|
||||
const CGroupContext *cgroup_context) {
|
||||
|
||||
assert(c);
|
||||
assert(cgroup_context);
|
||||
|
||||
/* Returns true if we have any reason to believe we might not be able to access the TPM device
|
||||
* directly, even if we run as root/PID 1. This could be because /dev/ is replaced by a private
|
||||
* version, or because a device node access list is configured. */
|
||||
|
||||
if (c->private_devices)
|
||||
return true;
|
||||
|
||||
if (cgroup_context_has_device_policy(cgroup_context))
|
||||
return true;
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
static int acquire_credentials(
|
||||
const ExecContext *context,
|
||||
const CGroupContext *cgroup_context,
|
||||
const ExecParameters *params,
|
||||
const char *unit,
|
||||
const char *p,
|
||||
int dfd,
|
||||
uid_t uid,
|
||||
gid_t gid,
|
||||
bool ownership_ok) {
|
||||
|
||||
_cleanup_close_ int dfd = -EBADF;
|
||||
int r;
|
||||
|
||||
assert(context);
|
||||
assert(cgroup_context);
|
||||
assert(params);
|
||||
assert(unit);
|
||||
assert(p);
|
||||
|
||||
dfd = open(p, O_DIRECTORY|O_CLOEXEC);
|
||||
if (dfd < 0)
|
||||
return -errno;
|
||||
|
||||
r = fd_acl_make_writable(dfd); /* Add the "w" bit, if we are reusing an already set up credentials dir where it was unset */
|
||||
if (r < 0)
|
||||
return r;
|
||||
assert(dfd >= 0);
|
||||
|
||||
struct load_cred_args args = {
|
||||
.context = context,
|
||||
.cgroup_context = cgroup_context,
|
||||
.params = params,
|
||||
.unit = unit,
|
||||
.always_ipc = device_nodes_restricted(context, cgroup_context),
|
||||
.write_dfd = dfd,
|
||||
.uid = uid,
|
||||
.gid = gid,
|
||||
@@ -919,7 +901,15 @@ static int acquire_credentials(
|
||||
return r;
|
||||
}
|
||||
|
||||
r = fd_acl_make_read_only(dfd); /* Now take away the "w" bit */
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int credentials_dir_finalize_permissions(int dfd, uid_t uid, gid_t gid, bool ownership_ok) {
|
||||
int r;
|
||||
|
||||
assert(dfd >= 0);
|
||||
|
||||
r = fd_acl_make_read_only(dfd); /* Take away the "w" bit */
|
||||
if (r < 0)
|
||||
return r;
|
||||
|
||||
@@ -943,157 +933,159 @@ static int acquire_credentials(
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int setup_credentials_plain_dir(
|
||||
const ExecContext *context,
|
||||
const CGroupContext *cgroup_context,
|
||||
const ExecParameters *params,
|
||||
const char *unit,
|
||||
const char *cred_dir,
|
||||
uid_t uid,
|
||||
gid_t gid) {
|
||||
|
||||
_cleanup_free_ char *t = NULL, *workspace = NULL;
|
||||
_cleanup_(rm_rf_safep) const char *workspace_rm = NULL;
|
||||
_cleanup_close_ int dfd = -EBADF;
|
||||
int r;
|
||||
|
||||
assert(context);
|
||||
assert(params);
|
||||
assert(unit);
|
||||
assert(cred_dir);
|
||||
|
||||
/* Temporary workspace, that remains inaccessible all the time. We prepare stuff there before moving
|
||||
* it into place, so that users can't access half-initialized credential stores. */
|
||||
t = path_join(params->prefix[EXEC_DIRECTORY_RUNTIME], "systemd/temporary-credentials");
|
||||
if (!t)
|
||||
return -ENOMEM;
|
||||
|
||||
r = mkdir_label(t, 0700);
|
||||
if (r < 0 && r != -EEXIST)
|
||||
return r;
|
||||
|
||||
workspace = path_join(t, unit);
|
||||
if (!workspace)
|
||||
return -ENOMEM;
|
||||
|
||||
dfd = open_mkdir(workspace, O_CLOEXEC|O_EXCL, 0700);
|
||||
if (dfd < 0)
|
||||
return log_debug_errno(dfd, "Failed to create workspace for credentials: %m");
|
||||
workspace_rm = workspace;
|
||||
|
||||
(void) label_fix_full(dfd, /* inode_path = */ NULL, cred_dir, /* flags = */ 0);
|
||||
|
||||
r = acquire_credentials(context, cgroup_context, params, unit, dfd, uid, gid, /* ownership_ok = */ false);
|
||||
if (r < 0)
|
||||
return r;
|
||||
|
||||
r = RET_NERRNO(rename(workspace, cred_dir));
|
||||
if (r >= 0)
|
||||
workspace_rm = NULL;
|
||||
if (r == -EEXIST) {
|
||||
log_debug_errno(r, "Credential dir '%s' already populated, exchanging with workspace.", cred_dir);
|
||||
r = RET_NERRNO(renameat2(AT_FDCWD, workspace, AT_FDCWD, cred_dir, RENAME_EXCHANGE));
|
||||
}
|
||||
if (r < 0)
|
||||
return log_debug_errno(r, "Failed to move credentials workspace into place: %m");
|
||||
|
||||
/* rename() requires both the source and target to be writable, hence lock down write permission
|
||||
* as last step. */
|
||||
r = credentials_dir_finalize_permissions(dfd, uid, gid, /* ownership_ok = */ false);
|
||||
if (r < 0)
|
||||
return log_debug_errno(r, "Failed to adjust ACLs of credentials dir: %m");
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int setup_credentials_internal(
|
||||
const ExecContext *context,
|
||||
const CGroupContext *cgroup_context,
|
||||
const ExecParameters *params,
|
||||
const char *unit,
|
||||
const char *final, /* This is where the credential store shall eventually end up at */
|
||||
const char *workspace, /* This is where we can prepare it before moving it to the final place */
|
||||
bool reuse_workspace, /* Whether to reuse any existing workspace mount if it already is a mount */
|
||||
bool must_mount, /* Whether to require that we mount something, it's not OK to use the plain directory fall back */
|
||||
const char *cred_dir,
|
||||
uid_t uid,
|
||||
gid_t gid) {
|
||||
|
||||
bool final_mounted;
|
||||
int r, workspace_mounted; /* negative if we don't know yet whether we have/can mount something; true
|
||||
* if we mounted something; false if we definitely can't mount anything */
|
||||
_cleanup_close_ int fs_fd = -EBADF, mfd = -EBADF, dfd = -EBADF;
|
||||
bool dir_mounted;
|
||||
int r;
|
||||
|
||||
assert(context);
|
||||
assert(params);
|
||||
assert(unit);
|
||||
assert(final);
|
||||
assert(workspace);
|
||||
assert(cred_dir);
|
||||
|
||||
r = path_is_mount_point(final);
|
||||
if (r < 0)
|
||||
return log_debug_errno(r, "Failed to determine if '%s' is a mountpoint: %m", final);
|
||||
final_mounted = r > 0;
|
||||
|
||||
if (final_mounted) {
|
||||
if (FLAGS_SET(params->flags, EXEC_SETUP_CREDENTIALS_FRESH)) {
|
||||
r = umount_verbose(LOG_DEBUG, final, MNT_DETACH|UMOUNT_NOFOLLOW);
|
||||
if (r < 0)
|
||||
return r;
|
||||
|
||||
final_mounted = false;
|
||||
} else {
|
||||
/* We can reuse the previous credential dir */
|
||||
r = dir_is_empty(final, /* ignore_hidden_or_backup = */ false);
|
||||
if (r < 0)
|
||||
return r;
|
||||
if (r == 0) {
|
||||
log_debug("Credential dir for unit '%s' already set up, skipping.", unit);
|
||||
return 0;
|
||||
}
|
||||
if (!FLAGS_SET(params->flags, EXEC_SETUP_CREDENTIALS_FRESH)) {
|
||||
/* We may reuse the previous credential dir */
|
||||
r = dir_is_empty(cred_dir, /* ignore_hidden_or_backup = */ false);
|
||||
if (r < 0)
|
||||
return r;
|
||||
if (r == 0) {
|
||||
log_debug("Credential dir for unit '%s' already set up, skipping.", unit);
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
if (reuse_workspace) {
|
||||
r = path_is_mount_point(workspace);
|
||||
if (r < 0)
|
||||
return r;
|
||||
if (r > 0)
|
||||
workspace_mounted = true; /* If this is already a mount, and we are supposed to reuse
|
||||
* it, let's keep this in mind */
|
||||
else
|
||||
workspace_mounted = -1; /* We need to figure out if we can mount something to the workspace */
|
||||
} else
|
||||
workspace_mounted = -1; /* ditto */
|
||||
r = path_is_mount_point(cred_dir);
|
||||
if (r < 0)
|
||||
return log_debug_errno(r, "Failed to determine if '%s' is a mountpoint: %m", cred_dir);
|
||||
dir_mounted = r > 0;
|
||||
|
||||
/* If both the final place and the workspace are mounted, we have no mounts to set up, based on
|
||||
* the assumption that they're actually the same tmpfs (but the latter with MS_RDONLY different).
|
||||
* If the workspace is not mounted, we just bind the final place over and make it writable. */
|
||||
must_mount = must_mount || final_mounted;
|
||||
|
||||
if (workspace_mounted < 0) {
|
||||
if (!final_mounted)
|
||||
/* Nothing is mounted on the workspace yet, let's try to mount a new tmpfs if
|
||||
* not using the final place. */
|
||||
r = mount_credentials_fs(workspace, CREDENTIALS_TOTAL_SIZE_MAX, /* ro= */ false);
|
||||
if (final_mounted || r < 0) {
|
||||
/* If using final place or failed to mount new tmpfs, make a bind mount from
|
||||
* the final to the workspace, so that we can make it writable there. */
|
||||
r = mount_nofollow_verbose(LOG_DEBUG, final, workspace, NULL, MS_BIND|MS_REC, NULL);
|
||||
if (r < 0) {
|
||||
if (!ERRNO_IS_PRIVILEGE(r))
|
||||
/* Propagate anything that isn't a permission problem. */
|
||||
return r;
|
||||
|
||||
if (must_mount)
|
||||
/* If it's not OK to use the plain directory fallback, propagate all
|
||||
* errors too. */
|
||||
return r;
|
||||
|
||||
/* If we lack privileges to bind mount stuff, then let's gracefully proceed
|
||||
* for compat with container envs, and just use the final dir as is.
|
||||
* Final place must not be mounted in this case (refused by must_mount
|
||||
* above) */
|
||||
|
||||
workspace_mounted = false;
|
||||
} else {
|
||||
/* Make the new bind mount writable (i.e. drop MS_RDONLY) */
|
||||
r = mount_nofollow_verbose(LOG_DEBUG,
|
||||
NULL,
|
||||
workspace,
|
||||
NULL,
|
||||
MS_BIND|MS_REMOUNT|credentials_fs_mount_flags(/* ro= */ false),
|
||||
NULL);
|
||||
if (r < 0)
|
||||
return r;
|
||||
|
||||
workspace_mounted = true;
|
||||
}
|
||||
} else
|
||||
workspace_mounted = true;
|
||||
mfd = fsmount_credentials_fs(&fs_fd);
|
||||
if (ERRNO_IS_NEG_PRIVILEGE(mfd) && !dir_mounted) {
|
||||
log_debug_errno(mfd, "Lacking privilege to mount credentials fs, falling back to plain directory.");
|
||||
return setup_credentials_plain_dir(context, cgroup_context, params, unit, cred_dir, uid, gid);
|
||||
}
|
||||
if (mfd < 0)
|
||||
return log_debug_errno(mfd, "Failed to mount credentials fs: %m");
|
||||
|
||||
assert(workspace_mounted >= 0);
|
||||
assert(!must_mount || workspace_mounted);
|
||||
dfd = fd_reopen(mfd, O_DIRECTORY|O_CLOEXEC);
|
||||
if (dfd < 0)
|
||||
return dfd;
|
||||
|
||||
const char *where = workspace_mounted ? workspace : final;
|
||||
|
||||
(void) label_fix_full(AT_FDCWD, where, final, 0);
|
||||
|
||||
r = acquire_credentials(context, cgroup_context, params, unit, where, uid, gid, workspace_mounted);
|
||||
if (r < 0) {
|
||||
/* If we're using final place as workspace, and failed to acquire credentials, we might
|
||||
* have left half-written creds there. Let's get rid of the whole mount, so future
|
||||
* calls won't reuse it. */
|
||||
if (final_mounted)
|
||||
(void) umount_verbose(LOG_DEBUG, final, MNT_DETACH|UMOUNT_NOFOLLOW);
|
||||
(void) label_fix_full(dfd, /* inode_path = */ NULL, cred_dir, /* flags = */ 0);
|
||||
|
||||
r = acquire_credentials(context, cgroup_context, params, unit, dfd, uid, gid, /* ownership_ok = */ true);
|
||||
if (r < 0)
|
||||
return r;
|
||||
}
|
||||
|
||||
if (workspace_mounted) {
|
||||
if (!final_mounted) {
|
||||
/* Make workspace read-only now, so that any bind mount we make from it defaults to
|
||||
* read-only too */
|
||||
r = mount_nofollow_verbose(LOG_DEBUG, NULL, workspace, NULL, MS_BIND|MS_REMOUNT|credentials_fs_mount_flags(/* ro= */ true), NULL);
|
||||
if (r < 0)
|
||||
return r;
|
||||
r = credentials_dir_finalize_permissions(dfd, uid, gid, /* ownership_ok = */ true);
|
||||
if (r < 0)
|
||||
return log_debug_errno(r, "Failed to adjust ACLs of credentials dir: %m");
|
||||
|
||||
/* And mount it to the final place, read-only */
|
||||
r = mount_nofollow_verbose(LOG_DEBUG, workspace, final, NULL, MS_MOVE, NULL);
|
||||
} else
|
||||
/* Otherwise we just get rid of the bind mount of final place */
|
||||
r = umount_verbose(LOG_DEBUG, workspace, MNT_DETACH|UMOUNT_NOFOLLOW);
|
||||
// Work around a kernel bug that results in tmpfs reconfiguration failure.
|
||||
// FIXME: drop this once https://lore.kernel.org/linux-fsdevel/20251108190930.440685-1-me@yhndnzj.com/
|
||||
// is merged and hits the distro kernels.
|
||||
(void) fsconfig(fs_fd, FSCONFIG_SET_FLAG, "noswap", NULL, 0);
|
||||
|
||||
if (fsconfig(fs_fd, FSCONFIG_SET_FLAG, "ro", NULL, 0) < 0)
|
||||
return -errno;
|
||||
|
||||
if (fsconfig(fs_fd, FSCONFIG_CMD_RECONFIGURE, NULL, NULL, 0) < 0)
|
||||
return -errno;
|
||||
|
||||
log_debug("Successfully reconfigured credentials fs to be read only.");
|
||||
|
||||
if (dir_mounted) {
|
||||
/* Firstly, try to move beneath the existing mount, which guarantees strictly atomic replacement
|
||||
* (needs kernel >= 6.5) */
|
||||
r = move_mount(mfd, "", AT_FDCWD, cred_dir, MOVE_MOUNT_F_EMPTY_PATH|MOVE_MOUNT_BENEATH);
|
||||
if (r >= 0)
|
||||
return umount_verbose(LOG_DEBUG, cred_dir, MNT_DETACH|UMOUNT_NOFOLLOW);
|
||||
if (errno != EINVAL)
|
||||
return log_debug_errno(errno, "Failed to move credentials fs into place: %m");
|
||||
|
||||
log_debug_errno(errno, "Unable to move credentials fs beneath existing mount '%s', unmounting instead: %m",
|
||||
cred_dir);
|
||||
|
||||
r = umount_verbose(LOG_DEBUG, cred_dir, MNT_DETACH|UMOUNT_NOFOLLOW);
|
||||
if (r < 0)
|
||||
return r;
|
||||
} else {
|
||||
_cleanup_free_ char *parent = NULL;
|
||||
|
||||
/* If we do not have our own mount put used the plain directory fallback, then we need to
|
||||
* open access to the top-level credential directory and the per-service directory now */
|
||||
|
||||
r = path_extract_directory(final, &parent);
|
||||
if (r < 0)
|
||||
return r;
|
||||
if (chmod(parent, 0755) < 0)
|
||||
return -errno;
|
||||
}
|
||||
|
||||
r = move_mount(mfd, "", AT_FDCWD, cred_dir, MOVE_MOUNT_F_EMPTY_PATH);
|
||||
if (r < 0)
|
||||
return log_debug_errno(errno, "Failed to move credentials fs into place: %m");
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -1136,96 +1128,12 @@ int exec_setup_credentials(
|
||||
if (r < 0 && r != -EEXIST)
|
||||
return r;
|
||||
|
||||
r = safe_fork("(sd-mkdcreds)", FORK_DEATHSIG_SIGTERM|FORK_WAIT|FORK_NEW_MOUNTNS, NULL);
|
||||
if (r < 0) {
|
||||
_cleanup_(rmdir_and_freep) char *u = NULL; /* remove the temporary workspace if we can */
|
||||
_cleanup_free_ char *t = NULL;
|
||||
|
||||
/* If this is not a privilege or support issue then propagate the error */
|
||||
if (!ERRNO_IS_NOT_SUPPORTED(r) && !ERRNO_IS_PRIVILEGE(r))
|
||||
return r;
|
||||
|
||||
/* Temporary workspace, that remains inaccessible all the time. We prepare stuff there before moving
|
||||
* it into place, so that users can't access half-initialized credential stores. */
|
||||
t = path_join(params->prefix[EXEC_DIRECTORY_RUNTIME], "systemd/temporary-credentials");
|
||||
if (!t)
|
||||
return -ENOMEM;
|
||||
|
||||
/* We can't set up a mount namespace. In that case operate on a fixed, inaccessible per-unit
|
||||
* directory outside of /run/credentials/ first, and then move it over to /run/credentials/
|
||||
* after it is fully set up */
|
||||
u = path_join(t, unit);
|
||||
if (!u)
|
||||
return -ENOMEM;
|
||||
|
||||
FOREACH_STRING(i, t, u) {
|
||||
r = mkdir_label(i, 0700);
|
||||
if (r < 0 && r != -EEXIST)
|
||||
return log_debug_errno(r, "Failed to make directory '%s': %m", i);
|
||||
}
|
||||
|
||||
r = setup_credentials_internal(
|
||||
context,
|
||||
cgroup_context,
|
||||
params,
|
||||
unit,
|
||||
p, /* final mount point */
|
||||
u, /* temporary workspace to overmount */
|
||||
true, /* reuse the workspace if it is already a mount */
|
||||
false, /* it's OK to fall back to a plain directory if we can't mount anything */
|
||||
uid,
|
||||
gid);
|
||||
if (r < 0)
|
||||
return r;
|
||||
|
||||
} else if (r == 0) {
|
||||
|
||||
/* We managed to set up a mount namespace, and are now in a child. That's great. In this case
|
||||
* we can use the same directory for all cases, after turning off propagation. Question
|
||||
* though is: where do we turn off propagation exactly, and where do we place the workspace
|
||||
* directory? We need some place that is guaranteed to be a mount point in the host, and
|
||||
* which is guaranteed to have a subdir we can mount over. /run/ is not suitable for this,
|
||||
* since we ultimately want to move the resulting file system there, i.e. we need propagation
|
||||
* for /run/ eventually. We could use our own /run/systemd/bind mount on itself, but that
|
||||
* would be visible in the host mount table all the time, which we want to avoid. Hence, what
|
||||
* we do here instead we use /dev/ and /dev/shm/ for our purposes. We know for sure that
|
||||
* /dev/ is a mount point and we now for sure that /dev/shm/ exists. Hence we can turn off
|
||||
* propagation on the former, and then overmount the latter.
|
||||
*
|
||||
* Yes it's nasty playing games with /dev/ and /dev/shm/ like this, since it does not exist
|
||||
* for this purpose, but there are few other candidates that work equally well for us, and
|
||||
* given that we do this in a privately namespaced short-lived single-threaded process that
|
||||
* no one else sees this should be OK to do. */
|
||||
|
||||
/* Turn off propagation from our namespace to host */
|
||||
r = mount_nofollow_verbose(LOG_DEBUG, NULL, "/dev", NULL, MS_SLAVE|MS_REC, NULL);
|
||||
if (r < 0)
|
||||
goto child_fail;
|
||||
|
||||
r = setup_credentials_internal(
|
||||
context,
|
||||
cgroup_context,
|
||||
params,
|
||||
unit,
|
||||
p, /* final mount point */
|
||||
"/dev/shm", /* temporary workspace to overmount */
|
||||
false, /* do not reuse /dev/shm if it is already a mount, under no circumstances */
|
||||
true, /* insist that something is mounted, do not allow fallback to plain directory */
|
||||
uid,
|
||||
gid);
|
||||
if (r < 0)
|
||||
goto child_fail;
|
||||
|
||||
_exit(EXIT_SUCCESS);
|
||||
|
||||
child_fail:
|
||||
_exit(EXIT_FAILURE);
|
||||
}
|
||||
r = setup_credentials_internal(context, cgroup_context, params, unit, p, uid, gid);
|
||||
|
||||
/* If the credentials dir is empty and not a mount point, then there's no point in having it. Let's
|
||||
* try to remove it. This matters in particular if we created the dir as mount point but then didn't
|
||||
* actually end up mounting anything on it. In that case we'd rather have ENOENT than EACCESS being
|
||||
* seen by users when trying access this inode. */
|
||||
(void) rmdir(p);
|
||||
return 0;
|
||||
return r;
|
||||
}
|
||||
|
||||
@@ -103,7 +103,7 @@ static int acquire_credential_directory(ImportCredentialsContext *c, const char
|
||||
(void) mount_nofollow_verbose(LOG_WARNING, NULL, path, NULL, MS_BIND|MS_REMOUNT|credentials_fs_mount_flags(/* ro= */ false), NULL);
|
||||
else if (with_mount)
|
||||
/* If not a mount point yet, and the credentials are not encrypted, then let's try to mount a no-swap fs there */
|
||||
(void) mount_credentials_fs(path, CREDENTIALS_TOTAL_SIZE_MAX, /* ro= */ false);
|
||||
(void) mount_credentials_fs(path);
|
||||
|
||||
c->target_dir_fd = open(path, O_RDONLY|O_DIRECTORY|O_CLOEXEC);
|
||||
if (c->target_dir_fd < 0)
|
||||
|
||||
@@ -682,16 +682,12 @@ int fd_acl_make_read_only(int fd) {
|
||||
|
||||
r = dlopen_libacl();
|
||||
if (r < 0)
|
||||
return r;
|
||||
goto maybe_fallback;
|
||||
|
||||
acl = sym_acl_get_fd(fd);
|
||||
if (!acl) {
|
||||
|
||||
if (!ERRNO_IS_NOT_SUPPORTED(errno))
|
||||
return -errno;
|
||||
|
||||
/* No ACLs? Then just update the regular mode_t */
|
||||
return fd_acl_make_read_only_fallback(fd);
|
||||
r = -errno;
|
||||
goto maybe_fallback;
|
||||
}
|
||||
|
||||
for (r = sym_acl_get_entry(acl, ACL_FIRST_ENTRY, &i);
|
||||
@@ -729,75 +725,18 @@ int fd_acl_make_read_only(int fd) {
|
||||
return 0;
|
||||
|
||||
if (sym_acl_set_fd(fd, acl) < 0) {
|
||||
if (!ERRNO_IS_NOT_SUPPORTED(errno))
|
||||
return -errno;
|
||||
|
||||
return fd_acl_make_read_only_fallback(fd);
|
||||
r = -errno;
|
||||
goto maybe_fallback;
|
||||
}
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
int fd_acl_make_writable(int fd) {
|
||||
_cleanup_(acl_freep) acl_t acl = NULL;
|
||||
acl_entry_t i;
|
||||
int r;
|
||||
|
||||
/* Safely adds the writable bit to the owner's ACL entry of this inode. (And only the owner's! – This
|
||||
* not the obvious inverse of fd_acl_make_read_only() hence!) */
|
||||
|
||||
r = dlopen_libacl();
|
||||
if (r < 0)
|
||||
maybe_fallback:
|
||||
if (!ERRNO_IS_NEG_NOT_SUPPORTED(r))
|
||||
return r;
|
||||
|
||||
acl = sym_acl_get_fd(fd);
|
||||
if (!acl) {
|
||||
if (!ERRNO_IS_NOT_SUPPORTED(errno))
|
||||
return -errno;
|
||||
|
||||
/* No ACLs? Then just update the regular mode_t */
|
||||
return fd_acl_make_writable_fallback(fd);
|
||||
}
|
||||
|
||||
for (r = sym_acl_get_entry(acl, ACL_FIRST_ENTRY, &i);
|
||||
r > 0;
|
||||
r = sym_acl_get_entry(acl, ACL_NEXT_ENTRY, &i)) {
|
||||
acl_permset_t permset;
|
||||
acl_tag_t tag;
|
||||
int b;
|
||||
|
||||
if (sym_acl_get_tag_type(i, &tag) < 0)
|
||||
return -errno;
|
||||
|
||||
if (tag != ACL_USER_OBJ)
|
||||
continue;
|
||||
|
||||
if (sym_acl_get_permset(i, &permset) < 0)
|
||||
return -errno;
|
||||
|
||||
b = sym_acl_get_perm(permset, ACL_WRITE);
|
||||
if (b < 0)
|
||||
return -errno;
|
||||
|
||||
if (b)
|
||||
return 0; /* Already set? Then there's nothing to do. */
|
||||
|
||||
if (sym_acl_add_perm(permset, ACL_WRITE) < 0)
|
||||
return -errno;
|
||||
|
||||
break;
|
||||
}
|
||||
if (r < 0)
|
||||
return -errno;
|
||||
|
||||
if (sym_acl_set_fd(fd, acl) < 0) {
|
||||
if (!ERRNO_IS_NOT_SUPPORTED(errno))
|
||||
return -errno;
|
||||
|
||||
return fd_acl_make_writable_fallback(fd);
|
||||
}
|
||||
|
||||
return 1;
|
||||
/* No ACLs? Then just update the regular mode_t */
|
||||
return fd_acl_make_read_only_fallback(fd);
|
||||
}
|
||||
#endif
|
||||
|
||||
@@ -818,23 +757,6 @@ int fd_acl_make_read_only_fallback(int fd) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
int fd_acl_make_writable_fallback(int fd) {
|
||||
struct stat st;
|
||||
|
||||
assert(fd >= 0);
|
||||
|
||||
if (fstat(fd, &st) < 0)
|
||||
return -errno;
|
||||
|
||||
if ((st.st_mode & 0200) != 0) /* already set */
|
||||
return 0;
|
||||
|
||||
if (fchmod(fd, (st.st_mode & 07777) | 0200) < 0)
|
||||
return -errno;
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
int inode_type_can_acl(mode_t mode) {
|
||||
return IN_SET(mode & S_IFMT, S_IFSOCK, S_IFREG, S_IFBLK, S_IFCHR, S_IFDIR, S_IFIFO);
|
||||
}
|
||||
|
||||
@@ -4,7 +4,6 @@
|
||||
#include "shared-forward.h"
|
||||
|
||||
int fd_acl_make_read_only_fallback(int fd);
|
||||
int fd_acl_make_writable_fallback(int fd);
|
||||
|
||||
#if HAVE_ACL
|
||||
#include <acl/libacl.h> /* IWYU pragma: export */
|
||||
@@ -56,7 +55,6 @@ int acls_for_file(const char *path, acl_type_t type, acl_t new, acl_t *ret);
|
||||
int fd_add_uid_acl_permission(int fd, uid_t uid, unsigned mask);
|
||||
|
||||
int fd_acl_make_read_only(int fd);
|
||||
int fd_acl_make_writable(int fd);
|
||||
|
||||
/* acl_free() takes multiple argument types. Multiple cleanup functions are necessary. */
|
||||
DEFINE_TRIVIAL_CLEANUP_FUNC_FULL_RENAME(acl_t, sym_acl_free, acl_freep, NULL);
|
||||
@@ -89,10 +87,6 @@ static inline int fd_add_uid_acl_permission(int fd, uid_t uid, unsigned mask) {
|
||||
static inline int fd_acl_make_read_only(int fd) {
|
||||
return fd_acl_make_read_only_fallback(fd);
|
||||
}
|
||||
|
||||
static inline int fd_acl_make_writable(int fd) {
|
||||
return fd_acl_make_writable_fallback(fd);
|
||||
}
|
||||
#endif
|
||||
|
||||
int inode_type_can_acl(mode_t mode);
|
||||
|
||||
@@ -9,7 +9,7 @@
|
||||
#define CREDENTIAL_NAME_MAX FDNAME_MAX
|
||||
|
||||
/* Put a size limit on the individual credential */
|
||||
#define CREDENTIAL_SIZE_MAX (1024U*1024U)
|
||||
#define CREDENTIAL_SIZE_MAX (1U * U64_MB)
|
||||
|
||||
/* Refuse to store more than 1M per service, after all this is unswappable memory. Note that for now we put
|
||||
* this to the same limit as the per-credential limit, i.e. if the user has n > 1 credentials instead of 1 it
|
||||
@@ -18,7 +18,7 @@
|
||||
|
||||
/* Put a size limit on encrypted credentials (which is the same as the unencrypted size plus a spacious 128K of extra
|
||||
* space for headers, IVs, exported TPM2 key material and so on. */
|
||||
#define CREDENTIAL_ENCRYPTED_SIZE_MAX (CREDENTIAL_SIZE_MAX + 128U*1024U)
|
||||
#define CREDENTIAL_ENCRYPTED_SIZE_MAX (CREDENTIAL_SIZE_MAX + 128U * U64_KB)
|
||||
|
||||
bool credential_name_valid(const char *s);
|
||||
bool credential_glob_valid(const char *s);
|
||||
|
||||
@@ -7,6 +7,7 @@
|
||||
|
||||
#include "alloc-util.h"
|
||||
#include "chase.h"
|
||||
#include "creds-util.h"
|
||||
#include "dissect-image.h"
|
||||
#include "errno-util.h"
|
||||
#include "extract-word.h"
|
||||
@@ -1831,58 +1832,65 @@ unsigned long credentials_fs_mount_flags(bool ro) {
|
||||
return MS_NODEV|MS_NOEXEC|MS_NOSUID|ms_nosymfollow_supported()|(ro ? MS_RDONLY : 0);
|
||||
}
|
||||
|
||||
int mount_credentials_fs(const char *path, size_t size, bool ro) {
|
||||
_cleanup_free_ char *opts = NULL;
|
||||
int r, noswap_supported;
|
||||
int fsmount_credentials_fs(int *ret_fsfd) {
|
||||
_cleanup_close_ int fs_fd = -EBADF;
|
||||
char size_str[DECIMAL_STR_MAX(uint64_t)];
|
||||
|
||||
/* Mounts a file system we can place credentials in, i.e. with tight access modes right from the
|
||||
* beginning, and ideally swapping turned off. In order of preference:
|
||||
*
|
||||
* 1. tmpfs if it supports "noswap"
|
||||
* 1. tmpfs if it supports "noswap" (needs kernel >= 6.3)
|
||||
* 2. ramfs
|
||||
* 3. tmpfs if it doesn't support "noswap"
|
||||
* 3. tmpfs without "noswap"
|
||||
*/
|
||||
|
||||
noswap_supported = mount_option_supported("tmpfs", "noswap", NULL); /* Check explicitly to avoid kmsg noise */
|
||||
if (noswap_supported > 0) {
|
||||
_cleanup_free_ char *noswap_opts = NULL;
|
||||
fs_fd = fsopen("tmpfs", FSOPEN_CLOEXEC);
|
||||
if (fs_fd < 0)
|
||||
return -errno;
|
||||
|
||||
if (asprintf(&noswap_opts, "mode=0700,nr_inodes=1024,size=%zu,noswap", size) < 0)
|
||||
return -ENOMEM;
|
||||
if (fsconfig(fs_fd, FSCONFIG_SET_STRING, "nr_inodes", "1024", 0) < 0)
|
||||
return -errno;
|
||||
|
||||
/* Best case: tmpfs with noswap (needs kernel >= 6.3) */
|
||||
xsprintf(size_str, "%" PRIu64, CREDENTIALS_TOTAL_SIZE_MAX);
|
||||
if (fsconfig(fs_fd, FSCONFIG_SET_STRING, "size", size_str, 0) < 0)
|
||||
return -errno;
|
||||
|
||||
r = mount_nofollow_verbose(
|
||||
LOG_DEBUG,
|
||||
"tmpfs",
|
||||
path,
|
||||
"tmpfs",
|
||||
credentials_fs_mount_flags(ro),
|
||||
noswap_opts);
|
||||
if (r >= 0)
|
||||
return r;
|
||||
if (fsconfig(fs_fd, FSCONFIG_SET_FLAG, "noswap", NULL, 0) < 0) {
|
||||
if (errno != EINVAL)
|
||||
return -errno;
|
||||
|
||||
int ramfs_fd = fsopen("ramfs", FSOPEN_CLOEXEC);
|
||||
if (ramfs_fd >= 0)
|
||||
close_and_replace(fs_fd, ramfs_fd);
|
||||
}
|
||||
|
||||
r = mount_nofollow_verbose(
|
||||
LOG_DEBUG,
|
||||
"ramfs",
|
||||
path,
|
||||
"ramfs",
|
||||
credentials_fs_mount_flags(ro),
|
||||
"mode=0700");
|
||||
if (r >= 0)
|
||||
return r;
|
||||
if (fsconfig(fs_fd, FSCONFIG_SET_STRING, "mode", "0700", 0) < 0)
|
||||
return -errno;
|
||||
|
||||
if (asprintf(&opts, "mode=0700,nr_inodes=1024,size=%zu", size) < 0)
|
||||
return -ENOMEM;
|
||||
if (fsconfig(fs_fd, FSCONFIG_CMD_CREATE, NULL, NULL, 0) < 0)
|
||||
return -errno;
|
||||
|
||||
return mount_nofollow_verbose(
|
||||
LOG_DEBUG,
|
||||
"tmpfs",
|
||||
path,
|
||||
"tmpfs",
|
||||
credentials_fs_mount_flags(ro),
|
||||
opts);
|
||||
int mfd = fsmount(fs_fd, FSMOUNT_CLOEXEC,
|
||||
ms_flags_to_mount_attr(credentials_fs_mount_flags(/* ro = */ false)));
|
||||
if (mfd < 0)
|
||||
return -errno;
|
||||
|
||||
if (ret_fsfd)
|
||||
*ret_fsfd = TAKE_FD(fs_fd);
|
||||
|
||||
return mfd;
|
||||
}
|
||||
|
||||
int mount_credentials_fs(const char *path) {
|
||||
_cleanup_close_ int mfd = -EBADF;
|
||||
|
||||
assert(path);
|
||||
|
||||
mfd = fsmount_credentials_fs(/* ret_fsfd = */ NULL);
|
||||
if (mfd < 0)
|
||||
return mfd;
|
||||
|
||||
return RET_NERRNO(move_mount(mfd, "", AT_FDCWD, path, MOVE_MOUNT_F_EMPTY_PATH));
|
||||
}
|
||||
|
||||
int make_fsmount(
|
||||
|
||||
@@ -159,7 +159,8 @@ int make_mount_point_inode_from_path(const char *source, const char *dest, mode_
|
||||
int trigger_automount_at(int dir_fd, const char *path);
|
||||
|
||||
unsigned long credentials_fs_mount_flags(bool ro);
|
||||
int mount_credentials_fs(const char *path, size_t size, bool ro);
|
||||
int fsmount_credentials_fs(int *ret_fsfd);
|
||||
int mount_credentials_fs(const char *path);
|
||||
|
||||
int make_fsmount(int error_log_level, const char *what, const char *type, unsigned long flags, const char *options, int userns_fd);
|
||||
|
||||
|
||||
@@ -112,30 +112,6 @@ TEST_RET(fd_acl_make_read_only) {
|
||||
cmd = strjoina("stat ", fn);
|
||||
ASSERT_OK_ZERO_ERRNO(system(cmd));
|
||||
|
||||
log_info("writable");
|
||||
ASSERT_OK_POSITIVE(fd_acl_make_writable(fd));
|
||||
|
||||
ASSERT_OK_ERRNO(fstat(fd, &st));
|
||||
ASSERT_EQ(st.st_mode & 0222, 0200u);
|
||||
|
||||
cmd = strjoina("getfacl -p ", fn);
|
||||
ASSERT_OK_ZERO_ERRNO(system(cmd));
|
||||
|
||||
cmd = strjoina("stat ", fn);
|
||||
ASSERT_OK_ZERO_ERRNO(system(cmd));
|
||||
|
||||
log_info("read-only");
|
||||
ASSERT_OK_POSITIVE(fd_acl_make_read_only(fd));
|
||||
|
||||
ASSERT_OK_ERRNO(fstat(fd, &st));
|
||||
ASSERT_EQ(st.st_mode & 0222, 0000u);
|
||||
|
||||
cmd = strjoina("getfacl -p ", fn);
|
||||
ASSERT_OK_ZERO_ERRNO(system(cmd));
|
||||
|
||||
cmd = strjoina("stat ", fn);
|
||||
ASSERT_OK_ZERO_ERRNO(system(cmd));
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user