Merge pull request #27450 from poettering/switch-root-modernize

pid1: modernize switch root logic a bit
This commit is contained in:
Lennart Poettering 2023-05-03 20:12:20 +02:00 committed by GitHub
commit 5d63c7eb83
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
7 changed files with 79 additions and 75 deletions

4
TODO
View file

@ -1224,10 +1224,6 @@ Features:
* Add service setting to run a service within the specified VRF. i.e. do the
equivalent of "ip vrf exec".
* change SwitchRoot() implementation in PID 1 to use pivot_root(".", "."), as
documented in the pivot_root(2) man page, so that we can drop the /oldroot
temporary dir.
* special case some calls of chase() to use openat2() internally, so
that the kernel does what we otherwise do.

View file

@ -1797,7 +1797,7 @@ static int do_reexecute(
broadcast_signal(SIGTERM, false, true, arg_default_timeout_stop_usec);
/* And switch root with MS_MOVE, because we remove the old directory afterwards and detach it. */
r = switch_root(switch_root_dir, "/mnt", true, MS_MOVE);
r = switch_root(switch_root_dir, /* old_root_after= */ NULL, MS_MOVE);
if (r < 0)
log_error_errno(r, "Failed to switch root, trying to continue: %m");
}

View file

@ -11,6 +11,7 @@
#include "alloc-util.h"
#include "architecture.h"
#include "base-filesystem.h"
#include "errno-util.h"
#include "fd-util.h"
#include "log.h"
#include "macro.h"
@ -130,19 +131,19 @@ static const BaseFilesystem table[] = {
# pragma message "Please add an entry above specifying whether your architecture uses /lib64/, /lib32/, or no such links."
#endif
int base_filesystem_create(const char *root, uid_t uid, gid_t gid) {
_cleanup_close_ int fd = -EBADF;
int base_filesystem_create_fd(int fd, const char *root, uid_t uid, gid_t gid) {
int r;
fd = open(root, O_RDONLY|O_NONBLOCK|O_DIRECTORY|O_CLOEXEC|O_NOFOLLOW);
if (fd < 0)
return log_error_errno(errno, "Failed to open root file system: %m");
assert(fd >= 0);
assert(root);
/* The "root" parameter is decoration only it's only used as part of log messages */
for (size_t i = 0; i < ELEMENTSOF(table); i++) {
if (faccessat(fd, table[i].dir, F_OK, AT_SYMLINK_NOFOLLOW) >= 0)
continue;
if (table[i].target) {
if (table[i].target) { /* Create as symlink? */
const char *target = NULL;
/* check if one of the targets exists */
@ -169,39 +170,36 @@ int base_filesystem_create(const char *root, uid_t uid, gid_t gid) {
if (!target)
continue;
if (symlinkat(target, fd, table[i].dir) < 0) {
log_full_errno(IN_SET(errno, EEXIST, EROFS) || table[i].ignore_failure ? LOG_DEBUG : LOG_ERR, errno,
"Failed to create symlink at %s/%s: %m", root, table[i].dir);
if (IN_SET(errno, EEXIST, EROFS) || table[i].ignore_failure)
continue;
return -errno;
}
if (uid_is_valid(uid) || gid_is_valid(gid))
if (fchownat(fd, table[i].dir, uid, gid, AT_SYMLINK_NOFOLLOW) < 0)
return log_error_errno(errno, "Failed to chown symlink at %s/%s: %m", root, table[i].dir);
continue;
r = RET_NERRNO(symlinkat(target, fd, table[i].dir));
} else {
/* Create as directory. */
WITH_UMASK(0000)
r = RET_NERRNO(mkdirat(fd, table[i].dir, table[i].mode));
}
WITH_UMASK(0000)
r = mkdirat(fd, table[i].dir, table[i].mode);
if (r < 0) {
log_full_errno(IN_SET(errno, EEXIST, EROFS) || table[i].ignore_failure ? LOG_DEBUG : LOG_ERR, errno,
"Failed to create directory at %s/%s: %m", root, table[i].dir);
if (IN_SET(errno, EEXIST, EROFS) || table[i].ignore_failure)
bool ignore = IN_SET(r, -EEXIST, -EROFS) || table[i].ignore_failure;
log_full_errno(ignore ? LOG_DEBUG : LOG_ERR, r,
"Failed to create %s/%s: %m", root, table[i].dir);
if (ignore)
continue;
return -errno;
return r;
}
if (uid_is_valid(uid) || gid_is_valid(gid))
if (fchownat(fd, table[i].dir, uid, gid, AT_SYMLINK_NOFOLLOW) < 0)
return log_error_errno(errno, "Failed to chown directory at %s/%s: %m", root, table[i].dir);
return log_error_errno(errno, "Failed to chown %s/%s: %m", root, table[i].dir);
}
return 0;
}
int base_filesystem_create(const char *root, uid_t uid, gid_t gid) {
_cleanup_close_ int fd = -EBADF;
fd = open(ASSERT_PTR(root), O_DIRECTORY|O_CLOEXEC);
if (fd < 0)
return log_error_errno(errno, "Failed to open root file system: %m");
return base_filesystem_create_fd(fd, root, uid, gid);
}

View file

@ -3,4 +3,5 @@
#include <sys/types.h>
int base_filesystem_create_fd(int fd, const char *root, uid_t uid, gid_t gid);
int base_filesystem_create(const char *root, uid_t uid, gid_t gid);

View file

@ -26,41 +26,45 @@
#include "user-util.h"
int switch_root(const char *new_root,
const char *old_root_after, /* path below the new root, where to place the old root after the transition */
bool unmount_old_root,
unsigned long mount_flags) { /* MS_MOVE or MS_BIND */
const char *old_root_after, /* path below the new root, where to place the old root after the transition; may be NULL to unmount it */
unsigned long mount_flags) { /* MS_MOVE or MS_BIND used for /proc/, /dev/, /run/, /sys/ */
_cleanup_close_ int old_root_fd = -EBADF, new_root_fd = -EBADF;
_cleanup_free_ char *resolved_old_root_after = NULL;
_cleanup_close_ int old_root_fd = -EBADF;
int r;
int r, istmp;
assert(new_root);
assert(old_root_after);
assert(IN_SET(mount_flags, MS_MOVE, MS_BIND));
if (path_equal(new_root, "/"))
return 0;
/* Check if we shall remove the contents of the old root */
old_root_fd = open("/", O_RDONLY | O_CLOEXEC | O_DIRECTORY);
old_root_fd = open("/", O_DIRECTORY|O_CLOEXEC);
if (old_root_fd < 0)
return log_error_errno(errno, "Failed to open root directory: %m");
r = fd_is_temporary_fs(old_root_fd);
if (r < 0)
return log_error_errno(r, "Failed to stat root directory: %m");
if (r > 0)
istmp = fd_is_temporary_fs(old_root_fd);
if (istmp < 0)
return log_error_errno(istmp, "Failed to stat root directory: %m");
if (istmp > 0)
log_debug("Root directory is on tmpfs, will do cleanup later.");
else
old_root_fd = safe_close(old_root_fd);
/* Determine where we shall place the old root after the transition */
r = chase(old_root_after, new_root, CHASE_PREFIX_ROOT|CHASE_NONEXISTENT, &resolved_old_root_after, NULL);
if (r < 0)
return log_error_errno(r, "Failed to resolve %s/%s: %m", new_root, old_root_after);
if (r == 0) /* Doesn't exist yet. Let's create it */
(void) mkdir_p_label(resolved_old_root_after, 0755);
new_root_fd = open(new_root, O_DIRECTORY|O_CLOEXEC);
if (new_root_fd < 0)
return log_error_errno(errno, "Failed to open target directory '%s': %m", new_root);
/* Work-around for kernel design: the kernel refuses MS_MOVE if any file systems are mounted MS_SHARED. Hence
* remount them MS_PRIVATE here as a work-around.
if (old_root_after) {
/* Determine where we shall place the old root after the transition */
r = chase(old_root_after, new_root, CHASE_PREFIX_ROOT|CHASE_NONEXISTENT, &resolved_old_root_after, NULL);
if (r < 0)
return log_error_errno(r, "Failed to resolve %s/%s: %m", new_root, old_root_after);
if (r == 0) /* Doesn't exist yet. Let's create it */
(void) mkdir_p_label(resolved_old_root_after, 0755);
}
/* Work-around for kernel design: the kernel refuses MS_MOVE if any file systems are mounted
* MS_SHARED. Hence remount them MS_PRIVATE here as a work-around.
*
* https://bugzilla.redhat.com/show_bug.cgi?id=847418 */
if (mount(NULL, "/", NULL, MS_REC|MS_PRIVATE, NULL) < 0)
@ -90,37 +94,42 @@ int switch_root(const char *new_root,
/* Do not fail if base_filesystem_create() fails. Not all switch roots are like base_filesystem_create() wants
* them to look like. They might even boot, if they are RO and don't have the FS layout. Just ignore the error
* and switch_root() nevertheless. */
(void) base_filesystem_create(new_root, UID_INVALID, GID_INVALID);
(void) base_filesystem_create_fd(new_root_fd, new_root, UID_INVALID, GID_INVALID);
if (chdir(new_root) < 0)
if (fchdir(new_root_fd) < 0)
return log_error_errno(errno, "Failed to change directory to %s: %m", new_root);
/* We first try a pivot_root() so that we can umount the old root dir. In many cases (i.e. where rootfs is /),
* that's not possible however, and hence we simply overmount root */
if (pivot_root(new_root, resolved_old_root_after) >= 0) {
/* Immediately get rid of the old root, if detach_oldroot is set.
* Since we are running off it we need to do this lazily. */
if (unmount_old_root) {
r = umount_recursive(old_root_after, MNT_DETACH);
if (r < 0)
log_warning_errno(r, "Failed to unmount old root directory tree, ignoring: %m");
if (resolved_old_root_after)
r = RET_NERRNO(pivot_root(".", resolved_old_root_after));
else {
r = RET_NERRNO(pivot_root(".", "."));
if (r >= 0) {
/* Now unmount the upper of the two stacked file systems */
if (umount2(".", MNT_DETACH) < 0)
return log_error_errno(errno, "Failed to unmount the old root: %m");
}
}
if (r < 0) {
log_debug_errno(r, "Pivoting root file system failed, moving mounts instead: %m");
} else if (mount(new_root, "/", NULL, MS_MOVE, NULL) < 0)
return log_error_errno(errno, "Failed to move %s to /: %m", new_root);
if (mount(".", "/", NULL, MS_MOVE, NULL) < 0)
return log_error_errno(errno, "Failed to move %s to /: %m", new_root);
if (chroot(".") < 0)
return log_error_errno(errno, "Failed to change root: %m");
if (chroot(".") < 0)
return log_error_errno(errno, "Failed to change root: %m");
if (chdir("/") < 0)
return log_error_errno(errno, "Failed to change directory: %m");
if (chdir(".") < 0)
return log_error_errno(errno, "Failed to change directory: %m");
}
if (old_root_fd >= 0) {
if (istmp) {
struct stat rb;
if (fstat(old_root_fd, &rb) < 0)
return log_error_errno(errno, "Failed to stat old root directory: %m");
(void) rm_rf_children(TAKE_FD(old_root_fd), 0, &rb); /* takes possession of the dir fd, even on failure */
}

View file

@ -3,4 +3,4 @@
#include <stdbool.h>
int switch_root(const char *new_root, const char *oldroot, bool detach_oldroot, unsigned long mountflags);
int switch_root(const char *new_root, const char *old_root_after, unsigned long mount_flags);

View file

@ -169,7 +169,7 @@ static int switch_root_initramfs(void) {
* /run/initramfs/shutdown will take care of these.
* Also do not detach the old root, because /run/initramfs/shutdown needs to access it.
*/
return switch_root("/run/initramfs", "/oldroot", false, MS_BIND);
return switch_root("/run/initramfs", "/oldroot", MS_BIND);
}
/* Read the following fields from /proc/meminfo: