From 23cc6ef6fd453b13502caae23130844e7d6ed0fe Mon Sep 17 00:00:00 2001 From: Justin Stitt Date: Mon, 13 May 2024 17:50:30 +0000 Subject: [PATCH 01/27] fs: remove accidental overflow during wraparound check Running syzkaller with the newly enabled signed integer overflow sanitizer produces this report: [ 195.401651] ------------[ cut here ]------------ [ 195.404808] UBSAN: signed-integer-overflow in ../fs/open.c:321:15 [ 195.408739] 9223372036854775807 + 562984447377399 cannot be represented in type 'loff_t' (aka 'long long') [ 195.414683] CPU: 1 PID: 703 Comm: syz-executor.0 Not tainted 6.8.0-rc2-00039-g14de58dbe653-dirty #11 [ 195.420138] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.16.3-debian-1.16.3-2 04/01/2014 [ 195.425804] Call Trace: [ 195.427360] [ 195.428791] dump_stack_lvl+0x93/0xd0 [ 195.431150] handle_overflow+0x171/0x1b0 [ 195.433640] vfs_fallocate+0x459/0x4f0 ... [ 195.490053] ------------[ cut here ]------------ [ 195.493146] UBSAN: signed-integer-overflow in ../fs/open.c:321:61 [ 195.497030] 9223372036854775807 + 562984447377399 cannot be represented in type 'loff_t' (aka 'long long) [ 195.502940] CPU: 1 PID: 703 Comm: syz-executor.0 Not tainted 6.8.0-rc2-00039-g14de58dbe653-dirty #11 [ 195.508395] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.16.3-debian-1.16.3-2 04/01/2014 [ 195.514075] Call Trace: [ 195.515636] [ 195.517000] dump_stack_lvl+0x93/0xd0 [ 195.519255] handle_overflow+0x171/0x1b0 [ 195.521677] vfs_fallocate+0x4cb/0x4f0 [ 195.524033] __x64_sys_fallocate+0xb2/0xf0 Historically, the signed integer overflow sanitizer did not work in the kernel due to its interaction with `-fwrapv` but this has since been changed [1] in the newest version of Clang. It was re-enabled in the kernel with Commit 557f8c582a9ba8ab ("ubsan: Reintroduce signed overflow sanitizer"). Let's use the check_add_overflow helper to first verify the addition stays within the bounds of its type (long long); then we can use that sum for the following check. Link: https://github.com/llvm/llvm-project/pull/82432 [1] Closes: https://github.com/KSPP/linux/issues/356 Cc: linux-hardening@vger.kernel.org Reviewed-by: Kees Cook Signed-off-by: Justin Stitt Link: https://lore.kernel.org/r/20240513-b4-sio-vfs_fallocate-v2-1-db415872fb16@google.com Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- fs/open.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/fs/open.c b/fs/open.c index 89cafb572061..a5c4f8a0f143 100644 --- a/fs/open.c +++ b/fs/open.c @@ -247,6 +247,7 @@ int vfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len) { struct inode *inode = file_inode(file); long ret; + loff_t sum; if (offset < 0 || len <= 0) return -EINVAL; @@ -319,8 +320,11 @@ int vfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len) if (!S_ISREG(inode->i_mode) && !S_ISBLK(inode->i_mode)) return -ENODEV; - /* Check for wrap through zero too */ - if (((offset + len) > inode->i_sb->s_maxbytes) || ((offset + len) < 0)) + /* Check for wraparound */ + if (check_add_overflow(offset, len, &sum)) + return -EFBIG; + + if (sum > inode->i_sb->s_maxbytes) return -EFBIG; if (!file->f_op->fallocate) From 3aa63a569c64e708df547a8913c84e64a06e7853 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Fri, 17 May 2024 20:08:40 -0400 Subject: [PATCH 02/27] fs: switch timespec64 fields in inode to discrete integers Adjacent struct timespec64's pack poorly. Switch them to discrete integer fields for the seconds and nanoseconds. This shaves 8 bytes off struct inode with a garden-variety Fedora Kconfig on x86_64, but that also moves the i_lock into the previous cacheline, away from the fields it protects. To remedy that, move i_generation above the i_lock, which moves the new 4-byte hole to just after the i_fsnotify_mask in my setup. Amir has plans to use that to expand the i_fsnotify_mask, so add a comment to that effect as well. Cc: Amir Goldstein Suggested-by: Linus Torvalds Signed-off-by: Jeff Layton Link: https://lore.kernel.org/r/20240517-amtime-v1-1-7b804ca4be8f@kernel.org Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- include/linux/fs.h | 48 ++++++++++++++++++++++++++++++---------------- 1 file changed, 32 insertions(+), 16 deletions(-) diff --git a/include/linux/fs.h b/include/linux/fs.h index 0283cf366c2a..639885621608 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -660,9 +660,13 @@ struct inode { }; dev_t i_rdev; loff_t i_size; - struct timespec64 __i_atime; - struct timespec64 __i_mtime; - struct timespec64 __i_ctime; /* use inode_*_ctime accessors! */ + time64_t i_atime_sec; + time64_t i_mtime_sec; + time64_t i_ctime_sec; + u32 i_atime_nsec; + u32 i_mtime_nsec; + u32 i_ctime_nsec; + u32 i_generation; spinlock_t i_lock; /* i_blocks, i_bytes, maybe i_size */ unsigned short i_bytes; u8 i_blkbits; @@ -719,10 +723,10 @@ struct inode { unsigned i_dir_seq; }; - __u32 i_generation; #ifdef CONFIG_FSNOTIFY __u32 i_fsnotify_mask; /* all events this inode cares about */ + /* 32-bit hole reserved for expanding i_fsnotify_mask */ struct fsnotify_mark_connector __rcu *i_fsnotify_marks; #endif @@ -1538,23 +1542,27 @@ struct timespec64 inode_set_ctime_current(struct inode *inode); static inline time64_t inode_get_atime_sec(const struct inode *inode) { - return inode->__i_atime.tv_sec; + return inode->i_atime_sec; } static inline long inode_get_atime_nsec(const struct inode *inode) { - return inode->__i_atime.tv_nsec; + return inode->i_atime_nsec; } static inline struct timespec64 inode_get_atime(const struct inode *inode) { - return inode->__i_atime; + struct timespec64 ts = { .tv_sec = inode_get_atime_sec(inode), + .tv_nsec = inode_get_atime_nsec(inode) }; + + return ts; } static inline struct timespec64 inode_set_atime_to_ts(struct inode *inode, struct timespec64 ts) { - inode->__i_atime = ts; + inode->i_atime_sec = ts.tv_sec; + inode->i_atime_nsec = ts.tv_nsec; return ts; } @@ -1563,28 +1571,32 @@ static inline struct timespec64 inode_set_atime(struct inode *inode, { struct timespec64 ts = { .tv_sec = sec, .tv_nsec = nsec }; + return inode_set_atime_to_ts(inode, ts); } static inline time64_t inode_get_mtime_sec(const struct inode *inode) { - return inode->__i_mtime.tv_sec; + return inode->i_mtime_sec; } static inline long inode_get_mtime_nsec(const struct inode *inode) { - return inode->__i_mtime.tv_nsec; + return inode->i_mtime_nsec; } static inline struct timespec64 inode_get_mtime(const struct inode *inode) { - return inode->__i_mtime; + struct timespec64 ts = { .tv_sec = inode_get_mtime_sec(inode), + .tv_nsec = inode_get_mtime_nsec(inode) }; + return ts; } static inline struct timespec64 inode_set_mtime_to_ts(struct inode *inode, struct timespec64 ts) { - inode->__i_mtime = ts; + inode->i_mtime_sec = ts.tv_sec; + inode->i_mtime_nsec = ts.tv_nsec; return ts; } @@ -1598,23 +1610,27 @@ static inline struct timespec64 inode_set_mtime(struct inode *inode, static inline time64_t inode_get_ctime_sec(const struct inode *inode) { - return inode->__i_ctime.tv_sec; + return inode->i_ctime_sec; } static inline long inode_get_ctime_nsec(const struct inode *inode) { - return inode->__i_ctime.tv_nsec; + return inode->i_ctime_nsec; } static inline struct timespec64 inode_get_ctime(const struct inode *inode) { - return inode->__i_ctime; + struct timespec64 ts = { .tv_sec = inode_get_ctime_sec(inode), + .tv_nsec = inode_get_ctime_nsec(inode) }; + + return ts; } static inline struct timespec64 inode_set_ctime_to_ts(struct inode *inode, struct timespec64 ts) { - inode->__i_ctime = ts; + inode->i_ctime_sec = ts.tv_sec; + inode->i_ctime_nsec = ts.tv_nsec; return ts; } From ef44c8ab06b300a5b9b30e5b630f491ac7bc4d3e Mon Sep 17 00:00:00 2001 From: Hongbo Li Date: Wed, 22 May 2024 11:04:22 +0800 Subject: [PATCH 03/27] fs: fsconfig: intercept non-new mount API in advance for FSCONFIG_CMD_CREATE_EXCL command fsconfig with FSCONFIG_CMD_CREATE_EXCL command requires the new mount api, here we should return -EOPNOTSUPP in advance to avoid extra procedure. Signed-off-by: Hongbo Li Link: https://lore.kernel.org/r/20240522030422.315892-1-lihongbo22@huawei.com Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- fs/fsopen.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/fs/fsopen.c b/fs/fsopen.c index 6593ae518115..18fe979da7e2 100644 --- a/fs/fsopen.c +++ b/fs/fsopen.c @@ -220,10 +220,6 @@ static int vfs_cmd_create(struct fs_context *fc, bool exclusive) if (!mount_capable(fc)) return -EPERM; - /* require the new mount api */ - if (exclusive && fc->ops == &legacy_fs_context_ops) - return -EOPNOTSUPP; - fc->phase = FS_CONTEXT_CREATING; fc->exclusive = exclusive; @@ -411,6 +407,7 @@ SYSCALL_DEFINE5(fsconfig, case FSCONFIG_SET_PATH: case FSCONFIG_SET_PATH_EMPTY: case FSCONFIG_SET_FD: + case FSCONFIG_CMD_CREATE_EXCL: ret = -EOPNOTSUPP; goto out_f; } From 620c266f394932e5decc4b34683a75dfc59dc2f4 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 24 May 2024 12:19:39 +0200 Subject: [PATCH 04/27] fhandle: relax open_by_handle_at() permission checks A current limitation of open_by_handle_at() is that it's currently not possible to use it from within containers at all because we require CAP_DAC_READ_SEARCH in the initial namespace. That's unfortunate because there are scenarios where using open_by_handle_at() from within containers. Two examples: (1) cgroupfs allows to encode cgroups to file handles and reopen them with open_by_handle_at(). (2) Fanotify allows placing filesystem watches they currently aren't usable in containers because the returned file handles cannot be used. Here's a proposal for relaxing the permission check for open_by_handle_at(). (1) Opening file handles when the caller has privileges over the filesystem (1.1) The caller has an unobstructed view of the filesystem. (1.2) The caller has permissions to follow a path to the file handle. This doesn't address the problem of opening a file handle when only a portion of a filesystem is exposed as is common in containers by e.g., bind-mounting a subtree. The proposal to solve this use-case is: (2) Opening file handles when the caller has privileges over a subtree (2.1) The caller is able to reach the file from the provided mount fd. (2.2) The caller has permissions to construct an unobstructed path to the file handle. (2.3) The caller has permissions to follow a path to the file handle. The relaxed permission checks are currently restricted to directory file handles which are what both cgroupfs and fanotify need. Handling disconnected non-directory file handles would lead to a potentially non-deterministic api. If a disconnected non-directory file handle is provided we may fail to decode a valid path that we could use for permission checking. That in itself isn't a problem as we would just return EACCES in that case. However, confusion may arise if a non-disconnected dentry ends up in the cache later and those opening the file handle would suddenly succeed. * It's potentially possible to use timing information (side-channel) to infer whether a given inode exists. I don't think that's particularly problematic. Thanks to Jann for bringing this to my attention. * An unrelated note (IOW, these are thoughts that apply to open_by_handle_at() generically and are unrelated to the changes here): Jann pointed out that we should verify whether deleted files could potentially be reopened through open_by_handle_at(). I don't think that's possible though. Another potential thing to check is whether open_by_handle_at() could be abused to open internal stuff like memfds or gpu stuff. I don't think so but I haven't had the time to completely verify this. This dates back to discussions Amir and I had quite some time ago and thanks to him for providing a lot of details around the export code and related patches! Link: https://lore.kernel.org/r/20240524-vfs-open_by_handle_at-v1-1-3d4b7d22736b@kernel.org Reviewed-by: Amir Goldstein Reviewed-by: Jeff Layton Signed-off-by: Christian Brauner --- fs/exportfs/expfs.c | 9 +- fs/fhandle.c | 192 ++++++++++++++++++++++++++++++--------- fs/mount.h | 1 + fs/namespace.c | 2 +- fs/nfsd/nfsfh.c | 2 +- include/linux/exportfs.h | 2 + 6 files changed, 159 insertions(+), 49 deletions(-) diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c index 07ea3d62b298..4f2dd4ab4486 100644 --- a/fs/exportfs/expfs.c +++ b/fs/exportfs/expfs.c @@ -427,7 +427,7 @@ EXPORT_SYMBOL_GPL(exportfs_encode_fh); struct dentry * exportfs_decode_fh_raw(struct vfsmount *mnt, struct fid *fid, int fh_len, - int fileid_type, + int fileid_type, unsigned int flags, int (*acceptable)(void *, struct dentry *), void *context) { @@ -445,6 +445,11 @@ exportfs_decode_fh_raw(struct vfsmount *mnt, struct fid *fid, int fh_len, if (IS_ERR_OR_NULL(result)) return result; + if ((flags & EXPORT_FH_DIR_ONLY) && !d_is_dir(result)) { + err = -ENOTDIR; + goto err_result; + } + /* * If no acceptance criteria was specified by caller, a disconnected * dentry is also accepatable. Callers may use this mode to query if @@ -581,7 +586,7 @@ struct dentry *exportfs_decode_fh(struct vfsmount *mnt, struct fid *fid, { struct dentry *ret; - ret = exportfs_decode_fh_raw(mnt, fid, fh_len, fileid_type, + ret = exportfs_decode_fh_raw(mnt, fid, fh_len, fileid_type, 0, acceptable, context); if (IS_ERR_OR_NULL(ret)) { if (ret == ERR_PTR(-ENOMEM)) diff --git a/fs/fhandle.c b/fs/fhandle.c index 8a7f86c2139a..6e8cea16790e 100644 --- a/fs/fhandle.c +++ b/fs/fhandle.c @@ -115,88 +115,188 @@ SYSCALL_DEFINE5(name_to_handle_at, int, dfd, const char __user *, name, return err; } -static struct vfsmount *get_vfsmount_from_fd(int fd) +static int get_path_from_fd(int fd, struct path *root) { - struct vfsmount *mnt; - if (fd == AT_FDCWD) { struct fs_struct *fs = current->fs; spin_lock(&fs->lock); - mnt = mntget(fs->pwd.mnt); + *root = fs->pwd; + path_get(root); spin_unlock(&fs->lock); } else { struct fd f = fdget(fd); if (!f.file) - return ERR_PTR(-EBADF); - mnt = mntget(f.file->f_path.mnt); + return -EBADF; + *root = f.file->f_path; + path_get(root); fdput(f); } - return mnt; + + return 0; } +enum handle_to_path_flags { + HANDLE_CHECK_PERMS = (1 << 0), + HANDLE_CHECK_SUBTREE = (1 << 1), +}; + +struct handle_to_path_ctx { + struct path root; + enum handle_to_path_flags flags; + unsigned int fh_flags; +}; + static int vfs_dentry_acceptable(void *context, struct dentry *dentry) { - return 1; -} - -static int do_handle_to_path(int mountdirfd, struct file_handle *handle, - struct path *path) -{ + struct handle_to_path_ctx *ctx = context; + struct user_namespace *user_ns = current_user_ns(); + struct dentry *d, *root = ctx->root.dentry; + struct mnt_idmap *idmap = mnt_idmap(ctx->root.mnt); int retval = 0; - int handle_dwords; - path->mnt = get_vfsmount_from_fd(mountdirfd); - if (IS_ERR(path->mnt)) { - retval = PTR_ERR(path->mnt); - goto out_err; + if (!root) + return 1; + + /* Old permission model with global CAP_DAC_READ_SEARCH. */ + if (!ctx->flags) + return 1; + + /* + * It's racy as we're not taking rename_lock but we're able to ignore + * permissions and we just need an approximation whether we were able + * to follow a path to the file. + * + * It's also potentially expensive on some filesystems especially if + * there is a deep path. + */ + d = dget(dentry); + while (d != root && !IS_ROOT(d)) { + struct dentry *parent = dget_parent(d); + + /* + * We know that we have the ability to override DAC permissions + * as we've verified this earlier via CAP_DAC_READ_SEARCH. But + * we also need to make sure that there aren't any unmapped + * inodes in the path that would prevent us from reaching the + * file. + */ + if (!privileged_wrt_inode_uidgid(user_ns, idmap, + d_inode(parent))) { + dput(d); + dput(parent); + return retval; + } + + dput(d); + d = parent; } - /* change the handle size to multiple of sizeof(u32) */ - handle_dwords = handle->handle_bytes >> 2; - path->dentry = exportfs_decode_fh(path->mnt, - (struct fid *)handle->f_handle, - handle_dwords, handle->handle_type, - vfs_dentry_acceptable, NULL); - if (IS_ERR(path->dentry)) { - retval = PTR_ERR(path->dentry); - goto out_mnt; - } - return 0; -out_mnt: - mntput(path->mnt); -out_err: + + if (!(ctx->flags & HANDLE_CHECK_SUBTREE) || d == root) + retval = 1; + WARN_ON_ONCE(d != root && d != root->d_sb->s_root); + dput(d); return retval; } +static int do_handle_to_path(struct file_handle *handle, struct path *path, + struct handle_to_path_ctx *ctx) +{ + int handle_dwords; + struct vfsmount *mnt = ctx->root.mnt; + + /* change the handle size to multiple of sizeof(u32) */ + handle_dwords = handle->handle_bytes >> 2; + path->dentry = exportfs_decode_fh_raw(mnt, + (struct fid *)handle->f_handle, + handle_dwords, handle->handle_type, + ctx->fh_flags, + vfs_dentry_acceptable, ctx); + if (IS_ERR_OR_NULL(path->dentry)) { + if (path->dentry == ERR_PTR(-ENOMEM)) + return -ENOMEM; + return -ESTALE; + } + path->mnt = mntget(mnt); + return 0; +} + +/* + * Allow relaxed permissions of file handles if the caller has the + * ability to mount the filesystem or create a bind-mount of the + * provided @mountdirfd. + * + * In both cases the caller may be able to get an unobstructed way to + * the encoded file handle. If the caller is only able to create a + * bind-mount we need to verify that there are no locked mounts on top + * of it that could prevent us from getting to the encoded file. + * + * In principle, locked mounts can prevent the caller from mounting the + * filesystem but that only applies to procfs and sysfs neither of which + * support decoding file handles. + */ +static inline bool may_decode_fh(struct handle_to_path_ctx *ctx, + unsigned int o_flags) +{ + struct path *root = &ctx->root; + + /* + * Restrict to O_DIRECTORY to provide a deterministic API that avoids a + * confusing api in the face of disconnected non-dir dentries. + * + * There's only one dentry for each directory inode (VFS rule)... + */ + if (!(o_flags & O_DIRECTORY)) + return false; + + if (ns_capable(root->mnt->mnt_sb->s_user_ns, CAP_SYS_ADMIN)) + ctx->flags = HANDLE_CHECK_PERMS; + else if (is_mounted(root->mnt) && + ns_capable(real_mount(root->mnt)->mnt_ns->user_ns, + CAP_SYS_ADMIN) && + !has_locked_children(real_mount(root->mnt), root->dentry)) + ctx->flags = HANDLE_CHECK_PERMS | HANDLE_CHECK_SUBTREE; + else + return false; + + /* Are we able to override DAC permissions? */ + if (!ns_capable(current_user_ns(), CAP_DAC_READ_SEARCH)) + return false; + + ctx->fh_flags = EXPORT_FH_DIR_ONLY; + return true; +} + static int handle_to_path(int mountdirfd, struct file_handle __user *ufh, - struct path *path) + struct path *path, unsigned int o_flags) { int retval = 0; struct file_handle f_handle; struct file_handle *handle = NULL; + struct handle_to_path_ctx ctx = {}; - /* - * With handle we don't look at the execute bit on the - * directory. Ideally we would like CAP_DAC_SEARCH. - * But we don't have that - */ - if (!capable(CAP_DAC_READ_SEARCH)) { - retval = -EPERM; + retval = get_path_from_fd(mountdirfd, &ctx.root); + if (retval) goto out_err; + + if (!capable(CAP_DAC_READ_SEARCH) && !may_decode_fh(&ctx, o_flags)) { + retval = -EPERM; + goto out_path; } + if (copy_from_user(&f_handle, ufh, sizeof(struct file_handle))) { retval = -EFAULT; - goto out_err; + goto out_path; } if ((f_handle.handle_bytes > MAX_HANDLE_SZ) || (f_handle.handle_bytes == 0)) { retval = -EINVAL; - goto out_err; + goto out_path; } handle = kmalloc(struct_size(handle, f_handle, f_handle.handle_bytes), GFP_KERNEL); if (!handle) { retval = -ENOMEM; - goto out_err; + goto out_path; } /* copy the full handle */ *handle = f_handle; @@ -207,10 +307,12 @@ static int handle_to_path(int mountdirfd, struct file_handle __user *ufh, goto out_handle; } - retval = do_handle_to_path(mountdirfd, handle, path); + retval = do_handle_to_path(handle, path, &ctx); out_handle: kfree(handle); +out_path: + path_put(&ctx.root); out_err: return retval; } @@ -223,7 +325,7 @@ static long do_handle_open(int mountdirfd, struct file_handle __user *ufh, struct file *file; int fd; - retval = handle_to_path(mountdirfd, ufh, &path); + retval = handle_to_path(mountdirfd, ufh, &path, open_flag); if (retval) return retval; diff --git a/fs/mount.h b/fs/mount.h index 4a42fc68f4cc..4adce73211ae 100644 --- a/fs/mount.h +++ b/fs/mount.h @@ -152,3 +152,4 @@ static inline void move_from_ns(struct mount *mnt, struct list_head *dt_list) } extern void mnt_cursor_del(struct mnt_namespace *ns, struct mount *cursor); +bool has_locked_children(struct mount *mnt, struct dentry *dentry); diff --git a/fs/namespace.c b/fs/namespace.c index 5a51315c6678..4386787210c7 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -2078,7 +2078,7 @@ void drop_collected_mounts(struct vfsmount *mnt) namespace_unlock(); } -static bool has_locked_children(struct mount *mnt, struct dentry *dentry) +bool has_locked_children(struct mount *mnt, struct dentry *dentry) { struct mount *child; diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c index 0b75305fb5f5..dd4e11a703aa 100644 --- a/fs/nfsd/nfsfh.c +++ b/fs/nfsd/nfsfh.c @@ -247,7 +247,7 @@ static __be32 nfsd_set_fh_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp) dentry = dget(exp->ex_path.dentry); else { dentry = exportfs_decode_fh_raw(exp->ex_path.mnt, fid, - data_left, fileid_type, + data_left, fileid_type, 0, nfsd_acceptable, exp); if (IS_ERR_OR_NULL(dentry)) { trace_nfsd_set_fh_dentry_badhandle(rqstp, fhp, diff --git a/include/linux/exportfs.h b/include/linux/exportfs.h index bb37ad5cc954..893a1d21dc1c 100644 --- a/include/linux/exportfs.h +++ b/include/linux/exportfs.h @@ -158,6 +158,7 @@ struct fid { #define EXPORT_FH_CONNECTABLE 0x1 /* Encode file handle with parent */ #define EXPORT_FH_FID 0x2 /* File handle may be non-decodeable */ +#define EXPORT_FH_DIR_ONLY 0x4 /* Only decode file handle for a directory */ /** * struct export_operations - for nfsd to communicate with file systems @@ -305,6 +306,7 @@ static inline int exportfs_encode_fid(struct inode *inode, struct fid *fid, extern struct dentry *exportfs_decode_fh_raw(struct vfsmount *mnt, struct fid *fid, int fh_len, int fileid_type, + unsigned int flags, int (*acceptable)(void *, struct dentry *), void *context); extern struct dentry *exportfs_decode_fh(struct vfsmount *mnt, struct fid *fid, From 54018131e663a1df05021fcb22a18d6c5ebef734 Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Sun, 2 Jun 2024 14:37:19 +0200 Subject: [PATCH 05/27] vfs: replace WARN(down_read_trylock, ...) abuse with proper asserts Note the macro used here works regardless of LOCKDEP. Signed-off-by: Mateusz Guzik Link: https://lore.kernel.org/r/20240602123720.775702-1-mjguzik@gmail.com Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- fs/dcache.c | 2 +- fs/quota/dquot.c | 8 ++------ 2 files changed, 3 insertions(+), 7 deletions(-) diff --git a/fs/dcache.c b/fs/dcache.c index 1ee6404b430b..58b89c9e9b0c 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -1548,7 +1548,7 @@ void shrink_dcache_for_umount(struct super_block *sb) { struct dentry *dentry; - WARN(down_read_trylock(&sb->s_umount), "s_umount should've been locked"); + rwsem_assert_held_write(&sb->s_umount); dentry = sb->s_root; sb->s_root = NULL; diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index 627eb2f72ef3..a2b256dac36e 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -2246,9 +2246,7 @@ int dquot_disable(struct super_block *sb, int type, unsigned int flags) int cnt; struct quota_info *dqopt = sb_dqopt(sb); - /* s_umount should be held in exclusive mode */ - if (WARN_ON_ONCE(down_read_trylock(&sb->s_umount))) - up_read(&sb->s_umount); + rwsem_assert_held_write(&sb->s_umount); /* Cannot turn off usage accounting without turning off limits, or * suspend quotas and simultaneously turn quotas off. */ @@ -2510,9 +2508,7 @@ int dquot_resume(struct super_block *sb, int type) int ret = 0, cnt; unsigned int flags; - /* s_umount should be held in exclusive mode */ - if (WARN_ON_ONCE(down_read_trylock(&sb->s_umount))) - up_read(&sb->s_umount); + rwsem_assert_held_write(&sb->s_umount); for (cnt = 0; cnt < MAXQUOTAS; cnt++) { if (type != -1 && cnt != type) From c12c0bb031cbaccf4f7d375db466b6457453bfa8 Mon Sep 17 00:00:00 2001 From: Thorsten Blum Date: Sun, 2 Jun 2024 12:15:35 +0200 Subject: [PATCH 06/27] readdir: Remove unused header include Since commit c512c6918719 ("uaccess: implement a proper unsafe_copy_to_user() and switch filldir over to it") the header file is no longer needed. Signed-off-by: Thorsten Blum Link: https://lore.kernel.org/r/20240602101534.348159-2-thorsten.blum@toblux.com Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- fs/readdir.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/fs/readdir.c b/fs/readdir.c index 278bc0254732..12a5f4567360 100644 --- a/fs/readdir.c +++ b/fs/readdir.c @@ -22,8 +22,6 @@ #include #include -#include - /* * Some filesystems were never converted to '->iterate_shared()' * and their directory iterators want the inode lock held for From 992f03ff8661bb0427f1db59c283f3fa63182b09 Mon Sep 17 00:00:00 2001 From: Thorsten Blum Date: Sun, 2 Jun 2024 02:47:30 +0200 Subject: [PATCH 07/27] readdir: Add missing quote in macro comment Add a missing double quote in the unsafe_copy_dirent_name() macro comment. Signed-off-by: Thorsten Blum Link: https://lore.kernel.org/r/20240602004729.229634-2-thorsten.blum@toblux.com Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- fs/readdir.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/readdir.c b/fs/readdir.c index 12a5f4567360..d6c82421902a 100644 --- a/fs/readdir.c +++ b/fs/readdir.c @@ -70,7 +70,7 @@ int wrap_directory_iterator(struct file *file, EXPORT_SYMBOL(wrap_directory_iterator); /* - * Note the "unsafe_put_user() semantics: we goto a + * Note the "unsafe_put_user()" semantics: we goto a * label for errors. */ #define unsafe_copy_dirent_name(_dst, _src, _len, label) do { \ From 2a010c41285345da60cece35575b4e0af7e7bf44 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 31 May 2024 15:01:43 +0200 Subject: [PATCH 08/27] fs: don't block i_writecount during exec Back in 2021 we already discussed removing deny_write_access() for executables. Back then I was hesistant because I thought that this might cause issues in userspace. But even back then I had started taking some notes on what could potentially depend on this and I didn't come up with a lot so I've changed my mind and I would like to try this. Here are some of the notes that I took: (1) The deny_write_access() mechanism is causing really pointless issues such as [1]. If a thread in a thread-group opens a file writable, then writes some stuff, then closing the file descriptor and then calling execve() they can fail the execve() with ETXTBUSY because another thread in the thread-group could have concurrently called fork(). Multi-threaded libraries such as go suffer from this. (2) There are userspace attacks that rely on overwriting the binary of a running process. These attacks are _mitigated_ but _not at all prevented_ from ocurring by the deny_write_access() mechanism. I'll go over some details. The clearest example of such attacks was the attack against runC in CVE-2019-5736 (cf. [3]). An attack could compromise the runC host binary from inside a _privileged_ runC container. The malicious binary could then be used to take over the host. (It is crucial to note that this attack is _not_ possible with unprivileged containers. IOW, the setup here is already insecure.) The attack can be made when attaching to a running container or when starting a container running a specially crafted image. For example, when runC attaches to a container the attacker can trick it into executing itself. This could be done by replacing the target binary inside the container with a custom binary pointing back at the runC binary itself. As an example, if the target binary was /bin/bash, this could be replaced with an executable script specifying the interpreter path #!/proc/self/exe. As such when /bin/bash is executed inside the container, instead the target of /proc/self/exe will be executed. That magic link will point to the runc binary on the host. The attacker can then proceed to write to the target of /proc/self/exe to try and overwrite the runC binary on the host. However, this will not succeed because of deny_write_access(). Now, one might think that this would prevent the attack but it doesn't. To overcome this, the attacker has multiple ways: * Open a file descriptor to /proc/self/exe using the O_PATH flag and then proceed to reopen the binary as O_WRONLY through /proc/self/fd/ and try to write to it in a busy loop from a separate process. Ultimately it will succeed when the runC binary exits. After this the runC binary is compromised and can be used to attack other containers or the host itself. * Use a malicious shared library annotating a function in there with the constructor attribute making the malicious function run as an initializor. The malicious library will then open /proc/self/exe for creating a new entry under /proc/self/fd/. It'll then call exec to a) force runC to exit and b) hand the file descriptor off to a program that then reopens /proc/self/fd/ for writing (which is now possible because runC has exited) and overwriting that binary. To sum up: the deny_write_access() mechanism doesn't prevent such attacks in insecure setups. It just makes them minimally harder. That's all. The only way back then to prevent this is to create a temporary copy of the calling binary itself when it starts or attaches to containers. So what I did back then for LXC (and Aleksa for runC) was to create an anonymous, in-memory file using the memfd_create() system call and to copy itself into the temporary in-memory file, which is then sealed to prevent further modifications. This sealed, in-memory file copy is then executed instead of the original on-disk binary. Any compromising write operations from a privileged container to the host binary will then write to the temporary in-memory binary and not to the host binary on-disk, preserving the integrity of the host binary. Also as the temporary, in-memory binary is sealed, writes to this will also fail. The point is that deny_write_access() is uselss to prevent these attacks. (3) Denying write access to an inode because it's currently used in an exec path could easily be done on an LSM level. It might need an additional hook but that should be about it. (4) The MAP_DENYWRITE flag for mmap() has been deprecated a long time ago so while we do protect the main executable the bigger portion of the things you'd think need protecting such as the shared libraries aren't. IOW, we let anyone happily overwrite shared libraries. (5) We removed all remaining uses of VM_DENYWRITE in [2]. That means: (5.1) We removed the legacy uselib() protection for preventing overwriting of shared libraries. Nobody cared in 3 years. (5.2) We allow write access to the elf interpreter after exec completed treating it on a par with shared libraries. Yes, someone in userspace could potentially be relying on this. It's not completely out of the realm of possibility but let's find out if that's actually the case and not guess. Link: https://github.com/golang/go/issues/22315 [1] Link: 49624efa65ac ("Merge tag 'denywrite-for-5.15' of git://github.com/davidhildenbrand/linux") [2] Link: https://unit42.paloaltonetworks.com/breaking-docker-via-runc-explaining-cve-2019-5736 [3] Link: https://lwn.net/Articles/866493 Link: https://github.com/golang/go/issues/22220 Link: https://github.com/golang/go/blob/5bf8c0cf09ee5c7e5a37ab90afcce154ab716a97/src/cmd/go/internal/work/buildid.go#L724 Link: https://github.com/golang/go/blob/5bf8c0cf09ee5c7e5a37ab90afcce154ab716a97/src/cmd/go/internal/work/exec.go#L1493 Link: https://github.com/golang/go/blob/5bf8c0cf09ee5c7e5a37ab90afcce154ab716a97/src/cmd/go/internal/script/cmds.go#L457 Link: https://github.com/golang/go/blob/5bf8c0cf09ee5c7e5a37ab90afcce154ab716a97/src/cmd/go/internal/test/test.go#L1557 Link: https://github.com/golang/go/blob/5bf8c0cf09ee5c7e5a37ab90afcce154ab716a97/src/os/exec/lp_linux_test.go#L61 Link: https://github.com/buildkite/agent/pull/2736 Link: https://github.com/rust-lang/rust/issues/114554 Link: https://bugs.openjdk.org/browse/JDK-8068370 Link: https://github.com/dotnet/runtime/issues/58964 Link: https://lore.kernel.org/r/20240531-vfs-i_writecount-v1-1-a17bea7ee36b@kernel.org Reviewed-by: Josef Bacik Signed-off-by: Christian Brauner --- fs/binfmt_elf.c | 2 -- fs/binfmt_elf_fdpic.c | 5 +---- fs/binfmt_misc.c | 7 ++----- fs/exec.c | 14 +++----------- kernel/fork.c | 26 +++----------------------- 5 files changed, 9 insertions(+), 45 deletions(-) diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index a43897b03ce9..6fdec541f8bf 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -1216,7 +1216,6 @@ static int load_elf_binary(struct linux_binprm *bprm) } reloc_func_desc = interp_load_addr; - allow_write_access(interpreter); fput(interpreter); kfree(interp_elf_ex); @@ -1308,7 +1307,6 @@ static int load_elf_binary(struct linux_binprm *bprm) kfree(interp_elf_ex); kfree(interp_elf_phdata); out_free_file: - allow_write_access(interpreter); if (interpreter) fput(interpreter); out_free_ph: diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c index b799701454a9..28a3439f163a 100644 --- a/fs/binfmt_elf_fdpic.c +++ b/fs/binfmt_elf_fdpic.c @@ -394,7 +394,6 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm) goto error; } - allow_write_access(interpreter); fput(interpreter); interpreter = NULL; } @@ -466,10 +465,8 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm) retval = 0; error: - if (interpreter) { - allow_write_access(interpreter); + if (interpreter) fput(interpreter); - } kfree(interpreter_name); kfree(exec_params.phdrs); kfree(exec_params.loadmap); diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c index 68fa225f89e5..21ce5ec1ea76 100644 --- a/fs/binfmt_misc.c +++ b/fs/binfmt_misc.c @@ -247,13 +247,10 @@ static int load_misc_binary(struct linux_binprm *bprm) if (retval < 0) goto ret; - if (fmt->flags & MISC_FMT_OPEN_FILE) { + if (fmt->flags & MISC_FMT_OPEN_FILE) interp_file = file_clone_open(fmt->interp_file); - if (!IS_ERR(interp_file)) - deny_write_access(interp_file); - } else { + else interp_file = open_exec(fmt->interpreter); - } retval = PTR_ERR(interp_file); if (IS_ERR(interp_file)) goto ret; diff --git a/fs/exec.c b/fs/exec.c index 40073142288f..4dee205452e2 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -952,10 +952,6 @@ static struct file *do_open_execat(int fd, struct filename *name, int flags) path_noexec(&file->f_path))) goto exit; - err = deny_write_access(file); - if (err) - goto exit; - out: return file; @@ -971,8 +967,7 @@ static struct file *do_open_execat(int fd, struct filename *name, int flags) * * Returns ERR_PTR on failure or allocated struct file on success. * - * As this is a wrapper for the internal do_open_execat(), callers - * must call allow_write_access() before fput() on release. Also see + * As this is a wrapper for the internal do_open_execat(). Also see * do_close_execat(). */ struct file *open_exec(const char *name) @@ -1524,10 +1519,8 @@ static int prepare_bprm_creds(struct linux_binprm *bprm) /* Matches do_open_execat() */ static void do_close_execat(struct file *file) { - if (!file) - return; - allow_write_access(file); - fput(file); + if (file) + fput(file); } static void free_bprm(struct linux_binprm *bprm) @@ -1846,7 +1839,6 @@ static int exec_binprm(struct linux_binprm *bprm) bprm->file = bprm->interpreter; bprm->interpreter = NULL; - allow_write_access(exec); if (unlikely(bprm->have_execfd)) { if (bprm->executable) { fput(exec); diff --git a/kernel/fork.c b/kernel/fork.c index 99076dbe27d8..763a042eef9c 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -616,12 +616,6 @@ static void dup_mm_exe_file(struct mm_struct *mm, struct mm_struct *oldmm) exe_file = get_mm_exe_file(oldmm); RCU_INIT_POINTER(mm->exe_file, exe_file); - /* - * We depend on the oldmm having properly denied write access to the - * exe_file already. - */ - if (exe_file && deny_write_access(exe_file)) - pr_warn_once("deny_write_access() failed in %s\n", __func__); } #ifdef CONFIG_MMU @@ -1412,20 +1406,11 @@ int set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file) */ old_exe_file = rcu_dereference_raw(mm->exe_file); - if (new_exe_file) { - /* - * We expect the caller (i.e., sys_execve) to already denied - * write access, so this is unlikely to fail. - */ - if (unlikely(deny_write_access(new_exe_file))) - return -EACCES; + if (new_exe_file) get_file(new_exe_file); - } rcu_assign_pointer(mm->exe_file, new_exe_file); - if (old_exe_file) { - allow_write_access(old_exe_file); + if (old_exe_file) fput(old_exe_file); - } return 0; } @@ -1464,9 +1449,6 @@ int replace_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file) return ret; } - ret = deny_write_access(new_exe_file); - if (ret) - return -EACCES; get_file(new_exe_file); /* set the new file */ @@ -1475,10 +1457,8 @@ int replace_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file) rcu_assign_pointer(mm->exe_file, new_exe_file); mmap_write_unlock(mm); - if (old_exe_file) { - allow_write_access(old_exe_file); + if (old_exe_file) fput(old_exe_file); - } return 0; } From ca86a5d2f9feed16e11023891c8c70aa57ced24e Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Thu, 16 May 2024 00:10:44 +0200 Subject: [PATCH 09/27] tmpfs: don't interrupt fallocate with EINTR I have a program that sets up a periodic timer with 10ms interval. When the program attempts to call fallocate(2) on tmpfs, it goes into an infinite loop. fallocate(2) takes longer than 10ms, so it gets interrupted by a signal and it returns EINTR. On EINTR, the fallocate call is restarted, going into the same loop again. Let's change the signal_pending() check in shmem_fallocate() loop to fatal_signal_pending(). This solves the problem of shmem_fallocate() constantly restarting. Since most other filesystem's fallocate methods don't react to signals, it is unlikely userspace really relies on timely delivery of non-fatal signals while fallocate is running. Also the comment before the signal check: /* * Good, the fallocate(2) manpage permits EINTR: we may have * been interrupted because we are using up too much memory. */ indicates that the check was mainly added for OOM situations in which case the process will be sent a fatal signal so this change preserves the behavior in OOM situations. [JK: Update changelog and comment based on upstream discussion] Signed-off-by: Mikulas Patocka Signed-off-by: Jan Kara Link: https://lore.kernel.org/r/20240515221044.590-1-jack@suse.cz Reviewed-by: Matthew Wilcox (Oracle) Signed-off-by: Christian Brauner --- mm/shmem.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index f5d60436b604..ff7c756a7d02 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -3166,10 +3166,13 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset, struct folio *folio; /* - * Good, the fallocate(2) manpage permits EINTR: we may have - * been interrupted because we are using up too much memory. + * Check for fatal signal so that we abort early in OOM + * situations. We don't want to abort in case of non-fatal + * signals as large fallocate can take noticeable time and + * e.g. periodic timers may result in fallocate constantly + * restarting. */ - if (signal_pending(current)) + if (fatal_signal_pending(current)) error = -EINTR; else if (shmem_falloc.nr_unswapped > shmem_falloc.nr_falloced) error = -ENOMEM; From 969ce92da3112e05d1a70c344f8740a85e933f2e Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Tue, 4 Jun 2024 17:52:55 +0200 Subject: [PATCH 10/27] vfs: stop using user_path_at_empty in do_readlinkat It is the only consumer and it saddles getname_flags with an argument set to NULL by everyone else. Instead the routine can do the empty check on its own. Then user_path_at_empty can get retired and getname_flags can lose the argument. Signed-off-by: Mateusz Guzik Link: https://lore.kernel.org/r/20240604155257.109500-2-mjguzik@gmail.com Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- fs/stat.c | 43 ++++++++++++++++++++++++------------------- 1 file changed, 24 insertions(+), 19 deletions(-) diff --git a/fs/stat.c b/fs/stat.c index 70bd3e888cfa..7f7861544500 100644 --- a/fs/stat.c +++ b/fs/stat.c @@ -488,34 +488,39 @@ static int do_readlinkat(int dfd, const char __user *pathname, char __user *buf, int bufsiz) { struct path path; + struct filename *name; int error; - int empty = 0; unsigned int lookup_flags = LOOKUP_EMPTY; if (bufsiz <= 0) return -EINVAL; retry: - error = user_path_at_empty(dfd, pathname, lookup_flags, &path, &empty); - if (!error) { - struct inode *inode = d_backing_inode(path.dentry); + name = getname_flags(pathname, lookup_flags, NULL); + error = filename_lookup(dfd, name, lookup_flags, &path, NULL); + if (unlikely(error)) { + putname(name); + return error; + } - error = empty ? -ENOENT : -EINVAL; - /* - * AFS mountpoints allow readlink(2) but are not symlinks - */ - if (d_is_symlink(path.dentry) || inode->i_op->readlink) { - error = security_inode_readlink(path.dentry); - if (!error) { - touch_atime(&path); - error = vfs_readlink(path.dentry, buf, bufsiz); - } - } - path_put(&path); - if (retry_estale(error, lookup_flags)) { - lookup_flags |= LOOKUP_REVAL; - goto retry; + /* + * AFS mountpoints allow readlink(2) but are not symlinks + */ + if (d_is_symlink(path.dentry) || + d_backing_inode(path.dentry)->i_op->readlink) { + error = security_inode_readlink(path.dentry); + if (!error) { + touch_atime(&path); + error = vfs_readlink(path.dentry, buf, bufsiz); } + } else { + error = (name->name[0] == '\0') ? -ENOENT : -EINVAL; + } + path_put(&path); + putname(name); + if (retry_estale(error, lookup_flags)) { + lookup_flags |= LOOKUP_REVAL; + goto retry; } return error; } From dff60734fc7606fabde668ab6a26feacec8787cc Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Tue, 4 Jun 2024 17:52:56 +0200 Subject: [PATCH 11/27] vfs: retire user_path_at_empty and drop empty arg from getname_flags No users after do_readlinkat started doing the job on its own. Signed-off-by: Mateusz Guzik Link: https://lore.kernel.org/r/20240604155257.109500-3-mjguzik@gmail.com Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- fs/fsopen.c | 2 +- fs/namei.c | 16 +++++++--------- fs/stat.c | 6 +++--- include/linux/fs.h | 2 +- include/linux/namei.h | 8 +------- io_uring/statx.c | 3 +-- io_uring/xattr.c | 4 ++-- 7 files changed, 16 insertions(+), 25 deletions(-) diff --git a/fs/fsopen.c b/fs/fsopen.c index 18fe979da7e2..ed2dd000622e 100644 --- a/fs/fsopen.c +++ b/fs/fsopen.c @@ -448,7 +448,7 @@ SYSCALL_DEFINE5(fsconfig, fallthrough; case FSCONFIG_SET_PATH: param.type = fs_value_is_filename; - param.name = getname_flags(_value, lookup_flags, NULL); + param.name = getname_flags(_value, lookup_flags); if (IS_ERR(param.name)) { ret = PTR_ERR(param.name); goto out_key; diff --git a/fs/namei.c b/fs/namei.c index 37fb0a8aa09a..950ad6bdd9fe 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -126,7 +126,7 @@ #define EMBEDDED_NAME_MAX (PATH_MAX - offsetof(struct filename, iname)) struct filename * -getname_flags(const char __user *filename, int flags, int *empty) +getname_flags(const char __user *filename, int flags) { struct filename *result; char *kname; @@ -190,8 +190,6 @@ getname_flags(const char __user *filename, int flags, int *empty) atomic_set(&result->refcnt, 1); /* The empty path is special. */ if (unlikely(!len)) { - if (empty) - *empty = 1; if (!(flags & LOOKUP_EMPTY)) { putname(result); return ERR_PTR(-ENOENT); @@ -209,13 +207,13 @@ getname_uflags(const char __user *filename, int uflags) { int flags = (uflags & AT_EMPTY_PATH) ? LOOKUP_EMPTY : 0; - return getname_flags(filename, flags, NULL); + return getname_flags(filename, flags); } struct filename * getname(const char __user * filename) { - return getname_flags(filename, 0, NULL); + return getname_flags(filename, 0); } struct filename * @@ -2922,16 +2920,16 @@ int path_pts(struct path *path) } #endif -int user_path_at_empty(int dfd, const char __user *name, unsigned flags, - struct path *path, int *empty) +int user_path_at(int dfd, const char __user *name, unsigned flags, + struct path *path) { - struct filename *filename = getname_flags(name, flags, empty); + struct filename *filename = getname_flags(name, flags); int ret = filename_lookup(dfd, filename, flags, path, NULL); putname(filename); return ret; } -EXPORT_SYMBOL(user_path_at_empty); +EXPORT_SYMBOL(user_path_at); int __check_sticky(struct mnt_idmap *idmap, struct inode *dir, struct inode *inode) diff --git a/fs/stat.c b/fs/stat.c index 7f7861544500..16aa1f5ceec4 100644 --- a/fs/stat.c +++ b/fs/stat.c @@ -300,7 +300,7 @@ int vfs_fstatat(int dfd, const char __user *filename, return vfs_fstat(dfd, stat); } - name = getname_flags(filename, getname_statx_lookup_flags(statx_flags), NULL); + name = getname_flags(filename, getname_statx_lookup_flags(statx_flags)); ret = vfs_statx(dfd, name, statx_flags, stat, STATX_BASIC_STATS); putname(name); @@ -496,7 +496,7 @@ static int do_readlinkat(int dfd, const char __user *pathname, return -EINVAL; retry: - name = getname_flags(pathname, lookup_flags, NULL); + name = getname_flags(pathname, lookup_flags); error = filename_lookup(dfd, name, lookup_flags, &path, NULL); if (unlikely(error)) { putname(name); @@ -710,7 +710,7 @@ SYSCALL_DEFINE5(statx, int ret; struct filename *name; - name = getname_flags(filename, getname_statx_lookup_flags(flags), NULL); + name = getname_flags(filename, getname_statx_lookup_flags(flags)); ret = do_statx(dfd, name, flags, mask, buffer); putname(name); diff --git a/include/linux/fs.h b/include/linux/fs.h index 639885621608..bfc1e6407bf6 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2701,7 +2701,7 @@ static inline struct file *file_clone_open(struct file *file) } extern int filp_close(struct file *, fl_owner_t id); -extern struct filename *getname_flags(const char __user *, int, int *); +extern struct filename *getname_flags(const char __user *, int); extern struct filename *getname_uflags(const char __user *, int); extern struct filename *getname(const char __user *); extern struct filename *getname_kernel(const char *); diff --git a/include/linux/namei.h b/include/linux/namei.h index 967aa9ea9f96..8ec8fed3bce8 100644 --- a/include/linux/namei.h +++ b/include/linux/namei.h @@ -50,13 +50,7 @@ enum {LAST_NORM, LAST_ROOT, LAST_DOT, LAST_DOTDOT}; extern int path_pts(struct path *path); -extern int user_path_at_empty(int, const char __user *, unsigned, struct path *, int *empty); - -static inline int user_path_at(int dfd, const char __user *name, unsigned flags, - struct path *path) -{ - return user_path_at_empty(dfd, name, flags, path, NULL); -} +extern int user_path_at(int, const char __user *, unsigned, struct path *); struct dentry *lookup_one_qstr_excl(const struct qstr *name, struct dentry *base, diff --git a/io_uring/statx.c b/io_uring/statx.c index abb874209caa..f7f9b202eec0 100644 --- a/io_uring/statx.c +++ b/io_uring/statx.c @@ -37,8 +37,7 @@ int io_statx_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) sx->flags = READ_ONCE(sqe->statx_flags); sx->filename = getname_flags(path, - getname_statx_lookup_flags(sx->flags), - NULL); + getname_statx_lookup_flags(sx->flags)); if (IS_ERR(sx->filename)) { int ret = PTR_ERR(sx->filename); diff --git a/io_uring/xattr.c b/io_uring/xattr.c index 44905b82eea8..6cf41c3bc369 100644 --- a/io_uring/xattr.c +++ b/io_uring/xattr.c @@ -96,7 +96,7 @@ int io_getxattr_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) path = u64_to_user_ptr(READ_ONCE(sqe->addr3)); - ix->filename = getname_flags(path, LOOKUP_FOLLOW, NULL); + ix->filename = getname_flags(path, LOOKUP_FOLLOW); if (IS_ERR(ix->filename)) { ret = PTR_ERR(ix->filename); ix->filename = NULL; @@ -189,7 +189,7 @@ int io_setxattr_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) path = u64_to_user_ptr(READ_ONCE(sqe->addr3)); - ix->filename = getname_flags(path, LOOKUP_FOLLOW, NULL); + ix->filename = getname_flags(path, LOOKUP_FOLLOW); if (IS_ERR(ix->filename)) { ret = PTR_ERR(ix->filename); ix->filename = NULL; From d4f50ea957cab6ea940cc072a142b1e964a10ee6 Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Tue, 4 Jun 2024 17:52:57 +0200 Subject: [PATCH 12/27] vfs: shave a branch in getname_flags Check for an error while copying and no path in one branch. Signed-off-by: Mateusz Guzik Link: https://lore.kernel.org/r/20240604155257.109500-4-mjguzik@gmail.com Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- fs/namei.c | 31 ++++++++++++++++++++----------- 1 file changed, 20 insertions(+), 11 deletions(-) diff --git a/fs/namei.c b/fs/namei.c index 950ad6bdd9fe..3d3674c21d3c 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -148,9 +148,20 @@ getname_flags(const char __user *filename, int flags) result->name = kname; len = strncpy_from_user(kname, filename, EMBEDDED_NAME_MAX); - if (unlikely(len < 0)) { - __putname(result); - return ERR_PTR(len); + /* + * Handle both empty path and copy failure in one go. + */ + if (unlikely(len <= 0)) { + if (unlikely(len < 0)) { + __putname(result); + return ERR_PTR(len); + } + + /* The empty path is special. */ + if (!(flags & LOOKUP_EMPTY)) { + __putname(result); + return ERR_PTR(-ENOENT); + } } /* @@ -180,6 +191,12 @@ getname_flags(const char __user *filename, int flags) kfree(result); return ERR_PTR(len); } + /* The empty path is special. */ + if (unlikely(!len) && !(flags & LOOKUP_EMPTY)) { + __putname(kname); + kfree(result); + return ERR_PTR(-ENOENT); + } if (unlikely(len == PATH_MAX)) { __putname(kname); kfree(result); @@ -188,14 +205,6 @@ getname_flags(const char __user *filename, int flags) } atomic_set(&result->refcnt, 1); - /* The empty path is special. */ - if (unlikely(!len)) { - if (!(flags & LOOKUP_EMPTY)) { - putname(result); - return ERR_PTR(-ENOENT); - } - } - result->uptr = filename; result->aname = NULL; audit_getname(result); From deebbd505c7bf8913451095c3ac9dcec39c7a025 Mon Sep 17 00:00:00 2001 From: Jemmy Date: Fri, 7 Jun 2024 01:39:12 +0800 Subject: [PATCH 13/27] Improve readability of copy_tree by employing `copy mount tree from src to dst` concept. This involves renaming the opaque variables (e.g., p, q, r, s) to be more descriptive, aiming to make the code easier to understand. Changes: mnt -> src_root (root of the tree to copy) r -> src_root_child (direct child of the root being cloning) p -> src_parent (parent of src_mnt) s -> src_mnt (current mount being copying) parent -> dst_parent (parent of dst_child) q -> dst_mnt (freshly cloned mount) Signed-off-by: Jemmy Link: https://lore.kernel.org/r/20240606173912.99442-1-jemmywong512@gmail.com Signed-off-by: Christian Brauner --- fs/namespace.c | 59 ++++++++++++++++++++++++++------------------------ 1 file changed, 31 insertions(+), 28 deletions(-) diff --git a/fs/namespace.c b/fs/namespace.c index 4386787210c7..2b85d2f53df6 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -1966,69 +1966,72 @@ static bool mnt_ns_loop(struct dentry *dentry) return current->nsproxy->mnt_ns->seq >= mnt_ns->seq; } -struct mount *copy_tree(struct mount *mnt, struct dentry *dentry, +struct mount *copy_tree(struct mount *src_root, struct dentry *dentry, int flag) { - struct mount *res, *p, *q, *r, *parent; + struct mount *res, *src_parent, *src_root_child, *src_mnt, + *dst_parent, *dst_mnt; - if (!(flag & CL_COPY_UNBINDABLE) && IS_MNT_UNBINDABLE(mnt)) + if (!(flag & CL_COPY_UNBINDABLE) && IS_MNT_UNBINDABLE(src_root)) return ERR_PTR(-EINVAL); if (!(flag & CL_COPY_MNT_NS_FILE) && is_mnt_ns_file(dentry)) return ERR_PTR(-EINVAL); - res = q = clone_mnt(mnt, dentry, flag); - if (IS_ERR(q)) - return q; + res = dst_mnt = clone_mnt(src_root, dentry, flag); + if (IS_ERR(dst_mnt)) + return dst_mnt; - q->mnt_mountpoint = mnt->mnt_mountpoint; + src_parent = src_root; + dst_mnt->mnt_mountpoint = src_root->mnt_mountpoint; - p = mnt; - list_for_each_entry(r, &mnt->mnt_mounts, mnt_child) { - struct mount *s; - if (!is_subdir(r->mnt_mountpoint, dentry)) + list_for_each_entry(src_root_child, &src_root->mnt_mounts, mnt_child) { + if (!is_subdir(src_root_child->mnt_mountpoint, dentry)) continue; - for (s = r; s; s = next_mnt(s, r)) { + for (src_mnt = src_root_child; src_mnt; + src_mnt = next_mnt(src_mnt, src_root_child)) { if (!(flag & CL_COPY_UNBINDABLE) && - IS_MNT_UNBINDABLE(s)) { - if (s->mnt.mnt_flags & MNT_LOCKED) { + IS_MNT_UNBINDABLE(src_mnt)) { + if (src_mnt->mnt.mnt_flags & MNT_LOCKED) { /* Both unbindable and locked. */ - q = ERR_PTR(-EPERM); + dst_mnt = ERR_PTR(-EPERM); goto out; } else { - s = skip_mnt_tree(s); + src_mnt = skip_mnt_tree(src_mnt); continue; } } if (!(flag & CL_COPY_MNT_NS_FILE) && - is_mnt_ns_file(s->mnt.mnt_root)) { - s = skip_mnt_tree(s); + is_mnt_ns_file(src_mnt->mnt.mnt_root)) { + src_mnt = skip_mnt_tree(src_mnt); continue; } - while (p != s->mnt_parent) { - p = p->mnt_parent; - q = q->mnt_parent; + while (src_parent != src_mnt->mnt_parent) { + src_parent = src_parent->mnt_parent; + dst_mnt = dst_mnt->mnt_parent; } - p = s; - parent = q; - q = clone_mnt(p, p->mnt.mnt_root, flag); - if (IS_ERR(q)) + + src_parent = src_mnt; + dst_parent = dst_mnt; + dst_mnt = clone_mnt(src_mnt, src_mnt->mnt.mnt_root, flag); + if (IS_ERR(dst_mnt)) goto out; lock_mount_hash(); - list_add_tail(&q->mnt_list, &res->mnt_list); - attach_mnt(q, parent, p->mnt_mp, false); + list_add_tail(&dst_mnt->mnt_list, &res->mnt_list); + attach_mnt(dst_mnt, dst_parent, src_parent->mnt_mp, false); unlock_mount_hash(); } } return res; + out: if (res) { lock_mount_hash(); umount_tree(res, UMOUNT_SYNC); unlock_mount_hash(); } - return q; + return dst_mnt; } /* Caller should check returned pointer for errors */ From be4edd1642ee205ed7bbf66edc0453b1be1fb8d7 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 7 Jun 2024 22:23:04 +0800 Subject: [PATCH 14/27] hfsplus: fix to avoid false alarm of circular locking MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Syzbot report potential ABBA deadlock as below: loop0: detected capacity change from 0 to 1024 ====================================================== WARNING: possible circular locking dependency detected 6.9.0-syzkaller-10323-g8f6a15f095a6 #0 Not tainted ------------------------------------------------------ syz-executor171/5344 is trying to acquire lock: ffff88807cb980b0 (&tree->tree_lock){+.+.}-{3:3}, at: hfsplus_file_truncate+0x811/0xb50 fs/hfsplus/extents.c:595 but task is already holding lock: ffff88807a930108 (&HFSPLUS_I(inode)->extents_lock){+.+.}-{3:3}, at: hfsplus_file_truncate+0x2da/0xb50 fs/hfsplus/extents.c:576 which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #1 (&HFSPLUS_I(inode)->extents_lock){+.+.}-{3:3}: lock_acquire+0x1ed/0x550 kernel/locking/lockdep.c:5754 __mutex_lock_common kernel/locking/mutex.c:608 [inline] __mutex_lock+0x136/0xd70 kernel/locking/mutex.c:752 hfsplus_file_extend+0x21b/0x1b70 fs/hfsplus/extents.c:457 hfsplus_bmap_reserve+0x105/0x4e0 fs/hfsplus/btree.c:358 hfsplus_rename_cat+0x1d0/0x1050 fs/hfsplus/catalog.c:456 hfsplus_rename+0x12e/0x1c0 fs/hfsplus/dir.c:552 vfs_rename+0xbdb/0xf00 fs/namei.c:4887 do_renameat2+0xd94/0x13f0 fs/namei.c:5044 __do_sys_rename fs/namei.c:5091 [inline] __se_sys_rename fs/namei.c:5089 [inline] __x64_sys_rename+0x86/0xa0 fs/namei.c:5089 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0xf5/0x240 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x77/0x7f -> #0 (&tree->tree_lock){+.+.}-{3:3}: check_prev_add kernel/locking/lockdep.c:3134 [inline] check_prevs_add kernel/locking/lockdep.c:3253 [inline] validate_chain+0x18cb/0x58e0 kernel/locking/lockdep.c:3869 __lock_acquire+0x1346/0x1fd0 kernel/locking/lockdep.c:5137 lock_acquire+0x1ed/0x550 kernel/locking/lockdep.c:5754 __mutex_lock_common kernel/locking/mutex.c:608 [inline] __mutex_lock+0x136/0xd70 kernel/locking/mutex.c:752 hfsplus_file_truncate+0x811/0xb50 fs/hfsplus/extents.c:595 hfsplus_setattr+0x1ce/0x280 fs/hfsplus/inode.c:265 notify_change+0xb9d/0xe70 fs/attr.c:497 do_truncate+0x220/0x310 fs/open.c:65 handle_truncate fs/namei.c:3308 [inline] do_open fs/namei.c:3654 [inline] path_openat+0x2a3d/0x3280 fs/namei.c:3807 do_filp_open+0x235/0x490 fs/namei.c:3834 do_sys_openat2+0x13e/0x1d0 fs/open.c:1406 do_sys_open fs/open.c:1421 [inline] __do_sys_creat fs/open.c:1497 [inline] __se_sys_creat fs/open.c:1491 [inline] __x64_sys_creat+0x123/0x170 fs/open.c:1491 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0xf5/0x240 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x77/0x7f other info that might help us debug this: Possible unsafe locking scenario: CPU0 CPU1 ---- ---- lock(&HFSPLUS_I(inode)->extents_lock); lock(&tree->tree_lock); lock(&HFSPLUS_I(inode)->extents_lock); lock(&tree->tree_lock); This is a false alarm as tree_lock mutex are different, one is from sbi->cat_tree, and another is from sbi->ext_tree: Thread A Thread B - hfsplus_rename - hfsplus_rename_cat - hfs_find_init - mutext_lock(cat_tree->tree_lock) - hfsplus_setattr - hfsplus_file_truncate - mutex_lock(hip->extents_lock) - hfs_find_init - mutext_lock(ext_tree->tree_lock) - hfs_bmap_reserve - hfsplus_file_extend - mutex_lock(hip->extents_lock) So, let's call mutex_lock_nested for tree_lock mutex lock, and pass correct lock class for it. Fixes: 31651c607151 ("hfsplus: avoid deadlock on file truncation") Reported-by: syzbot+6030b3b1b9bf70e538c4@syzkaller.appspotmail.com Closes: https://lore.kernel.org/linux-fsdevel/000000000000e37a4005ef129563@google.com Cc: Ernesto A. Fernández Signed-off-by: Chao Yu Link: https://lore.kernel.org/r/20240607142304.455441-1-chao@kernel.org Signed-off-by: Christian Brauner --- fs/hfsplus/bfind.c | 15 ++------------- fs/hfsplus/extents.c | 9 ++++++--- fs/hfsplus/hfsplus_fs.h | 21 +++++++++++++++++++++ 3 files changed, 29 insertions(+), 16 deletions(-) diff --git a/fs/hfsplus/bfind.c b/fs/hfsplus/bfind.c index ca2ba8c9f82e..901e83d65d20 100644 --- a/fs/hfsplus/bfind.c +++ b/fs/hfsplus/bfind.c @@ -25,19 +25,8 @@ int hfs_find_init(struct hfs_btree *tree, struct hfs_find_data *fd) fd->key = ptr + tree->max_key_len + 2; hfs_dbg(BNODE_REFS, "find_init: %d (%p)\n", tree->cnid, __builtin_return_address(0)); - switch (tree->cnid) { - case HFSPLUS_CAT_CNID: - mutex_lock_nested(&tree->tree_lock, CATALOG_BTREE_MUTEX); - break; - case HFSPLUS_EXT_CNID: - mutex_lock_nested(&tree->tree_lock, EXTENTS_BTREE_MUTEX); - break; - case HFSPLUS_ATTR_CNID: - mutex_lock_nested(&tree->tree_lock, ATTR_BTREE_MUTEX); - break; - default: - BUG(); - } + mutex_lock_nested(&tree->tree_lock, + hfsplus_btree_lock_class(tree)); return 0; } diff --git a/fs/hfsplus/extents.c b/fs/hfsplus/extents.c index 3c572e44f2ad..9c51867dddc5 100644 --- a/fs/hfsplus/extents.c +++ b/fs/hfsplus/extents.c @@ -430,7 +430,8 @@ int hfsplus_free_fork(struct super_block *sb, u32 cnid, hfsplus_free_extents(sb, ext_entry, total_blocks - start, total_blocks); total_blocks = start; - mutex_lock(&fd.tree->tree_lock); + mutex_lock_nested(&fd.tree->tree_lock, + hfsplus_btree_lock_class(fd.tree)); } while (total_blocks > blocks); hfs_find_exit(&fd); @@ -592,7 +593,8 @@ void hfsplus_file_truncate(struct inode *inode) alloc_cnt, alloc_cnt - blk_cnt); hfsplus_dump_extent(hip->first_extents); hip->first_blocks = blk_cnt; - mutex_lock(&fd.tree->tree_lock); + mutex_lock_nested(&fd.tree->tree_lock, + hfsplus_btree_lock_class(fd.tree)); break; } res = __hfsplus_ext_cache_extent(&fd, inode, alloc_cnt); @@ -606,7 +608,8 @@ void hfsplus_file_truncate(struct inode *inode) hfsplus_free_extents(sb, hip->cached_extents, alloc_cnt - start, alloc_cnt - blk_cnt); hfsplus_dump_extent(hip->cached_extents); - mutex_lock(&fd.tree->tree_lock); + mutex_lock_nested(&fd.tree->tree_lock, + hfsplus_btree_lock_class(fd.tree)); if (blk_cnt > start) { hip->extent_state |= HFSPLUS_EXT_DIRTY; break; diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h index 012a3d003fbe..9e78f181c24f 100644 --- a/fs/hfsplus/hfsplus_fs.h +++ b/fs/hfsplus/hfsplus_fs.h @@ -553,6 +553,27 @@ static inline __be32 __hfsp_ut2mt(time64_t ut) return cpu_to_be32(lower_32_bits(ut) + HFSPLUS_UTC_OFFSET); } +static inline enum hfsplus_btree_mutex_classes +hfsplus_btree_lock_class(struct hfs_btree *tree) +{ + enum hfsplus_btree_mutex_classes class; + + switch (tree->cnid) { + case HFSPLUS_CAT_CNID: + class = CATALOG_BTREE_MUTEX; + break; + case HFSPLUS_EXT_CNID: + class = EXTENTS_BTREE_MUTEX; + break; + case HFSPLUS_ATTR_CNID: + class = ATTR_BTREE_MUTEX; + break; + default: + BUG(); + } + return class; +} + /* compatibility */ #define hfsp_mt2ut(t) (struct timespec64){ .tv_sec = __hfsp_mt2ut(t) } #define hfsp_ut2mt(t) __hfsp_ut2mt((t).tv_sec) From 7f07ee5a23a5ed3da7ed3574913aada6f4143a22 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Sat, 8 Jun 2024 16:34:20 +0200 Subject: [PATCH 15/27] proc: Remove usage of the deprecated ida_simple_xx() API ida_alloc() and ida_free() should be preferred to the deprecated ida_simple_get() and ida_simple_remove(). Note that the upper limit of ida_simple_get() is exclusive, but the one of ida_alloc_max() is inclusive. So a -1 has been added when needed. Signed-off-by: Christophe JAILLET Link: https://lore.kernel.org/r/ae10003feb87d240163d0854de95f09e1f00be7d.1717855701.git.christophe.jaillet@wanadoo.fr Reviewed-by: Matthew Wilcox (Oracle) Signed-off-by: Christian Brauner --- fs/proc/generic.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/proc/generic.c b/fs/proc/generic.c index 775ce0bcf08c..c02f1e63f82d 100644 --- a/fs/proc/generic.c +++ b/fs/proc/generic.c @@ -202,8 +202,8 @@ int proc_alloc_inum(unsigned int *inum) { int i; - i = ida_simple_get(&proc_inum_ida, 0, UINT_MAX - PROC_DYNAMIC_FIRST + 1, - GFP_KERNEL); + i = ida_alloc_max(&proc_inum_ida, UINT_MAX - PROC_DYNAMIC_FIRST, + GFP_KERNEL); if (i < 0) return i; @@ -213,7 +213,7 @@ int proc_alloc_inum(unsigned int *inum) void proc_free_inum(unsigned int inum) { - ida_simple_remove(&proc_inum_ida, inum - PROC_DYNAMIC_FIRST); + ida_free(&proc_inum_ida, inum - PROC_DYNAMIC_FIRST); } static int proc_misc_d_revalidate(struct dentry *dentry, unsigned int flags) From 26a2ed107929a855155429b11e1293b83e6b2a8b Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sun, 16 Jun 2024 09:38:41 +0800 Subject: [PATCH 16/27] hfs: fix to initialize fields of hfs_inode_info after hfs_alloc_inode() Syzbot reports uninitialized value access issue as below: loop0: detected capacity change from 0 to 64 ===================================================== BUG: KMSAN: uninit-value in hfs_revalidate_dentry+0x307/0x3f0 fs/hfs/sysdep.c:30 hfs_revalidate_dentry+0x307/0x3f0 fs/hfs/sysdep.c:30 d_revalidate fs/namei.c:862 [inline] lookup_fast+0x89e/0x8e0 fs/namei.c:1649 walk_component fs/namei.c:2001 [inline] link_path_walk+0x817/0x1480 fs/namei.c:2332 path_lookupat+0xd9/0x6f0 fs/namei.c:2485 filename_lookup+0x22e/0x740 fs/namei.c:2515 user_path_at_empty+0x8b/0x390 fs/namei.c:2924 user_path_at include/linux/namei.h:57 [inline] do_mount fs/namespace.c:3689 [inline] __do_sys_mount fs/namespace.c:3898 [inline] __se_sys_mount+0x66b/0x810 fs/namespace.c:3875 __x64_sys_mount+0xe4/0x140 fs/namespace.c:3875 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0xcf/0x1e0 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x63/0x6b BUG: KMSAN: uninit-value in hfs_ext_read_extent fs/hfs/extent.c:196 [inline] BUG: KMSAN: uninit-value in hfs_get_block+0x92d/0x1620 fs/hfs/extent.c:366 hfs_ext_read_extent fs/hfs/extent.c:196 [inline] hfs_get_block+0x92d/0x1620 fs/hfs/extent.c:366 block_read_full_folio+0x4ff/0x11b0 fs/buffer.c:2271 hfs_read_folio+0x55/0x60 fs/hfs/inode.c:39 filemap_read_folio+0x148/0x4f0 mm/filemap.c:2426 do_read_cache_folio+0x7c8/0xd90 mm/filemap.c:3553 do_read_cache_page mm/filemap.c:3595 [inline] read_cache_page+0xfb/0x2f0 mm/filemap.c:3604 read_mapping_page include/linux/pagemap.h:755 [inline] hfs_btree_open+0x928/0x1ae0 fs/hfs/btree.c:78 hfs_mdb_get+0x260c/0x3000 fs/hfs/mdb.c:204 hfs_fill_super+0x1fb1/0x2790 fs/hfs/super.c:406 mount_bdev+0x628/0x920 fs/super.c:1359 hfs_mount+0xcd/0xe0 fs/hfs/super.c:456 legacy_get_tree+0x167/0x2e0 fs/fs_context.c:610 vfs_get_tree+0xdc/0x5d0 fs/super.c:1489 do_new_mount+0x7a9/0x16f0 fs/namespace.c:3145 path_mount+0xf98/0x26a0 fs/namespace.c:3475 do_mount fs/namespace.c:3488 [inline] __do_sys_mount fs/namespace.c:3697 [inline] __se_sys_mount+0x919/0x9e0 fs/namespace.c:3674 __ia32_sys_mount+0x15b/0x1b0 fs/namespace.c:3674 do_syscall_32_irqs_on arch/x86/entry/common.c:112 [inline] __do_fast_syscall_32+0xa2/0x100 arch/x86/entry/common.c:178 do_fast_syscall_32+0x37/0x80 arch/x86/entry/common.c:203 do_SYSENTER_32+0x1f/0x30 arch/x86/entry/common.c:246 entry_SYSENTER_compat_after_hwframe+0x70/0x82 Uninit was created at: __alloc_pages+0x9a6/0xe00 mm/page_alloc.c:4590 __alloc_pages_node include/linux/gfp.h:238 [inline] alloc_pages_node include/linux/gfp.h:261 [inline] alloc_slab_page mm/slub.c:2190 [inline] allocate_slab mm/slub.c:2354 [inline] new_slab+0x2d7/0x1400 mm/slub.c:2407 ___slab_alloc+0x16b5/0x3970 mm/slub.c:3540 __slab_alloc mm/slub.c:3625 [inline] __slab_alloc_node mm/slub.c:3678 [inline] slab_alloc_node mm/slub.c:3850 [inline] kmem_cache_alloc_lru+0x64d/0xb30 mm/slub.c:3879 alloc_inode_sb include/linux/fs.h:3018 [inline] hfs_alloc_inode+0x5a/0xc0 fs/hfs/super.c:165 alloc_inode+0x83/0x440 fs/inode.c:260 new_inode_pseudo fs/inode.c:1005 [inline] new_inode+0x38/0x4f0 fs/inode.c:1031 hfs_new_inode+0x61/0x1010 fs/hfs/inode.c:186 hfs_mkdir+0x54/0x250 fs/hfs/dir.c:228 vfs_mkdir+0x49a/0x700 fs/namei.c:4126 do_mkdirat+0x529/0x810 fs/namei.c:4149 __do_sys_mkdirat fs/namei.c:4164 [inline] __se_sys_mkdirat fs/namei.c:4162 [inline] __x64_sys_mkdirat+0xc8/0x120 fs/namei.c:4162 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0xcf/0x1e0 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x63/0x6b It missed to initialize .tz_secondswest, .cached_start and .cached_blocks fields in struct hfs_inode_info after hfs_alloc_inode(), fix it. Cc: stable@vger.kernel.org Reported-by: syzbot+3ae6be33a50b5aae4dab@syzkaller.appspotmail.com Closes: https://lore.kernel.org/linux-fsdevel/0000000000005ad04005ee48897f@google.com Signed-off-by: Chao Yu Link: https://lore.kernel.org/r/20240616013841.2217-1-chao@kernel.org Signed-off-by: Christian Brauner --- fs/hfs/inode.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c index 8c34798a0715..744e10b46904 100644 --- a/fs/hfs/inode.c +++ b/fs/hfs/inode.c @@ -200,6 +200,7 @@ struct inode *hfs_new_inode(struct inode *dir, const struct qstr *name, umode_t HFS_I(inode)->flags = 0; HFS_I(inode)->rsrc_inode = NULL; HFS_I(inode)->fs_blocks = 0; + HFS_I(inode)->tz_secondswest = sys_tz.tz_minuteswest * 60; if (S_ISDIR(mode)) { inode->i_size = 2; HFS_SB(sb)->folder_count++; @@ -275,6 +276,8 @@ void hfs_inode_read_fork(struct inode *inode, struct hfs_extent *ext, for (count = 0, i = 0; i < 3; i++) count += be16_to_cpu(ext[i].count); HFS_I(inode)->first_blocks = count; + HFS_I(inode)->cached_start = 0; + HFS_I(inode)->cached_blocks = 0; inode->i_size = HFS_I(inode)->phys_size = log_size; HFS_I(inode)->fs_blocks = (log_size + sb->s_blocksize - 1) >> sb->s_blocksize_bits; From 5e362bd5eecdd7ccafc9611a108a53423ccd5065 Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Thu, 20 Jun 2024 14:03:59 +0200 Subject: [PATCH 17/27] vfs: reorder checks in may_create_in_sticky The routine is called for all directories on file creation and weirdly postpones the check if the dir is sticky to begin with. Instead it first checks fifos and regular files (in that order), while avoidably pulling globals. No functional changes. Signed-off-by: Mateusz Guzik Link: https://lore.kernel.org/r/20240620120359.151258-1-mjguzik@gmail.com Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- fs/namei.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/namei.c b/fs/namei.c index 3d3674c21d3c..8b40991cb59e 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1246,9 +1246,9 @@ static int may_create_in_sticky(struct mnt_idmap *idmap, umode_t dir_mode = nd->dir_mode; vfsuid_t dir_vfsuid = nd->dir_vfsuid; - if ((!sysctl_protected_fifos && S_ISFIFO(inode->i_mode)) || - (!sysctl_protected_regular && S_ISREG(inode->i_mode)) || - likely(!(dir_mode & S_ISVTX)) || + if (likely(!(dir_mode & S_ISVTX)) || + (S_ISREG(inode->i_mode) && !sysctl_protected_regular) || + (S_ISFIFO(inode->i_mode) && !sysctl_protected_fifos) || vfsuid_eq(i_uid_into_vfsuid(idmap, inode), dir_vfsuid) || vfsuid_eq_kuid(i_uid_into_vfsuid(idmap, inode), current_fsuid())) return 0; From 9b6a14f08b4875aa22ea0b5bc35042e2580b311b Mon Sep 17 00:00:00 2001 From: Youling Tang Date: Thu, 20 Jun 2024 11:23:33 +0800 Subject: [PATCH 18/27] fs: Export in_group_or_capable() Export in_group_or_capable() as a VFS helper function. Signed-off-by: Youling Tang Link: https://lore.kernel.org/r/20240620032335.147136-1-youling.tang@linux.dev Signed-off-by: Christian Brauner --- fs/attr.c | 2 -- fs/inode.c | 1 + include/linux/fs.h | 2 ++ 3 files changed, 3 insertions(+), 2 deletions(-) diff --git a/fs/attr.c b/fs/attr.c index 960a310581eb..825007d5cda4 100644 --- a/fs/attr.c +++ b/fs/attr.c @@ -17,8 +17,6 @@ #include #include -#include "internal.h" - /** * setattr_should_drop_sgid - determine whether the setgid bit needs to be * removed diff --git a/fs/inode.c b/fs/inode.c index 3a41f83a4ba5..e0815acc5abb 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -2538,6 +2538,7 @@ bool in_group_or_capable(struct mnt_idmap *idmap, return true; return false; } +EXPORT_SYMBOL(in_group_or_capable); /** * mode_strip_sgid - handle the sgid bit for non-directories diff --git a/include/linux/fs.h b/include/linux/fs.h index bfc1e6407bf6..942cb11dba96 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1942,6 +1942,8 @@ void inode_init_owner(struct mnt_idmap *idmap, struct inode *inode, extern bool may_open_dev(const struct path *path); umode_t mode_strip_sgid(struct mnt_idmap *idmap, const struct inode *dir, umode_t mode); +bool in_group_or_capable(struct mnt_idmap *idmap, + const struct inode *inode, vfsgid_t vfsgid); /* * This is the "filldir" function type, used by readdir() to let From 8444ee22adb0c8b09ccf9c183bdf6d649d67ddda Mon Sep 17 00:00:00 2001 From: Youling Tang Date: Thu, 20 Jun 2024 11:23:34 +0800 Subject: [PATCH 19/27] f2fs: Use in_group_or_capable() helper Use the in_group_or_capable() helper function to simplify the code. Signed-off-by: Youling Tang Link: https://lore.kernel.org/r/20240620032335.147136-2-youling.tang@linux.dev Signed-off-by: Christian Brauner --- fs/f2fs/acl.c | 3 +-- fs/f2fs/file.c | 4 +--- 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c index ec2aeccb69a3..8bffdeccdbc3 100644 --- a/fs/f2fs/acl.c +++ b/fs/f2fs/acl.c @@ -219,8 +219,7 @@ static int f2fs_acl_update_mode(struct mnt_idmap *idmap, return error; if (error == 0) *acl = NULL; - if (!vfsgid_in_group_p(i_gid_into_vfsgid(idmap, inode)) && - !capable_wrt_inode_uidgid(idmap, inode, CAP_FSETID)) + if (!in_group_or_capable(idmap, inode, i_gid_into_vfsgid(idmap, inode))) mode &= ~S_ISGID; *mode_p = mode; return 0; diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 5c0b281a70f3..7a23434963d1 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -923,10 +923,8 @@ static void __setattr_copy(struct mnt_idmap *idmap, inode_set_ctime_to_ts(inode, attr->ia_ctime); if (ia_valid & ATTR_MODE) { umode_t mode = attr->ia_mode; - vfsgid_t vfsgid = i_gid_into_vfsgid(idmap, inode); - if (!vfsgid_in_group_p(vfsgid) && - !capable_wrt_inode_uidgid(idmap, inode, CAP_FSETID)) + if (!in_group_or_capable(idmap, inode, i_gid_into_vfsgid(idmap, inode))) mode &= ~S_ISGID; set_acl_inode(inode, mode); } From 153216cf7bd50842ffc3d500ea96adeb65db63ef Mon Sep 17 00:00:00 2001 From: Youling Tang Date: Thu, 20 Jun 2024 11:23:35 +0800 Subject: [PATCH 20/27] fuse: Use in_group_or_capable() helper Use the in_group_or_capable() helper function to simplify the code. Signed-off-by: Youling Tang Link: https://lore.kernel.org/r/20240620032335.147136-3-youling.tang@linux.dev Signed-off-by: Christian Brauner --- fs/fuse/acl.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/fuse/acl.c b/fs/fuse/acl.c index 3d192b80a561..04cfd8fee992 100644 --- a/fs/fuse/acl.c +++ b/fs/fuse/acl.c @@ -146,8 +146,8 @@ int fuse_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, * be stripped. */ if (fc->posix_acl && - !vfsgid_in_group_p(i_gid_into_vfsgid(&nop_mnt_idmap, inode)) && - !capable_wrt_inode_uidgid(&nop_mnt_idmap, inode, CAP_FSETID)) + !in_group_or_capable(&nop_mnt_idmap, inode, + i_gid_into_vfsgid(&nop_mnt_idmap, inode))) extra_flags |= FUSE_SETXATTR_ACL_KILL_SGID; ret = fuse_setxattr(inode, name, value, size, 0, extra_flags); From 8e3447822d7d8c0f562c6851a7a31e24d1ede55e Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Mon, 24 Jun 2024 10:54:02 +0200 Subject: [PATCH 21/27] vfs: remove redundant smp_mb for thp handling in do_dentry_open opening for write performs: if (f->f_mode & FMODE_WRITE) { [snip] smp_mb(); if (filemap_nr_thps(inode->i_mapping)) { [snip] } } filemap_nr_thps on kernels built without CONFIG_READ_ONLY_THP_FOR expands to 0, allowing the compiler to eliminate the entire thing, with exception of the fence (and the branch leading there). So happens required synchronisation between i_writecount and nr_thps changes is already provided by the full fence coming from get_write_access -> atomic_inc_unless_negative, thus the smp_mb instance above can be removed regardless of CONFIG_READ_ONLY_THP_FOR. While I updated commentary in places claiming to match the now-removed fence, I did not try to patch them to act on the compile option. I did not bother benchmarking it, not issuing a spurious full fence in the fast path does not warrant justification from perf standpoint. Signed-off-by: Mateusz Guzik Link: https://lore.kernel.org/r/20240624085402.493630-1-mjguzik@gmail.com Signed-off-by: Christian Brauner --- fs/open.c | 9 ++++----- mm/khugepaged.c | 10 +++++----- 2 files changed, 9 insertions(+), 10 deletions(-) diff --git a/fs/open.c b/fs/open.c index a5c4f8a0f143..c4e9b01aafd8 100644 --- a/fs/open.c +++ b/fs/open.c @@ -986,12 +986,11 @@ static int do_dentry_open(struct file *f, */ if (f->f_mode & FMODE_WRITE) { /* - * Paired with smp_mb() in collapse_file() to ensure nr_thps - * is up to date and the update to i_writecount by - * get_write_access() is visible. Ensures subsequent insertion - * of THPs into the page cache will fail. + * Depends on full fence from get_write_access() to synchronize + * against collapse_file() regarding i_writecount and nr_thps + * updates. Ensures subsequent insertion of THPs into the page + * cache will fail. */ - smp_mb(); if (filemap_nr_thps(inode->i_mapping)) { struct address_space *mapping = inode->i_mapping; diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 774a97e6e2da..aab471791bd9 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -2000,9 +2000,9 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr, if (!is_shmem) { filemap_nr_thps_inc(mapping); /* - * Paired with smp_mb() in do_dentry_open() to ensure - * i_writecount is up to date and the update to nr_thps is - * visible. Ensures the page cache will be truncated if the + * Paired with the fence in do_dentry_open() -> get_write_access() + * to ensure i_writecount is up to date and the update to nr_thps + * is visible. Ensures the page cache will be truncated if the * file is opened writable. */ smp_mb(); @@ -2190,8 +2190,8 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr, if (!is_shmem && result == SCAN_COPY_MC) { filemap_nr_thps_dec(mapping); /* - * Paired with smp_mb() in do_dentry_open() to - * ensure the update to nr_thps is visible. + * Paired with the fence in do_dentry_open() -> get_write_access() + * to ensure the update to nr_thps is visible. */ smp_mb(); } From 9fb9ff7ed1656dbd2168ebfc21b8c33666a994fa Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Wed, 26 Jun 2024 14:23:04 +0200 Subject: [PATCH 22/27] fs: reflow may_create_in_sticky() This function is so messy and hard to read, let's reflow it to make it more readable. Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20240621-affekt-denkzettel-3c115f68355a@brauner Signed-off-by: Christian Brauner --- fs/namei.c | 51 +++++++++++++++++++++++++++++++++++---------------- 1 file changed, 35 insertions(+), 16 deletions(-) diff --git a/fs/namei.c b/fs/namei.c index 8b40991cb59e..f10b6dde7dab 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1240,29 +1240,48 @@ int may_linkat(struct mnt_idmap *idmap, const struct path *link) * * Returns 0 if the open is allowed, -ve on error. */ -static int may_create_in_sticky(struct mnt_idmap *idmap, - struct nameidata *nd, struct inode *const inode) +static int may_create_in_sticky(struct mnt_idmap *idmap, struct nameidata *nd, + struct inode *const inode) { umode_t dir_mode = nd->dir_mode; - vfsuid_t dir_vfsuid = nd->dir_vfsuid; + vfsuid_t dir_vfsuid = nd->dir_vfsuid, i_vfsuid; - if (likely(!(dir_mode & S_ISVTX)) || - (S_ISREG(inode->i_mode) && !sysctl_protected_regular) || - (S_ISFIFO(inode->i_mode) && !sysctl_protected_fifos) || - vfsuid_eq(i_uid_into_vfsuid(idmap, inode), dir_vfsuid) || - vfsuid_eq_kuid(i_uid_into_vfsuid(idmap, inode), current_fsuid())) + if (likely(!(dir_mode & S_ISVTX))) return 0; - if (likely(dir_mode & 0002) || - (dir_mode & 0020 && - ((sysctl_protected_fifos >= 2 && S_ISFIFO(inode->i_mode)) || - (sysctl_protected_regular >= 2 && S_ISREG(inode->i_mode))))) { - const char *operation = S_ISFIFO(inode->i_mode) ? - "sticky_create_fifo" : - "sticky_create_regular"; - audit_log_path_denied(AUDIT_ANOM_CREAT, operation); + if (S_ISREG(inode->i_mode) && !sysctl_protected_regular) + return 0; + + if (S_ISFIFO(inode->i_mode) && !sysctl_protected_fifos) + return 0; + + i_vfsuid = i_uid_into_vfsuid(idmap, inode); + + if (vfsuid_eq(i_vfsuid, dir_vfsuid)) + return 0; + + if (vfsuid_eq_kuid(i_vfsuid, current_fsuid())) + return 0; + + if (likely(dir_mode & 0002)) { + audit_log_path_denied(AUDIT_ANOM_CREAT, "sticky_create"); return -EACCES; } + + if (dir_mode & 0020) { + if (sysctl_protected_fifos >= 2 && S_ISFIFO(inode->i_mode)) { + audit_log_path_denied(AUDIT_ANOM_CREAT, + "sticky_create_fifo"); + return -EACCES; + } + + if (sysctl_protected_regular >= 2 && S_ISREG(inode->i_mode)) { + audit_log_path_denied(AUDIT_ANOM_CREAT, + "sticky_create_regular"); + return -EACCES; + } + } + return 0; } From 1bc6d4452d5c91beb09e37a98a590808e1997b79 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Tue, 30 Apr 2024 13:54:46 +0200 Subject: [PATCH 23/27] fs: new helper vfs_empty_path() Make it possible to quickly check whether AT_EMPTY_PATH is valid. Note, after some discussion we decided to also allow NULL to be passed instead of requiring the empty string. Signed-off-by: Christian Brauner --- include/linux/fs.h | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/include/linux/fs.h b/include/linux/fs.h index 942cb11dba96..5ff362277834 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3631,4 +3631,21 @@ extern int vfs_fadvise(struct file *file, loff_t offset, loff_t len, extern int generic_fadvise(struct file *file, loff_t offset, loff_t len, int advice); +static inline bool vfs_empty_path(int dfd, const char __user *path) +{ + char c; + + if (dfd < 0) + return false; + + /* We now allow NULL to be used for empty path. */ + if (!path) + return true; + + if (unlikely(get_user(c, path))) + return false; + + return !c; +} + #endif /* _LINUX_FS_H */ From 27a2d0cb2f38c67b58285e6124b14f7fff3fd1a8 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Tue, 30 Apr 2024 13:57:58 +0200 Subject: [PATCH 24/27] stat: use vfs_empty_path() helper Use the newly added helper for this. Signed-off-by: Christian Brauner --- fs/stat.c | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/fs/stat.c b/fs/stat.c index 16aa1f5ceec4..5039c34a385d 100644 --- a/fs/stat.c +++ b/fs/stat.c @@ -289,16 +289,8 @@ int vfs_fstatat(int dfd, const char __user *filename, * If AT_EMPTY_PATH is set, we expect the common case to be that * empty path, and avoid doing all the extra pathname work. */ - if (dfd >= 0 && flags == AT_EMPTY_PATH) { - char c; - - ret = get_user(c, filename); - if (unlikely(ret)) - return ret; - - if (likely(!c)) - return vfs_fstat(dfd, stat); - } + if (flags == AT_EMPTY_PATH && vfs_empty_path(dfd, filename)) + return vfs_fstat(dfd, stat); name = getname_flags(filename, getname_statx_lookup_flags(statx_flags)); ret = vfs_statx(dfd, name, statx_flags, stat, STATX_BASIC_STATS); From 0ef625bba6fb2bc0c8ed2aab9524fdf423f67dd5 Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Tue, 25 Jun 2024 17:18:06 +0200 Subject: [PATCH 25/27] vfs: support statx(..., NULL, AT_EMPTY_PATH, ...) The newly used helper also checks for empty ("") paths. NULL paths with any flag value other than AT_EMPTY_PATH go the usual route and end up with -EFAULT to retain compatibility (Rust is abusing calls of the sort to detect availability of statx). This avoids path lookup code, lockref management, memory allocation and in case of NULL path userspace memory access (which can be quite expensive with SMAP on x86_64). Benchmarked with statx(..., AT_EMPTY_PATH, ...) running on Sapphire Rapids, with the "" path for the first two cases and NULL for the last one. Results in ops/s: stock: 4231237 pre-check: 5944063 (+40%) NULL path: 6601619 (+11%/+56%) Signed-off-by: Mateusz Guzik Link: https://lore.kernel.org/r/20240625151807.620812-1-mjguzik@gmail.com Tested-by: Xi Ruoyao [brauner: use path_mounted() and other tweaks] Signed-off-by: Christian Brauner --- fs/internal.h | 14 +++++++ fs/namespace.c | 13 ------ fs/stat.c | 111 ++++++++++++++++++++++++++++++++++++------------- 3 files changed, 97 insertions(+), 41 deletions(-) diff --git a/fs/internal.h b/fs/internal.h index ab2225136f60..f26454c60a98 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -247,6 +247,8 @@ extern const struct dentry_operations ns_dentry_operations; int getname_statx_lookup_flags(int flags); int do_statx(int dfd, struct filename *filename, unsigned int flags, unsigned int mask, struct statx __user *buffer); +int do_statx_fd(int fd, unsigned int flags, unsigned int mask, + struct statx __user *buffer); /* * fs/splice.c: @@ -321,3 +323,15 @@ struct stashed_operations { int path_from_stashed(struct dentry **stashed, struct vfsmount *mnt, void *data, struct path *path); void stashed_dentry_prune(struct dentry *dentry); +/** + * path_mounted - check whether path is mounted + * @path: path to check + * + * Determine whether @path refers to the root of a mount. + * + * Return: true if @path is the root of a mount, false if not. + */ +static inline bool path_mounted(const struct path *path) +{ + return path->mnt->mnt_root == path->dentry; +} diff --git a/fs/namespace.c b/fs/namespace.c index 2b85d2f53df6..3aadd5361a18 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -1846,19 +1846,6 @@ bool may_mount(void) return ns_capable(current->nsproxy->mnt_ns->user_ns, CAP_SYS_ADMIN); } -/** - * path_mounted - check whether path is mounted - * @path: path to check - * - * Determine whether @path refers to the root of a mount. - * - * Return: true if @path is the root of a mount, false if not. - */ -static inline bool path_mounted(const struct path *path) -{ - return path->mnt->mnt_root == path->dentry; -} - static void warn_mandlock(void) { pr_warn_once("=======================================================\n" diff --git a/fs/stat.c b/fs/stat.c index 5039c34a385d..6f65b3456cad 100644 --- a/fs/stat.c +++ b/fs/stat.c @@ -214,6 +214,43 @@ int getname_statx_lookup_flags(int flags) return lookup_flags; } +static int vfs_statx_path(struct path *path, int flags, struct kstat *stat, + u32 request_mask) +{ + int error = vfs_getattr(path, stat, request_mask, flags); + + if (request_mask & STATX_MNT_ID_UNIQUE) { + stat->mnt_id = real_mount(path->mnt)->mnt_id_unique; + stat->result_mask |= STATX_MNT_ID_UNIQUE; + } else { + stat->mnt_id = real_mount(path->mnt)->mnt_id; + stat->result_mask |= STATX_MNT_ID; + } + + if (path_mounted(path)) + stat->attributes |= STATX_ATTR_MOUNT_ROOT; + stat->attributes_mask |= STATX_ATTR_MOUNT_ROOT; + + /* Handle STATX_DIOALIGN for block devices. */ + if (request_mask & STATX_DIOALIGN) { + struct inode *inode = d_backing_inode(path->dentry); + + if (S_ISBLK(inode->i_mode)) + bdev_statx_dioalign(inode, stat); + } + + return error; +} + +static int vfs_statx_fd(int fd, int flags, struct kstat *stat, + u32 request_mask) +{ + CLASS(fd_raw, f)(fd); + if (!f.file) + return -EBADF; + return vfs_statx_path(&f.file->f_path, flags, stat, request_mask); +} + /** * vfs_statx - Get basic and extra attributes by filename * @dfd: A file descriptor representing the base dir for a relative filename @@ -243,36 +280,13 @@ static int vfs_statx(int dfd, struct filename *filename, int flags, retry: error = filename_lookup(dfd, filename, lookup_flags, &path, NULL); if (error) - goto out; - - error = vfs_getattr(&path, stat, request_mask, flags); - - if (request_mask & STATX_MNT_ID_UNIQUE) { - stat->mnt_id = real_mount(path.mnt)->mnt_id_unique; - stat->result_mask |= STATX_MNT_ID_UNIQUE; - } else { - stat->mnt_id = real_mount(path.mnt)->mnt_id; - stat->result_mask |= STATX_MNT_ID; - } - - if (path.mnt->mnt_root == path.dentry) - stat->attributes |= STATX_ATTR_MOUNT_ROOT; - stat->attributes_mask |= STATX_ATTR_MOUNT_ROOT; - - /* Handle STATX_DIOALIGN for block devices. */ - if (request_mask & STATX_DIOALIGN) { - struct inode *inode = d_backing_inode(path.dentry); - - if (S_ISBLK(inode->i_mode)) - bdev_statx_dioalign(inode, stat); - } - + return error; + error = vfs_statx_path(&path, flags, stat, request_mask); path_put(&path); if (retry_estale(error, lookup_flags)) { lookup_flags |= LOOKUP_REVAL; goto retry; } -out: return error; } @@ -671,7 +685,8 @@ int do_statx(int dfd, struct filename *filename, unsigned int flags, if ((flags & AT_STATX_SYNC_TYPE) == AT_STATX_SYNC_TYPE) return -EINVAL; - /* STATX_CHANGE_COOKIE is kernel-only for now. Ignore requests + /* + * STATX_CHANGE_COOKIE is kernel-only for now. Ignore requests * from userland. */ mask &= ~STATX_CHANGE_COOKIE; @@ -683,16 +698,41 @@ int do_statx(int dfd, struct filename *filename, unsigned int flags, return cp_statx(&stat, buffer); } +int do_statx_fd(int fd, unsigned int flags, unsigned int mask, + struct statx __user *buffer) +{ + struct kstat stat; + int error; + + if (mask & STATX__RESERVED) + return -EINVAL; + if ((flags & AT_STATX_SYNC_TYPE) == AT_STATX_SYNC_TYPE) + return -EINVAL; + + /* + * STATX_CHANGE_COOKIE is kernel-only for now. Ignore requests + * from userland. + */ + mask &= ~STATX_CHANGE_COOKIE; + + error = vfs_statx_fd(fd, flags, &stat, mask); + if (error) + return error; + + return cp_statx(&stat, buffer); +} + /** * sys_statx - System call to get enhanced stats * @dfd: Base directory to pathwalk from *or* fd to stat. - * @filename: File to stat or "" with AT_EMPTY_PATH + * @filename: File to stat or either NULL or "" with AT_EMPTY_PATH * @flags: AT_* flags to control pathwalk. * @mask: Parts of statx struct actually required. * @buffer: Result buffer. * * Note that fstat() can be emulated by setting dfd to the fd of interest, - * supplying "" as the filename and setting AT_EMPTY_PATH in the flags. + * supplying "" (or preferably NULL) as the filename and setting AT_EMPTY_PATH + * in the flags. */ SYSCALL_DEFINE5(statx, int, dfd, const char __user *, filename, unsigned, flags, @@ -700,8 +740,23 @@ SYSCALL_DEFINE5(statx, struct statx __user *, buffer) { int ret; + unsigned lflags; struct filename *name; + /* + * Short-circuit handling of NULL and "" paths. + * + * For a NULL path we require and accept only the AT_EMPTY_PATH flag + * (possibly |'d with AT_STATX flags). + * + * However, glibc on 32-bit architectures implements fstatat as statx + * with the "" pathname and AT_NO_AUTOMOUNT | AT_EMPTY_PATH flags. + * Supporting this results in the uglification below. + */ + lflags = flags & ~(AT_NO_AUTOMOUNT | AT_STATX_SYNC_TYPE); + if (lflags == AT_EMPTY_PATH && vfs_empty_path(dfd, filename)) + return do_statx_fd(dfd, flags & ~AT_NO_AUTOMOUNT, mask, buffer); + name = getname_flags(filename, getname_statx_lookup_flags(flags)); ret = do_statx(dfd, name, flags, mask, buffer); putname(name); From f378ec4eec8b7e607dbc0112913fd3d5d84eb1b8 Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Thu, 27 Jun 2024 18:11:52 +0200 Subject: [PATCH 26/27] vfs: rename parent_ino to d_parent_ino and make it use RCU The routine is used by procfs through dir_emit_dots. The combined RCU and lock fallback implementation is too big for an inline. Given that the routine takes a dentry argument fs/dcache.c seems like the place to put it in. Signed-off-by: Mateusz Guzik Link: https://lore.kernel.org/r/20240627161152.802567-1-mjguzik@gmail.com Signed-off-by: Christian Brauner --- fs/dcache.c | 28 ++++++++++++++++++++++++++++ fs/f2fs/file.c | 2 +- fs/hfsplus/ioctl.c | 4 ++-- include/linux/dcache.h | 2 ++ include/linux/fs.h | 16 +--------------- 5 files changed, 34 insertions(+), 18 deletions(-) diff --git a/fs/dcache.c b/fs/dcache.c index 58b89c9e9b0c..68e843e78337 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -3099,6 +3099,34 @@ void d_tmpfile(struct file *file, struct inode *inode) } EXPORT_SYMBOL(d_tmpfile); +/* + * Obtain inode number of the parent dentry. + */ +ino_t d_parent_ino(struct dentry *dentry) +{ + struct dentry *parent; + struct inode *iparent; + unsigned seq; + ino_t ret; + + scoped_guard(rcu) { + seq = raw_seqcount_begin(&dentry->d_seq); + parent = READ_ONCE(dentry->d_parent); + iparent = d_inode_rcu(parent); + if (likely(iparent)) { + ret = iparent->i_ino; + if (!read_seqcount_retry(&dentry->d_seq, seq)) + return ret; + } + } + + spin_lock(&dentry->d_lock); + ret = dentry->d_parent->d_inode->i_ino; + spin_unlock(&dentry->d_lock); + return ret; +} +EXPORT_SYMBOL(d_parent_ino); + static __initdata unsigned long dhash_entries; static int __init set_dhash_entries(char *str) { diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 7a23434963d1..c1ad9b278c47 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -185,7 +185,7 @@ static int get_parent_ino(struct inode *inode, nid_t *pino) if (!dentry) return 0; - *pino = parent_ino(dentry); + *pino = d_parent_ino(dentry); dput(dentry); return 1; } diff --git a/fs/hfsplus/ioctl.c b/fs/hfsplus/ioctl.c index 5661a2e24d03..40d04dba13ac 100644 --- a/fs/hfsplus/ioctl.c +++ b/fs/hfsplus/ioctl.c @@ -40,7 +40,7 @@ static int hfsplus_ioctl_bless(struct file *file, int __user *user_flags) /* Directory containing the bootable system */ vh->finder_info[0] = bvh->finder_info[0] = - cpu_to_be32(parent_ino(dentry)); + cpu_to_be32(d_parent_ino(dentry)); /* * Bootloader. Just using the inode here breaks in the case of @@ -51,7 +51,7 @@ static int hfsplus_ioctl_bless(struct file *file, int __user *user_flags) /* Per spec, the OS X system folder - same as finder_info[0] here */ vh->finder_info[5] = bvh->finder_info[5] = - cpu_to_be32(parent_ino(dentry)); + cpu_to_be32(d_parent_ino(dentry)); mutex_unlock(&sbi->vh_mutex); return 0; diff --git a/include/linux/dcache.h b/include/linux/dcache.h index bf53e3894aae..ea58843942b9 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -278,6 +278,8 @@ static inline unsigned d_count(const struct dentry *dentry) return dentry->d_lockref.count; } +ino_t d_parent_ino(struct dentry *dentry); + /* * helper function for dentry_operations.d_dname() members */ diff --git a/include/linux/fs.h b/include/linux/fs.h index 5ff362277834..2fa06a4d197a 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3454,20 +3454,6 @@ static inline int kiocb_set_rw_flags(struct kiocb *ki, rwf_t flags) return 0; } -static inline ino_t parent_ino(struct dentry *dentry) -{ - ino_t res; - - /* - * Don't strictly need d_lock here? If the parent ino could change - * then surely we'd have a deeper race in the caller? - */ - spin_lock(&dentry->d_lock); - res = dentry->d_parent->d_inode->i_ino; - spin_unlock(&dentry->d_lock); - return res; -} - /* Transaction based IO helpers */ /* @@ -3592,7 +3578,7 @@ static inline bool dir_emit_dot(struct file *file, struct dir_context *ctx) static inline bool dir_emit_dotdot(struct file *file, struct dir_context *ctx) { return ctx->actor(ctx, "..", 2, ctx->pos, - parent_ino(file->f_path.dentry), DT_DIR); + d_parent_ino(file->f_path.dentry), DT_DIR); } static inline bool dir_emit_dots(struct file *file, struct dir_context *ctx) { From b80cc4df1124702c600fd43b784e423a30919204 Mon Sep 17 00:00:00 2001 From: Chen Ni Date: Mon, 8 Jul 2024 16:04:04 +0800 Subject: [PATCH 27/27] ipc: mqueue: remove assignment from IS_ERR argument Remove assignment from IS_ERR() argument. This is detected by coccinelle. Signed-off-by: Chen Ni Link: https://lore.kernel.org/r/20240708080404.3859094-1-nichen@iscas.ac.cn Signed-off-by: Christian Brauner --- ipc/mqueue.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ipc/mqueue.c b/ipc/mqueue.c index 5eea4dc0509e..a7cbd69efbef 100644 --- a/ipc/mqueue.c +++ b/ipc/mqueue.c @@ -903,7 +903,8 @@ static int do_mq_open(const char __user *u_name, int oflag, umode_t mode, audit_mq_open(oflag, mode, attr); - if (IS_ERR(name = getname(u_name))) + name = getname(u_name); + if (IS_ERR(name)) return PTR_ERR(name); fd = get_unused_fd_flags(O_CLOEXEC);