linux/fs/internal.h
Linus Torvalds 8c9440fea7 vfs-6.8.mount
-----BEGIN PGP SIGNATURE-----
 
 iHUEABYKAB0WIQRAhzRXHqcMeLMyaSiRxhvAZXjcogUCZZU0CgAKCRCRxhvAZXjc
 osncAQDSJK0frJL+72NqXxa4YNzivrnuw6fhp5iaDAEqxdm8ygEAoJWyh7Rmkt8G
 drAXWGyGnCYqv7UgC6axLyciid7TxQg=
 =vJuv
 -----END PGP SIGNATURE-----

Merge tag 'vfs-6.8.mount' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs

Pull vfs mount updates from Christian Brauner:
 "This contains the work to retrieve detailed information about mounts
  via two new system calls. This is hopefully the beginning of the end
  of the saga that started with fsinfo() years ago.

  The LWN articles in [1] and [2] can serve as a summary so we can avoid
  rehashing everything here.

  At LSFMM in May 2022 we got into a room and agreed on what we want to
  do about fsinfo(). Basically, split it into pieces. This is the first
  part of that agreement. Specifically, it is concerned with retrieving
  information about mounts. So this only concerns the mount information
  retrieval, not the mount table change notification, or the extended
  filesystem specific mount option work. That is separate work.

  Currently mounts have a 32bit id. Mount ids are already in heavy use
  by libmount and other low-level userspace but they can't be relied
  upon because they're recycled very quickly. We agreed that mounts
  should carry a unique 64bit id by which they can be referenced
  directly. This is now implemented as part of this work.

  The new 64bit mount id is exposed in statx() through the new
  STATX_MNT_ID_UNIQUE flag. If the flag isn't raised the old mount id is
  returned. If it is raised and the kernel supports the new 64bit mount
  id the flag is raised in the result mask and the new 64bit mount id is
  returned. New and old mount ids do not overlap so they cannot be
  conflated.

  Two new system calls are introduced that operate on the 64bit mount
  id: statmount() and listmount(). A summary of the api and usage can be
  found on LWN as well (cf. [3]) but of course, I'll provide a summary
  here as well.

  Both system calls rely on struct mnt_id_req. Which is the request
  struct used to pass the 64bit mount id identifying the mount to
  operate on. It is extensible to allow for the addition of new
  parameters and for future use in other apis that make use of mount
  ids.

  statmount() mimicks the semantics of statx() and exposes a set flags
  that userspace may raise in mnt_id_req to request specific information
  to be retrieved. A statmount() call returns a struct statmount filled
  in with information about the requested mount. Supported requests are
  indicated by raising the request flag passed in struct mnt_id_req in
  the @mask argument in struct statmount.

  Currently we do support:

   - STATMOUNT_SB_BASIC:
     Basic filesystem info

   - STATMOUNT_MNT_BASIC
     Mount information (mount id, parent mount id, mount attributes etc)

   - STATMOUNT_PROPAGATE_FROM
     Propagation from what mount in current namespace

   - STATMOUNT_MNT_ROOT
     Path of the root of the mount (e.g., mount --bind /bla /mnt returns /bla)

   - STATMOUNT_MNT_POINT
     Path of the mount point (e.g., mount --bind /bla /mnt returns /mnt)

   - STATMOUNT_FS_TYPE
     Name of the filesystem type as the magic number isn't enough due to submounts

  The string options STATMOUNT_MNT_{ROOT,POINT} and STATMOUNT_FS_TYPE
  are appended to the end of the struct. Userspace can use the offsets
  in @fs_type, @mnt_root, and @mnt_point to reference those strings
  easily.

  The struct statmount reserves quite a bit of space currently for
  future extensibility. This isn't really a problem and if this bothers
  us we can just send a follow-up pull request during this cycle.

  listmount() is given a 64bit mount id via mnt_id_req just as
  statmount(). It takes a buffer and a size to return an array of the
  64bit ids of the child mounts of the requested mount. Userspace can
  thus choose to either retrieve child mounts for a mount in batches or
  iterate through the child mounts. For most use-cases it will be
  sufficient to just leave space for a few child mounts. But for big
  mount tables having an iterator is really helpful. Iterating through a
  mount table works by setting @param in mnt_id_req to the mount id of
  the last child mount retrieved in the previous listmount() call"

Link: https://lwn.net/Articles/934469 [1]
Link: https://lwn.net/Articles/829212 [2]
Link: https://lwn.net/Articles/950569 [3]

* tag 'vfs-6.8.mount' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs:
  add selftest for statmount/listmount
  fs: keep struct mnt_id_req extensible
  wire up syscalls for statmount/listmount
  add listmount(2) syscall
  statmount: simplify string option retrieval
  statmount: simplify numeric option retrieval
  add statmount(2) syscall
  namespace: extract show_path() helper
  mounts: keep list of mounts in an rbtree
  add unique mount ID
2024-01-08 10:57:34 -08:00

310 lines
8.9 KiB
C

/* SPDX-License-Identifier: GPL-2.0-or-later */
/* fs/ internal definitions
*
* Copyright (C) 2006 Red Hat, Inc. All Rights Reserved.
* Written by David Howells (dhowells@redhat.com)
*/
struct super_block;
struct file_system_type;
struct iomap;
struct iomap_ops;
struct linux_binprm;
struct path;
struct mount;
struct shrink_control;
struct fs_context;
struct pipe_inode_info;
struct iov_iter;
struct mnt_idmap;
/*
* block/bdev.c
*/
#ifdef CONFIG_BLOCK
extern void __init bdev_cache_init(void);
#else
static inline void bdev_cache_init(void)
{
}
#endif /* CONFIG_BLOCK */
/*
* buffer.c
*/
int __block_write_begin_int(struct folio *folio, loff_t pos, unsigned len,
get_block_t *get_block, const struct iomap *iomap);
/*
* char_dev.c
*/
extern void __init chrdev_init(void);
/*
* fs_context.c
*/
extern const struct fs_context_operations legacy_fs_context_ops;
extern int parse_monolithic_mount_data(struct fs_context *, void *);
extern void vfs_clean_context(struct fs_context *fc);
extern int finish_clean_context(struct fs_context *fc);
/*
* namei.c
*/
extern int filename_lookup(int dfd, struct filename *name, unsigned flags,
struct path *path, struct path *root);
int do_rmdir(int dfd, struct filename *name);
int do_unlinkat(int dfd, struct filename *name);
int may_linkat(struct mnt_idmap *idmap, const struct path *link);
int do_renameat2(int olddfd, struct filename *oldname, int newdfd,
struct filename *newname, unsigned int flags);
int do_mkdirat(int dfd, struct filename *name, umode_t mode);
int do_symlinkat(struct filename *from, int newdfd, struct filename *to);
int do_linkat(int olddfd, struct filename *old, int newdfd,
struct filename *new, int flags);
/*
* namespace.c
*/
extern struct vfsmount *lookup_mnt(const struct path *);
extern int finish_automount(struct vfsmount *, const struct path *);
extern int sb_prepare_remount_readonly(struct super_block *);
extern void __init mnt_init(void);
int mnt_get_write_access_file(struct file *file);
void mnt_put_write_access_file(struct file *file);
extern void dissolve_on_fput(struct vfsmount *);
extern bool may_mount(void);
int path_mount(const char *dev_name, struct path *path,
const char *type_page, unsigned long flags, void *data_page);
int path_umount(struct path *path, int flags);
int show_path(struct seq_file *m, struct dentry *root);
/*
* fs_struct.c
*/
extern void chroot_fs_refs(const struct path *, const struct path *);
/*
* file_table.c
*/
struct file *alloc_empty_file(int flags, const struct cred *cred);
struct file *alloc_empty_file_noaccount(int flags, const struct cred *cred);
struct file *alloc_empty_backing_file(int flags, const struct cred *cred);
static inline void file_put_write_access(struct file *file)
{
put_write_access(file->f_inode);
mnt_put_write_access(file->f_path.mnt);
if (unlikely(file->f_mode & FMODE_BACKING))
mnt_put_write_access(backing_file_user_path(file)->mnt);
}
static inline void put_file_access(struct file *file)
{
if ((file->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ) {
i_readcount_dec(file->f_inode);
} else if (file->f_mode & FMODE_WRITER) {
file_put_write_access(file);
}
}
/*
* super.c
*/
extern int reconfigure_super(struct fs_context *);
extern bool super_trylock_shared(struct super_block *sb);
struct super_block *user_get_super(dev_t, bool excl);
void put_super(struct super_block *sb);
extern bool mount_capable(struct fs_context *);
int sb_init_dio_done_wq(struct super_block *sb);
/*
* Prepare superblock for changing its read-only state (i.e., either remount
* read-write superblock read-only or vice versa). After this function returns
* mnt_is_readonly() will return true for any mount of the superblock if its
* caller is able to observe any changes done by the remount. This holds until
* sb_end_ro_state_change() is called.
*/
static inline void sb_start_ro_state_change(struct super_block *sb)
{
WRITE_ONCE(sb->s_readonly_remount, 1);
/*
* For RO->RW transition, the barrier pairs with the barrier in
* mnt_is_readonly() making sure if mnt_is_readonly() sees SB_RDONLY
* cleared, it will see s_readonly_remount set.
* For RW->RO transition, the barrier pairs with the barrier in
* mnt_get_write_access() before the mnt_is_readonly() check.
* The barrier makes sure if mnt_get_write_access() sees MNT_WRITE_HOLD
* already cleared, it will see s_readonly_remount set.
*/
smp_wmb();
}
/*
* Ends section changing read-only state of the superblock. After this function
* returns if mnt_is_readonly() returns false, the caller will be able to
* observe all the changes remount did to the superblock.
*/
static inline void sb_end_ro_state_change(struct super_block *sb)
{
/*
* This barrier provides release semantics that pairs with
* the smp_rmb() acquire semantics in mnt_is_readonly().
* This barrier pair ensure that when mnt_is_readonly() sees
* 0 for sb->s_readonly_remount, it will also see all the
* preceding flag changes that were made during the RO state
* change.
*/
smp_wmb();
WRITE_ONCE(sb->s_readonly_remount, 0);
}
/*
* open.c
*/
struct open_flags {
int open_flag;
umode_t mode;
int acc_mode;
int intent;
int lookup_flags;
};
extern struct file *do_filp_open(int dfd, struct filename *pathname,
const struct open_flags *op);
extern struct file *do_file_open_root(const struct path *,
const char *, const struct open_flags *);
extern struct open_how build_open_how(int flags, umode_t mode);
extern int build_open_flags(const struct open_how *how, struct open_flags *op);
struct file *file_close_fd_locked(struct files_struct *files, unsigned fd);
long do_sys_ftruncate(unsigned int fd, loff_t length, int small);
int chmod_common(const struct path *path, umode_t mode);
int do_fchownat(int dfd, const char __user *filename, uid_t user, gid_t group,
int flag);
int chown_common(const struct path *path, uid_t user, gid_t group);
extern int vfs_open(const struct path *, struct file *);
/*
* inode.c
*/
extern long prune_icache_sb(struct super_block *sb, struct shrink_control *sc);
int dentry_needs_remove_privs(struct mnt_idmap *, struct dentry *dentry);
bool in_group_or_capable(struct mnt_idmap *idmap,
const struct inode *inode, vfsgid_t vfsgid);
void lock_two_inodes(struct inode *inode1, struct inode *inode2,
unsigned subclass1, unsigned subclass2);
/*
* fs-writeback.c
*/
extern long get_nr_dirty_inodes(void);
void invalidate_inodes(struct super_block *sb);
/*
* dcache.c
*/
extern int d_set_mounted(struct dentry *dentry);
extern long prune_dcache_sb(struct super_block *sb, struct shrink_control *sc);
extern struct dentry *d_alloc_cursor(struct dentry *);
extern struct dentry * d_alloc_pseudo(struct super_block *, const struct qstr *);
extern char *simple_dname(struct dentry *, char *, int);
extern void dput_to_list(struct dentry *, struct list_head *);
extern void shrink_dentry_list(struct list_head *);
/*
* pipe.c
*/
extern const struct file_operations pipefifo_fops;
/*
* fs_pin.c
*/
extern void group_pin_kill(struct hlist_head *p);
extern void mnt_pin_kill(struct mount *m);
/*
* fs/nsfs.c
*/
extern const struct dentry_operations ns_dentry_operations;
/*
* fs/stat.c:
*/
int getname_statx_lookup_flags(int flags);
int do_statx(int dfd, struct filename *filename, unsigned int flags,
unsigned int mask, struct statx __user *buffer);
/*
* fs/splice.c:
*/
long splice_file_to_pipe(struct file *in,
struct pipe_inode_info *opipe,
loff_t *offset,
size_t len, unsigned int flags);
/*
* fs/xattr.c:
*/
struct xattr_name {
char name[XATTR_NAME_MAX + 1];
};
struct xattr_ctx {
/* Value of attribute */
union {
const void __user *cvalue;
void __user *value;
};
void *kvalue;
size_t size;
/* Attribute name */
struct xattr_name *kname;
unsigned int flags;
};
ssize_t do_getxattr(struct mnt_idmap *idmap,
struct dentry *d,
struct xattr_ctx *ctx);
int setxattr_copy(const char __user *name, struct xattr_ctx *ctx);
int do_setxattr(struct mnt_idmap *idmap, struct dentry *dentry,
struct xattr_ctx *ctx);
int may_write_xattr(struct mnt_idmap *idmap, struct inode *inode);
#ifdef CONFIG_FS_POSIX_ACL
int do_set_acl(struct mnt_idmap *idmap, struct dentry *dentry,
const char *acl_name, const void *kvalue, size_t size);
ssize_t do_get_acl(struct mnt_idmap *idmap, struct dentry *dentry,
const char *acl_name, void *kvalue, size_t size);
#else
static inline int do_set_acl(struct mnt_idmap *idmap,
struct dentry *dentry, const char *acl_name,
const void *kvalue, size_t size)
{
return -EOPNOTSUPP;
}
static inline ssize_t do_get_acl(struct mnt_idmap *idmap,
struct dentry *dentry, const char *acl_name,
void *kvalue, size_t size)
{
return -EOPNOTSUPP;
}
#endif
ssize_t __kernel_write_iter(struct file *file, struct iov_iter *from, loff_t *pos);
/*
* fs/attr.c
*/
struct mnt_idmap *alloc_mnt_idmap(struct user_namespace *mnt_userns);
struct mnt_idmap *mnt_idmap_get(struct mnt_idmap *idmap);
void mnt_idmap_put(struct mnt_idmap *idmap);