linux/mm/fadvise.c
Matthew Wilcox (Oracle) cee9a0c4e8 mm: move readahead prototypes from mm.h
Patch series "Change readahead API", v11.

This series adds a readahead address_space operation to replace the
readpages operation.  The key difference is that pages are added to the
page cache as they are allocated (and then looked up by the filesystem)
instead of passing them on a list to the readpages operation and having
the filesystem add them to the page cache.  It's a net reduction in code
for each implementation, more efficient than walking a list, and solves
the direct-write vs buffered-read problem reported by yu kuai at
http://lkml.kernel.org/r/20200116063601.39201-1-yukuai3@huawei.com

The only unconverted filesystems are those which use fscache.  Their
conversion is pending Dave Howells' rewrite which will make the
conversion substantially easier.  This should be completed by the end of
the year.

I want to thank the reviewers/testers; Dave Chinner, John Hubbard, Eric
Biggers, Johannes Thumshirn, Dave Sterba, Zi Yan, Christoph Hellwig and
Miklos Szeredi have done a marvellous job of providing constructive
criticism.

These patches pass an xfstests run on ext4, xfs & btrfs with no
regressions that I can tell (some of the tests seem a little flaky
before and remain flaky afterwards).

This patch (of 25):

The readahead code is part of the page cache so should be found in the
pagemap.h file.  force_page_cache_readahead is only used within mm, so
move it to mm/internal.h instead.  Remove the parameter names where they
add no value, and rename the ones which were actively misleading.

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Reviewed-by: John Hubbard <jhubbard@nvidia.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: William Kucharski <william.kucharski@oracle.com>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Cc: Chao Yu <yuchao0@huawei.com>
Cc: Cong Wang <xiyou.wangcong@gmail.com>
Cc: Darrick J. Wong <darrick.wong@oracle.com>
Cc: Dave Chinner <dchinner@redhat.com>
Cc: Eric Biggers <ebiggers@google.com>
Cc: Gao Xiang <gaoxiang25@huawei.com>
Cc: Jaegeuk Kim <jaegeuk@kernel.org>
Cc: Joseph Qi <joseph.qi@linux.alibaba.com>
Cc: Junxiao Bi <junxiao.bi@oracle.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Zi Yan <ziy@nvidia.com>
Cc: Miklos Szeredi <mszeredi@redhat.com>
Link: http://lkml.kernel.org/r/20200414150233.24495-1-willy@infradead.org
Link: http://lkml.kernel.org/r/20200414150233.24495-2-willy@infradead.org
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2020-06-02 10:59:06 -07:00

223 lines
5.4 KiB
C

// SPDX-License-Identifier: GPL-2.0
/*
* mm/fadvise.c
*
* Copyright (C) 2002, Linus Torvalds
*
* 11Jan2003 Andrew Morton
* Initial version.
*/
#include <linux/kernel.h>
#include <linux/file.h>
#include <linux/fs.h>
#include <linux/mm.h>
#include <linux/pagemap.h>
#include <linux/backing-dev.h>
#include <linux/pagevec.h>
#include <linux/fadvise.h>
#include <linux/writeback.h>
#include <linux/syscalls.h>
#include <linux/swap.h>
#include <asm/unistd.h>
#include "internal.h"
/*
* POSIX_FADV_WILLNEED could set PG_Referenced, and POSIX_FADV_NOREUSE could
* deactivate the pages and clear PG_Referenced.
*/
int generic_fadvise(struct file *file, loff_t offset, loff_t len, int advice)
{
struct inode *inode;
struct address_space *mapping;
struct backing_dev_info *bdi;
loff_t endbyte; /* inclusive */
pgoff_t start_index;
pgoff_t end_index;
unsigned long nrpages;
inode = file_inode(file);
if (S_ISFIFO(inode->i_mode))
return -ESPIPE;
mapping = file->f_mapping;
if (!mapping || len < 0)
return -EINVAL;
bdi = inode_to_bdi(mapping->host);
if (IS_DAX(inode) || (bdi == &noop_backing_dev_info)) {
switch (advice) {
case POSIX_FADV_NORMAL:
case POSIX_FADV_RANDOM:
case POSIX_FADV_SEQUENTIAL:
case POSIX_FADV_WILLNEED:
case POSIX_FADV_NOREUSE:
case POSIX_FADV_DONTNEED:
/* no bad return value, but ignore advice */
break;
default:
return -EINVAL;
}
return 0;
}
/*
* Careful about overflows. Len == 0 means "as much as possible". Use
* unsigned math because signed overflows are undefined and UBSan
* complains.
*/
endbyte = (u64)offset + (u64)len;
if (!len || endbyte < len)
endbyte = -1;
else
endbyte--; /* inclusive */
switch (advice) {
case POSIX_FADV_NORMAL:
file->f_ra.ra_pages = bdi->ra_pages;
spin_lock(&file->f_lock);
file->f_mode &= ~FMODE_RANDOM;
spin_unlock(&file->f_lock);
break;
case POSIX_FADV_RANDOM:
spin_lock(&file->f_lock);
file->f_mode |= FMODE_RANDOM;
spin_unlock(&file->f_lock);
break;
case POSIX_FADV_SEQUENTIAL:
file->f_ra.ra_pages = bdi->ra_pages * 2;
spin_lock(&file->f_lock);
file->f_mode &= ~FMODE_RANDOM;
spin_unlock(&file->f_lock);
break;
case POSIX_FADV_WILLNEED:
/* First and last PARTIAL page! */
start_index = offset >> PAGE_SHIFT;
end_index = endbyte >> PAGE_SHIFT;
/* Careful about overflow on the "+1" */
nrpages = end_index - start_index + 1;
if (!nrpages)
nrpages = ~0UL;
/*
* Ignore return value because fadvise() shall return
* success even if filesystem can't retrieve a hint,
*/
force_page_cache_readahead(mapping, file, start_index, nrpages);
break;
case POSIX_FADV_NOREUSE:
break;
case POSIX_FADV_DONTNEED:
if (!inode_write_congested(mapping->host))
__filemap_fdatawrite_range(mapping, offset, endbyte,
WB_SYNC_NONE);
/*
* First and last FULL page! Partial pages are deliberately
* preserved on the expectation that it is better to preserve
* needed memory than to discard unneeded memory.
*/
start_index = (offset+(PAGE_SIZE-1)) >> PAGE_SHIFT;
end_index = (endbyte >> PAGE_SHIFT);
/*
* The page at end_index will be inclusively discarded according
* by invalidate_mapping_pages(), so subtracting 1 from
* end_index means we will skip the last page. But if endbyte
* is page aligned or is at the end of file, we should not skip
* that page - discarding the last page is safe enough.
*/
if ((endbyte & ~PAGE_MASK) != ~PAGE_MASK &&
endbyte != inode->i_size - 1) {
/* First page is tricky as 0 - 1 = -1, but pgoff_t
* is unsigned, so the end_index >= start_index
* check below would be true and we'll discard the whole
* file cache which is not what was asked.
*/
if (end_index == 0)
break;
end_index--;
}
if (end_index >= start_index) {
unsigned long count;
/*
* It's common to FADV_DONTNEED right after
* the read or write that instantiates the
* pages, in which case there will be some
* sitting on the local LRU cache. Try to
* avoid the expensive remote drain and the
* second cache tree walk below by flushing
* them out right away.
*/
lru_add_drain();
count = invalidate_mapping_pages(mapping,
start_index, end_index);
/*
* If fewer pages were invalidated than expected then
* it is possible that some of the pages were on
* a per-cpu pagevec for a remote CPU. Drain all
* pagevecs and try again.
*/
if (count < (end_index - start_index + 1)) {
lru_add_drain_all();
invalidate_mapping_pages(mapping, start_index,
end_index);
}
}
break;
default:
return -EINVAL;
}
return 0;
}
EXPORT_SYMBOL(generic_fadvise);
int vfs_fadvise(struct file *file, loff_t offset, loff_t len, int advice)
{
if (file->f_op->fadvise)
return file->f_op->fadvise(file, offset, len, advice);
return generic_fadvise(file, offset, len, advice);
}
EXPORT_SYMBOL(vfs_fadvise);
#ifdef CONFIG_ADVISE_SYSCALLS
int ksys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice)
{
struct fd f = fdget(fd);
int ret;
if (!f.file)
return -EBADF;
ret = vfs_fadvise(f.file, offset, len, advice);
fdput(f);
return ret;
}
SYSCALL_DEFINE4(fadvise64_64, int, fd, loff_t, offset, loff_t, len, int, advice)
{
return ksys_fadvise64_64(fd, offset, len, advice);
}
#ifdef __ARCH_WANT_SYS_FADVISE64
SYSCALL_DEFINE4(fadvise64, int, fd, loff_t, offset, size_t, len, int, advice)
{
return ksys_fadvise64_64(fd, offset, len, advice);
}
#endif
#endif