linux/fs/nilfs2/file.c
Ryusuke Konishi 38296afe3c nilfs2: fix hang in nilfs_lookup_dirty_data_buffers()
Syzbot reported a hang issue in migrate_pages_batch() called by mbind()
and nilfs_lookup_dirty_data_buffers() called in the log writer of nilfs2.

While migrate_pages_batch() locks a folio and waits for the writeback to
complete, the log writer thread that should bring the writeback to
completion picks up the folio being written back in
nilfs_lookup_dirty_data_buffers() that it calls for subsequent log
creation and was trying to lock the folio.  Thus causing a deadlock.

In the first place, it is unexpected that folios/pages in the middle of
writeback will be updated and become dirty.  Nilfs2 adds a checksum to
verify the validity of the log being written and uses it for recovery at
mount, so data changes during writeback are suppressed.  Since this is
broken, an unclean shutdown could potentially cause recovery to fail.

Investigation revealed that the root cause is that the wait for writeback
completion in nilfs_page_mkwrite() is conditional, and if the backing
device does not require stable writes, data may be modified without
waiting.

Fix these issues by making nilfs_page_mkwrite() wait for writeback to
finish regardless of the stable write requirement of the backing device.

Link: https://lkml.kernel.org/r/20240131145657.4209-1-konishi.ryusuke@gmail.com
Fixes: 1d1d1a7672 ("mm: only enforce stable page writes if the backing device requires it")
Signed-off-by: Ryusuke Konishi <konishi.ryusuke@gmail.com>
Reported-by: syzbot+ee2ae68da3b22d04cd8d@syzkaller.appspotmail.com
Closes: https://lkml.kernel.org/r/00000000000047d819061004ad6c@google.com
Tested-by: Ryusuke Konishi <konishi.ryusuke@gmail.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2024-02-07 21:20:36 -08:00

164 lines
4 KiB
C

// SPDX-License-Identifier: GPL-2.0+
/*
* NILFS regular file handling primitives including fsync().
*
* Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
*
* Written by Amagai Yoshiji and Ryusuke Konishi.
*/
#include <linux/fs.h>
#include <linux/mm.h>
#include <linux/writeback.h>
#include "nilfs.h"
#include "segment.h"
int nilfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
{
/*
* Called from fsync() system call
* This is the only entry point that can catch write and synch
* timing for both data blocks and intermediate blocks.
*
* This function should be implemented when the writeback function
* will be implemented.
*/
struct the_nilfs *nilfs;
struct inode *inode = file->f_mapping->host;
int err = 0;
if (nilfs_inode_dirty(inode)) {
if (datasync)
err = nilfs_construct_dsync_segment(inode->i_sb, inode,
start, end);
else
err = nilfs_construct_segment(inode->i_sb);
}
nilfs = inode->i_sb->s_fs_info;
if (!err)
err = nilfs_flush_device(nilfs);
return err;
}
static vm_fault_t nilfs_page_mkwrite(struct vm_fault *vmf)
{
struct vm_area_struct *vma = vmf->vma;
struct folio *folio = page_folio(vmf->page);
struct inode *inode = file_inode(vma->vm_file);
struct nilfs_transaction_info ti;
struct buffer_head *bh, *head;
int ret = 0;
if (unlikely(nilfs_near_disk_full(inode->i_sb->s_fs_info)))
return VM_FAULT_SIGBUS; /* -ENOSPC */
sb_start_pagefault(inode->i_sb);
folio_lock(folio);
if (folio->mapping != inode->i_mapping ||
folio_pos(folio) >= i_size_read(inode) ||
!folio_test_uptodate(folio)) {
folio_unlock(folio);
ret = -EFAULT; /* make the VM retry the fault */
goto out;
}
/*
* check to see if the folio is mapped already (no holes)
*/
if (folio_test_mappedtodisk(folio))
goto mapped;
head = folio_buffers(folio);
if (head) {
int fully_mapped = 1;
bh = head;
do {
if (!buffer_mapped(bh)) {
fully_mapped = 0;
break;
}
} while (bh = bh->b_this_page, bh != head);
if (fully_mapped) {
folio_set_mappedtodisk(folio);
goto mapped;
}
}
folio_unlock(folio);
/*
* fill hole blocks
*/
ret = nilfs_transaction_begin(inode->i_sb, &ti, 1);
/* never returns -ENOMEM, but may return -ENOSPC */
if (unlikely(ret))
goto out;
file_update_time(vma->vm_file);
ret = block_page_mkwrite(vma, vmf, nilfs_get_block);
if (ret) {
nilfs_transaction_abort(inode->i_sb);
goto out;
}
nilfs_set_file_dirty(inode, 1 << (PAGE_SHIFT - inode->i_blkbits));
nilfs_transaction_commit(inode->i_sb);
mapped:
/*
* Since checksumming including data blocks is performed to determine
* the validity of the log to be written and used for recovery, it is
* necessary to wait for writeback to finish here, regardless of the
* stable write requirement of the backing device.
*/
folio_wait_writeback(folio);
out:
sb_end_pagefault(inode->i_sb);
return vmf_fs_error(ret);
}
static const struct vm_operations_struct nilfs_file_vm_ops = {
.fault = filemap_fault,
.map_pages = filemap_map_pages,
.page_mkwrite = nilfs_page_mkwrite,
};
static int nilfs_file_mmap(struct file *file, struct vm_area_struct *vma)
{
file_accessed(file);
vma->vm_ops = &nilfs_file_vm_ops;
return 0;
}
/*
* We have mostly NULL's here: the current defaults are ok for
* the nilfs filesystem.
*/
const struct file_operations nilfs_file_operations = {
.llseek = generic_file_llseek,
.read_iter = generic_file_read_iter,
.write_iter = generic_file_write_iter,
.unlocked_ioctl = nilfs_ioctl,
#ifdef CONFIG_COMPAT
.compat_ioctl = nilfs_compat_ioctl,
#endif /* CONFIG_COMPAT */
.mmap = nilfs_file_mmap,
.open = generic_file_open,
/* .release = nilfs_release_file, */
.fsync = nilfs_sync_file,
.splice_read = filemap_splice_read,
.splice_write = iter_file_splice_write,
};
const struct inode_operations nilfs_file_inode_operations = {
.setattr = nilfs_setattr,
.permission = nilfs_permission,
.fiemap = nilfs_fiemap,
.fileattr_get = nilfs_fileattr_get,
.fileattr_set = nilfs_fileattr_set,
};
/* end of file */