for-6.10-rc2-tag

-----BEGIN PGP SIGNATURE-----
 
 iQIzBAABCgAdFiEE8rQSAMVO+zA4DBdWxWXV+ddtWDsFAmZggXMACgkQxWXV+ddt
 WDupkA/9Foo2OsWR6wIQyBqzmHnhgzBwJ67q0F6MO2/iFfMRW/YIJH3Fk+0+PP40
 BDK4xiz1DIl/qJvoSv4bpPNvy/lAovtVB/AV8rH+JaJNHP/fTjkqA3Ad6ZtZN45J
 KoHE4SoX4NT1v+zwJ2irrH1W2mPh8tNTYvZINPcLC/nX2UzYoNjiIFLRCMSe003M
 ybNjvv6VUHPk+9JAWsVt5pjDLu5E1EmXakXv5mvGaIVr0ljNUPCwhFip20YMpVfo
 17t6MezmeqwGbrJgMpJyPOSsghaA68lzuzVVyAFFoxqlGLZ5rgtXTmK4O4NsyZfr
 EMkwNR1IDt7fVXUkHy4X/8f9V8Wwmmwp8bSY4rTTgA4hg3w0w4FCX+uNOWHagkaS
 8vWWTJBSvJKJwLUfWhKVHIaiUEkFEhmnUQPjqlfSxc+mQgxJcK1djgdVkVxSudrp
 l0xdDG0WTWiO0zniIXbIlZ7tCeUgL1kcovZmDIA6em+HSipryvSFdYT+h7VKgzzv
 XTJvdXKMSiqMvXoT2BRYkmWVeuUBhJ1EptkGidZBgTZ7EFfuGnhBCRgq9YSaWnak
 2SBvgjxKQzyxVpqWllOsksRg2/fSl9vdlGK3KjyGW1pAwrZD/zbmG/ZqH2MVOfjt
 LdswuwKd25pYpamYZqrCyJtIZlTSUrWpasaX1P28gs0uRCuFaiY=
 =q3Ic
 -----END PGP SIGNATURE-----

Merge tag 'for-6.10-rc2-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux

Pull btrfs fix from David Sterba:
 "A fix for fast fsync that needs to handle errors during writes after
  some COW failure so it does not lead to an inconsistent state"

* tag 'for-6.10-rc2-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux:
  btrfs: ensure fast fsync waits for ordered extents after a write failure
This commit is contained in:
Linus Torvalds 2024-06-05 11:28:25 -07:00
commit 19ca0d8a43
3 changed files with 57 additions and 0 deletions

View File

@ -89,6 +89,16 @@ enum {
BTRFS_INODE_FREE_SPACE_INODE,
/* Set when there are no capabilities in XATTs for the inode. */
BTRFS_INODE_NO_CAP_XATTR,
/*
* Set if an error happened when doing a COW write before submitting a
* bio or during writeback. Used for both buffered writes and direct IO
* writes. This is to signal a fast fsync that it has to wait for
* ordered extents to complete and therefore not log extent maps that
* point to unwritten extents (when an ordered extent completes and it
* has the BTRFS_ORDERED_IOERR flag set, it drops extent maps in its
* range).
*/
BTRFS_INODE_COW_WRITE_ERROR,
};
/* in memory btrfs inode */

View File

@ -1885,6 +1885,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
*/
if (full_sync || btrfs_is_zoned(fs_info)) {
ret = btrfs_wait_ordered_range(inode, start, len);
clear_bit(BTRFS_INODE_COW_WRITE_ERROR, &BTRFS_I(inode)->runtime_flags);
} else {
/*
* Get our ordered extents as soon as possible to avoid doing
@ -1894,6 +1895,21 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
btrfs_get_ordered_extents_for_logging(BTRFS_I(inode),
&ctx.ordered_extents);
ret = filemap_fdatawait_range(inode->i_mapping, start, end);
if (ret)
goto out_release_extents;
/*
* Check and clear the BTRFS_INODE_COW_WRITE_ERROR now after
* starting and waiting for writeback, because for buffered IO
* it may have been set during the end IO callback
* (end_bbio_data_write() -> btrfs_finish_ordered_extent()) in
* case an error happened and we need to wait for ordered
* extents to complete so that any extent maps that point to
* unwritten locations are dropped and we don't log them.
*/
if (test_and_clear_bit(BTRFS_INODE_COW_WRITE_ERROR,
&BTRFS_I(inode)->runtime_flags))
ret = btrfs_wait_ordered_range(inode, start, len);
}
if (ret)

View File

@ -388,6 +388,37 @@ bool btrfs_finish_ordered_extent(struct btrfs_ordered_extent *ordered,
ret = can_finish_ordered_extent(ordered, page, file_offset, len, uptodate);
spin_unlock_irqrestore(&inode->ordered_tree_lock, flags);
/*
* If this is a COW write it means we created new extent maps for the
* range and they point to unwritten locations if we got an error either
* before submitting a bio or during IO.
*
* We have marked the ordered extent with BTRFS_ORDERED_IOERR, and we
* are queuing its completion below. During completion, at
* btrfs_finish_one_ordered(), we will drop the extent maps for the
* unwritten extents.
*
* However because completion runs in a work queue we can end up having
* a fast fsync running before that. In the case of direct IO, once we
* unlock the inode the fsync might start, and we queue the completion
* before unlocking the inode. In the case of buffered IO when writeback
* finishes (end_bbio_data_write()) we queue the completion, so if the
* writeback was triggered by a fast fsync, the fsync might start
* logging before ordered extent completion runs in the work queue.
*
* The fast fsync will log file extent items based on the extent maps it
* finds, so if by the time it collects extent maps the ordered extent
* completion didn't happen yet, it will log file extent items that
* point to unwritten extents, resulting in a corruption if a crash
* happens and the log tree is replayed. Note that a fast fsync does not
* wait for completion of ordered extents in order to reduce latency.
*
* Set a flag in the inode so that the next fast fsync will wait for
* ordered extents to complete before starting to log.
*/
if (!uptodate && !test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags))
set_bit(BTRFS_INODE_COW_WRITE_ERROR, &inode->runtime_flags);
if (ret)
btrfs_queue_ordered_fn(ordered);
return ret;