From 9b1677fb5a0824b5f4b425c0ee950aaecf252029 Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Fri, 15 Dec 2023 12:51:41 -0500 Subject: [PATCH 1/2] dmu: Allow buffer fills to fail When ZFS overwrites a whole block, it does not bother to read the old content from disk. It is a good optimization, but if the buffer fill fails due to page fault or something else, the buffer ends up corrupted, neither keeping old content, nor getting the new one. On FreeBSD this is additionally complicated by page faults being blocked by VFS layer, always returning EFAULT on attempt to write from mmap()'ed but not yet cached address range. Normally it is not a big problem, since after original failure VFS will retry the write after reading the required data. The problem becomes worse in specific case when somebody tries to write into a file its own mmap()'ed content from the same location. In that situation the only copy of the data is getting corrupted on the page fault and the following retries only fixate the status quo. Block cloning makes this issue easier to reproduce, since it does not read the old data, unlike traditional file copy, that may work by chance. This patch provides the fill status to dmu_buf_fill_done(), that in case of error can destroy the corrupted buffer as if no write happened. One more complication in case of block cloning is that if error is possible during fill, dmu_buf_will_fill() must read the data via fall-back to dmu_buf_will_dirty(). It is required to allow in case of error restoring the buffer to a state after the cloning, not not before it, that would happen if we just call dbuf_undirty(). Reviewed-by: Brian Behlendorf Reviewed-by: Rob Norris Signed-off-by: Alexander Motin Sponsored by: iXsystems, Inc. Closes #15665 --- include/os/freebsd/spl/sys/uio.h | 2 +- include/os/linux/spl/sys/uio.h | 2 +- include/sys/dbuf.h | 4 ++-- lib/libspl/include/sys/uio.h | 2 +- module/os/freebsd/zfs/dmu_os.c | 4 ++-- module/zfs/dbuf.c | 33 +++++++++++++++++++++++--------- module/zfs/dmu.c | 21 +++++++++----------- module/zfs/dmu_recv.c | 2 +- module/zfs/dsl_bookmark.c | 2 +- 9 files changed, 42 insertions(+), 30 deletions(-) diff --git a/include/os/freebsd/spl/sys/uio.h b/include/os/freebsd/spl/sys/uio.h index b71f2f2e5625..b9d41903ea63 100644 --- a/include/os/freebsd/spl/sys/uio.h +++ b/include/os/freebsd/spl/sys/uio.h @@ -62,7 +62,7 @@ zfs_uio_setoffset(zfs_uio_t *uio, offset_t off) } static inline void -zfs_uio_advance(zfs_uio_t *uio, size_t size) +zfs_uio_advance(zfs_uio_t *uio, ssize_t size) { zfs_uio_resid(uio) -= size; zfs_uio_offset(uio) += size; diff --git a/include/os/linux/spl/sys/uio.h b/include/os/linux/spl/sys/uio.h index a4b600004c9f..5e6ea8d3c221 100644 --- a/include/os/linux/spl/sys/uio.h +++ b/include/os/linux/spl/sys/uio.h @@ -95,7 +95,7 @@ zfs_uio_setoffset(zfs_uio_t *uio, offset_t off) } static inline void -zfs_uio_advance(zfs_uio_t *uio, size_t size) +zfs_uio_advance(zfs_uio_t *uio, ssize_t size) { uio->uio_resid -= size; uio->uio_loffset += size; diff --git a/include/sys/dbuf.h b/include/sys/dbuf.h index 2ff0bc72b270..3808a04cba80 100644 --- a/include/sys/dbuf.h +++ b/include/sys/dbuf.h @@ -380,8 +380,8 @@ dmu_buf_impl_t *dbuf_find(struct objset *os, uint64_t object, uint8_t level, int dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags); void dmu_buf_will_clone(dmu_buf_t *db, dmu_tx_t *tx); void dmu_buf_will_not_fill(dmu_buf_t *db, dmu_tx_t *tx); -void dmu_buf_will_fill(dmu_buf_t *db, dmu_tx_t *tx); -void dmu_buf_fill_done(dmu_buf_t *db, dmu_tx_t *tx); +void dmu_buf_will_fill(dmu_buf_t *db, dmu_tx_t *tx, boolean_t canfail); +boolean_t dmu_buf_fill_done(dmu_buf_t *db, dmu_tx_t *tx, boolean_t failed); void dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx); dbuf_dirty_record_t *dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx); dbuf_dirty_record_t *dbuf_dirty_lightweight(dnode_t *dn, uint64_t blkid, diff --git a/lib/libspl/include/sys/uio.h b/lib/libspl/include/sys/uio.h index e9e21819d4f8..665bfc42301b 100644 --- a/lib/libspl/include/sys/uio.h +++ b/lib/libspl/include/sys/uio.h @@ -90,7 +90,7 @@ zfs_uio_iov_at_index(zfs_uio_t *uio, uint_t idx, void **base, uint64_t *len) } static inline void -zfs_uio_advance(zfs_uio_t *uio, size_t size) +zfs_uio_advance(zfs_uio_t *uio, ssize_t size) { uio->uio_resid -= size; uio->uio_loffset += size; diff --git a/module/os/freebsd/zfs/dmu_os.c b/module/os/freebsd/zfs/dmu_os.c index ee6fb2dc657b..48ea37cbad59 100644 --- a/module/os/freebsd/zfs/dmu_os.c +++ b/module/os/freebsd/zfs/dmu_os.c @@ -107,7 +107,7 @@ dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size); if (tocpy == db->db_size) - dmu_buf_will_fill(db, tx); + dmu_buf_will_fill(db, tx, B_FALSE); else dmu_buf_will_dirty(db, tx); @@ -123,7 +123,7 @@ dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, } if (tocpy == db->db_size) - dmu_buf_fill_done(db, tx); + dmu_buf_fill_done(db, tx, B_FALSE); offset += tocpy; size -= tocpy; diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c index 03c97941d6d3..e9d5abca3324 100644 --- a/module/zfs/dbuf.c +++ b/module/zfs/dbuf.c @@ -2751,7 +2751,7 @@ dmu_buf_will_not_fill(dmu_buf_t *db_fake, dmu_tx_t *tx) } void -dmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx) +dmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx, boolean_t canfail) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; @@ -2769,8 +2769,14 @@ dmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx) * Block cloning: We will be completely overwriting a block * cloned in this transaction group, so let's undirty the * pending clone and mark the block as uncached. This will be - * as if the clone was never done. + * as if the clone was never done. But if the fill can fail + * we should have a way to return back to the cloned data. */ + if (canfail && dbuf_find_dirty_eq(db, tx->tx_txg) != NULL) { + mutex_exit(&db->db_mtx); + dmu_buf_will_dirty(db_fake, tx); + return; + } VERIFY(!dbuf_undirty(db, tx)); db->db_state = DB_UNCACHED; } @@ -2831,32 +2837,41 @@ dbuf_override_impl(dmu_buf_impl_t *db, const blkptr_t *bp, dmu_tx_t *tx) dl->dr_overridden_by.blk_birth = dr->dr_txg; } -void -dmu_buf_fill_done(dmu_buf_t *dbuf, dmu_tx_t *tx) +boolean_t +dmu_buf_fill_done(dmu_buf_t *dbuf, dmu_tx_t *tx, boolean_t failed) { (void) tx; dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf; - dbuf_states_t old_state; mutex_enter(&db->db_mtx); DBUF_VERIFY(db); - old_state = db->db_state; - db->db_state = DB_CACHED; - if (old_state == DB_FILL) { + if (db->db_state == DB_FILL) { if (db->db_level == 0 && db->db_freed_in_flight) { ASSERT(db->db_blkid != DMU_BONUS_BLKID); /* we were freed while filling */ /* XXX dbuf_undirty? */ memset(db->db.db_data, 0, db->db.db_size); db->db_freed_in_flight = FALSE; + db->db_state = DB_CACHED; DTRACE_SET_STATE(db, "fill done handling freed in flight"); + failed = B_FALSE; + } else if (failed) { + VERIFY(!dbuf_undirty(db, tx)); + db->db_buf = NULL; + dbuf_clear_data(db); + DTRACE_SET_STATE(db, "fill failed"); } else { + db->db_state = DB_CACHED; DTRACE_SET_STATE(db, "fill done"); } cv_broadcast(&db->db_changed); + } else { + db->db_state = DB_CACHED; + failed = B_FALSE; } mutex_exit(&db->db_mtx); + return (failed); } void @@ -3001,7 +3016,7 @@ dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx) DTRACE_SET_STATE(db, "filling assigned arcbuf"); mutex_exit(&db->db_mtx); (void) dbuf_dirty(db, tx); - dmu_buf_fill_done(&db->db, tx); + dmu_buf_fill_done(&db->db, tx, B_FALSE); } void diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c index f5a5d0fc437f..d82211e6d4c7 100644 --- a/module/zfs/dmu.c +++ b/module/zfs/dmu.c @@ -1134,14 +1134,14 @@ dmu_write_impl(dmu_buf_t **dbp, int numbufs, uint64_t offset, uint64_t size, ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size); if (tocpy == db->db_size) - dmu_buf_will_fill(db, tx); + dmu_buf_will_fill(db, tx, B_FALSE); else dmu_buf_will_dirty(db, tx); (void) memcpy((char *)db->db_data + bufoff, buf, tocpy); if (tocpy == db->db_size) - dmu_buf_fill_done(db, tx); + dmu_buf_fill_done(db, tx, B_FALSE); offset += tocpy; size -= tocpy; @@ -1349,27 +1349,24 @@ dmu_write_uio_dnode(dnode_t *dn, zfs_uio_t *uio, uint64_t size, dmu_tx_t *tx) ASSERT(size > 0); - bufoff = zfs_uio_offset(uio) - db->db_offset; + offset_t off = zfs_uio_offset(uio); + bufoff = off - db->db_offset; tocpy = MIN(db->db_size - bufoff, size); ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size); if (tocpy == db->db_size) - dmu_buf_will_fill(db, tx); + dmu_buf_will_fill(db, tx, B_TRUE); else dmu_buf_will_dirty(db, tx); - /* - * XXX zfs_uiomove could block forever (eg.nfs-backed - * pages). There needs to be a uiolockdown() function - * to lock the pages in memory, so that zfs_uiomove won't - * block. - */ err = zfs_uio_fault_move((char *)db->db_data + bufoff, tocpy, UIO_WRITE, uio); - if (tocpy == db->db_size) - dmu_buf_fill_done(db, tx); + if (tocpy == db->db_size && dmu_buf_fill_done(db, tx, err)) { + /* The fill was reverted. Undo any uio progress. */ + zfs_uio_advance(uio, off - zfs_uio_offset(uio)); + } if (err) break; diff --git a/module/zfs/dmu_recv.c b/module/zfs/dmu_recv.c index 05ca91717c2f..54aa60259ea1 100644 --- a/module/zfs/dmu_recv.c +++ b/module/zfs/dmu_recv.c @@ -2532,7 +2532,7 @@ receive_spill(struct receive_writer_arg *rwa, struct drr_spill *drrs, * size of the provided arc_buf_t. */ if (db_spill->db_size != drrs->drr_length) { - dmu_buf_will_fill(db_spill, tx); + dmu_buf_will_fill(db_spill, tx, B_FALSE); VERIFY0(dbuf_spill_set_blksz(db_spill, drrs->drr_length, tx)); } diff --git a/module/zfs/dsl_bookmark.c b/module/zfs/dsl_bookmark.c index 03d9420dbdb9..4faefecbadbb 100644 --- a/module/zfs/dsl_bookmark.c +++ b/module/zfs/dsl_bookmark.c @@ -490,7 +490,7 @@ dsl_bookmark_create_sync_impl_snap(const char *bookmark, const char *snapshot, dmu_buf_t *db; VERIFY0(dmu_spill_hold_by_bonus(local_rl->rl_bonus, DB_RF_MUST_SUCCEED, FTAG, &db)); - dmu_buf_will_fill(db, tx); + dmu_buf_will_fill(db, tx, B_FALSE); VERIFY0(dbuf_spill_set_blksz(db, P2ROUNDUP(bonuslen, SPA_MINBLOCKSIZE), tx)); local_rl->rl_phys = db->db_data; From dbda45160ffa43e5ecf0498a609230f1afee7b3f Mon Sep 17 00:00:00 2001 From: Umer Saleem Date: Sat, 16 Dec 2023 03:18:27 +0500 Subject: [PATCH 2/2] Test LWB buffer overflow for block cloning PR#15634 removes 128K into 2x68K LWB split optimization, since it was found to cause LWB buffer overflow while trying to write 128KB TX_CLONE_RANGE record with 1022 block pointers into 68KB buffer, with multiple VDEVs ZIL. This commit adds a test for this particular scenario by writing maximum sizes TX_CLONE_RANE record with 1022 block pointers into 68KB buffer, with two SLOG devices. Reviewed-by: Brian Behlendorf Reviewed-by: Alexander Motin Reviewed-by: Ameer Hamza Signed-off-by: Umer Saleem Closes #15672 --- tests/runfiles/linux.run | 3 +- tests/test-runner/bin/zts-report.py.in | 2 + tests/zfs-tests/tests/Makefile.am | 1 + .../block_cloning_lwb_buffer_overflow.ksh | 89 +++++++++++++++++++ 4 files changed, 94 insertions(+), 1 deletion(-) create mode 100755 tests/zfs-tests/tests/functional/block_cloning/block_cloning_lwb_buffer_overflow.ksh diff --git a/tests/runfiles/linux.run b/tests/runfiles/linux.run index 17ba23352422..c7c17f271762 100644 --- a/tests/runfiles/linux.run +++ b/tests/runfiles/linux.run @@ -44,7 +44,8 @@ tests = ['block_cloning_copyfilerange', 'block_cloning_copyfilerange_partial', 'block_cloning_copyfilerange_cross_dataset', 'block_cloning_cross_enc_dataset', 'block_cloning_copyfilerange_fallback_same_txg', - 'block_cloning_replay', 'block_cloning_replay_encrypted'] + 'block_cloning_replay', 'block_cloning_replay_encrypted', + 'block_cloning_lwb_buffer_overflow'] tags = ['functional', 'block_cloning'] [tests/functional/chattr:Linux] diff --git a/tests/test-runner/bin/zts-report.py.in b/tests/test-runner/bin/zts-report.py.in index 3b5eeacb6bad..708b7be91767 100755 --- a/tests/test-runner/bin/zts-report.py.in +++ b/tests/test-runner/bin/zts-report.py.in @@ -305,6 +305,8 @@ elif sys.platform.startswith('linux'): ['SKIP', cfr_reason], 'block_cloning/block_cloning_replay_encrypted': ['SKIP', cfr_reason], + 'block_cloning/block_cloning_lwb_buffer_overflow': + ['SKIP', cfr_reason], 'block_cloning/block_cloning_copyfilerange_cross_dataset': ['SKIP', cfr_cross_reason], 'block_cloning/block_cloning_copyfilerange_fallback_same_txg': diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am index 3c9f09382424..f2e28b92f1a2 100644 --- a/tests/zfs-tests/tests/Makefile.am +++ b/tests/zfs-tests/tests/Makefile.am @@ -454,6 +454,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/block_cloning/block_cloning_cross_enc_dataset.ksh \ functional/block_cloning/block_cloning_replay.ksh \ functional/block_cloning/block_cloning_replay_encrypted.ksh \ + functional/block_cloning/block_cloning_lwb_buffer_overflow.ksh \ functional/bootfs/bootfs_001_pos.ksh \ functional/bootfs/bootfs_002_neg.ksh \ functional/bootfs/bootfs_003_pos.ksh \ diff --git a/tests/zfs-tests/tests/functional/block_cloning/block_cloning_lwb_buffer_overflow.ksh b/tests/zfs-tests/tests/functional/block_cloning/block_cloning_lwb_buffer_overflow.ksh new file mode 100755 index 000000000000..0ae76b7e54a5 --- /dev/null +++ b/tests/zfs-tests/tests/functional/block_cloning/block_cloning_lwb_buffer_overflow.ksh @@ -0,0 +1,89 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2023 by iXsystems, Inc. All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/block_cloning/block_cloning.kshlib + +# +# DESCRIPTION: +# Test for LWB buffer overflow with multiple VDEVs ZIL when 128KB +# block write is split into two 68KB ones, trying to write maximum +# sizes 128KB TX_CLONE_RANGE record with 1022 block pointers into +# 68KB buffer. +# +# STRATEGY: +# 1. Create a pool with multiple VDEVs ZIL +# 2. Write maximum sizes TX_CLONE_RANGE record with 1022 block +# pointers into 68KB buffer +# 3. Sync TXG +# 4. Clone the file +# 5. Synchronize cached writes +# + +verify_runnable "global" + +if [[ $(linux_version) -lt $(linux_version "4.5") ]]; then + log_unsupported "copy_file_range not available before Linux 4.5" +fi + +VDIR=$TEST_BASE_DIR/disk-bclone +VDEV="$VDIR/a $VDIR/b $VDIR/c" +LDEV="$VDIR/e $VDIR/f" + +function cleanup +{ + datasetexists $TESTPOOL && destroy_pool $TESTPOOL + rm -rf $VDIR +} + +log_onexit cleanup + +log_assert "Test for LWB buffer overflow with multiple VDEVs ZIL" + +log_must rm -rf $VDIR +log_must mkdir -p $VDIR +log_must truncate -s $MINVDEVSIZE $VDEV $LDEV + +log_must zpool create -o feature@block_cloning=enabled $TESTPOOL $VDEV \ + log mirror $LDEV +log_must zfs create -o recordsize=32K $TESTPOOL/$TESTFS +# Each ZIL log entry can fit 130816 bytes for a block cloning operation, +# so it can store 1022 block pointers. When LWB optimization is enabled, +# an assert is hit when 128KB block write is split into two 68KB ones +# for 2 SLOG devices +log_must dd if=/dev/urandom of=/$TESTPOOL/$TESTFS/file1 bs=32K count=1022 \ + conv=fsync +sync_pool $TESTPOOL +log_must clonefile -c /$TESTPOOL/$TESTFS/file1 /$TESTPOOL/$TESTFS/file2 +log_must sync + +sync_pool $TESTPOOL +log_must have_same_content /$TESTPOOL/$TESTFS/file1 /$TESTPOOL/$TESTFS/file2 +typeset blocks=$(get_same_blocks $TESTPOOL/$TESTFS file1 $TESTPOOL/$TESTFS file2) +log_must [ "$blocks" = "$(seq -s " " 0 1021)" ] + +log_pass "LWB buffer overflow is not triggered with multiple VDEVs ZIL" +