Refactor dbuf_read() for safer decryption

In dbuf_read_verify_dnode_crypt():
 - We don't need original dbuf locked there. Instead take a lock
on a dnode dbuf, that is actually manipulated.
 - Block decryption for a dnode dbuf if it is currently being
written.  ARC hash lock does not protect anonymous buffers, so
arc_untransform() is unsafe when used on buffers being written,
that may happen in case of encrypted dnode buffers, since they
are not copied by dbuf_dirty()/dbuf_hold_copy().

In dbuf_read():
 - If the buffer is in flight, recheck its compression/encryption
status after it is cached, since it may need arc_untransform().

Tested-by: Rich Ercolani <rincebrain@gmail.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by:	Alexander Motin <mav@FreeBSD.org>
Sponsored by:	iXsystems, Inc.
Closes #16104
This commit is contained in:
Alexander Motin 2024-04-22 14:41:03 -04:00 committed by GitHub
parent c346068e5e
commit 4036b8d027
No known key found for this signature in database
GPG key ID: B5690EEEBB952194

View file

@ -161,13 +161,13 @@ struct {
} dbuf_sums;
#define DBUF_STAT_INCR(stat, val) \
wmsum_add(&dbuf_sums.stat, val);
wmsum_add(&dbuf_sums.stat, val)
#define DBUF_STAT_DECR(stat, val) \
DBUF_STAT_INCR(stat, -(val));
DBUF_STAT_INCR(stat, -(val))
#define DBUF_STAT_BUMP(stat) \
DBUF_STAT_INCR(stat, 1);
DBUF_STAT_INCR(stat, 1)
#define DBUF_STAT_BUMPDOWN(stat) \
DBUF_STAT_INCR(stat, -1);
DBUF_STAT_INCR(stat, -1)
#define DBUF_STAT_MAX(stat, v) { \
uint64_t _m; \
while ((v) > (_m = dbuf_stats.stat.value.ui64) && \
@ -177,7 +177,6 @@ struct {
static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx);
static void dbuf_sync_leaf_verify_bonus_dnode(dbuf_dirty_record_t *dr);
static int dbuf_read_verify_dnode_crypt(dmu_buf_impl_t *db, uint32_t flags);
/*
* Global data structures and functions for the dbuf cache.
@ -1418,13 +1417,9 @@ dbuf_read_done(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp,
* a decrypted block. Otherwise success.
*/
static int
dbuf_read_bonus(dmu_buf_impl_t *db, dnode_t *dn, uint32_t flags)
dbuf_read_bonus(dmu_buf_impl_t *db, dnode_t *dn)
{
int bonuslen, max_bonuslen, err;
err = dbuf_read_verify_dnode_crypt(db, flags);
if (err)
return (err);
int bonuslen, max_bonuslen;
bonuslen = MIN(dn->dn_bonuslen, dn->dn_phys->dn_bonuslen);
max_bonuslen = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots);
@ -1509,32 +1504,46 @@ dbuf_read_hole(dmu_buf_impl_t *db, dnode_t *dn, blkptr_t *bp)
* decrypt / authenticate them when we need to read an encrypted bonus buffer.
*/
static int
dbuf_read_verify_dnode_crypt(dmu_buf_impl_t *db, uint32_t flags)
dbuf_read_verify_dnode_crypt(dmu_buf_impl_t *db, dnode_t *dn, uint32_t flags)
{
int err = 0;
objset_t *os = db->db_objset;
arc_buf_t *dnode_abuf;
dnode_t *dn;
dmu_buf_impl_t *dndb;
arc_buf_t *dnbuf;
zbookmark_phys_t zb;
ASSERT(MUTEX_HELD(&db->db_mtx));
int err;
if ((flags & DB_RF_NO_DECRYPT) != 0 ||
!os->os_encrypted || os->os_raw_receive)
!os->os_encrypted || os->os_raw_receive ||
(dndb = dn->dn_dbuf) == NULL)
return (0);
DB_DNODE_ENTER(db);
dn = DB_DNODE(db);
dnode_abuf = (dn->dn_dbuf != NULL) ? dn->dn_dbuf->db_buf : NULL;
if (dnode_abuf == NULL || !arc_is_encrypted(dnode_abuf)) {
DB_DNODE_EXIT(db);
dnbuf = dndb->db_buf;
if (!arc_is_encrypted(dnbuf))
return (0);
}
mutex_enter(&dndb->db_mtx);
/*
* Since dnode buffer is modified by sync process, there can be only
* one copy of it. It means we can not modify (decrypt) it while it
* is being written. I don't see how this may happen now, since
* encrypted dnode writes by receive should be completed before any
* plain-text reads due to txg wait, but better be safe than sorry.
*/
while (1) {
if (!arc_is_encrypted(dnbuf)) {
mutex_exit(&dndb->db_mtx);
return (0);
}
dbuf_dirty_record_t *dr = dndb->db_data_pending;
if (dr == NULL || dr->dt.dl.dr_data != dnbuf)
break;
cv_wait(&dndb->db_changed, &dndb->db_mtx);
};
SET_BOOKMARK(&zb, dmu_objset_id(os),
DMU_META_DNODE_OBJECT, 0, dn->dn_dbuf->db_blkid);
err = arc_untransform(dnode_abuf, os->os_spa, &zb, B_TRUE);
DMU_META_DNODE_OBJECT, 0, dndb->db_blkid);
err = arc_untransform(dnbuf, os->os_spa, &zb, B_TRUE);
/*
* An error code of EACCES tells us that the key is still not
@ -1547,7 +1556,7 @@ dbuf_read_verify_dnode_crypt(dmu_buf_impl_t *db, uint32_t flags)
!DMU_OT_IS_ENCRYPTED(dn->dn_bonustype))))
err = 0;
DB_DNODE_EXIT(db);
mutex_exit(&dndb->db_mtx);
return (err);
}
@ -1573,7 +1582,7 @@ dbuf_read_impl(dmu_buf_impl_t *db, dnode_t *dn, zio_t *zio, uint32_t flags,
RW_LOCK_HELD(&db->db_parent->db_rwlock));
if (db->db_blkid == DMU_BONUS_BLKID) {
err = dbuf_read_bonus(db, dn, flags);
err = dbuf_read_bonus(db, dn);
goto early_unlock;
}
@ -1635,10 +1644,6 @@ dbuf_read_impl(dmu_buf_impl_t *db, dnode_t *dn, zio_t *zio, uint32_t flags,
goto early_unlock;
}
err = dbuf_read_verify_dnode_crypt(db, flags);
if (err != 0)
goto early_unlock;
db->db_state = DB_READ;
DTRACE_SET_STATE(db, "read issued");
mutex_exit(&db->db_mtx);
@ -1754,19 +1759,23 @@ dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg)
int
dbuf_read(dmu_buf_impl_t *db, zio_t *pio, uint32_t flags)
{
int err = 0;
boolean_t prefetch;
dnode_t *dn;
boolean_t miss = B_TRUE, need_wait = B_FALSE, prefetch;
int err;
/*
* We don't have to hold the mutex to check db_state because it
* can't be freed while we have a hold on the buffer.
*/
ASSERT(!zfs_refcount_is_zero(&db->db_holds));
DB_DNODE_ENTER(db);
dn = DB_DNODE(db);
/*
* Ensure that this block's dnode has been decrypted if the caller
* has requested decrypted data.
*/
err = dbuf_read_verify_dnode_crypt(db, dn, flags);
if (err != 0)
goto done;
prefetch = db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID &&
(flags & DB_RF_NOPREFETCH) == 0;
@ -1775,13 +1784,38 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *pio, uint32_t flags)
db->db_partial_read = B_TRUE;
else if (!(flags & DB_RF_PARTIAL_MORE))
db->db_partial_read = B_FALSE;
if (db->db_state == DB_CACHED) {
/*
* Ensure that this block's dnode has been decrypted if
* the caller has requested decrypted data.
*/
err = dbuf_read_verify_dnode_crypt(db, flags);
miss = (db->db_state != DB_CACHED);
if (db->db_state == DB_READ || db->db_state == DB_FILL) {
/*
* Another reader came in while the dbuf was in flight between
* UNCACHED and CACHED. Either a writer will finish filling
* the buffer, sending the dbuf to CACHED, or the first reader's
* request will reach the read_done callback and send the dbuf
* to CACHED. Otherwise, a failure occurred and the dbuf will
* be sent to UNCACHED.
*/
if (flags & DB_RF_NEVERWAIT) {
mutex_exit(&db->db_mtx);
DB_DNODE_EXIT(db);
goto done;
}
do {
ASSERT(db->db_state == DB_READ ||
(flags & DB_RF_HAVESTRUCT) == 0);
DTRACE_PROBE2(blocked__read, dmu_buf_impl_t *, db,
zio_t *, pio);
cv_wait(&db->db_changed, &db->db_mtx);
} while (db->db_state == DB_READ || db->db_state == DB_FILL);
if (db->db_state == DB_UNCACHED) {
err = SET_ERROR(EIO);
mutex_exit(&db->db_mtx);
DB_DNODE_EXIT(db);
goto done;
}
}
if (db->db_state == DB_CACHED) {
/*
* If the arc buf is compressed or encrypted and the caller
* requested uncompressed data, we need to untransform it
@ -1789,8 +1823,7 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *pio, uint32_t flags)
* unauthenticated blocks, which will verify their MAC if
* the key is now available.
*/
if (err == 0 && db->db_buf != NULL &&
(flags & DB_RF_NO_DECRYPT) == 0 &&
if ((flags & DB_RF_NO_DECRYPT) == 0 && db->db_buf != NULL &&
(arc_is_encrypted(db->db_buf) ||
arc_is_unauthenticated(db->db_buf) ||
arc_get_compression(db->db_buf) != ZIO_COMPRESS_OFF)) {
@ -1804,17 +1837,10 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *pio, uint32_t flags)
dbuf_set_data(db, db->db_buf);
}
mutex_exit(&db->db_mtx);
if (err == 0 && prefetch) {
dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE,
B_FALSE, flags & DB_RF_HAVESTRUCT);
}
DB_DNODE_EXIT(db);
DBUF_STAT_BUMP(hash_hits);
} else if (db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL) {
boolean_t need_wait = B_FALSE;
} else {
ASSERT(db->db_state == DB_UNCACHED ||
db->db_state == DB_NOFILL);
db_lock_type_t dblt = dmu_buf_lock_parent(db, RW_READER, FTAG);
if (pio == NULL && (db->db_state == DB_NOFILL ||
(db->db_blkptr != NULL && !BP_IS_HOLE(db->db_blkptr)))) {
spa_t *spa = dn->dn_objset->os_spa;
@ -1822,65 +1848,33 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *pio, uint32_t flags)
need_wait = B_TRUE;
}
err = dbuf_read_impl(db, dn, pio, flags, dblt, FTAG);
/*
* dbuf_read_impl has dropped db_mtx and our parent's rwlock
* for us
*/
if (!err && prefetch) {
dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE,
db->db_state != DB_CACHED,
flags & DB_RF_HAVESTRUCT);
}
DB_DNODE_EXIT(db);
DBUF_STAT_BUMP(hash_misses);
/*
* If we created a zio_root we must execute it to avoid
* leaking it, even if it isn't attached to any work due
* to an error in dbuf_read_impl().
*/
if (need_wait) {
if (err == 0)
err = zio_wait(pio);
else
(void) zio_wait(pio);
pio = NULL;
}
} else {
/*
* Another reader came in while the dbuf was in flight
* between UNCACHED and CACHED. Either a writer will finish
* writing the buffer (sending the dbuf to CACHED) or the
* first reader's request will reach the read_done callback
* and send the dbuf to CACHED. Otherwise, a failure
* occurred and the dbuf went to UNCACHED.
*/
mutex_exit(&db->db_mtx);
if (prefetch) {
dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE,
B_TRUE, flags & DB_RF_HAVESTRUCT);
}
DB_DNODE_EXIT(db);
DBUF_STAT_BUMP(hash_misses);
/* Skip the wait per the caller's request. */
if ((flags & DB_RF_NEVERWAIT) == 0) {
mutex_enter(&db->db_mtx);
while (db->db_state == DB_READ ||
db->db_state == DB_FILL) {
ASSERT(db->db_state == DB_READ ||
(flags & DB_RF_HAVESTRUCT) == 0);
DTRACE_PROBE2(blocked__read, dmu_buf_impl_t *,
db, zio_t *, pio);
cv_wait(&db->db_changed, &db->db_mtx);
}
if (db->db_state == DB_UNCACHED)
err = SET_ERROR(EIO);
mutex_exit(&db->db_mtx);
}
/* dbuf_read_impl drops db_mtx and parent's rwlock. */
miss = (db->db_state != DB_CACHED);
}
if (err == 0 && prefetch) {
dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE, miss,
flags & DB_RF_HAVESTRUCT);
}
DB_DNODE_EXIT(db);
/*
* If we created a zio we must execute it to avoid leaking it, even if
* it isn't attached to any work due to an error in dbuf_read_impl().
*/
if (need_wait) {
if (err == 0)
err = zio_wait(pio);
else
(void) zio_wait(pio);
pio = NULL;
}
done:
if (miss)
DBUF_STAT_BUMP(hash_misses);
else
DBUF_STAT_BUMP(hash_hits);
if (pio && err != 0) {
zio_t *zio = zio_null(pio, pio->io_spa, NULL, NULL, NULL,
ZIO_FLAG_CANFAIL);