Illumos 5408 - managing ZFS cache devices requires lots of RAM

5408 managing ZFS cache devices requires lots of RAM
Reviewed by: Christopher Siden <christopher.siden@delphix.com>
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: Don Brady <dev.fs.zfs@gmail.com>
Reviewed by: Josef 'Jeff' Sipek <josef.sipek@nexenta.com>
Approved by: Garrett D'Amore <garrett@damore.org>

Porting notes:

Due to the restructuring of the ARC-related structures, this
patch conflicts with at least the following existing ZoL commits:

    6e1d7276c9
    Fix inaccurate arcstat_l2_hdr_size calculations

        The ARC_SPACE_HDRS constant no longer exists and has been
        somewhat equivalently replaced by HDR_L2ONLY_SIZE.

    e0b0ca983d
    Add visibility in to cached dbufs

        The new layering of l{1,2}arc_buf_hdr_t within the arc_buf_hdr
        struct requires additional structure member names to be used
        when referencing the inner items.  Also, the presence of L1 or L2
        inner member is indicated by flags using the new HDR_HAS_L{1,2}HDR
        macros.

Ported by: Tim Chase <tim@chase2k.com>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
This commit is contained in:
Chris Williamson 2014-12-29 19:12:23 -08:00 committed by Brian Behlendorf
parent 2a4324141f
commit b9541d6b7d
5 changed files with 947 additions and 631 deletions

View file

@ -4042,7 +4042,7 @@ ztest_dmu_read_write_zcopy(ztest_ds_t *zd, uint64_t id)
* assign an arcbuf to a dbuf.
*/
for (j = 0; j < s; j++) {
if (i != 5) {
if (i != 5 || chunksize < (SPA_MINBLOCKSIZE * 2)) {
bigbuf_arcbufs[j] =
dmu_request_arcbuf(bonus_db, chunksize);
} else {
@ -4066,7 +4066,8 @@ ztest_dmu_read_write_zcopy(ztest_ds_t *zd, uint64_t id)
umem_free(packbuf, packsize);
umem_free(bigbuf, bigsize);
for (j = 0; j < s; j++) {
if (i != 5) {
if (i != 5 ||
chunksize < (SPA_MINBLOCKSIZE * 2)) {
dmu_return_arcbuf(bigbuf_arcbufs[j]);
} else {
dmu_return_arcbuf(
@ -4111,7 +4112,7 @@ ztest_dmu_read_write_zcopy(ztest_ds_t *zd, uint64_t id)
}
for (off = bigoff, j = 0; j < s; j++, off += chunksize) {
dmu_buf_t *dbt;
if (i != 5) {
if (i != 5 || chunksize < (SPA_MINBLOCKSIZE * 2)) {
bcopy((caddr_t)bigbuf + (off - bigoff),
bigbuf_arcbufs[j]->b_data, chunksize);
} else {
@ -4128,7 +4129,7 @@ ztest_dmu_read_write_zcopy(ztest_ds_t *zd, uint64_t id)
VERIFY(dmu_buf_hold(os, bigobj, off,
FTAG, &dbt, DMU_READ_NO_PREFETCH) == 0);
}
if (i != 5) {
if (i != 5 || chunksize < (SPA_MINBLOCKSIZE * 2)) {
dmu_assign_arcbuf(bonus_db, off,
bigbuf_arcbufs[j], tx);
} else {

View file

@ -81,10 +81,29 @@ typedef enum arc_flags
ARC_FLAG_FREED_IN_READ = 1 << 10, /* freed during read */
ARC_FLAG_BUF_AVAILABLE = 1 << 11, /* block not in use */
ARC_FLAG_INDIRECT = 1 << 12, /* indirect block */
ARC_FLAG_FREE_IN_PROGRESS = 1 << 13, /* about to be freed */
ARC_FLAG_L2_WRITING = 1 << 14, /* write in progress */
ARC_FLAG_L2_EVICTED = 1 << 15, /* evicted during I/O */
ARC_FLAG_L2_WRITE_HEAD = 1 << 16, /* head of write list */
ARC_FLAG_L2_WRITING = 1 << 13, /* write in progress */
ARC_FLAG_L2_EVICTED = 1 << 14, /* evicted during I/O */
ARC_FLAG_L2_WRITE_HEAD = 1 << 15, /* head of write list */
/* indicates that the buffer contains metadata (otherwise, data) */
ARC_FLAG_BUFC_METADATA = 1 << 16,
/* Flags specifying whether optional hdr struct fields are defined */
ARC_FLAG_HAS_L1HDR = 1 << 17,
ARC_FLAG_HAS_L2HDR = 1 << 18,
/*
* The arc buffer's compression mode is stored in the top 7 bits of the
* flags field, so these dummy flags are included so that MDB can
* interpret the enum properly.
*/
ARC_FLAG_COMPRESS_0 = 1 << 24,
ARC_FLAG_COMPRESS_1 = 1 << 25,
ARC_FLAG_COMPRESS_2 = 1 << 26,
ARC_FLAG_COMPRESS_3 = 1 << 27,
ARC_FLAG_COMPRESS_4 = 1 << 28,
ARC_FLAG_COMPRESS_5 = 1 << 29,
ARC_FLAG_COMPRESS_6 = 1 << 30
} arc_flags_t;
struct arc_buf {

View file

@ -74,8 +74,6 @@ typedef struct arc_state {
arc_state_type_t arcs_state;
} arc_state_t;
typedef struct l2arc_buf_hdr l2arc_buf_hdr_t;
typedef struct arc_callback arc_callback_t;
struct arc_callback {
@ -96,27 +94,45 @@ struct arc_write_callback {
arc_buf_t *awcb_buf;
};
struct arc_buf_hdr {
/* protected by hash lock */
dva_t b_dva;
uint64_t b_birth;
uint64_t b_cksum0;
/*
* ARC buffers are separated into multiple structs as a memory saving measure:
* - Common fields struct, always defined, and embedded within it:
* - L2-only fields, always allocated but undefined when not in L2ARC
* - L1-only fields, only allocated when in L1ARC
*
* Buffer in L1 Buffer only in L2
* +------------------------+ +------------------------+
* | arc_buf_hdr_t | | arc_buf_hdr_t |
* | | | |
* | | | |
* | | | |
* +------------------------+ +------------------------+
* | l2arc_buf_hdr_t | | l2arc_buf_hdr_t |
* | (undefined if L1-only) | | |
* +------------------------+ +------------------------+
* | l1arc_buf_hdr_t |
* | |
* | |
* | |
* | |
* +------------------------+
*
* Because it's possible for the L2ARC to become extremely large, we can wind
* up eating a lot of memory in L2ARC buffer headers, so the size of a header
* is minimized by only allocating the fields necessary for an L1-cached buffer
* when a header is actually in the L1 cache. The sub-headers (l1arc_buf_hdr and
* l2arc_buf_hdr) are embedded rather than allocated separately to save a couple
* words in pointers. arc_hdr_realloc() is used to switch a header between
* these two allocation states.
*/
typedef struct l1arc_buf_hdr {
kmutex_t b_freeze_lock;
zio_cksum_t *b_freeze_cksum;
arc_buf_hdr_t *b_hash_next;
arc_buf_t *b_buf;
arc_flags_t b_flags;
uint32_t b_datacnt;
arc_callback_t *b_acb;
/* for waiting on writes to complete */
kcondvar_t b_cv;
/* immutable */
arc_buf_contents_t b_type;
uint64_t b_size;
uint64_t b_spa;
/* protected by arc state mutex */
arc_state_t *b_state;
@ -133,9 +149,10 @@ struct arc_buf_hdr {
/* self protecting */
refcount_t b_refcnt;
l2arc_buf_hdr_t *b_l2hdr;
list_node_t b_l2node;
};
arc_callback_t *b_acb;
/* temporary buffer holder for in-flight compressed data */
void *b_tmp_cdata;
} l1arc_buf_hdr_t;
typedef struct l2arc_dev {
vdev_t *l2ad_vdev; /* vdev */
@ -146,15 +163,51 @@ typedef struct l2arc_dev {
uint64_t l2ad_evict; /* last addr eviction reached */
boolean_t l2ad_first; /* first sweep through */
boolean_t l2ad_writing; /* currently writing */
list_t *l2ad_buflist; /* buffer list */
kmutex_t l2ad_mtx; /* lock for buffer list */
list_t l2ad_buflist; /* buffer list */
list_node_t l2ad_node; /* device list node */
} l2arc_dev_t;
typedef struct l2arc_buf_hdr {
/* protected by arc_buf_hdr mutex */
l2arc_dev_t *b_dev; /* L2ARC device */
uint64_t b_daddr; /* disk address, offset byte */
/* real alloc'd buffer size depending on b_compress applied */
uint32_t b_hits;
int32_t b_asize;
list_node_t b_l2node;
} l2arc_buf_hdr_t;
typedef struct l2arc_write_callback {
l2arc_dev_t *l2wcb_dev; /* device info */
arc_buf_hdr_t *l2wcb_head; /* head of write buflist */
} l2arc_write_callback_t;
struct arc_buf_hdr {
/* protected by hash lock */
dva_t b_dva;
uint64_t b_birth;
/*
* Even though this checksum is only set/verified when a buffer is in
* the L1 cache, it needs to be in the set of common fields because it
* must be preserved from the time before a buffer is written out to
* L2ARC until after it is read back in.
*/
zio_cksum_t *b_freeze_cksum;
arc_buf_hdr_t *b_hash_next;
arc_flags_t b_flags;
/* immutable */
int32_t b_size;
uint64_t b_spa;
/* L2ARC fields. Undefined when not in L2ARC. */
l2arc_buf_hdr_t b_l2hdr;
/* L1ARC fields. Undefined when in l2arc_only state */
l1arc_buf_hdr_t b_l1hdr;
};
#ifdef __cplusplus
}
#endif

View file

@ -45,7 +45,6 @@ DECLARE_EVENT_CLASS(zfs_arc_buf_hdr_class,
TP_STRUCT__entry(
__array(uint64_t, hdr_dva_word, 2)
__field(uint64_t, hdr_birth)
__field(uint64_t, hdr_cksum0)
__field(uint32_t, hdr_flags)
__field(uint32_t, hdr_datacnt)
__field(arc_buf_contents_t, hdr_type)
@ -64,27 +63,25 @@ DECLARE_EVENT_CLASS(zfs_arc_buf_hdr_class,
__entry->hdr_dva_word[0] = ab->b_dva.dva_word[0];
__entry->hdr_dva_word[1] = ab->b_dva.dva_word[1];
__entry->hdr_birth = ab->b_birth;
__entry->hdr_cksum0 = ab->b_cksum0;
__entry->hdr_flags = ab->b_flags;
__entry->hdr_datacnt = ab->b_datacnt;
__entry->hdr_type = ab->b_type;
__entry->hdr_datacnt = ab->b_l1hdr.b_datacnt;
__entry->hdr_size = ab->b_size;
__entry->hdr_spa = ab->b_spa;
__entry->hdr_state_type = ab->b_state->arcs_state;
__entry->hdr_access = ab->b_arc_access;
__entry->hdr_mru_hits = ab->b_mru_hits;
__entry->hdr_mru_ghost_hits = ab->b_mru_ghost_hits;
__entry->hdr_mfu_hits = ab->b_mfu_hits;
__entry->hdr_mfu_ghost_hits = ab->b_mfu_ghost_hits;
__entry->hdr_l2_hits = ab->b_l2_hits;
__entry->hdr_refcount = ab->b_refcnt.rc_count;
__entry->hdr_state_type = ab->b_l1hdr.b_state->arcs_state;
__entry->hdr_access = ab->b_l1hdr.b_arc_access;
__entry->hdr_mru_hits = ab->b_l1hdr.b_mru_hits;
__entry->hdr_mru_ghost_hits = ab->b_l1hdr.b_mru_ghost_hits;
__entry->hdr_mfu_hits = ab->b_l1hdr.b_mfu_hits;
__entry->hdr_mfu_ghost_hits = ab->b_l1hdr.b_mfu_ghost_hits;
__entry->hdr_l2_hits = ab->b_l1hdr.b_l2_hits;
__entry->hdr_refcount = ab->b_l1hdr.b_refcnt.rc_count;
),
TP_printk("hdr { dva 0x%llx:0x%llx birth %llu cksum0 0x%llx "
TP_printk("hdr { dva 0x%llx:0x%llx birth %llu "
"flags 0x%x datacnt %u type %u size %llu spa %llu "
"state_type %u access %lu mru_hits %u mru_ghost_hits %u "
"mfu_hits %u mfu_ghost_hits %u l2_hits %u refcount %lli }",
__entry->hdr_dva_word[0], __entry->hdr_dva_word[1],
__entry->hdr_birth, __entry->hdr_cksum0, __entry->hdr_flags,
__entry->hdr_birth, __entry->hdr_flags,
__entry->hdr_datacnt, __entry->hdr_type, __entry->hdr_size,
__entry->hdr_spa, __entry->hdr_state_type,
__entry->hdr_access, __entry->hdr_mru_hits,
@ -261,7 +258,6 @@ DECLARE_EVENT_CLASS(zfs_arc_miss_class,
TP_STRUCT__entry(
__array(uint64_t, hdr_dva_word, 2)
__field(uint64_t, hdr_birth)
__field(uint64_t, hdr_cksum0)
__field(uint32_t, hdr_flags)
__field(uint32_t, hdr_datacnt)
__field(arc_buf_contents_t, hdr_type)
@ -292,20 +288,18 @@ DECLARE_EVENT_CLASS(zfs_arc_miss_class,
__entry->hdr_dva_word[0] = hdr->b_dva.dva_word[0];
__entry->hdr_dva_word[1] = hdr->b_dva.dva_word[1];
__entry->hdr_birth = hdr->b_birth;
__entry->hdr_cksum0 = hdr->b_cksum0;
__entry->hdr_flags = hdr->b_flags;
__entry->hdr_datacnt = hdr->b_datacnt;
__entry->hdr_type = hdr->b_type;
__entry->hdr_datacnt = hdr->b_l1hdr.b_datacnt;
__entry->hdr_size = hdr->b_size;
__entry->hdr_spa = hdr->b_spa;
__entry->hdr_state_type = hdr->b_state->arcs_state;
__entry->hdr_access = hdr->b_arc_access;
__entry->hdr_mru_hits = hdr->b_mru_hits;
__entry->hdr_mru_ghost_hits = hdr->b_mru_ghost_hits;
__entry->hdr_mfu_hits = hdr->b_mfu_hits;
__entry->hdr_mfu_ghost_hits = hdr->b_mfu_ghost_hits;
__entry->hdr_l2_hits = hdr->b_l2_hits;
__entry->hdr_refcount = hdr->b_refcnt.rc_count;
__entry->hdr_state_type = hdr->b_l1hdr.b_state->arcs_state;
__entry->hdr_access = hdr->b_l1hdr.b_arc_access;
__entry->hdr_mru_hits = hdr->b_l1hdr.b_mru_hits;
__entry->hdr_mru_ghost_hits = hdr->b_l1hdr.b_mru_ghost_hits;
__entry->hdr_mfu_hits = hdr->b_l1hdr.b_mfu_hits;
__entry->hdr_mfu_ghost_hits = hdr->b_l1hdr.b_mfu_ghost_hits;
__entry->hdr_l2_hits = hdr->b_l1hdr.b_l2_hits;
__entry->hdr_refcount = hdr->b_l1hdr.b_refcnt.rc_count;
__entry->bp_dva0[0] = bp->blk_dva[0].dva_word[0];
__entry->bp_dva0[1] = bp->blk_dva[0].dva_word[1];
@ -325,8 +319,8 @@ DECLARE_EVENT_CLASS(zfs_arc_miss_class,
__entry->zb_level = zb->zb_level;
__entry->zb_blkid = zb->zb_blkid;
),
TP_printk("hdr { dva 0x%llx:0x%llx birth %llu cksum0 0x%llx "
"flags 0x%x datacnt %u type %u size %llu spa %llu state_type %u "
TP_printk("hdr { dva 0x%llx:0x%llx birth %llu "
"flags 0x%x datacnt %u size %llu spa %llu state_type %u "
"access %lu mru_hits %u mru_ghost_hits %u mfu_hits %u "
"mfu_ghost_hits %u l2_hits %u refcount %lli } "
"bp { dva0 0x%llx:0x%llx dva1 0x%llx:0x%llx dva2 "
@ -334,8 +328,8 @@ DECLARE_EVENT_CLASS(zfs_arc_miss_class,
"lsize %llu } zb { objset %llu object %llu level %lli "
"blkid %llu }",
__entry->hdr_dva_word[0], __entry->hdr_dva_word[1],
__entry->hdr_birth, __entry->hdr_cksum0, __entry->hdr_flags,
__entry->hdr_datacnt, __entry->hdr_type, __entry->hdr_size,
__entry->hdr_birth, __entry->hdr_flags,
__entry->hdr_datacnt, __entry->hdr_size,
__entry->hdr_spa, __entry->hdr_state_type, __entry->hdr_access,
__entry->hdr_mru_hits, __entry->hdr_mru_ghost_hits,
__entry->hdr_mfu_hits, __entry->hdr_mfu_ghost_hits,

File diff suppressed because it is too large Load diff