mirror of
https://github.com/freebsd/freebsd-src
synced 2024-10-05 08:00:30 +00:00
f9cdbaba8d
illumos/illumos-gate@770499e185
770499e185
https://www.illumos.org/issues/8021
The ARC buf data project (known simply as "ABD" since its genesis in the ZoL
community) changes the way the ARC allocates `b_pdata` memory from using linear
`void *` buffers to using scatter/gather lists of fixed-size 1KB chunks. This
improves ZFS's performance by helping to defragment the address space occupied
by the ARC, in particular for cases where compressed ARC is enabled. It could
also ease future work to allocate pages directly from `segkpm` for minimal-
overhead memory allocations, bypassing the `kmem` subsystem.
This is essentially the same change as the one which recently landed in ZFS on
Linux, although they made some platform-specific changes while adapting this
work to their codebase:
1. Implemented the equivalent of the `segkpm` suggestion for future work
mentioned above to bypass issues that they've had with the Linux kernel memory
allocator.
2. Changed the internal representation of the ABD's scatter/gather list so it
could be used to pass I/O directly into Linux block device drivers. (This
feature is not available in the illumos block device interface yet.)
FreeBSD notes:
- the actual (default) chunk size is 4KB (despite the text above saying 1KB)
- we can try to reimplement ABDs, so that they are not permanently
mapped into the KVA unless explicitly requested, especially on
platforms with scarce KVA
- we can try to use unmapped I/O and avoid intermediate allocation of a
linear, virtual memory mapped buffer
- we can try to avoid extra data copying by referring to chunks / pages
in the original ABD
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Paul Dagnelie <pcd@delphix.com>
Reviewed by: John Kennedy <john.kennedy@delphix.com>
Reviewed by: Prakash Surya <prakash.surya@delphix.com>
Reviewed by: Prashanth Sreenivasa <pks@delphix.com>
Reviewed by: Pavel Zakharov <pavel.zakharov@delphix.com>
Reviewed by: Chris Williamson <chris.williamson@delphix.com>
Approved by: Richard Lowe <richlowe@richlowe.net>
Author: Dan Kimmel <dan.kimmel@delphix.com>
MFC after: 3 weeks
401 lines
10 KiB
C
401 lines
10 KiB
C
/*
|
|
* CDDL HEADER START
|
|
*
|
|
* The contents of this file are subject to the terms of the
|
|
* Common Development and Distribution License (the "License").
|
|
* You may not use this file except in compliance with the License.
|
|
*
|
|
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
|
|
* or http://www.opensolaris.org/os/licensing.
|
|
* See the License for the specific language governing permissions
|
|
* and limitations under the License.
|
|
*
|
|
* When distributing Covered Code, include this CDDL HEADER in each
|
|
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
|
|
* If applicable, add the following below this CDDL HEADER, with the
|
|
* fields enclosed by brackets "[]" replaced with your own identifying
|
|
* information: Portions Copyright [yyyy] [name of copyright owner]
|
|
*
|
|
* CDDL HEADER END
|
|
*/
|
|
/*
|
|
* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
|
|
* Use is subject to license terms.
|
|
*/
|
|
|
|
/*
|
|
* Copyright (c) 2013, 2016 by Delphix. All rights reserved.
|
|
*/
|
|
|
|
/*
|
|
* Print intent log header and statistics.
|
|
*/
|
|
|
|
#include <stdio.h>
|
|
#include <stdlib.h>
|
|
#include <ctype.h>
|
|
#include <sys/zfs_context.h>
|
|
#include <sys/spa.h>
|
|
#include <sys/dmu.h>
|
|
#include <sys/stat.h>
|
|
#include <sys/resource.h>
|
|
#include <sys/zil.h>
|
|
#include <sys/zil_impl.h>
|
|
#include <sys/abd.h>
|
|
|
|
extern uint8_t dump_opt[256];
|
|
|
|
static char prefix[4] = "\t\t\t";
|
|
|
|
static void
|
|
print_log_bp(const blkptr_t *bp, const char *prefix)
|
|
{
|
|
char blkbuf[BP_SPRINTF_LEN];
|
|
|
|
snprintf_blkptr(blkbuf, sizeof (blkbuf), bp);
|
|
(void) printf("%s%s\n", prefix, blkbuf);
|
|
}
|
|
|
|
/* ARGSUSED */
|
|
static void
|
|
zil_prt_rec_create(zilog_t *zilog, int txtype, lr_create_t *lr)
|
|
{
|
|
time_t crtime = lr->lr_crtime[0];
|
|
char *name, *link;
|
|
lr_attr_t *lrattr;
|
|
|
|
name = (char *)(lr + 1);
|
|
|
|
if (lr->lr_common.lrc_txtype == TX_CREATE_ATTR ||
|
|
lr->lr_common.lrc_txtype == TX_MKDIR_ATTR) {
|
|
lrattr = (lr_attr_t *)(lr + 1);
|
|
name += ZIL_XVAT_SIZE(lrattr->lr_attr_masksize);
|
|
}
|
|
|
|
if (txtype == TX_SYMLINK) {
|
|
link = name + strlen(name) + 1;
|
|
(void) printf("%s%s -> %s\n", prefix, name, link);
|
|
} else if (txtype != TX_MKXATTR) {
|
|
(void) printf("%s%s\n", prefix, name);
|
|
}
|
|
|
|
(void) printf("%s%s", prefix, ctime(&crtime));
|
|
(void) printf("%sdoid %llu, foid %llu, mode %llo\n", prefix,
|
|
(u_longlong_t)lr->lr_doid, (u_longlong_t)lr->lr_foid,
|
|
(longlong_t)lr->lr_mode);
|
|
(void) printf("%suid %llu, gid %llu, gen %llu, rdev 0x%llx\n", prefix,
|
|
(u_longlong_t)lr->lr_uid, (u_longlong_t)lr->lr_gid,
|
|
(u_longlong_t)lr->lr_gen, (u_longlong_t)lr->lr_rdev);
|
|
}
|
|
|
|
/* ARGSUSED */
|
|
static void
|
|
zil_prt_rec_remove(zilog_t *zilog, int txtype, lr_remove_t *lr)
|
|
{
|
|
(void) printf("%sdoid %llu, name %s\n", prefix,
|
|
(u_longlong_t)lr->lr_doid, (char *)(lr + 1));
|
|
}
|
|
|
|
/* ARGSUSED */
|
|
static void
|
|
zil_prt_rec_link(zilog_t *zilog, int txtype, lr_link_t *lr)
|
|
{
|
|
(void) printf("%sdoid %llu, link_obj %llu, name %s\n", prefix,
|
|
(u_longlong_t)lr->lr_doid, (u_longlong_t)lr->lr_link_obj,
|
|
(char *)(lr + 1));
|
|
}
|
|
|
|
/* ARGSUSED */
|
|
static void
|
|
zil_prt_rec_rename(zilog_t *zilog, int txtype, lr_rename_t *lr)
|
|
{
|
|
char *snm = (char *)(lr + 1);
|
|
char *tnm = snm + strlen(snm) + 1;
|
|
|
|
(void) printf("%ssdoid %llu, tdoid %llu\n", prefix,
|
|
(u_longlong_t)lr->lr_sdoid, (u_longlong_t)lr->lr_tdoid);
|
|
(void) printf("%ssrc %s tgt %s\n", prefix, snm, tnm);
|
|
}
|
|
|
|
/* ARGSUSED */
|
|
static int
|
|
zil_prt_rec_write_cb(void *data, size_t len, void *unused)
|
|
{
|
|
char *cdata = data;
|
|
for (int i = 0; i < len; i++) {
|
|
if (isprint(*cdata))
|
|
(void) printf("%c ", *cdata);
|
|
else
|
|
(void) printf("%2X", *cdata);
|
|
cdata++;
|
|
}
|
|
return (0);
|
|
}
|
|
|
|
/* ARGSUSED */
|
|
static void
|
|
zil_prt_rec_write(zilog_t *zilog, int txtype, lr_write_t *lr)
|
|
{
|
|
abd_t *data;
|
|
blkptr_t *bp = &lr->lr_blkptr;
|
|
zbookmark_phys_t zb;
|
|
int verbose = MAX(dump_opt['d'], dump_opt['i']);
|
|
int error;
|
|
|
|
(void) printf("%sfoid %llu, offset %llx, length %llx\n", prefix,
|
|
(u_longlong_t)lr->lr_foid, (u_longlong_t)lr->lr_offset,
|
|
(u_longlong_t)lr->lr_length);
|
|
|
|
if (txtype == TX_WRITE2 || verbose < 5)
|
|
return;
|
|
|
|
if (lr->lr_common.lrc_reclen == sizeof (lr_write_t)) {
|
|
(void) printf("%shas blkptr, %s\n", prefix,
|
|
!BP_IS_HOLE(bp) &&
|
|
bp->blk_birth >= spa_first_txg(zilog->zl_spa) ?
|
|
"will claim" : "won't claim");
|
|
print_log_bp(bp, prefix);
|
|
|
|
if (BP_IS_HOLE(bp)) {
|
|
(void) printf("\t\t\tLSIZE 0x%llx\n",
|
|
(u_longlong_t)BP_GET_LSIZE(bp));
|
|
(void) printf("%s<hole>\n", prefix);
|
|
return;
|
|
}
|
|
if (bp->blk_birth < zilog->zl_header->zh_claim_txg) {
|
|
(void) printf("%s<block already committed>\n", prefix);
|
|
return;
|
|
}
|
|
|
|
SET_BOOKMARK(&zb, dmu_objset_id(zilog->zl_os),
|
|
lr->lr_foid, ZB_ZIL_LEVEL,
|
|
lr->lr_offset / BP_GET_LSIZE(bp));
|
|
|
|
data = abd_alloc(BP_GET_LSIZE(bp), B_FALSE);
|
|
error = zio_wait(zio_read(NULL, zilog->zl_spa,
|
|
bp, data, BP_GET_LSIZE(bp), NULL, NULL,
|
|
ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL, &zb));
|
|
if (error)
|
|
goto out;
|
|
} else {
|
|
/* data is stored after the end of the lr_write record */
|
|
data = abd_alloc(lr->lr_length, B_FALSE);
|
|
abd_copy_from_buf(data, lr + 1, lr->lr_length);
|
|
}
|
|
|
|
(void) printf("%s", prefix);
|
|
(void) abd_iterate_func(data,
|
|
0, MIN(lr->lr_length, (verbose < 6 ? 20 : SPA_MAXBLOCKSIZE)),
|
|
zil_prt_rec_write_cb, NULL);
|
|
(void) printf("\n");
|
|
|
|
out:
|
|
abd_free(data);
|
|
}
|
|
|
|
/* ARGSUSED */
|
|
static void
|
|
zil_prt_rec_truncate(zilog_t *zilog, int txtype, lr_truncate_t *lr)
|
|
{
|
|
(void) printf("%sfoid %llu, offset 0x%llx, length 0x%llx\n", prefix,
|
|
(u_longlong_t)lr->lr_foid, (longlong_t)lr->lr_offset,
|
|
(u_longlong_t)lr->lr_length);
|
|
}
|
|
|
|
/* ARGSUSED */
|
|
static void
|
|
zil_prt_rec_setattr(zilog_t *zilog, int txtype, lr_setattr_t *lr)
|
|
{
|
|
time_t atime = (time_t)lr->lr_atime[0];
|
|
time_t mtime = (time_t)lr->lr_mtime[0];
|
|
|
|
(void) printf("%sfoid %llu, mask 0x%llx\n", prefix,
|
|
(u_longlong_t)lr->lr_foid, (u_longlong_t)lr->lr_mask);
|
|
|
|
if (lr->lr_mask & AT_MODE) {
|
|
(void) printf("%sAT_MODE %llo\n", prefix,
|
|
(longlong_t)lr->lr_mode);
|
|
}
|
|
|
|
if (lr->lr_mask & AT_UID) {
|
|
(void) printf("%sAT_UID %llu\n", prefix,
|
|
(u_longlong_t)lr->lr_uid);
|
|
}
|
|
|
|
if (lr->lr_mask & AT_GID) {
|
|
(void) printf("%sAT_GID %llu\n", prefix,
|
|
(u_longlong_t)lr->lr_gid);
|
|
}
|
|
|
|
if (lr->lr_mask & AT_SIZE) {
|
|
(void) printf("%sAT_SIZE %llu\n", prefix,
|
|
(u_longlong_t)lr->lr_size);
|
|
}
|
|
|
|
if (lr->lr_mask & AT_ATIME) {
|
|
(void) printf("%sAT_ATIME %llu.%09llu %s", prefix,
|
|
(u_longlong_t)lr->lr_atime[0],
|
|
(u_longlong_t)lr->lr_atime[1],
|
|
ctime(&atime));
|
|
}
|
|
|
|
if (lr->lr_mask & AT_MTIME) {
|
|
(void) printf("%sAT_MTIME %llu.%09llu %s", prefix,
|
|
(u_longlong_t)lr->lr_mtime[0],
|
|
(u_longlong_t)lr->lr_mtime[1],
|
|
ctime(&mtime));
|
|
}
|
|
}
|
|
|
|
/* ARGSUSED */
|
|
static void
|
|
zil_prt_rec_acl(zilog_t *zilog, int txtype, lr_acl_t *lr)
|
|
{
|
|
(void) printf("%sfoid %llu, aclcnt %llu\n", prefix,
|
|
(u_longlong_t)lr->lr_foid, (u_longlong_t)lr->lr_aclcnt);
|
|
}
|
|
|
|
typedef void (*zil_prt_rec_func_t)();
|
|
typedef struct zil_rec_info {
|
|
zil_prt_rec_func_t zri_print;
|
|
char *zri_name;
|
|
uint64_t zri_count;
|
|
} zil_rec_info_t;
|
|
|
|
static zil_rec_info_t zil_rec_info[TX_MAX_TYPE] = {
|
|
{ NULL, "Total " },
|
|
{ zil_prt_rec_create, "TX_CREATE " },
|
|
{ zil_prt_rec_create, "TX_MKDIR " },
|
|
{ zil_prt_rec_create, "TX_MKXATTR " },
|
|
{ zil_prt_rec_create, "TX_SYMLINK " },
|
|
{ zil_prt_rec_remove, "TX_REMOVE " },
|
|
{ zil_prt_rec_remove, "TX_RMDIR " },
|
|
{ zil_prt_rec_link, "TX_LINK " },
|
|
{ zil_prt_rec_rename, "TX_RENAME " },
|
|
{ zil_prt_rec_write, "TX_WRITE " },
|
|
{ zil_prt_rec_truncate, "TX_TRUNCATE " },
|
|
{ zil_prt_rec_setattr, "TX_SETATTR " },
|
|
{ zil_prt_rec_acl, "TX_ACL_V0 " },
|
|
{ zil_prt_rec_acl, "TX_ACL_ACL " },
|
|
{ zil_prt_rec_create, "TX_CREATE_ACL " },
|
|
{ zil_prt_rec_create, "TX_CREATE_ATTR " },
|
|
{ zil_prt_rec_create, "TX_CREATE_ACL_ATTR " },
|
|
{ zil_prt_rec_create, "TX_MKDIR_ACL " },
|
|
{ zil_prt_rec_create, "TX_MKDIR_ATTR " },
|
|
{ zil_prt_rec_create, "TX_MKDIR_ACL_ATTR " },
|
|
{ zil_prt_rec_write, "TX_WRITE2 " },
|
|
};
|
|
|
|
/* ARGSUSED */
|
|
static int
|
|
print_log_record(zilog_t *zilog, lr_t *lr, void *arg, uint64_t claim_txg)
|
|
{
|
|
int txtype;
|
|
int verbose = MAX(dump_opt['d'], dump_opt['i']);
|
|
|
|
/* reduce size of txtype to strip off TX_CI bit */
|
|
txtype = lr->lrc_txtype;
|
|
|
|
ASSERT(txtype != 0 && (uint_t)txtype < TX_MAX_TYPE);
|
|
ASSERT(lr->lrc_txg);
|
|
|
|
(void) printf("\t\t%s%s len %6llu, txg %llu, seq %llu\n",
|
|
(lr->lrc_txtype & TX_CI) ? "CI-" : "",
|
|
zil_rec_info[txtype].zri_name,
|
|
(u_longlong_t)lr->lrc_reclen,
|
|
(u_longlong_t)lr->lrc_txg,
|
|
(u_longlong_t)lr->lrc_seq);
|
|
|
|
if (txtype && verbose >= 3)
|
|
zil_rec_info[txtype].zri_print(zilog, txtype, lr);
|
|
|
|
zil_rec_info[txtype].zri_count++;
|
|
zil_rec_info[0].zri_count++;
|
|
|
|
return (0);
|
|
}
|
|
|
|
/* ARGSUSED */
|
|
static int
|
|
print_log_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg)
|
|
{
|
|
char blkbuf[BP_SPRINTF_LEN + 10];
|
|
int verbose = MAX(dump_opt['d'], dump_opt['i']);
|
|
char *claim;
|
|
|
|
if (verbose <= 3)
|
|
return (0);
|
|
|
|
if (verbose >= 5) {
|
|
(void) strcpy(blkbuf, ", ");
|
|
snprintf_blkptr(blkbuf + strlen(blkbuf),
|
|
sizeof (blkbuf) - strlen(blkbuf), bp);
|
|
} else {
|
|
blkbuf[0] = '\0';
|
|
}
|
|
|
|
if (claim_txg != 0)
|
|
claim = "already claimed";
|
|
else if (bp->blk_birth >= spa_first_txg(zilog->zl_spa))
|
|
claim = "will claim";
|
|
else
|
|
claim = "won't claim";
|
|
|
|
(void) printf("\tBlock seqno %llu, %s%s\n",
|
|
(u_longlong_t)bp->blk_cksum.zc_word[ZIL_ZC_SEQ], claim, blkbuf);
|
|
|
|
return (0);
|
|
}
|
|
|
|
static void
|
|
print_log_stats(int verbose)
|
|
{
|
|
int i, w, p10;
|
|
|
|
if (verbose > 3)
|
|
(void) printf("\n");
|
|
|
|
if (zil_rec_info[0].zri_count == 0)
|
|
return;
|
|
|
|
for (w = 1, p10 = 10; zil_rec_info[0].zri_count >= p10; p10 *= 10)
|
|
w++;
|
|
|
|
for (i = 0; i < TX_MAX_TYPE; i++)
|
|
if (zil_rec_info[i].zri_count || verbose >= 3)
|
|
(void) printf("\t\t%s %*llu\n",
|
|
zil_rec_info[i].zri_name, w,
|
|
(u_longlong_t)zil_rec_info[i].zri_count);
|
|
(void) printf("\n");
|
|
}
|
|
|
|
/* ARGSUSED */
|
|
void
|
|
dump_intent_log(zilog_t *zilog)
|
|
{
|
|
const zil_header_t *zh = zilog->zl_header;
|
|
int verbose = MAX(dump_opt['d'], dump_opt['i']);
|
|
int i;
|
|
|
|
if (BP_IS_HOLE(&zh->zh_log) || verbose < 1)
|
|
return;
|
|
|
|
(void) printf("\n ZIL header: claim_txg %llu, "
|
|
"claim_blk_seq %llu, claim_lr_seq %llu",
|
|
(u_longlong_t)zh->zh_claim_txg,
|
|
(u_longlong_t)zh->zh_claim_blk_seq,
|
|
(u_longlong_t)zh->zh_claim_lr_seq);
|
|
(void) printf(" replay_seq %llu, flags 0x%llx\n",
|
|
(u_longlong_t)zh->zh_replay_seq, (u_longlong_t)zh->zh_flags);
|
|
|
|
for (i = 0; i < TX_MAX_TYPE; i++)
|
|
zil_rec_info[i].zri_count = 0;
|
|
|
|
if (verbose >= 2) {
|
|
(void) printf("\n");
|
|
(void) zil_parse(zilog, print_log_block, print_log_record, NULL,
|
|
zh->zh_claim_txg);
|
|
print_log_stats(verbose);
|
|
}
|
|
}
|