Replace the TRIM consolodation framework originally added in -r337396

driven by problems found with the algorithms being tested for TRIM
consolodation.

Reported by:  Peter Holm
Suggested by: kib
Reviewed by:  kib
Sponsored by: Netflix
This commit is contained in:
Kirk McKusick 2018-08-18 22:21:59 +00:00
parent db7c2a4822
commit 7e038bc257
Notes: svn2git 2020-12-20 02:59:44 +00:00
svn path=/head/; revision=338031
9 changed files with 232 additions and 65 deletions

View file

@ -110,8 +110,6 @@ static ufs2_daddr_t
static void ffs_blkfree_cg(struct ufsmount *, struct fs *,
struct vnode *, ufs2_daddr_t, long, ino_t,
struct workhead *);
static void ffs_blkfree_trim_completed(struct buf *);
static void ffs_blkfree_trim_task(void *ctx, int pending __unused);
#ifdef INVARIANTS
static int ffs_checkblk(struct inode *, ufs2_daddr_t, long);
#endif
@ -395,8 +393,24 @@ ffs_realloccg(ip, lbprev, bprev, bpref, osize, nsize, flags, cred, bpp)
if (bno > 0) {
bp->b_blkno = fsbtodb(fs, bno);
if (!DOINGSOFTDEP(vp))
/*
* The usual case is that a smaller fragment that
* was just allocated has been replaced with a bigger
* fragment or a full-size block. If it is marked as
* B_DELWRI, the current contents have not been written
* to disk. It is possible that the block was written
* earlier, but very uncommon. If the block has never
* been written, there is no need to send a BIO_DELETE
* for it when it is freed. The gain from avoiding the
* TRIMs for the common case of unwritten blocks far
* exceeds the cost of the write amplification for the
* uncommon case of failing to send a TRIM for a block
* that had been written.
*/
ffs_blkfree(ump, fs, ump->um_devvp, bprev, (long)osize,
ip->i_number, vp->v_type, NULL);
ip->i_number, vp->v_type, NULL,
(bp->b_flags & B_DELWRI) != 0 ?
NOTRIM_KEY : SINGLETON_KEY);
delta = btodb(nsize - osize);
DIP_SET(ip, i_blocks, DIP(ip, i_blocks) + delta);
if (flags & IO_EXT)
@ -521,7 +535,7 @@ ffs_reallocblks_ufs1(ap)
struct fs *fs;
struct inode *ip;
struct vnode *vp;
struct buf *sbp, *ebp;
struct buf *sbp, *ebp, *bp;
ufs1_daddr_t *bap, *sbap, *ebap;
struct cluster_save *buflist;
struct ufsmount *ump;
@ -730,14 +744,30 @@ ffs_reallocblks_ufs1(ap)
printf("\n\tnew:");
#endif
for (blkno = newblk, i = 0; i < len; i++, blkno += fs->fs_frag) {
bp = buflist->bs_children[i];
if (!DOINGSOFTDEP(vp))
/*
* The usual case is that a set of N-contiguous blocks
* that was just allocated has been replaced with a
* set of N+1-contiguous blocks. If they are marked as
* B_DELWRI, the current contents have not been written
* to disk. It is possible that the blocks were written
* earlier, but very uncommon. If the blocks have never
* been written, there is no need to send a BIO_DELETE
* for them when they are freed. The gain from avoiding
* the TRIMs for the common case of unwritten blocks
* far exceeds the cost of the write amplification for
* the uncommon case of failing to send a TRIM for the
* blocks that had been written.
*/
ffs_blkfree(ump, fs, ump->um_devvp,
dbtofsb(fs, buflist->bs_children[i]->b_blkno),
fs->fs_bsize, ip->i_number, vp->v_type, NULL);
buflist->bs_children[i]->b_blkno = fsbtodb(fs, blkno);
dbtofsb(fs, bp->b_blkno),
fs->fs_bsize, ip->i_number, vp->v_type, NULL,
(bp->b_flags & B_DELWRI) != 0 ?
NOTRIM_KEY : SINGLETON_KEY);
bp->b_blkno = fsbtodb(fs, blkno);
#ifdef INVARIANTS
if (!ffs_checkblk(ip,
dbtofsb(fs, buflist->bs_children[i]->b_blkno), fs->fs_bsize))
if (!ffs_checkblk(ip, dbtofsb(fs, bp->b_blkno), fs->fs_bsize))
panic("ffs_reallocblks: unallocated block 3");
#endif
#ifdef DEBUG
@ -771,7 +801,7 @@ ffs_reallocblks_ufs2(ap)
struct fs *fs;
struct inode *ip;
struct vnode *vp;
struct buf *sbp, *ebp;
struct buf *sbp, *ebp, *bp;
ufs2_daddr_t *bap, *sbap, *ebap;
struct cluster_save *buflist;
struct ufsmount *ump;
@ -978,14 +1008,30 @@ ffs_reallocblks_ufs2(ap)
printf("\n\tnew:");
#endif
for (blkno = newblk, i = 0; i < len; i++, blkno += fs->fs_frag) {
bp = buflist->bs_children[i];
if (!DOINGSOFTDEP(vp))
/*
* The usual case is that a set of N-contiguous blocks
* that was just allocated has been replaced with a
* set of N+1-contiguous blocks. If they are marked as
* B_DELWRI, the current contents have not been written
* to disk. It is possible that the blocks were written
* earlier, but very uncommon. If the blocks have never
* been written, there is no need to send a BIO_DELETE
* for them when they are freed. The gain from avoiding
* the TRIMs for the common case of unwritten blocks
* far exceeds the cost of the write amplification for
* the uncommon case of failing to send a TRIM for the
* blocks that had been written.
*/
ffs_blkfree(ump, fs, ump->um_devvp,
dbtofsb(fs, buflist->bs_children[i]->b_blkno),
fs->fs_bsize, ip->i_number, vp->v_type, NULL);
buflist->bs_children[i]->b_blkno = fsbtodb(fs, blkno);
dbtofsb(fs, bp->b_blkno),
fs->fs_bsize, ip->i_number, vp->v_type, NULL,
(bp->b_flags & B_DELWRI) != 0 ?
NOTRIM_KEY : SINGLETON_KEY);
bp->b_blkno = fsbtodb(fs, blkno);
#ifdef INVARIANTS
if (!ffs_checkblk(ip,
dbtofsb(fs, buflist->bs_children[i]->b_blkno), fs->fs_bsize))
if (!ffs_checkblk(ip, dbtofsb(fs, bp->b_blkno), fs->fs_bsize))
panic("ffs_reallocblks: unallocated block 3");
#endif
#ifdef DEBUG
@ -1823,8 +1869,7 @@ ffs_alloccgblk(ip, bp, bpref, size)
/* XXX Fixme. */
UFS_UNLOCK(ump);
if (DOINGSOFTDEP(ITOV(ip)))
softdep_setup_blkmapdep(bp, UFSTOVFS(ump), blkno,
size, 0);
softdep_setup_blkmapdep(bp, UFSTOVFS(ump), blkno, size, 0);
UFS_LOCK(ump);
return (blkno);
}
@ -2254,6 +2299,17 @@ ffs_blkfree_cg(ump, fs, devvp, bno, size, inum, dephd)
bdwrite(bp);
}
/*
* Structures and routines associated with trim management.
*/
MALLOC_DEFINE(M_TRIM, "ufs_trim", "UFS trim structures");
#define TRIMLIST_HASH(ump, key) \
(&(ump)->um_trimhash[(key) & (ump)->um_trimlisthashsize])
static void ffs_blkfree_trim_completed(struct buf *);
static void ffs_blkfree_trim_task(void *ctx, int pending __unused);
struct ffs_blkfree_trim_params {
struct task task;
struct ufsmount *ump;
@ -2277,7 +2333,7 @@ ffs_blkfree_trim_task(ctx, pending)
tp->inum, tp->pdephd);
vn_finished_secondary_write(UFSTOVFS(tp->ump));
atomic_add_int(&tp->ump->um_trim_inflight, -1);
free(tp, M_TEMP);
free(tp, M_TRIM);
}
static void
@ -2287,13 +2343,45 @@ ffs_blkfree_trim_completed(bp)
struct ffs_blkfree_trim_params *tp;
tp = bp->b_fsprivate1;
free(bp, M_TEMP);
free(bp, M_TRIM);
TASK_INIT(&tp->task, 0, ffs_blkfree_trim_task, tp);
taskqueue_enqueue(tp->ump->um_trim_tq, &tp->task);
}
/*
* Allocate a new key to use to identify a range of blocks.
*/
u_long
ffs_blkrelease_start(ump, devvp, inum)
struct ufsmount *ump;
struct vnode *devvp;
ino_t inum;
{
static u_long masterkey;
u_long key;
if ((ump->um_flags & UM_CANDELETE) == 0)
return (SINGLETON_KEY);
do {
key = atomic_fetchadd_long(&masterkey, 1);
} while (key < FIRST_VALID_KEY);
return (key);
}
/*
* Deallocate a key that has been used to identify a range of blocks.
*/
void
ffs_blkfree(ump, fs, devvp, bno, size, inum, vtype, dephd)
ffs_blkrelease_finish(ump, key)
struct ufsmount *ump;
u_long key;
{
return;
}
void
ffs_blkfree(ump, fs, devvp, bno, size, inum, vtype, dephd, key)
struct ufsmount *ump;
struct fs *fs;
struct vnode *devvp;
@ -2302,6 +2390,7 @@ ffs_blkfree(ump, fs, devvp, bno, size, inum, vtype, dephd)
ino_t inum;
enum vtype vtype;
struct workhead *dephd;
u_long key;
{
struct mount *mp;
struct buf *bp;
@ -2319,10 +2408,11 @@ ffs_blkfree(ump, fs, devvp, bno, size, inum, vtype, dephd)
return;
}
/*
* Nothing to delay if TRIM is disabled, or the operation is
* performed on the snapshot.
* Nothing to delay if TRIM is not required for this block or TRIM
* is disabled or the operation is performed on a snapshot.
*/
if (((ump->um_flags) & UM_CANDELETE) == 0 || devvp->v_type == VREG) {
if (key == NOTRIM_KEY || ((ump->um_flags & UM_CANDELETE) == 0) ||
devvp->v_type == VREG) {
ffs_blkfree_cg(ump, fs, devvp, bno, size, inum, dephd);
return;
}
@ -2334,7 +2424,7 @@ ffs_blkfree(ump, fs, devvp, bno, size, inum, vtype, dephd)
* and write some new data into it.
*/
atomic_add_int(&ump->um_trim_inflight, 1);
tp = malloc(sizeof(struct ffs_blkfree_trim_params), M_TEMP, M_WAITOK);
tp = malloc(sizeof(struct ffs_blkfree_trim_params), M_TRIM, M_WAITOK);
tp->ump = ump;
tp->devvp = devvp;
tp->bno = bno;
@ -2347,7 +2437,7 @@ ffs_blkfree(ump, fs, devvp, bno, size, inum, vtype, dephd)
} else
tp->pdephd = NULL;
bp = malloc(sizeof(*bp), M_TEMP, M_WAITOK | M_ZERO);
bp = malloc(sizeof(*bp), M_TRIM, M_WAITOK | M_ZERO);
bp->b_iocmd = BIO_DELETE;
bp->b_iooffset = dbtob(fsbtodb(fs, bno));
bp->b_iodone = ffs_blkfree_trim_completed;
@ -2822,6 +2912,7 @@ sysctl_ffs_fsck(SYSCTL_HANDLER_ARGS)
struct fs *fs;
ufs2_daddr_t blkno;
long blkcnt, blksize;
u_long key;
struct file *fp, *vfp;
cap_rights_t rights;
int filetype, error;
@ -2956,15 +3047,18 @@ sysctl_ffs_fsck(SYSCTL_HANDLER_ARGS)
blkno = cmd.value;
blkcnt = cmd.size;
blksize = fs->fs_frag - (blkno % fs->fs_frag);
key = ffs_blkrelease_start(ump, ump->um_devvp, UFS_ROOTINO);
while (blkcnt > 0) {
if (blksize > blkcnt)
if (blkcnt < blksize)
blksize = blkcnt;
ffs_blkfree(ump, fs, ump->um_devvp, blkno,
blksize * fs->fs_fsize, UFS_ROOTINO, VDIR, NULL);
blksize * fs->fs_fsize, UFS_ROOTINO,
VDIR, NULL, key);
blkno += blksize;
blkcnt -= blksize;
blksize = fs->fs_frag;
}
ffs_blkrelease_finish(ump, key);
break;
/*

View file

@ -553,7 +553,7 @@ ffs_balloc_ufs1(struct vnode *vp, off_t startoffset, int size,
lbns_remfree++;
#endif
ffs_blkfree(ump, fs, ump->um_devvp, *blkp, fs->fs_bsize,
ip->i_number, vp->v_type, NULL);
ip->i_number, vp->v_type, NULL, SINGLETON_KEY);
}
return (error);
}
@ -1147,7 +1147,7 @@ ffs_balloc_ufs2(struct vnode *vp, off_t startoffset, int size,
lbns_remfree++;
#endif
ffs_blkfree(ump, fs, ump->um_devvp, *blkp, fs->fs_bsize,
ip->i_number, vp->v_type, NULL);
ip->i_number, vp->v_type, NULL, SINGLETON_KEY);
}
return (error);
}

View file

@ -63,9 +63,11 @@ int ffs_balloc_ufs2(struct vnode *a_vp, off_t a_startoffset, int a_size,
struct ucred *a_cred, int a_flags, struct buf **a_bpp);
int ffs_blkatoff(struct vnode *, off_t, char **, struct buf **);
void ffs_blkfree(struct ufsmount *, struct fs *, struct vnode *,
ufs2_daddr_t, long, ino_t, enum vtype, struct workhead *);
ufs2_daddr_t, long, ino_t, enum vtype, struct workhead *, u_long);
ufs2_daddr_t ffs_blkpref_ufs1(struct inode *, ufs_lbn_t, int, ufs1_daddr_t *);
ufs2_daddr_t ffs_blkpref_ufs2(struct inode *, ufs_lbn_t, int, ufs2_daddr_t *);
void ffs_blkrelease_finish(struct ufsmount *, u_long);
u_long ffs_blkrelease_start(struct ufsmount *, struct vnode *, ino_t);
int ffs_checkfreefile(struct fs *, struct vnode *, ino_t);
void ffs_clrblock(struct fs *, u_char *, ufs1_daddr_t);
void ffs_clusteracct(struct fs *, struct cg *, ufs1_daddr_t, int);
@ -111,11 +113,27 @@ vfs_vget_t ffs_vget;
int ffs_vgetf(struct mount *, ino_t, int, struct vnode **, int);
void process_deferred_inactive(struct mount *mp);
/*
* Flags to ffs_vgetf
*/
#define FFSV_FORCEINSMQ 0x0001
/*
* Flags to ffs_reload
*/
#define FFSR_FORCE 0x0001
#define FFSR_UNSUSPEND 0x0002
/*
* Definitions for TRIM interface
*
* Special keys and recommended hash table size
*/
#define NOTRIM_KEY 1 /* never written, so don't call trim for it */
#define SINGLETON_KEY 2 /* only block being freed, so trim it now */
#define FIRST_VALID_KEY 3 /* first valid key describing a block range */
#define MAXTRIMIO 1024 /* maximum expected outstanding trim requests */
extern struct vop_vector ffs_vnodeops1;
extern struct vop_vector ffs_fifoops1;
extern struct vop_vector ffs_vnodeops2;

View file

@ -197,6 +197,7 @@ ffs_truncate(vp, length, flags, cred)
int needextclean, extblocks;
int offset, size, level, nblocks;
int i, error, allerror, indiroff, waitforupdate;
u_long key;
off_t osize;
ip = VTOI(vp);
@ -275,7 +276,7 @@ ffs_truncate(vp, length, flags, cred)
continue;
ffs_blkfree(ump, fs, ITODEVVP(ip), oldblks[i],
sblksize(fs, osize, i), ip->i_number,
vp->v_type, NULL);
vp->v_type, NULL, SINGLETON_KEY);
}
}
}
@ -523,7 +524,7 @@ ffs_truncate(vp, length, flags, cred)
DIP_SET(ip, i_ib[level], 0);
ffs_blkfree(ump, fs, ump->um_devvp, bn,
fs->fs_bsize, ip->i_number,
vp->v_type, NULL);
vp->v_type, NULL, SINGLETON_KEY);
blocksreleased += nblocks;
}
}
@ -534,6 +535,7 @@ ffs_truncate(vp, length, flags, cred)
/*
* All whole direct blocks or frags.
*/
key = ffs_blkrelease_start(ump, ump->um_devvp, ip->i_number);
for (i = UFS_NDADDR - 1; i > lastblock; i--) {
long bsize;
@ -543,9 +545,10 @@ ffs_truncate(vp, length, flags, cred)
DIP_SET(ip, i_db[i], 0);
bsize = blksize(fs, ip, i);
ffs_blkfree(ump, fs, ump->um_devvp, bn, bsize, ip->i_number,
vp->v_type, NULL);
vp->v_type, NULL, key);
blocksreleased += btodb(bsize);
}
ffs_blkrelease_finish(ump, key);
if (lastblock < 0)
goto done;
@ -575,7 +578,8 @@ ffs_truncate(vp, length, flags, cred)
*/
bn += numfrags(fs, newspace);
ffs_blkfree(ump, fs, ump->um_devvp, bn,
oldspace - newspace, ip->i_number, vp->v_type, NULL);
oldspace - newspace, ip->i_number, vp->v_type,
NULL, SINGLETON_KEY);
blocksreleased += btodb(oldspace - newspace);
}
}
@ -634,8 +638,10 @@ ffs_indirtrunc(ip, lbn, dbn, lastbn, level, countp)
{
struct buf *bp;
struct fs *fs;
struct ufsmount *ump;
struct vnode *vp;
caddr_t copy = NULL;
u_long key;
int i, nblocks, error = 0, allerror = 0;
ufs2_daddr_t nb, nlbn, last;
ufs2_daddr_t blkcount, factor, blocksreleased = 0;
@ -644,6 +650,7 @@ ffs_indirtrunc(ip, lbn, dbn, lastbn, level, countp)
#define BAP(ip, i) (I_IS_UFS1(ip) ? bap1[i] : bap2[i])
fs = ITOFS(ip);
ump = ITOUMP(ip);
/*
* Calculate index in current block of last
@ -719,6 +726,7 @@ ffs_indirtrunc(ip, lbn, dbn, lastbn, level, countp)
/*
* Recursively free totally unused blocks.
*/
key = ffs_blkrelease_start(ump, ITODEVVP(ip), ip->i_number);
for (i = NINDIR(fs) - 1, nlbn = lbn + 1 - i * factor; i > last;
i--, nlbn += factor) {
nb = BAP(ip, i);
@ -730,10 +738,11 @@ ffs_indirtrunc(ip, lbn, dbn, lastbn, level, countp)
allerror = error;
blocksreleased += blkcount;
}
ffs_blkfree(ITOUMP(ip), fs, ITODEVVP(ip), nb, fs->fs_bsize,
ip->i_number, vp->v_type, NULL);
ffs_blkfree(ump, fs, ITODEVVP(ip), nb, fs->fs_bsize,
ip->i_number, vp->v_type, NULL, key);
blocksreleased += nblocks;
}
ffs_blkrelease_finish(ump, key);
/*
* Recursively free last partial block.

View file

@ -583,7 +583,7 @@ ffs_snapshot(mp, snapfile)
if (len != 0 && len < fs->fs_bsize) {
ffs_blkfree(ump, copy_fs, vp,
DIP(xp, i_db[loc]), len, xp->i_number,
xvp->v_type, NULL);
xvp->v_type, NULL, SINGLETON_KEY);
blkno = DIP(xp, i_db[loc]);
DIP_SET(xp, i_db[loc], 0);
}
@ -1265,7 +1265,7 @@ mapacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, expungetype)
if (blkno == BLK_SNAP)
blkno = blkstofrags(fs, lblkno);
ffs_blkfree(ITOUMP(ip), fs, vp, blkno, fs->fs_bsize, inum,
vp->v_type, NULL);
vp->v_type, NULL, SINGLETON_KEY);
}
return (0);
}
@ -1549,7 +1549,7 @@ mapacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, expungetype)
if (blkno == BLK_SNAP)
blkno = blkstofrags(fs, lblkno);
ffs_blkfree(ITOUMP(ip), fs, vp, blkno, fs->fs_bsize, inum,
vp->v_type, NULL);
vp->v_type, NULL, SINGLETON_KEY);
}
return (0);
}

View file

@ -869,7 +869,7 @@ static void cancel_allocdirect(struct allocdirectlst *,
struct allocdirect *, struct freeblks *);
static int check_inode_unwritten(struct inodedep *);
static int free_inodedep(struct inodedep *);
static void freework_freeblock(struct freework *);
static void freework_freeblock(struct freework *, u_long);
static void freework_enqueue(struct freework *);
static int handle_workitem_freeblocks(struct freeblks *, int);
static int handle_complete_freeblocks(struct freeblks *, int);
@ -884,7 +884,7 @@ static struct allocindir *newallocindir(struct inode *, int, ufs2_daddr_t,
ufs2_daddr_t, ufs_lbn_t);
static void handle_workitem_freefrag(struct freefrag *);
static struct freefrag *newfreefrag(struct inode *, ufs2_daddr_t, long,
ufs_lbn_t);
ufs_lbn_t, u_long);
static void allocdirect_merge(struct allocdirectlst *,
struct allocdirect *, struct allocdirect *);
static struct freefrag *allocindir_merge(struct allocindir *,
@ -5289,7 +5289,22 @@ softdep_setup_allocdirect(ip, off, newblkno, oldblkno, newsize, oldsize, bp)
KASSERT(MOUNTEDSOFTDEP(mp) != 0,
("softdep_setup_allocdirect called on non-softdep filesystem"));
if (oldblkno && oldblkno != newblkno)
freefrag = newfreefrag(ip, oldblkno, oldsize, lbn);
/*
* The usual case is that a smaller fragment that
* was just allocated has been replaced with a bigger
* fragment or a full-size block. If it is marked as
* B_DELWRI, the current contents have not been written
* to disk. It is possible that the block was written
* earlier, but very uncommon. If the block has never
* been written, there is no need to send a BIO_DELETE
* for it when it is freed. The gain from avoiding the
* TRIMs for the common case of unwritten blocks far
* exceeds the cost of the write amplification for the
* uncommon case of failing to send a TRIM for a block
* that had been written.
*/
freefrag = newfreefrag(ip, oldblkno, oldsize, lbn,
(bp->b_flags & B_DELWRI) != 0 ? NOTRIM_KEY : SINGLETON_KEY);
else
freefrag = NULL;
@ -5566,11 +5581,12 @@ newjfreefrag(freefrag, ip, blkno, size, lbn)
* Allocate a new freefrag structure.
*/
static struct freefrag *
newfreefrag(ip, blkno, size, lbn)
newfreefrag(ip, blkno, size, lbn, key)
struct inode *ip;
ufs2_daddr_t blkno;
long size;
ufs_lbn_t lbn;
u_long key;
{
struct freefrag *freefrag;
struct ufsmount *ump;
@ -5591,6 +5607,7 @@ newfreefrag(ip, blkno, size, lbn)
freefrag->ff_vtype = ITOV(ip)->v_type;
freefrag->ff_blkno = blkno;
freefrag->ff_fragsize = size;
freefrag->ff_key = key;
if (MOUNTEDSUJ(UFSTOVFS(ump))) {
freefrag->ff_jdep = (struct worklist *)
@ -5636,7 +5653,8 @@ handle_workitem_freefrag(freefrag)
}
FREE_LOCK(ump);
ffs_blkfree(ump, ump->um_fs, ump->um_devvp, freefrag->ff_blkno,
freefrag->ff_fragsize, freefrag->ff_inum, freefrag->ff_vtype, &wkhd);
freefrag->ff_fragsize, freefrag->ff_inum, freefrag->ff_vtype,
&wkhd, freefrag->ff_key);
ACQUIRE_LOCK(ump);
WORKITEM_FREE(freefrag, D_FREEFRAG);
FREE_LOCK(ump);
@ -5676,7 +5694,22 @@ softdep_setup_allocext(ip, off, newblkno, oldblkno, newsize, oldsize, bp)
lbn = bp->b_lblkno;
if (oldblkno && oldblkno != newblkno)
freefrag = newfreefrag(ip, oldblkno, oldsize, lbn);
/*
* The usual case is that a smaller fragment that
* was just allocated has been replaced with a bigger
* fragment or a full-size block. If it is marked as
* B_DELWRI, the current contents have not been written
* to disk. It is possible that the block was written
* earlier, but very uncommon. If the block has never
* been written, there is no need to send a BIO_DELETE
* for it when it is freed. The gain from avoiding the
* TRIMs for the common case of unwritten blocks far
* exceeds the cost of the write amplification for the
* uncommon case of failing to send a TRIM for a block
* that had been written.
*/
freefrag = newfreefrag(ip, oldblkno, oldsize, lbn,
(bp->b_flags & B_DELWRI) != 0 ? NOTRIM_KEY : SINGLETON_KEY);
else
freefrag = NULL;
@ -5789,7 +5822,8 @@ newallocindir(ip, ptrno, newblkno, oldblkno, lbn)
struct jnewblk *jnewblk;
if (oldblkno)
freefrag = newfreefrag(ip, oldblkno, ITOFS(ip)->fs_bsize, lbn);
freefrag = newfreefrag(ip, oldblkno, ITOFS(ip)->fs_bsize, lbn,
SINGLETON_KEY);
else
freefrag = NULL;
ACQUIRE_LOCK(ITOUMP(ip));
@ -7724,8 +7758,9 @@ free_inodedep(inodedep)
* in memory immediately.
*/
static void
freework_freeblock(freework)
freework_freeblock(freework, key)
struct freework *freework;
u_long key;
{
struct freeblks *freeblks;
struct jnewblk *jnewblk;
@ -7779,10 +7814,10 @@ freework_freeblock(freework)
FREE_LOCK(ump);
freeblks_free(ump, freeblks, btodb(bsize));
CTR4(KTR_SUJ,
"freework_freeblock: ino %d blkno %jd lbn %jd size %ld",
"freework_freeblock: ino %jd blkno %jd lbn %jd size %d",
freeblks->fb_inum, freework->fw_blkno, freework->fw_lbn, bsize);
ffs_blkfree(ump, fs, freeblks->fb_devvp, freework->fw_blkno, bsize,
freeblks->fb_inum, freeblks->fb_vtype, &wkhd);
freeblks->fb_inum, freeblks->fb_vtype, &wkhd, key);
ACQUIRE_LOCK(ump);
/*
* The jnewblk will be discarded and the bits in the map never
@ -7835,7 +7870,7 @@ handle_workitem_indirblk(freework)
return;
}
if (freework->fw_off == NINDIR(fs)) {
freework_freeblock(freework);
freework_freeblock(freework, SINGLETON_KEY);
return;
}
freework->fw_state |= INPROGRESS;
@ -7894,10 +7929,12 @@ handle_workitem_freeblocks(freeblks, flags)
struct allocindir *aip;
struct ufsmount *ump;
struct worklist *wk;
u_long key;
KASSERT(LIST_EMPTY(&freeblks->fb_jblkdephd),
("handle_workitem_freeblocks: Journal entries not written."));
ump = VFSTOUFS(freeblks->fb_list.wk_mp);
key = ffs_blkrelease_start(ump, freeblks->fb_devvp, freeblks->fb_inum);
ACQUIRE_LOCK(ump);
while ((wk = LIST_FIRST(&freeblks->fb_freeworkhd)) != NULL) {
WORKLIST_REMOVE(wk);
@ -7935,7 +7972,7 @@ handle_workitem_freeblocks(freeblks, flags)
if (freework->fw_lbn <= -UFS_NDADDR)
handle_workitem_indirblk(freework);
else
freework_freeblock(freework);
freework_freeblock(freework, key);
continue;
default:
panic("handle_workitem_freeblocks: Unknown type %s",
@ -7948,6 +7985,7 @@ handle_workitem_freeblocks(freeblks, flags)
freeblks = NULL;
}
FREE_LOCK(ump);
ffs_blkrelease_finish(ump, key);
if (freeblks)
return handle_complete_freeblocks(freeblks, flags);
return (0);
@ -8080,13 +8118,9 @@ indir_trunc(freework, dbn, lbn)
ufs1_daddr_t *bap1;
ufs2_daddr_t nb, nnb, *bap2;
ufs_lbn_t lbnadd, nlbn;
int i, nblocks, ufs1fmt;
int freedblocks;
int goingaway;
int freedeps;
int needj;
int level;
int cnt;
u_long key;
int nblocks, ufs1fmt, freedblocks;
int goingaway, freedeps, needj, level, cnt, i;
freeblks = freework->fw_freeblks;
ump = VFSTOUFS(freeblks->fb_list.wk_mp);
@ -8180,6 +8214,7 @@ indir_trunc(freework, dbn, lbn)
* arranges for the current level to be freed when subordinates
* are free when journaling.
*/
key = ffs_blkrelease_start(ump, freeblks->fb_devvp, freeblks->fb_inum);
for (i = freework->fw_off; i < NINDIR(fs); i++, nb = nnb) {
if (i != NINDIR(fs) - 1) {
if (ufs1fmt)
@ -8215,13 +8250,14 @@ indir_trunc(freework, dbn, lbn)
freedeps++;
}
CTR3(KTR_SUJ,
"indir_trunc: ino %d blkno %jd size %ld",
"indir_trunc: ino %jd blkno %jd size %d",
freeblks->fb_inum, nb, fs->fs_bsize);
ffs_blkfree(ump, fs, freeblks->fb_devvp, nb,
fs->fs_bsize, freeblks->fb_inum,
freeblks->fb_vtype, &wkhd);
freeblks->fb_vtype, &wkhd, key);
}
}
ffs_blkrelease_finish(ump, key);
if (goingaway) {
bp->b_flags |= B_INVAL | B_NOCACHE;
brelse(bp);
@ -8244,7 +8280,7 @@ indir_trunc(freework, dbn, lbn)
if (level == 0)
freeblks->fb_cgwait += freedeps;
if (freework->fw_ref == 0)
freework_freeblock(freework);
freework_freeblock(freework, SINGLETON_KEY);
FREE_LOCK(ump);
return;
}
@ -8253,10 +8289,10 @@ indir_trunc(freework, dbn, lbn)
*/
dbn = dbtofsb(fs, dbn);
CTR3(KTR_SUJ,
"indir_trunc 2: ino %d blkno %jd size %ld",
"indir_trunc 2: ino %jd blkno %jd size %d",
freeblks->fb_inum, dbn, fs->fs_bsize);
ffs_blkfree(ump, fs, freeblks->fb_devvp, dbn, fs->fs_bsize,
freeblks->fb_inum, freeblks->fb_vtype, NULL);
freeblks->fb_inum, freeblks->fb_vtype, NULL, SINGLETON_KEY);
/* Non SUJ softdep does single-threaded truncations. */
if (freework->fw_blkno == dbn) {
freework->fw_state |= ALLCOMPLETE;

View file

@ -978,6 +978,8 @@ ffs_mountfs(devvp, mp, td)
taskqueue_thread_enqueue, &ump->um_trim_tq);
taskqueue_start_threads(&ump->um_trim_tq, 1, PVFS,
"%s trim", mp->mnt_stat.f_mntonname);
ump->um_trimhash = hashinit(MAXTRIMIO, M_TRIM,
&ump->um_trimlisthashsize);
}
}
@ -1256,6 +1258,7 @@ ffs_unmount(mp, mntflags)
pause("ufsutr", hz);
taskqueue_drain_all(ump->um_trim_tq);
taskqueue_free(ump->um_trim_tq);
free (ump->um_trimhash, M_TRIM);
}
g_topology_lock();
if (ump->um_fsckpid > 0) {

View file

@ -557,6 +557,7 @@ struct freefrag {
long ff_fragsize; /* size of fragment being deleted */
ino_t ff_inum; /* owning inode number */
enum vtype ff_vtype; /* owning inode's file type */
int ff_key; /* trim key when deleted */
};
/*

View file

@ -47,6 +47,7 @@ struct ufs_args {
#ifdef MALLOC_DECLARE
MALLOC_DECLARE(M_UFSMNT);
MALLOC_DECLARE(M_TRIM);
#endif
struct buf;
@ -63,6 +64,7 @@ struct inodedep;
TAILQ_HEAD(inodedeplst, inodedep);
LIST_HEAD(bmsafemaphd, bmsafemap);
LIST_HEAD(trimlist_hashhead, ffs_blkfree_trim_params);
/*
* This structure describes the UFS specific mount structure data.
@ -70,7 +72,6 @@ LIST_HEAD(bmsafemaphd, bmsafemap);
* UFS (UFS1, UFS2, etc).
*
* Lock reference:
* a - atomic operations
* c - set at allocation then constant until freed
* i - ufsmount interlock (UFS_LOCK / UFS_UNLOCK)
* q - associated quota file is locked
@ -99,8 +100,13 @@ struct ufsmount {
char um_qflags[MAXQUOTAS]; /* (i) quota specific flags */
int64_t um_savedmaxfilesize; /* (c) track maxfilesize */
u_int um_flags; /* (i) filesystem flags */
u_int um_trim_inflight; /* (a) outstanding trim count */
u_int um_trim_inflight; /* (i) outstanding trim count */
u_int um_trim_inflight_blks; /* (i) outstanding trim blks */
u_long um_trim_total; /* (i) total trim count */
u_long um_trim_total_blks; /* (i) total trim block count */
struct taskqueue *um_trim_tq; /* (c) trim request queue */
struct trimlist_hashhead *um_trimhash; /* (i) trimlist hash table */
u_long um_trimlisthashsize; /* (i) trim hash table size-1 */
/* (c) - below function ptrs */
int (*um_balloc)(struct vnode *, off_t, int, struct ucred *,
int, struct buf **);