From 7e038bc257e9c5f7563695b88b481e493a33576f Mon Sep 17 00:00:00 2001 From: Kirk McKusick Date: Sat, 18 Aug 2018 22:21:59 +0000 Subject: [PATCH] Replace the TRIM consolodation framework originally added in -r337396 driven by problems found with the algorithms being tested for TRIM consolodation. Reported by: Peter Holm Suggested by: kib Reviewed by: kib Sponsored by: Netflix --- sys/ufs/ffs/ffs_alloc.c | 148 ++++++++++++++++++++++++++++++------- sys/ufs/ffs/ffs_balloc.c | 4 +- sys/ufs/ffs/ffs_extern.h | 20 ++++- sys/ufs/ffs/ffs_inode.c | 21 ++++-- sys/ufs/ffs/ffs_snapshot.c | 6 +- sys/ufs/ffs/ffs_softdep.c | 84 +++++++++++++++------ sys/ufs/ffs/ffs_vfsops.c | 3 + sys/ufs/ffs/softdep.h | 1 + sys/ufs/ufs/ufsmount.h | 10 ++- 9 files changed, 232 insertions(+), 65 deletions(-) diff --git a/sys/ufs/ffs/ffs_alloc.c b/sys/ufs/ffs/ffs_alloc.c index 1e1b4f1350a1..1267de701dec 100644 --- a/sys/ufs/ffs/ffs_alloc.c +++ b/sys/ufs/ffs/ffs_alloc.c @@ -110,8 +110,6 @@ static ufs2_daddr_t static void ffs_blkfree_cg(struct ufsmount *, struct fs *, struct vnode *, ufs2_daddr_t, long, ino_t, struct workhead *); -static void ffs_blkfree_trim_completed(struct buf *); -static void ffs_blkfree_trim_task(void *ctx, int pending __unused); #ifdef INVARIANTS static int ffs_checkblk(struct inode *, ufs2_daddr_t, long); #endif @@ -395,8 +393,24 @@ ffs_realloccg(ip, lbprev, bprev, bpref, osize, nsize, flags, cred, bpp) if (bno > 0) { bp->b_blkno = fsbtodb(fs, bno); if (!DOINGSOFTDEP(vp)) + /* + * The usual case is that a smaller fragment that + * was just allocated has been replaced with a bigger + * fragment or a full-size block. If it is marked as + * B_DELWRI, the current contents have not been written + * to disk. It is possible that the block was written + * earlier, but very uncommon. If the block has never + * been written, there is no need to send a BIO_DELETE + * for it when it is freed. The gain from avoiding the + * TRIMs for the common case of unwritten blocks far + * exceeds the cost of the write amplification for the + * uncommon case of failing to send a TRIM for a block + * that had been written. + */ ffs_blkfree(ump, fs, ump->um_devvp, bprev, (long)osize, - ip->i_number, vp->v_type, NULL); + ip->i_number, vp->v_type, NULL, + (bp->b_flags & B_DELWRI) != 0 ? + NOTRIM_KEY : SINGLETON_KEY); delta = btodb(nsize - osize); DIP_SET(ip, i_blocks, DIP(ip, i_blocks) + delta); if (flags & IO_EXT) @@ -521,7 +535,7 @@ ffs_reallocblks_ufs1(ap) struct fs *fs; struct inode *ip; struct vnode *vp; - struct buf *sbp, *ebp; + struct buf *sbp, *ebp, *bp; ufs1_daddr_t *bap, *sbap, *ebap; struct cluster_save *buflist; struct ufsmount *ump; @@ -730,14 +744,30 @@ ffs_reallocblks_ufs1(ap) printf("\n\tnew:"); #endif for (blkno = newblk, i = 0; i < len; i++, blkno += fs->fs_frag) { + bp = buflist->bs_children[i]; if (!DOINGSOFTDEP(vp)) + /* + * The usual case is that a set of N-contiguous blocks + * that was just allocated has been replaced with a + * set of N+1-contiguous blocks. If they are marked as + * B_DELWRI, the current contents have not been written + * to disk. It is possible that the blocks were written + * earlier, but very uncommon. If the blocks have never + * been written, there is no need to send a BIO_DELETE + * for them when they are freed. The gain from avoiding + * the TRIMs for the common case of unwritten blocks + * far exceeds the cost of the write amplification for + * the uncommon case of failing to send a TRIM for the + * blocks that had been written. + */ ffs_blkfree(ump, fs, ump->um_devvp, - dbtofsb(fs, buflist->bs_children[i]->b_blkno), - fs->fs_bsize, ip->i_number, vp->v_type, NULL); - buflist->bs_children[i]->b_blkno = fsbtodb(fs, blkno); + dbtofsb(fs, bp->b_blkno), + fs->fs_bsize, ip->i_number, vp->v_type, NULL, + (bp->b_flags & B_DELWRI) != 0 ? + NOTRIM_KEY : SINGLETON_KEY); + bp->b_blkno = fsbtodb(fs, blkno); #ifdef INVARIANTS - if (!ffs_checkblk(ip, - dbtofsb(fs, buflist->bs_children[i]->b_blkno), fs->fs_bsize)) + if (!ffs_checkblk(ip, dbtofsb(fs, bp->b_blkno), fs->fs_bsize)) panic("ffs_reallocblks: unallocated block 3"); #endif #ifdef DEBUG @@ -771,7 +801,7 @@ ffs_reallocblks_ufs2(ap) struct fs *fs; struct inode *ip; struct vnode *vp; - struct buf *sbp, *ebp; + struct buf *sbp, *ebp, *bp; ufs2_daddr_t *bap, *sbap, *ebap; struct cluster_save *buflist; struct ufsmount *ump; @@ -978,14 +1008,30 @@ ffs_reallocblks_ufs2(ap) printf("\n\tnew:"); #endif for (blkno = newblk, i = 0; i < len; i++, blkno += fs->fs_frag) { + bp = buflist->bs_children[i]; if (!DOINGSOFTDEP(vp)) + /* + * The usual case is that a set of N-contiguous blocks + * that was just allocated has been replaced with a + * set of N+1-contiguous blocks. If they are marked as + * B_DELWRI, the current contents have not been written + * to disk. It is possible that the blocks were written + * earlier, but very uncommon. If the blocks have never + * been written, there is no need to send a BIO_DELETE + * for them when they are freed. The gain from avoiding + * the TRIMs for the common case of unwritten blocks + * far exceeds the cost of the write amplification for + * the uncommon case of failing to send a TRIM for the + * blocks that had been written. + */ ffs_blkfree(ump, fs, ump->um_devvp, - dbtofsb(fs, buflist->bs_children[i]->b_blkno), - fs->fs_bsize, ip->i_number, vp->v_type, NULL); - buflist->bs_children[i]->b_blkno = fsbtodb(fs, blkno); + dbtofsb(fs, bp->b_blkno), + fs->fs_bsize, ip->i_number, vp->v_type, NULL, + (bp->b_flags & B_DELWRI) != 0 ? + NOTRIM_KEY : SINGLETON_KEY); + bp->b_blkno = fsbtodb(fs, blkno); #ifdef INVARIANTS - if (!ffs_checkblk(ip, - dbtofsb(fs, buflist->bs_children[i]->b_blkno), fs->fs_bsize)) + if (!ffs_checkblk(ip, dbtofsb(fs, bp->b_blkno), fs->fs_bsize)) panic("ffs_reallocblks: unallocated block 3"); #endif #ifdef DEBUG @@ -1823,8 +1869,7 @@ ffs_alloccgblk(ip, bp, bpref, size) /* XXX Fixme. */ UFS_UNLOCK(ump); if (DOINGSOFTDEP(ITOV(ip))) - softdep_setup_blkmapdep(bp, UFSTOVFS(ump), blkno, - size, 0); + softdep_setup_blkmapdep(bp, UFSTOVFS(ump), blkno, size, 0); UFS_LOCK(ump); return (blkno); } @@ -2254,6 +2299,17 @@ ffs_blkfree_cg(ump, fs, devvp, bno, size, inum, dephd) bdwrite(bp); } +/* + * Structures and routines associated with trim management. + */ +MALLOC_DEFINE(M_TRIM, "ufs_trim", "UFS trim structures"); + +#define TRIMLIST_HASH(ump, key) \ + (&(ump)->um_trimhash[(key) & (ump)->um_trimlisthashsize]) + +static void ffs_blkfree_trim_completed(struct buf *); +static void ffs_blkfree_trim_task(void *ctx, int pending __unused); + struct ffs_blkfree_trim_params { struct task task; struct ufsmount *ump; @@ -2277,7 +2333,7 @@ ffs_blkfree_trim_task(ctx, pending) tp->inum, tp->pdephd); vn_finished_secondary_write(UFSTOVFS(tp->ump)); atomic_add_int(&tp->ump->um_trim_inflight, -1); - free(tp, M_TEMP); + free(tp, M_TRIM); } static void @@ -2287,13 +2343,45 @@ ffs_blkfree_trim_completed(bp) struct ffs_blkfree_trim_params *tp; tp = bp->b_fsprivate1; - free(bp, M_TEMP); + free(bp, M_TRIM); TASK_INIT(&tp->task, 0, ffs_blkfree_trim_task, tp); taskqueue_enqueue(tp->ump->um_trim_tq, &tp->task); } +/* + * Allocate a new key to use to identify a range of blocks. + */ +u_long +ffs_blkrelease_start(ump, devvp, inum) + struct ufsmount *ump; + struct vnode *devvp; + ino_t inum; +{ + static u_long masterkey; + u_long key; + + if ((ump->um_flags & UM_CANDELETE) == 0) + return (SINGLETON_KEY); + do { + key = atomic_fetchadd_long(&masterkey, 1); + } while (key < FIRST_VALID_KEY); + return (key); +} + +/* + * Deallocate a key that has been used to identify a range of blocks. + */ void -ffs_blkfree(ump, fs, devvp, bno, size, inum, vtype, dephd) +ffs_blkrelease_finish(ump, key) + struct ufsmount *ump; + u_long key; +{ + + return; +} + +void +ffs_blkfree(ump, fs, devvp, bno, size, inum, vtype, dephd, key) struct ufsmount *ump; struct fs *fs; struct vnode *devvp; @@ -2302,6 +2390,7 @@ ffs_blkfree(ump, fs, devvp, bno, size, inum, vtype, dephd) ino_t inum; enum vtype vtype; struct workhead *dephd; + u_long key; { struct mount *mp; struct buf *bp; @@ -2319,10 +2408,11 @@ ffs_blkfree(ump, fs, devvp, bno, size, inum, vtype, dephd) return; } /* - * Nothing to delay if TRIM is disabled, or the operation is - * performed on the snapshot. + * Nothing to delay if TRIM is not required for this block or TRIM + * is disabled or the operation is performed on a snapshot. */ - if (((ump->um_flags) & UM_CANDELETE) == 0 || devvp->v_type == VREG) { + if (key == NOTRIM_KEY || ((ump->um_flags & UM_CANDELETE) == 0) || + devvp->v_type == VREG) { ffs_blkfree_cg(ump, fs, devvp, bno, size, inum, dephd); return; } @@ -2334,7 +2424,7 @@ ffs_blkfree(ump, fs, devvp, bno, size, inum, vtype, dephd) * and write some new data into it. */ atomic_add_int(&ump->um_trim_inflight, 1); - tp = malloc(sizeof(struct ffs_blkfree_trim_params), M_TEMP, M_WAITOK); + tp = malloc(sizeof(struct ffs_blkfree_trim_params), M_TRIM, M_WAITOK); tp->ump = ump; tp->devvp = devvp; tp->bno = bno; @@ -2347,7 +2437,7 @@ ffs_blkfree(ump, fs, devvp, bno, size, inum, vtype, dephd) } else tp->pdephd = NULL; - bp = malloc(sizeof(*bp), M_TEMP, M_WAITOK | M_ZERO); + bp = malloc(sizeof(*bp), M_TRIM, M_WAITOK | M_ZERO); bp->b_iocmd = BIO_DELETE; bp->b_iooffset = dbtob(fsbtodb(fs, bno)); bp->b_iodone = ffs_blkfree_trim_completed; @@ -2822,6 +2912,7 @@ sysctl_ffs_fsck(SYSCTL_HANDLER_ARGS) struct fs *fs; ufs2_daddr_t blkno; long blkcnt, blksize; + u_long key; struct file *fp, *vfp; cap_rights_t rights; int filetype, error; @@ -2956,15 +3047,18 @@ sysctl_ffs_fsck(SYSCTL_HANDLER_ARGS) blkno = cmd.value; blkcnt = cmd.size; blksize = fs->fs_frag - (blkno % fs->fs_frag); + key = ffs_blkrelease_start(ump, ump->um_devvp, UFS_ROOTINO); while (blkcnt > 0) { - if (blksize > blkcnt) + if (blkcnt < blksize) blksize = blkcnt; ffs_blkfree(ump, fs, ump->um_devvp, blkno, - blksize * fs->fs_fsize, UFS_ROOTINO, VDIR, NULL); + blksize * fs->fs_fsize, UFS_ROOTINO, + VDIR, NULL, key); blkno += blksize; blkcnt -= blksize; blksize = fs->fs_frag; } + ffs_blkrelease_finish(ump, key); break; /* diff --git a/sys/ufs/ffs/ffs_balloc.c b/sys/ufs/ffs/ffs_balloc.c index 6143b4fca8c0..ad41f316000a 100644 --- a/sys/ufs/ffs/ffs_balloc.c +++ b/sys/ufs/ffs/ffs_balloc.c @@ -553,7 +553,7 @@ ffs_balloc_ufs1(struct vnode *vp, off_t startoffset, int size, lbns_remfree++; #endif ffs_blkfree(ump, fs, ump->um_devvp, *blkp, fs->fs_bsize, - ip->i_number, vp->v_type, NULL); + ip->i_number, vp->v_type, NULL, SINGLETON_KEY); } return (error); } @@ -1147,7 +1147,7 @@ ffs_balloc_ufs2(struct vnode *vp, off_t startoffset, int size, lbns_remfree++; #endif ffs_blkfree(ump, fs, ump->um_devvp, *blkp, fs->fs_bsize, - ip->i_number, vp->v_type, NULL); + ip->i_number, vp->v_type, NULL, SINGLETON_KEY); } return (error); } diff --git a/sys/ufs/ffs/ffs_extern.h b/sys/ufs/ffs/ffs_extern.h index 2df48ec91de9..6b8e708f7371 100644 --- a/sys/ufs/ffs/ffs_extern.h +++ b/sys/ufs/ffs/ffs_extern.h @@ -63,9 +63,11 @@ int ffs_balloc_ufs2(struct vnode *a_vp, off_t a_startoffset, int a_size, struct ucred *a_cred, int a_flags, struct buf **a_bpp); int ffs_blkatoff(struct vnode *, off_t, char **, struct buf **); void ffs_blkfree(struct ufsmount *, struct fs *, struct vnode *, - ufs2_daddr_t, long, ino_t, enum vtype, struct workhead *); + ufs2_daddr_t, long, ino_t, enum vtype, struct workhead *, u_long); ufs2_daddr_t ffs_blkpref_ufs1(struct inode *, ufs_lbn_t, int, ufs1_daddr_t *); ufs2_daddr_t ffs_blkpref_ufs2(struct inode *, ufs_lbn_t, int, ufs2_daddr_t *); +void ffs_blkrelease_finish(struct ufsmount *, u_long); +u_long ffs_blkrelease_start(struct ufsmount *, struct vnode *, ino_t); int ffs_checkfreefile(struct fs *, struct vnode *, ino_t); void ffs_clrblock(struct fs *, u_char *, ufs1_daddr_t); void ffs_clusteracct(struct fs *, struct cg *, ufs1_daddr_t, int); @@ -111,11 +113,27 @@ vfs_vget_t ffs_vget; int ffs_vgetf(struct mount *, ino_t, int, struct vnode **, int); void process_deferred_inactive(struct mount *mp); +/* + * Flags to ffs_vgetf + */ #define FFSV_FORCEINSMQ 0x0001 +/* + * Flags to ffs_reload + */ #define FFSR_FORCE 0x0001 #define FFSR_UNSUSPEND 0x0002 +/* + * Definitions for TRIM interface + * + * Special keys and recommended hash table size + */ +#define NOTRIM_KEY 1 /* never written, so don't call trim for it */ +#define SINGLETON_KEY 2 /* only block being freed, so trim it now */ +#define FIRST_VALID_KEY 3 /* first valid key describing a block range */ +#define MAXTRIMIO 1024 /* maximum expected outstanding trim requests */ + extern struct vop_vector ffs_vnodeops1; extern struct vop_vector ffs_fifoops1; extern struct vop_vector ffs_vnodeops2; diff --git a/sys/ufs/ffs/ffs_inode.c b/sys/ufs/ffs/ffs_inode.c index 6a26ef97189a..0ec662179880 100644 --- a/sys/ufs/ffs/ffs_inode.c +++ b/sys/ufs/ffs/ffs_inode.c @@ -197,6 +197,7 @@ ffs_truncate(vp, length, flags, cred) int needextclean, extblocks; int offset, size, level, nblocks; int i, error, allerror, indiroff, waitforupdate; + u_long key; off_t osize; ip = VTOI(vp); @@ -275,7 +276,7 @@ ffs_truncate(vp, length, flags, cred) continue; ffs_blkfree(ump, fs, ITODEVVP(ip), oldblks[i], sblksize(fs, osize, i), ip->i_number, - vp->v_type, NULL); + vp->v_type, NULL, SINGLETON_KEY); } } } @@ -523,7 +524,7 @@ ffs_truncate(vp, length, flags, cred) DIP_SET(ip, i_ib[level], 0); ffs_blkfree(ump, fs, ump->um_devvp, bn, fs->fs_bsize, ip->i_number, - vp->v_type, NULL); + vp->v_type, NULL, SINGLETON_KEY); blocksreleased += nblocks; } } @@ -534,6 +535,7 @@ ffs_truncate(vp, length, flags, cred) /* * All whole direct blocks or frags. */ + key = ffs_blkrelease_start(ump, ump->um_devvp, ip->i_number); for (i = UFS_NDADDR - 1; i > lastblock; i--) { long bsize; @@ -543,9 +545,10 @@ ffs_truncate(vp, length, flags, cred) DIP_SET(ip, i_db[i], 0); bsize = blksize(fs, ip, i); ffs_blkfree(ump, fs, ump->um_devvp, bn, bsize, ip->i_number, - vp->v_type, NULL); + vp->v_type, NULL, key); blocksreleased += btodb(bsize); } + ffs_blkrelease_finish(ump, key); if (lastblock < 0) goto done; @@ -575,7 +578,8 @@ ffs_truncate(vp, length, flags, cred) */ bn += numfrags(fs, newspace); ffs_blkfree(ump, fs, ump->um_devvp, bn, - oldspace - newspace, ip->i_number, vp->v_type, NULL); + oldspace - newspace, ip->i_number, vp->v_type, + NULL, SINGLETON_KEY); blocksreleased += btodb(oldspace - newspace); } } @@ -634,8 +638,10 @@ ffs_indirtrunc(ip, lbn, dbn, lastbn, level, countp) { struct buf *bp; struct fs *fs; + struct ufsmount *ump; struct vnode *vp; caddr_t copy = NULL; + u_long key; int i, nblocks, error = 0, allerror = 0; ufs2_daddr_t nb, nlbn, last; ufs2_daddr_t blkcount, factor, blocksreleased = 0; @@ -644,6 +650,7 @@ ffs_indirtrunc(ip, lbn, dbn, lastbn, level, countp) #define BAP(ip, i) (I_IS_UFS1(ip) ? bap1[i] : bap2[i]) fs = ITOFS(ip); + ump = ITOUMP(ip); /* * Calculate index in current block of last @@ -719,6 +726,7 @@ ffs_indirtrunc(ip, lbn, dbn, lastbn, level, countp) /* * Recursively free totally unused blocks. */ + key = ffs_blkrelease_start(ump, ITODEVVP(ip), ip->i_number); for (i = NINDIR(fs) - 1, nlbn = lbn + 1 - i * factor; i > last; i--, nlbn += factor) { nb = BAP(ip, i); @@ -730,10 +738,11 @@ ffs_indirtrunc(ip, lbn, dbn, lastbn, level, countp) allerror = error; blocksreleased += blkcount; } - ffs_blkfree(ITOUMP(ip), fs, ITODEVVP(ip), nb, fs->fs_bsize, - ip->i_number, vp->v_type, NULL); + ffs_blkfree(ump, fs, ITODEVVP(ip), nb, fs->fs_bsize, + ip->i_number, vp->v_type, NULL, key); blocksreleased += nblocks; } + ffs_blkrelease_finish(ump, key); /* * Recursively free last partial block. diff --git a/sys/ufs/ffs/ffs_snapshot.c b/sys/ufs/ffs/ffs_snapshot.c index fed0456b13cb..4453c59517df 100644 --- a/sys/ufs/ffs/ffs_snapshot.c +++ b/sys/ufs/ffs/ffs_snapshot.c @@ -583,7 +583,7 @@ ffs_snapshot(mp, snapfile) if (len != 0 && len < fs->fs_bsize) { ffs_blkfree(ump, copy_fs, vp, DIP(xp, i_db[loc]), len, xp->i_number, - xvp->v_type, NULL); + xvp->v_type, NULL, SINGLETON_KEY); blkno = DIP(xp, i_db[loc]); DIP_SET(xp, i_db[loc], 0); } @@ -1265,7 +1265,7 @@ mapacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, expungetype) if (blkno == BLK_SNAP) blkno = blkstofrags(fs, lblkno); ffs_blkfree(ITOUMP(ip), fs, vp, blkno, fs->fs_bsize, inum, - vp->v_type, NULL); + vp->v_type, NULL, SINGLETON_KEY); } return (0); } @@ -1549,7 +1549,7 @@ mapacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, expungetype) if (blkno == BLK_SNAP) blkno = blkstofrags(fs, lblkno); ffs_blkfree(ITOUMP(ip), fs, vp, blkno, fs->fs_bsize, inum, - vp->v_type, NULL); + vp->v_type, NULL, SINGLETON_KEY); } return (0); } diff --git a/sys/ufs/ffs/ffs_softdep.c b/sys/ufs/ffs/ffs_softdep.c index 4943555198db..89d0b7382e4d 100644 --- a/sys/ufs/ffs/ffs_softdep.c +++ b/sys/ufs/ffs/ffs_softdep.c @@ -869,7 +869,7 @@ static void cancel_allocdirect(struct allocdirectlst *, struct allocdirect *, struct freeblks *); static int check_inode_unwritten(struct inodedep *); static int free_inodedep(struct inodedep *); -static void freework_freeblock(struct freework *); +static void freework_freeblock(struct freework *, u_long); static void freework_enqueue(struct freework *); static int handle_workitem_freeblocks(struct freeblks *, int); static int handle_complete_freeblocks(struct freeblks *, int); @@ -884,7 +884,7 @@ static struct allocindir *newallocindir(struct inode *, int, ufs2_daddr_t, ufs2_daddr_t, ufs_lbn_t); static void handle_workitem_freefrag(struct freefrag *); static struct freefrag *newfreefrag(struct inode *, ufs2_daddr_t, long, - ufs_lbn_t); + ufs_lbn_t, u_long); static void allocdirect_merge(struct allocdirectlst *, struct allocdirect *, struct allocdirect *); static struct freefrag *allocindir_merge(struct allocindir *, @@ -5289,7 +5289,22 @@ softdep_setup_allocdirect(ip, off, newblkno, oldblkno, newsize, oldsize, bp) KASSERT(MOUNTEDSOFTDEP(mp) != 0, ("softdep_setup_allocdirect called on non-softdep filesystem")); if (oldblkno && oldblkno != newblkno) - freefrag = newfreefrag(ip, oldblkno, oldsize, lbn); + /* + * The usual case is that a smaller fragment that + * was just allocated has been replaced with a bigger + * fragment or a full-size block. If it is marked as + * B_DELWRI, the current contents have not been written + * to disk. It is possible that the block was written + * earlier, but very uncommon. If the block has never + * been written, there is no need to send a BIO_DELETE + * for it when it is freed. The gain from avoiding the + * TRIMs for the common case of unwritten blocks far + * exceeds the cost of the write amplification for the + * uncommon case of failing to send a TRIM for a block + * that had been written. + */ + freefrag = newfreefrag(ip, oldblkno, oldsize, lbn, + (bp->b_flags & B_DELWRI) != 0 ? NOTRIM_KEY : SINGLETON_KEY); else freefrag = NULL; @@ -5566,11 +5581,12 @@ newjfreefrag(freefrag, ip, blkno, size, lbn) * Allocate a new freefrag structure. */ static struct freefrag * -newfreefrag(ip, blkno, size, lbn) +newfreefrag(ip, blkno, size, lbn, key) struct inode *ip; ufs2_daddr_t blkno; long size; ufs_lbn_t lbn; + u_long key; { struct freefrag *freefrag; struct ufsmount *ump; @@ -5591,6 +5607,7 @@ newfreefrag(ip, blkno, size, lbn) freefrag->ff_vtype = ITOV(ip)->v_type; freefrag->ff_blkno = blkno; freefrag->ff_fragsize = size; + freefrag->ff_key = key; if (MOUNTEDSUJ(UFSTOVFS(ump))) { freefrag->ff_jdep = (struct worklist *) @@ -5636,7 +5653,8 @@ handle_workitem_freefrag(freefrag) } FREE_LOCK(ump); ffs_blkfree(ump, ump->um_fs, ump->um_devvp, freefrag->ff_blkno, - freefrag->ff_fragsize, freefrag->ff_inum, freefrag->ff_vtype, &wkhd); + freefrag->ff_fragsize, freefrag->ff_inum, freefrag->ff_vtype, + &wkhd, freefrag->ff_key); ACQUIRE_LOCK(ump); WORKITEM_FREE(freefrag, D_FREEFRAG); FREE_LOCK(ump); @@ -5676,7 +5694,22 @@ softdep_setup_allocext(ip, off, newblkno, oldblkno, newsize, oldsize, bp) lbn = bp->b_lblkno; if (oldblkno && oldblkno != newblkno) - freefrag = newfreefrag(ip, oldblkno, oldsize, lbn); + /* + * The usual case is that a smaller fragment that + * was just allocated has been replaced with a bigger + * fragment or a full-size block. If it is marked as + * B_DELWRI, the current contents have not been written + * to disk. It is possible that the block was written + * earlier, but very uncommon. If the block has never + * been written, there is no need to send a BIO_DELETE + * for it when it is freed. The gain from avoiding the + * TRIMs for the common case of unwritten blocks far + * exceeds the cost of the write amplification for the + * uncommon case of failing to send a TRIM for a block + * that had been written. + */ + freefrag = newfreefrag(ip, oldblkno, oldsize, lbn, + (bp->b_flags & B_DELWRI) != 0 ? NOTRIM_KEY : SINGLETON_KEY); else freefrag = NULL; @@ -5789,7 +5822,8 @@ newallocindir(ip, ptrno, newblkno, oldblkno, lbn) struct jnewblk *jnewblk; if (oldblkno) - freefrag = newfreefrag(ip, oldblkno, ITOFS(ip)->fs_bsize, lbn); + freefrag = newfreefrag(ip, oldblkno, ITOFS(ip)->fs_bsize, lbn, + SINGLETON_KEY); else freefrag = NULL; ACQUIRE_LOCK(ITOUMP(ip)); @@ -7724,8 +7758,9 @@ free_inodedep(inodedep) * in memory immediately. */ static void -freework_freeblock(freework) +freework_freeblock(freework, key) struct freework *freework; + u_long key; { struct freeblks *freeblks; struct jnewblk *jnewblk; @@ -7779,10 +7814,10 @@ freework_freeblock(freework) FREE_LOCK(ump); freeblks_free(ump, freeblks, btodb(bsize)); CTR4(KTR_SUJ, - "freework_freeblock: ino %d blkno %jd lbn %jd size %ld", + "freework_freeblock: ino %jd blkno %jd lbn %jd size %d", freeblks->fb_inum, freework->fw_blkno, freework->fw_lbn, bsize); ffs_blkfree(ump, fs, freeblks->fb_devvp, freework->fw_blkno, bsize, - freeblks->fb_inum, freeblks->fb_vtype, &wkhd); + freeblks->fb_inum, freeblks->fb_vtype, &wkhd, key); ACQUIRE_LOCK(ump); /* * The jnewblk will be discarded and the bits in the map never @@ -7835,7 +7870,7 @@ handle_workitem_indirblk(freework) return; } if (freework->fw_off == NINDIR(fs)) { - freework_freeblock(freework); + freework_freeblock(freework, SINGLETON_KEY); return; } freework->fw_state |= INPROGRESS; @@ -7894,10 +7929,12 @@ handle_workitem_freeblocks(freeblks, flags) struct allocindir *aip; struct ufsmount *ump; struct worklist *wk; + u_long key; KASSERT(LIST_EMPTY(&freeblks->fb_jblkdephd), ("handle_workitem_freeblocks: Journal entries not written.")); ump = VFSTOUFS(freeblks->fb_list.wk_mp); + key = ffs_blkrelease_start(ump, freeblks->fb_devvp, freeblks->fb_inum); ACQUIRE_LOCK(ump); while ((wk = LIST_FIRST(&freeblks->fb_freeworkhd)) != NULL) { WORKLIST_REMOVE(wk); @@ -7935,7 +7972,7 @@ handle_workitem_freeblocks(freeblks, flags) if (freework->fw_lbn <= -UFS_NDADDR) handle_workitem_indirblk(freework); else - freework_freeblock(freework); + freework_freeblock(freework, key); continue; default: panic("handle_workitem_freeblocks: Unknown type %s", @@ -7948,6 +7985,7 @@ handle_workitem_freeblocks(freeblks, flags) freeblks = NULL; } FREE_LOCK(ump); + ffs_blkrelease_finish(ump, key); if (freeblks) return handle_complete_freeblocks(freeblks, flags); return (0); @@ -8080,13 +8118,9 @@ indir_trunc(freework, dbn, lbn) ufs1_daddr_t *bap1; ufs2_daddr_t nb, nnb, *bap2; ufs_lbn_t lbnadd, nlbn; - int i, nblocks, ufs1fmt; - int freedblocks; - int goingaway; - int freedeps; - int needj; - int level; - int cnt; + u_long key; + int nblocks, ufs1fmt, freedblocks; + int goingaway, freedeps, needj, level, cnt, i; freeblks = freework->fw_freeblks; ump = VFSTOUFS(freeblks->fb_list.wk_mp); @@ -8180,6 +8214,7 @@ indir_trunc(freework, dbn, lbn) * arranges for the current level to be freed when subordinates * are free when journaling. */ + key = ffs_blkrelease_start(ump, freeblks->fb_devvp, freeblks->fb_inum); for (i = freework->fw_off; i < NINDIR(fs); i++, nb = nnb) { if (i != NINDIR(fs) - 1) { if (ufs1fmt) @@ -8215,13 +8250,14 @@ indir_trunc(freework, dbn, lbn) freedeps++; } CTR3(KTR_SUJ, - "indir_trunc: ino %d blkno %jd size %ld", + "indir_trunc: ino %jd blkno %jd size %d", freeblks->fb_inum, nb, fs->fs_bsize); ffs_blkfree(ump, fs, freeblks->fb_devvp, nb, fs->fs_bsize, freeblks->fb_inum, - freeblks->fb_vtype, &wkhd); + freeblks->fb_vtype, &wkhd, key); } } + ffs_blkrelease_finish(ump, key); if (goingaway) { bp->b_flags |= B_INVAL | B_NOCACHE; brelse(bp); @@ -8244,7 +8280,7 @@ indir_trunc(freework, dbn, lbn) if (level == 0) freeblks->fb_cgwait += freedeps; if (freework->fw_ref == 0) - freework_freeblock(freework); + freework_freeblock(freework, SINGLETON_KEY); FREE_LOCK(ump); return; } @@ -8253,10 +8289,10 @@ indir_trunc(freework, dbn, lbn) */ dbn = dbtofsb(fs, dbn); CTR3(KTR_SUJ, - "indir_trunc 2: ino %d blkno %jd size %ld", + "indir_trunc 2: ino %jd blkno %jd size %d", freeblks->fb_inum, dbn, fs->fs_bsize); ffs_blkfree(ump, fs, freeblks->fb_devvp, dbn, fs->fs_bsize, - freeblks->fb_inum, freeblks->fb_vtype, NULL); + freeblks->fb_inum, freeblks->fb_vtype, NULL, SINGLETON_KEY); /* Non SUJ softdep does single-threaded truncations. */ if (freework->fw_blkno == dbn) { freework->fw_state |= ALLCOMPLETE; diff --git a/sys/ufs/ffs/ffs_vfsops.c b/sys/ufs/ffs/ffs_vfsops.c index 9ed5c58f7b0e..e3927327c79c 100644 --- a/sys/ufs/ffs/ffs_vfsops.c +++ b/sys/ufs/ffs/ffs_vfsops.c @@ -978,6 +978,8 @@ ffs_mountfs(devvp, mp, td) taskqueue_thread_enqueue, &ump->um_trim_tq); taskqueue_start_threads(&ump->um_trim_tq, 1, PVFS, "%s trim", mp->mnt_stat.f_mntonname); + ump->um_trimhash = hashinit(MAXTRIMIO, M_TRIM, + &ump->um_trimlisthashsize); } } @@ -1256,6 +1258,7 @@ ffs_unmount(mp, mntflags) pause("ufsutr", hz); taskqueue_drain_all(ump->um_trim_tq); taskqueue_free(ump->um_trim_tq); + free (ump->um_trimhash, M_TRIM); } g_topology_lock(); if (ump->um_fsckpid > 0) { diff --git a/sys/ufs/ffs/softdep.h b/sys/ufs/ffs/softdep.h index 707429fe68c6..1e2946ab9fb9 100644 --- a/sys/ufs/ffs/softdep.h +++ b/sys/ufs/ffs/softdep.h @@ -557,6 +557,7 @@ struct freefrag { long ff_fragsize; /* size of fragment being deleted */ ino_t ff_inum; /* owning inode number */ enum vtype ff_vtype; /* owning inode's file type */ + int ff_key; /* trim key when deleted */ }; /* diff --git a/sys/ufs/ufs/ufsmount.h b/sys/ufs/ufs/ufsmount.h index 1958b02f6abb..0652e6a72930 100644 --- a/sys/ufs/ufs/ufsmount.h +++ b/sys/ufs/ufs/ufsmount.h @@ -47,6 +47,7 @@ struct ufs_args { #ifdef MALLOC_DECLARE MALLOC_DECLARE(M_UFSMNT); +MALLOC_DECLARE(M_TRIM); #endif struct buf; @@ -63,6 +64,7 @@ struct inodedep; TAILQ_HEAD(inodedeplst, inodedep); LIST_HEAD(bmsafemaphd, bmsafemap); +LIST_HEAD(trimlist_hashhead, ffs_blkfree_trim_params); /* * This structure describes the UFS specific mount structure data. @@ -70,7 +72,6 @@ LIST_HEAD(bmsafemaphd, bmsafemap); * UFS (UFS1, UFS2, etc). * * Lock reference: - * a - atomic operations * c - set at allocation then constant until freed * i - ufsmount interlock (UFS_LOCK / UFS_UNLOCK) * q - associated quota file is locked @@ -99,8 +100,13 @@ struct ufsmount { char um_qflags[MAXQUOTAS]; /* (i) quota specific flags */ int64_t um_savedmaxfilesize; /* (c) track maxfilesize */ u_int um_flags; /* (i) filesystem flags */ - u_int um_trim_inflight; /* (a) outstanding trim count */ + u_int um_trim_inflight; /* (i) outstanding trim count */ + u_int um_trim_inflight_blks; /* (i) outstanding trim blks */ + u_long um_trim_total; /* (i) total trim count */ + u_long um_trim_total_blks; /* (i) total trim block count */ struct taskqueue *um_trim_tq; /* (c) trim request queue */ + struct trimlist_hashhead *um_trimhash; /* (i) trimlist hash table */ + u_long um_trimlisthashsize; /* (i) trim hash table size-1 */ /* (c) - below function ptrs */ int (*um_balloc)(struct vnode *, off_t, int, struct ucred *, int, struct buf **);