Improvement in UFS/FFS directory placement when doing mkdir(2).

The algorithm for laying out new directories was devised in the 1980s
and markedly improved the performance of the filesystem. In those days
large disks had at most 100 cylinder groups and often as few as 10-20.
Modern multi-terrabyte disks have thousands of cylinder groups. The
original algorithm does not handle these large sizes well. This change
attempts to expand the scope of the original algorithm to work well
with these much larger disks while still retaining the properties
of the original algorithm for small disks.

The filesystem implementation is divided into policy routines and
implementation routines. The policy routines can be changed in any
way desired without risk of corrupting the filesystem. The policy
requests are handled by the implementation layer. If the policy
asks for an available resource, it is granted. But if it asks for
an already in-use resource, then the implementation will provide
an available one nearby the request. Thus it is impossible for a
policy to double allocate. This change is limited to the policy
implementation.

This change updates the ffs_dirpref() routine which is responsible
for selecting the cylinder group into which a new directory should
be placed. If we are near the root of the filesystem we aim to
spread them out as much as possible. As we descend deeper from the
root we cluster them closer together around their parent as we
expect them to be more closely interactive. Higher-level directories
like usr/src/sys and usr/src/bin should be separated while the
directories in these areas are more likely to be accessed together
so should be closer. And directories within commands or kernel
subsystems should be closer still.

We pick a range of cylinder groups around the cylinder group of the
directory in which we are being created. The size of the range for
our search is based on our depth from the root of our filesystem.
We then probe that range based on how many directories are already
present. The first new directory is at 1/2 (middle) of the range;
the second is in the first 1/4 of the range, then at 3/4, 1/8, 3/8,
5/8, 7/8, 1/16, 3/16, 5/16, etc.

It is desirable to store the depth of a directory in its on-disk
inode so that it is available when we need it. We add a new field
di_dirdepth to track the depth of each directory. Because there are
few spare fields left in the inode, we choose to share an existing
field in the inode rather than having one of our own. Specifically
we create a union with the di_freelink field. The di_freelink field
is used to track inodes that have been unlinked but remain referenced.
It is not needed until a rmdir(2) operation has been done on a
directory. At that point, the directory has no contents and even
if it is kept active as a current directory is no longer able to
have any new directories or files created in it. Thus the use of
di_dirdepth and di_freelink will never coincide.

Reported by:  Timo Voelker
Reviewed by:  kib
Tested by:    Peter Holm
MFC after:    2 weeks
Sponsored by: The FreeBSD Foundation
Differential Revision: https://reviews.freebsd.org/D39246
This commit is contained in:
Kirk McKusick 2023-03-29 21:09:39 -07:00
parent 1fb7d2cf99
commit fe5e6e2cc5
12 changed files with 203 additions and 61 deletions

View file

@ -88,12 +88,97 @@ propagate(void)
if (inoinfo(inp->i_parent)->ino_state == DFOUND &&
INO_IS_DUNFOUND(inp->i_number)) {
inoinfo(inp->i_number)->ino_state = DFOUND;
check_dirdepth(inp);
change++;
}
}
} while (change > 0);
}
/*
* Check that the recorded depth of the directory is correct.
*/
void
check_dirdepth(struct inoinfo *inp)
{
struct inoinfo *parentinp;
struct inode ip;
union dinode *dp;
int saveresolved;
static int updateasked, dirdepthupdate;
if ((parentinp = getinoinfo(inp->i_parent)) == NULL) {
pfatal("check_dirdepth: UNKNOWN PARENT DIR");
return;
}
/*
* If depth is correct, nothing to do.
*/
if (parentinp->i_depth + 1 == inp->i_depth)
return;
/*
* Only the root inode should have depth of 0, so if any other
* directory has a depth of 0 then this is an old filesystem
* that has not been tracking directory depth. Ask just once
* whether it should start tracking directory depth.
*/
if (inp->i_depth == 0 && updateasked == 0) {
updateasked = 1;
if (preen) {
pwarn("UPDATING FILESYSTEM TO TRACK DIRECTORY DEPTH");
dirdepthupdate = 1;
} else {
/*
* The file system can be marked clean even if
* a directory does not have the right depth.
* Hence, resolved should not be cleared when
* the filesystem does not update directory depths.
*/
saveresolved = resolved;
dirdepthupdate =
reply("UPDATE FILESYSTEM TO TRACK DIRECTORY DEPTH");
resolved = saveresolved;
}
}
/*
* If we are not converting, nothing more to do.
*/
if (inp->i_depth == 0 && dirdepthupdate == 0)
return;
/*
* Individual directory at wrong depth. Report it and correct if
* in preen mode or ask if in interactive mode. Note that if a
* directory is renamed to a new location that is at a different
* level in the tree, its depth will be recalculated, but none of
* the directories that it contains will be updated. Thus it is
* not unexpected to find directories with incorrect depths. No
* operational harm will come from this though new directory
* placement in the subtree may not be as optimal until the depths
* of the affected directories are corrected.
*
* To avoid much spurious output on otherwise clean filesystems
* we only generate detailed output when the debug flag is given.
*/
ginode(inp->i_number, &ip);
dp = ip.i_dp;
if (inp->i_depth != 0 && debug) {
pwarn("DIRECTORY");
prtinode(&ip);
printf(" DEPTH %d SHOULD BE %d", inp->i_depth,
parentinp->i_depth + 1);
if (preen == 0 && reply("ADJUST") == 0) {
irelse(&ip);
return;
}
if (preen)
printf(" (ADJUSTED)\n");
}
inp->i_depth = parentinp->i_depth + 1;
DIP_SET(dp, di_dirdepth, inp->i_depth);
inodirty(&ip);
irelse(&ip);
}
/*
* Scan each entry in a directory block.
*/
@ -471,7 +556,7 @@ linkup(ino_t orphan, ino_t parentdir, char *name)
{
struct inode ip;
union dinode *dp;
int lostdir;
int lostdir, depth;
ino_t oldlfdir;
struct inoinfo *inp;
struct inodesc idesc;
@ -546,7 +631,7 @@ linkup(ino_t orphan, ino_t parentdir, char *name)
irelse(&ip);
return (0);
}
if ((changeino(UFS_ROOTINO, lfname, lfdir) & ALTERED) == 0) {
if ((changeino(UFS_ROOTINO, lfname, lfdir, 1) & ALTERED) == 0) {
pfatal("SORRY. CANNOT CREATE lost+found DIRECTORY\n\n");
irelse(&ip);
return (0);
@ -575,7 +660,8 @@ linkup(ino_t orphan, ino_t parentdir, char *name)
}
inoinfo(orphan)->ino_linkcnt--;
if (lostdir) {
if ((changeino(orphan, "..", lfdir) & ALTERED) == 0 &&
depth = DIP(dp, di_dirdepth) + 1;
if ((changeino(orphan, "..", lfdir, depth) & ALTERED) == 0 &&
parentdir != (ino_t)-1)
(void)makeentry(orphan, lfdir, "..");
DIP_SET(dp, di_nlink, DIP(dp, di_nlink) + 1);
@ -607,7 +693,7 @@ linkup(ino_t orphan, ino_t parentdir, char *name)
* fix an entry in a directory.
*/
int
changeino(ino_t dir, const char *name, ino_t newnum)
changeino(ino_t dir, const char *name, ino_t newnum, int depth)
{
struct inodesc idesc;
struct inode ip;
@ -621,7 +707,10 @@ changeino(ino_t dir, const char *name, ino_t newnum)
idesc.id_name = strdup(name);
idesc.id_parent = newnum; /* new value for name */
ginode(dir, &ip);
error = ckinode(ip.i_dp, &idesc);
if (((error = ckinode(ip.i_dp, &idesc)) & ALTERED) && newnum != 0) {
DIP_SET(ip.i_dp, di_dirdepth, depth);
getinoinfo(dir)->i_depth = depth;
}
free(idesc.id_name);
irelse(&ip);
return (error);
@ -815,8 +904,8 @@ allocdir(ino_t parent, ino_t request, int mode)
struct inode ip;
union dinode *dp;
struct bufarea *bp;
struct inoinfo *inp;
struct dirtemplate *dirp;
struct inoinfo *inp, *parentinp;
ino = allocino(request, IFDIR|mode);
if (ino == 0)
@ -859,6 +948,12 @@ allocdir(ino_t parent, ino_t request, int mode)
inp->i_parent = parent;
inp->i_dotdot = parent;
inp->i_flags |= INFO_NEW;
if ((parentinp = getinoinfo(inp->i_parent)) == NULL) {
pfatal("allocdir: UNKNOWN PARENT DIR");
} else {
inp->i_depth = parentinp->i_depth + 1;
DIP_SET(dp, di_dirdepth, inp->i_depth);
}
inoinfo(ino)->ino_type = DT_DIR;
inoinfo(ino)->ino_state = inoinfo(parent)->ino_state;
if (inoinfo(ino)->ino_state == DSTATE) {

View file

@ -309,6 +309,7 @@ struct inoinfo {
ino_t i_parent; /* inode number of parent */
ino_t i_dotdot; /* inode number of `..' */
size_t i_isize; /* size of inode */
u_int i_depth; /* depth of directory from root */
u_int i_flags; /* flags, see below */
u_int i_numblks; /* size of block array in bytes */
ufs2_daddr_t i_blks[1]; /* actually longer */
@ -462,9 +463,10 @@ void catch(int);
void catchquit(int);
void cgdirty(struct bufarea *);
struct bufarea *cglookup(int cg);
int changeino(ino_t dir, const char *name, ino_t newnum);
int changeino(ino_t dir, const char *name, ino_t newnum, int depth);
void check_blkcnt(struct inode *ip);
int check_cgmagic(int cg, struct bufarea *cgbp, int requestrebuild);
void check_dirdepth(struct inoinfo *inp);
int chkrange(ufs2_daddr_t blk, int cnt);
void ckfini(int markclean);
int ckinode(union dinode *dp, struct inodesc *);

View file

@ -1135,6 +1135,7 @@ cacheino(union dinode *dp, ino_t inumber)
inp->i_dotdot = (ino_t)0;
inp->i_number = inumber;
inp->i_isize = DIP(dp, di_size);
inp->i_depth = DIP(dp, di_dirdepth);
inp->i_numblks = blks;
for (i = 0; i < MIN(blks, UFS_NDADDR); i++)
inp->i_blks[i] = DIP(dp, di_db[i]);

View file

@ -388,14 +388,15 @@ checkinode(ino_t inumber, struct inodesc *idesc, int rebuildcg)
n_files++;
inoinfo(inumber)->ino_linkcnt = DIP(dp, di_nlink);
if (mode == IFDIR) {
if (DIP(dp, di_size) == 0)
if (DIP(dp, di_size) == 0) {
inoinfo(inumber)->ino_state = DCLEAR;
else if (DIP(dp, di_nlink) <= 0)
} else if (DIP(dp, di_nlink) <= 0) {
inoinfo(inumber)->ino_state = DZLINK;
else
} else {
inoinfo(inumber)->ino_state = DSTATE;
cacheino(dp, inumber);
countdirs++;
cacheino(dp, inumber);
countdirs++;
}
} else if (DIP(dp, di_nlink) <= 0)
inoinfo(inumber)->ino_state = FZLINK;
else

View file

@ -210,8 +210,10 @@ pass2(void)
if (inp->i_parent == 0 || inp->i_isize == 0)
continue;
if (inoinfo(inp->i_parent)->ino_state == DFOUND &&
INO_IS_DUNFOUND(inp->i_number))
INO_IS_DUNFOUND(inp->i_number)) {
inoinfo(inp->i_number)->ino_state = DFOUND;
check_dirdepth(inp);
}
if (inp->i_dotdot == inp->i_parent ||
inp->i_dotdot == (ino_t)-1)
continue;
@ -271,7 +273,8 @@ pass2(void)
inoinfo(inp->i_dotdot)->ino_linkcnt++;
inoinfo(inp->i_parent)->ino_linkcnt--;
inp->i_dotdot = inp->i_parent;
(void)changeino(inp->i_number, "..", inp->i_parent);
(void)changeino(inp->i_number, "..", inp->i_parent,
getinoinfo(inp->i_parent)->i_depth + 1);
}
/*
* Mark all the directories that can be found from the root.
@ -548,10 +551,12 @@ pass2check(struct inodesc *idesc)
case DFOUND:
inp = getinoinfo(dirp->d_ino);
if (idesc->id_entryno > 2) {
if (inp->i_parent == 0)
if (inp->i_parent == 0) {
inp->i_parent = idesc->id_number;
else if ((n = fix_extraneous(inp, idesc)) == 1)
check_dirdepth(inp);
} else if ((n = fix_extraneous(inp, idesc))) {
break;
}
}
/* FALLTHROUGH */

View file

@ -74,7 +74,7 @@ pass3(void)
if (inp->i_number == UFS_ROOTINO ||
(inp->i_parent != 0 && !S_IS_DUNFOUND(state)))
continue;
if (state == DCLEAR)
if (state == DCLEAR || state == DZLINK)
continue;
/*
* If we are running with soft updates and we come
@ -102,6 +102,7 @@ pass3(void)
inoinfo(lfdir)->ino_linkcnt--;
}
inoinfo(orphan)->ino_state = DFOUND;
check_dirdepth(inp);
propagate();
continue;
}
@ -127,6 +128,7 @@ pass3(void)
}
irelse(&ip);
inoinfo(orphan)->ino_state = DFOUND;
check_dirdepth(inp);
propagate();
}
}

View file

@ -781,7 +781,7 @@ CMDFUNCSTART(rm)
if (!checkactivedir())
return 1;
rval = changeino(curinum, argv[1], 0);
rval = changeino(curinum, argv[1], 0, 0);
if (rval & ALTERED) {
printf("Name `%s' removed\n", argv[1]);
return 0;

View file

@ -915,8 +915,9 @@ fsinit(time_t utime)
alloc(sblock.fs_fsize, node.dp1.di_mode);
node.dp1.di_blocks =
btodb(fragroundup(&sblock, node.dp1.di_size));
wtfs(fsbtodb(&sblock, node.dp1.di_db[0]),
sblock.fs_fsize, iobuf);
node.dp1.di_dirdepth = 1;
wtfs(fsbtodb(&sblock, node.dp1.di_db[0]),
sblock.fs_fsize, iobuf);
iput(&node, UFS_ROOTINO + 1);
}
} else {
@ -951,8 +952,9 @@ fsinit(time_t utime)
alloc(sblock.fs_fsize, node.dp2.di_mode);
node.dp2.di_blocks =
btodb(fragroundup(&sblock, node.dp2.di_size));
wtfs(fsbtodb(&sblock, node.dp2.di_db[0]),
sblock.fs_fsize, iobuf);
node.dp2.di_dirdepth = 1;
wtfs(fsbtodb(&sblock, node.dp2.di_db[0]),
sblock.fs_fsize, iobuf);
iput(&node, UFS_ROOTINO + 1);
}
}

View file

@ -1179,6 +1179,8 @@ ffs_valloc(struct vnode *pvp,
}
ip->i_flags = 0;
DIP_SET(ip, i_flags, 0);
if ((mode & IFMT) == IFDIR)
DIP_SET(ip, i_dirdepth, DIP(pip, i_dirdepth) + 1);
/*
* Set up a new generation number for this inode.
*/
@ -1238,10 +1240,10 @@ static ino_t
ffs_dirpref(struct inode *pip)
{
struct fs *fs;
int cg, prefcg, dirsize, cgsize;
int cg, prefcg, curcg, dirsize, cgsize;
int depth, range, start, end, numdirs, power, numerator, denominator;
u_int avgifree, avgbfree, avgndir, curdirsize;
u_int minifree, minbfree, maxndir;
u_int mincg, minndir;
u_int maxcontigdirs;
mtx_assert(UFS_MTX(ITOUMP(pip)), MA_OWNED);
@ -1252,35 +1254,53 @@ ffs_dirpref(struct inode *pip)
avgndir = fs->fs_cstotal.cs_ndir / fs->fs_ncg;
/*
* Force allocation in another cg if creating a first level dir.
* Select a preferred cylinder group to place a new directory.
* If we are near the root of the filesystem we aim to spread
* them out as much as possible. As we descend deeper from the
* root we cluster them closer together around their parent as
* we expect them to be more closely interactive. Higher-level
* directories like usr/src/sys and usr/src/bin should be
* separated while the directories in these areas are more
* likely to be accessed together so should be closer.
*
* We pick a range of cylinder groups around the cylinder group
* of the directory in which we are being created. The size of
* the range for our search is based on our depth from the root
* of our filesystem. We then probe that range based on how many
* directories are already present. The first new directory is at
* 1/2 (middle) of the range; the second is in the first 1/4 of the
* range, then at 3/4, 1/8, 3/8, 5/8, 7/8, 1/16, 3/16, 5/16, etc.
*/
ASSERT_VOP_LOCKED(ITOV(pip), "ffs_dirpref");
if (ITOV(pip)->v_vflag & VV_ROOT) {
prefcg = arc4random() % fs->fs_ncg;
mincg = prefcg;
minndir = fs->fs_ipg;
for (cg = prefcg; cg < fs->fs_ncg; cg++)
if (fs->fs_cs(fs, cg).cs_ndir < minndir &&
fs->fs_cs(fs, cg).cs_nifree >= avgifree &&
fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) {
mincg = cg;
minndir = fs->fs_cs(fs, cg).cs_ndir;
}
for (cg = 0; cg < prefcg; cg++)
if (fs->fs_cs(fs, cg).cs_ndir < minndir &&
fs->fs_cs(fs, cg).cs_nifree >= avgifree &&
fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) {
mincg = cg;
minndir = fs->fs_cs(fs, cg).cs_ndir;
}
return ((ino_t)(fs->fs_ipg * mincg));
}
depth = DIP(pip, i_dirdepth);
range = fs->fs_ncg / (1 << depth);
curcg = ino_to_cg(fs, pip->i_number);
start = curcg - (range / 2);
if (start < 0)
start += fs->fs_ncg;
end = curcg + (range / 2);
if (end >= fs->fs_ncg)
end -= fs->fs_ncg;
numdirs = pip->i_effnlink - 1;
power = fls(numdirs);
numerator = (numdirs & ~(1 << (power - 1))) * 2 + 1;
denominator = 1 << power;
prefcg = (curcg - (range / 2) + (range * numerator / denominator));
if (prefcg < 0)
prefcg += fs->fs_ncg;
if (prefcg >= fs->fs_ncg)
prefcg -= fs->fs_ncg;
/*
* If this filesystem is not tracking directory depths,
* revert to the old algorithm.
*/
if (depth == 0 && pip->i_number != UFS_ROOTINO)
prefcg = curcg;
/*
* Count various limits which used for
* optimal allocation of a directory inode.
*/
maxndir = min(avgndir + fs->fs_ipg / 16, fs->fs_ipg);
maxndir = min(avgndir + (1 << depth), fs->fs_ipg);
minifree = avgifree - avgifree / 4;
if (minifree < 1)
minifree = 1;
@ -1324,7 +1344,6 @@ ffs_dirpref(struct inode *pip)
* in new cylinder groups so finds every possible block after
* one pass over the filesystem.
*/
prefcg = ino_to_cg(fs, pip->i_number);
for (cg = prefcg; cg < fs->fs_ncg; cg++)
if (fs->fs_cs(fs, cg).cs_ndir < maxndir &&
fs->fs_cs(fs, cg).cs_nifree >= minifree &&

View file

@ -12485,17 +12485,6 @@ softdep_update_inodeblock(
KASSERT(MOUNTEDSOFTDEP(mp) != 0,
("softdep_update_inodeblock called on non-softdep filesystem"));
fs = ump->um_fs;
/*
* Preserve the freelink that is on disk. clear_unlinked_inodedep()
* does not have access to the in-core ip so must write directly into
* the inode block buffer when setting freelink.
*/
if (fs->fs_magic == FS_UFS1_MAGIC)
DIP_SET(ip, i_freelink, ((struct ufs1_dinode *)bp->b_data +
ino_to_fsbo(fs, ip->i_number))->di_freelink);
else
DIP_SET(ip, i_freelink, ((struct ufs2_dinode *)bp->b_data +
ino_to_fsbo(fs, ip->i_number))->di_freelink);
/*
* If the effective link count is not equal to the actual link
* count, then we must track the difference in an inodedep while
@ -12511,6 +12500,21 @@ softdep_update_inodeblock(
panic("softdep_update_inodeblock: bad link count");
return;
}
/*
* Preserve the freelink that is on disk. clear_unlinked_inodedep()
* does not have access to the in-core ip so must write directly into
* the inode block buffer when setting freelink.
*/
if ((inodedep->id_state & UNLINKED) != 0) {
if (fs->fs_magic == FS_UFS1_MAGIC)
DIP_SET(ip, i_freelink,
((struct ufs1_dinode *)bp->b_data +
ino_to_fsbo(fs, ip->i_number))->di_freelink);
else
DIP_SET(ip, i_freelink,
((struct ufs2_dinode *)bp->b_data +
ino_to_fsbo(fs, ip->i_number))->di_freelink);
}
KASSERT(ip->i_nlink >= inodedep->id_nlinkdelta,
("softdep_update_inodeblock inconsistent ip %p i_nlink %d "
"inodedep %p id_nlinkdelta %jd",

View file

@ -156,7 +156,10 @@ struct ufs2_dinode {
[(UFS_NDADDR + UFS_NIADDR) * sizeof(ufs2_daddr_t)];
};
u_int64_t di_modrev; /* 232: i_modrev for NFSv4 */
uint32_t di_freelink; /* 240: SUJ: Next unlinked inode. */
union {
uint32_t di_freelink; /* 240: SUJ: Next unlinked inode. */
uint32_t di_dirdepth; /* 240: IFDIR: depth from root dir */
};
uint32_t di_ckhash; /* 244: if CK_INODE, its check-hash */
uint32_t di_spare[2]; /* 248: Reserved; currently unused */
};
@ -179,7 +182,10 @@ struct ufs2_dinode {
struct ufs1_dinode {
u_int16_t di_mode; /* 0: IFMT, permissions; see below. */
int16_t di_nlink; /* 2: File link count. */
uint32_t di_freelink; /* 4: SUJ: Next unlinked inode. */
union {
uint32_t di_freelink; /* 4: SUJ: Next unlinked inode. */
uint32_t di_dirdepth; /* 4: IFDIR: depth from root dir */
};
u_int64_t di_size; /* 8: File byte count. */
int32_t di_atime; /* 16: Last access time. */
int32_t di_atimensec; /* 20: Last access time. */

View file

@ -1710,6 +1710,10 @@ ufs_rename(
* and ".." set to point to the new parent.
*/
if (doingdirectory && newparent) {
/*
* Set the directory depth based on its new parent.
*/
DIP_SET(fip, i_dirdepth, DIP(tdp, i_dirdepth) + 1);
/*
* If tip exists we simply use its link, otherwise we must
* add a new one.
@ -2121,6 +2125,7 @@ ufs_mkdir(
ip->i_effnlink = 2;
ip->i_nlink = 2;
DIP_SET(ip, i_nlink, 2);
DIP_SET(ip, i_dirdepth, DIP(dp,i_dirdepth) + 1);
if (cnp->cn_flags & ISWHITEOUT) {
ip->i_flags |= UF_OPAQUE;