When closing the last reference to an unlinked file, it is freed

by the inactive routine. Because the freeing causes the filesystem
to be modified, the close must be held up during periods when the
filesystem is suspended.

For snapshots to be consistent across crashes, they must write
blocks that they copy and claim those written blocks in their
on-disk block pointers before the old blocks that they referenced
can be allowed to be written.

Close a loophole that allowed unwritten blocks to be skipped when
doing ffs_sync with a request to wait for all I/O activity to be
completed.
This commit is contained in:
Kirk McKusick 2001-04-25 08:11:18 +00:00
parent e69b2bc11c
commit 112f737245
Notes: svn2git 2020-12-20 02:59:44 +00:00
svn path=/head/; revision=75943
4 changed files with 113 additions and 26 deletions

View file

@ -235,6 +235,15 @@ vn_close(vp, flags, cred, p)
if (flags & FWRITE)
vp->v_writecount--;
error = VOP_CLOSE(vp, flags, cred, p);
/*
* XXX - In certain instances VOP_CLOSE has to do the vrele
* itself. If the vrele has been done, it will return EAGAIN
* to indicate that the vrele should not be done again. When
* this happens, we just return success. The correct thing to
* do would be to have all VOP_CLOSE instances do the vrele.
*/
if (error == EAGAIN)
return (0);
vrele(vp);
return (error);
}

View file

@ -198,10 +198,14 @@ ffs_snapshot(mp, snapfile)
}
/*
* Allocate shadow blocks to copy all of the other snapshot inodes
* so that we will be able to expunge them from this snapshot.
* so that we will be able to expunge them from this snapshot. Also
* include a copy of ourselves so that we do not deadlock trying
* to copyonwrite ourselves when VOP_FSYNC'ing below.
*/
for (loc = 0, inoblkcnt = 0; loc < snaploc; loc++) {
fs->fs_snapinum[snaploc] = ip->i_number;
for (loc = snaploc, inoblkcnt = 0; loc >= 0; loc--) {
blkno = fragstoblks(fs, ino_to_fsba(fs, fs->fs_snapinum[loc]));
fs->fs_snapinum[snaploc] = 0;
for (i = 0; i < inoblkcnt; i++)
if (inoblks[i] == blkno)
break;
@ -652,14 +656,14 @@ ffs_snapremove(vp)
ip = VTOI(vp);
fs = ip->i_fs;
/*
* Delete from incore list.
* If active, delete from incore list (this snapshot may
* already have been in the process of being deleted, so
* would not have been active).
*
* Clear copy-on-write flag if last snapshot.
*/
devvp = ip->i_devvp;
if (ip->i_nextsnap.tqe_prev == 0) {
printf("ffs_snapremove: lost snapshot vnode %d\n",
ip->i_number);
} else {
if (ip->i_nextsnap.tqe_prev != 0) {
devvp = ip->i_devvp;
TAILQ_REMOVE(&devvp->v_rdev->si_snapshots, ip, i_nextsnap);
ip->i_nextsnap.tqe_prev = 0;
if (TAILQ_FIRST(&devvp->v_rdev->si_snapshots) == 0) {
@ -832,9 +836,10 @@ ffs_snapblkfree(freeip, bno, size)
error = VOP_BALLOC(vp, lblktosize(fs, (off_t)lbn),
fs->fs_bsize, KERNCRED, 0, &cbp);
p->p_flag &= ~P_COWINPROGRESS;
VOP_UNLOCK(vp, 0, p);
if (error)
if (error) {
VOP_UNLOCK(vp, 0, p);
break;
}
#ifdef DEBUG
if (snapdebug)
printf("%s%d lbn %d for inum %d size %ld to blkno %d\n",
@ -843,22 +848,44 @@ ffs_snapblkfree(freeip, bno, size)
#endif
/*
* If we have already read the old block contents, then
* simply copy them to the new block.
* simply copy them to the new block. Note that we need
* to synchronously write snapshots that have not been
* unlinked, and hence will be visible after a crash,
* to ensure their integrity.
*/
if (savedcbp != 0) {
bcopy(savedcbp->b_data, cbp->b_data, fs->fs_bsize);
bawrite(cbp);
if (ip->i_effnlink > 0)
(void) VOP_FSYNC(vp, KERNCRED, MNT_WAIT, p);
VOP_UNLOCK(vp, 0, p);
continue;
}
/*
* Otherwise, read the old block contents into the buffer.
*/
if ((error = readblock(cbp, lbn)) != 0)
if ((error = readblock(cbp, lbn)) != 0) {
bzero(cbp->b_data, fs->fs_bsize);
bawrite(cbp);
if (ip->i_effnlink > 0)
(void) VOP_FSYNC(vp, KERNCRED, MNT_WAIT, p);
VOP_UNLOCK(vp, 0, p);
break;
}
savedcbp = cbp;
}
if (savedcbp)
/*
* Note that we need to synchronously write snapshots that
* have not been unlinked, and hence will be visible after
* a crash, to ensure their integrity.
*/
if (savedcbp) {
vp = savedcbp->b_vp;
bawrite(savedcbp);
if (VTOI(vp)->i_effnlink > 0)
(void) VOP_FSYNC(vp, KERNCRED, MNT_WAIT, p);
VOP_UNLOCK(vp, 0, p);
}
/*
* If we have been unable to allocate a block in which to do
* the copy, then return non-zero so that the fragment will
@ -1014,8 +1041,8 @@ ffs_copyonwrite(devvp, bp)
error = VOP_BALLOC(vp, lblktosize(fs, (off_t)lbn),
fs->fs_bsize, KERNCRED, B_NOWAIT, &cbp);
p->p_flag &= ~P_COWINPROGRESS;
VOP_UNLOCK(vp, 0, p);
if (error) {
VOP_UNLOCK(vp, 0, p);
if (error != EWOULDBLOCK)
break;
tsleep(vp, p->p_pri.pri_user, "nap", 1);
@ -1035,22 +1062,44 @@ ffs_copyonwrite(devvp, bp)
#endif
/*
* If we have already read the old block contents, then
* simply copy them to the new block.
* simply copy them to the new block. Note that we need
* to synchronously write snapshots that have not been
* unlinked, and hence will be visible after a crash,
* to ensure their integrity.
*/
if (savedcbp != 0) {
bcopy(savedcbp->b_data, cbp->b_data, fs->fs_bsize);
bawrite(cbp);
if (ip->i_effnlink > 0)
(void) VOP_FSYNC(vp, KERNCRED, MNT_WAIT, p);
VOP_UNLOCK(vp, 0, p);
continue;
}
/*
* Otherwise, read the old block contents into the buffer.
*/
if ((error = readblock(cbp, lbn)) != 0)
if ((error = readblock(cbp, lbn)) != 0) {
bzero(cbp->b_data, fs->fs_bsize);
bawrite(cbp);
if (ip->i_effnlink > 0)
(void) VOP_FSYNC(vp, KERNCRED, MNT_WAIT, p);
VOP_UNLOCK(vp, 0, p);
break;
}
savedcbp = cbp;
}
if (savedcbp)
/*
* Note that we need to synchronously write snapshots that
* have not been unlinked, and hence will be visible after
* a crash, to ensure their integrity.
*/
if (savedcbp) {
vp = savedcbp->b_vp;
bawrite(savedcbp);
if (VTOI(vp)->i_effnlink > 0)
(void) VOP_FSYNC(vp, KERNCRED, MNT_WAIT, p);
VOP_UNLOCK(vp, 0, p);
}
return (error);
}

View file

@ -952,7 +952,7 @@ ffs_sync(mp, waitfor, cred, p)
struct ucred *cred;
struct proc *p;
{
struct vnode *nvp, *vp;
struct vnode *nvp, *vp, *devvp;
struct inode *ip;
struct ufsmount *ump = VFSTOUFS(mp);
struct fs *fs;
@ -1026,12 +1026,21 @@ ffs_sync(mp, waitfor, cred, p)
#ifdef QUOTA
qsync(mp);
#endif
if (waitfor != MNT_LAZY) {
vn_lock(ump->um_devvp, LK_EXCLUSIVE | LK_RETRY, p);
if ((error = VOP_FSYNC(ump->um_devvp, cred, waitfor, p)) != 0)
devvp = ump->um_devvp;
mtx_lock(&devvp->v_interlock);
if (waitfor != MNT_LAZY &&
(devvp->v_numoutput > 0 || TAILQ_FIRST(&devvp->v_dirtyblkhd))) {
mtx_unlock(&devvp->v_interlock);
vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY, p);
if ((error = VOP_FSYNC(devvp, cred, waitfor, p)) != 0)
allerror = error;
VOP_UNLOCK(ump->um_devvp, 0, p);
}
VOP_UNLOCK(devvp, 0, p);
if (waitfor == MNT_WAIT) {
mtx_lock(&mntvnode_mtx);
goto loop;
}
} else
mtx_unlock(&devvp->v_interlock);
/*
* Write back modified superblock.
*/

View file

@ -292,12 +292,32 @@ ufs_close(ap)
struct proc *a_p;
} */ *ap;
{
register struct vnode *vp = ap->a_vp;
struct vnode *vp = ap->a_vp;
struct mount *mp;
mtx_lock(&vp->v_interlock);
if (vp->v_usecount > 1)
if (vp->v_usecount > 1) {
ufs_itimes(vp);
mtx_unlock(&vp->v_interlock);
mtx_unlock(&vp->v_interlock);
} else {
mtx_unlock(&vp->v_interlock);
/*
* If we are closing the last reference to an unlinked
* file, then it will be freed by the inactive routine.
* Because the freeing causes a the filesystem to be
* modified, it must be held up during periods when the
* filesystem is suspended.
*
* XXX - EAGAIN is returned to prevent vn_close from
* repeating the vrele operation.
*/
if (vp->v_type == VREG && VTOI(vp)->i_effnlink == 0) {
(void) vn_start_write(vp, &mp, V_WAIT);
vrele(vp);
vn_finished_write(mp);
return (EAGAIN);
}
}
return (0);
}