unionfs: accommodate underlying FS calls that may re-lock

Since non-doomed unionfs vnodes always share their primary lock with
either the lower or upper vnode, any forwarded call to the base FS
which transiently drops that upper or lower vnode lock may result in
the unionfs vnode becoming completely unlocked during that transient
window.  The unionfs vnode may then become doomed by a concurrent
forced unmount, which can lead to either or both of the following:

--Complete loss of the unionfs lock: in the process of being
  doomed, the unionfs vnode switches back to the default vnode lock,
  so even if the base FS VOP reacquires the upper/lower vnode lock,
  that no longer translates into the unionfs vnode being relocked.
  This will then violate that caller's locking assumptions as well
  as various assertions that are enabled with DEBUG_VFS_LOCKS.

--Complete less of reference on the upper/lower vnode: the caller
  normally holds a reference on the unionfs vnode, while the unionfs
  vnode in turn holds references on the upper/lower vnodes.  But in
  the course of being doomed, the unionfs vnode will drop the latter
  set of references, which can effectively lead to the base FS VOP
  executing with no references at all on its vnode, violating the
  assumption that vnodes can't be recycled during these calls and
  (if lucky) violating various assertions in the base FS.

Fix this by adding two new functions, unionfs_forward_vop_start_pair()
and unionfs_forward_vop_finish_pair(), which are intended to bookend
any forwarded VOP which may transiently unlock the relevant vnode(s).
These functions are currently only applied to VOPs that modify file
state (and require vnode reference and lock state to be identical at
call entry and exit), as the common reason for transiently dropping
locks is to update filesystem metadata.

Reviewed by:	olce
Tested by:	pho
MFC after:	2 weeks
Differential Revision: https://reviews.freebsd.org/D44076
This commit is contained in:
Jason A. Harmening 2024-01-02 15:22:24 -06:00
parent d56c175ac9
commit 6c8ded0015
3 changed files with 289 additions and 60 deletions

View file

@ -144,8 +144,8 @@ int unionfs_create_uppervattr(struct unionfs_mount *, struct vnode *,
struct vattr *, struct ucred *, struct thread *);
int unionfs_mkshadowdir(struct unionfs_mount *, struct vnode *,
struct unionfs_node *, struct componentname *, struct thread *);
int unionfs_mkwhiteout(struct vnode *, struct componentname *,
struct thread *, char *, int);
int unionfs_mkwhiteout(struct vnode *, struct vnode *,
struct componentname *, struct thread *, char *, int);
int unionfs_relookup(struct vnode *, struct vnode **,
struct componentname *, struct componentname *, struct thread *,
char *, int, u_long);
@ -155,6 +155,24 @@ int unionfs_relookup_for_delete(struct vnode *, struct componentname *,
struct thread *);
int unionfs_relookup_for_rename(struct vnode *, struct componentname *,
struct thread *);
void unionfs_forward_vop_start_pair(struct vnode *, int *,
struct vnode *, int *);
bool unionfs_forward_vop_finish_pair(struct vnode *, struct vnode *, int,
struct vnode *, struct vnode *, int);
static inline void
unionfs_forward_vop_start(struct vnode *basevp, int *lkflags)
{
unionfs_forward_vop_start_pair(basevp, lkflags, NULL, NULL);
}
static inline bool
unionfs_forward_vop_finish(struct vnode *unionvp, struct vnode *basevp,
int lkflags)
{
return (unionfs_forward_vop_finish_pair(unionvp, basevp, lkflags,
NULL, NULL, 0));
}
#define UNIONFSVPTOLOWERVP(vp) (VTOUNIONFS(vp)->un_lowervp)
#define UNIONFSVPTOUPPERVP(vp) (VTOUNIONFS(vp)->un_uppervp)

View file

@ -936,14 +936,21 @@ unionfs_mkshadowdir(struct unionfs_mount *ump, struct vnode *udvp,
*pathend = pathterm;
if (!error) {
unionfs_node_update(unp, uvp, td);
/*
* XXX The bug which cannot set uid/gid was corrected.
* Ignore errors.
*/
va.va_type = VNON;
VOP_SETATTR(uvp, &va, nd.ni_cnd.cn_cred);
/*
* VOP_SETATTR() may transiently drop uvp's lock, so it's
* important to call it before unionfs_node_update() transfers
* the unionfs vnode's lock from lvp to uvp; otherwise the
* unionfs vnode itself would be transiently unlocked and
* potentially doomed.
*/
unionfs_node_update(unp, uvp, td);
}
vn_finished_write(mp);
@ -955,28 +962,180 @@ unionfs_mkshadowdir(struct unionfs_mount *ump, struct vnode *udvp,
return (error);
}
static inline void
unionfs_forward_vop_ref(struct vnode *basevp, int *lkflags)
{
ASSERT_VOP_LOCKED(basevp, __func__);
*lkflags = VOP_ISLOCKED(basevp);
vref(basevp);
}
/*
* Prepare unionfs to issue a forwarded VOP to either the upper or lower
* FS. This should be used for any VOP which may drop the vnode lock;
* it is not required otherwise.
* The unionfs vnode shares its lock with the base-layer vnode(s); if the
* base FS must transiently drop its vnode lock, the unionfs vnode may
* effectively become unlocked. During that window, a concurrent forced
* unmount may doom the unionfs vnode, which leads to two significant
* issues:
* 1) Completion of, and return from, the unionfs VOP with the unionfs
* vnode completely unlocked. When the unionfs vnode becomes doomed
* it stops sharing its lock with the base vnode, so even if the
* forwarded VOP reacquires the base vnode lock the unionfs vnode
* lock will no longer be held. This can lead to violation of the
* caller's sychronization requirements as well as various failed
* locking assertions when DEBUG_VFS_LOCKS is enabled.
* 2) Loss of reference on the base vnode. The caller is expected to
* hold a v_usecount reference on the unionfs vnode, while the
* unionfs vnode holds a reference on the base-layer vnode(s). But
* these references are released when the unionfs vnode becomes
* doomed, violating the base layer's expectation that its caller
* must hold a reference to prevent vnode recycling.
*
* basevp1 and basevp2 represent two base-layer vnodes which are
* expected to be locked when this function is called. basevp2
* may be NULL, but if not NULL basevp1 and basevp2 should represent
* a parent directory and a filed linked to it, respectively.
* lkflags1 and lkflags2 are output parameters that will store the
* current lock status of basevp1 and basevp2, respectively. They
* are intended to be passed as the lkflags1 and lkflags2 parameters
* in the subsequent call to unionfs_forward_vop_finish_pair().
* lkflags2 may be NULL iff basevp2 is NULL.
*/
void
unionfs_forward_vop_start_pair(struct vnode *basevp1, int *lkflags1,
struct vnode *basevp2, int *lkflags2)
{
/*
* Take an additional reference on the base-layer vnodes to
* avoid loss of reference if the unionfs vnodes are doomed.
*/
unionfs_forward_vop_ref(basevp1, lkflags1);
if (basevp2 != NULL)
unionfs_forward_vop_ref(basevp2, lkflags2);
}
static inline bool
unionfs_forward_vop_rele(struct vnode *unionvp, struct vnode *basevp,
int lkflags)
{
bool unionvp_doomed;
if (__predict_false(VTOUNIONFS(unionvp) == NULL)) {
if ((lkflags & LK_EXCLUSIVE) != 0)
ASSERT_VOP_ELOCKED(basevp, __func__);
else
ASSERT_VOP_LOCKED(basevp, __func__);
unionvp_doomed = true;
} else {
vrele(basevp);
unionvp_doomed = false;
}
return (unionvp_doomed);
}
/*
* Indicate completion of a forwarded VOP previously prepared by
* unionfs_forward_vop_start_pair().
* basevp1 and basevp2 must be the same values passed to the prior
* call to unionfs_forward_vop_start_pair(). unionvp1 and unionvp2
* must be the unionfs vnodes that were initially above basevp1 and
* basevp2, respectively.
* basevp1 and basevp2 (if not NULL) must be locked when this function
* is called, while unionvp1 and/or unionvp2 may be unlocked if either
* unionfs vnode has become doomed.
* lkflags1 and lkflag2 represent the locking flags that should be
* used to re-lock unionvp1 and unionvp2, respectively, if either
* vnode has become doomed.
*
* Returns true if any unionfs vnode was found to be doomed, false
* otherwise.
*/
bool
unionfs_forward_vop_finish_pair(
struct vnode *unionvp1, struct vnode *basevp1, int lkflags1,
struct vnode *unionvp2, struct vnode *basevp2, int lkflags2)
{
bool vp1_doomed, vp2_doomed;
/*
* If either vnode is found to have been doomed, set
* a flag indicating that it needs to be re-locked.
* Otherwise, simply drop the base-vnode reference that
* was taken in unionfs_forward_vop_start().
*/
vp1_doomed = unionfs_forward_vop_rele(unionvp1, basevp1, lkflags1);
if (unionvp2 != NULL)
vp2_doomed = unionfs_forward_vop_rele(unionvp2, basevp2, lkflags2);
else
vp2_doomed = false;
/*
* If any of the unionfs vnodes need to be re-locked, that
* means the unionfs vnode's lock is now de-coupled from the
* corresponding base vnode. We therefore need to drop the
* base vnode lock (since nothing else will after this point),
* and also release the reference taken in
* unionfs_forward_vop_start_pair().
*/
if (__predict_false(vp1_doomed && vp2_doomed))
VOP_VPUT_PAIR(basevp1, &basevp2, true);
else if (__predict_false(vp1_doomed)) {
/*
* If basevp1 needs to be unlocked, then we may not
* be able to safely unlock it with basevp2 still locked,
* for the same reason that an ordinary VFS call would
* need to use VOP_VPUT_PAIR() here. We might be able
* to use VOP_VPUT_PAIR(..., false) here, but then we
* would need to deal with the possibility of basevp2
* changing out from under us, which could result in
* either the unionfs vnode becoming doomed or its
* upper/lower vp no longer matching basevp2. Either
* scenario would require at least re-locking the unionfs
* vnode anyway.
*/
if (unionvp2 != NULL) {
VOP_UNLOCK(unionvp2);
vp2_doomed = true;
}
vput(basevp1);
} else if (__predict_false(vp2_doomed))
vput(basevp2);
if (__predict_false(vp1_doomed || vp2_doomed))
vn_lock_pair(unionvp1, !vp1_doomed, lkflags1,
unionvp2, !vp2_doomed, lkflags2);
return (vp1_doomed || vp2_doomed);
}
/*
* Create a new whiteout.
*
* dvp should be locked on entry and will be locked on return.
* udvp and dvp should be locked on entry and will be locked on return.
*/
int
unionfs_mkwhiteout(struct vnode *dvp, struct componentname *cnp,
struct thread *td, char *path, int pathlen)
unionfs_mkwhiteout(struct vnode *dvp, struct vnode *udvp,
struct componentname *cnp, struct thread *td, char *path, int pathlen)
{
struct vnode *wvp;
struct nameidata nd;
struct mount *mp;
int error;
int lkflags;
wvp = NULLVP;
NDPREINIT(&nd);
if ((error = unionfs_relookup(dvp, &wvp, cnp, &nd.ni_cnd, td, path,
if ((error = unionfs_relookup(udvp, &wvp, cnp, &nd.ni_cnd, td, path,
pathlen, CREATE))) {
return (error);
}
if (wvp != NULLVP) {
if (dvp == wvp)
if (udvp == wvp)
vrele(wvp);
else
vput(wvp);
@ -984,9 +1143,11 @@ unionfs_mkwhiteout(struct vnode *dvp, struct componentname *cnp,
return (EEXIST);
}
if ((error = vn_start_write(dvp, &mp, V_WAIT | V_PCATCH)))
if ((error = vn_start_write(udvp, &mp, V_WAIT | V_PCATCH)))
goto unionfs_mkwhiteout_free_out;
error = VOP_WHITEOUT(dvp, &nd.ni_cnd, CREATE);
unionfs_forward_vop_start(udvp, &lkflags);
error = VOP_WHITEOUT(udvp, &nd.ni_cnd, CREATE);
unionfs_forward_vop_finish(dvp, udvp, lkflags);
vn_finished_write(mp);

View file

@ -369,21 +369,27 @@ unionfs_create(struct vop_create_args *ap)
error = EROFS;
if (udvp != NULLVP) {
int lkflags;
bool vp_created = false;
unionfs_forward_vop_start(udvp, &lkflags);
error = VOP_CREATE(udvp, &vp, cnp, ap->a_vap);
if (error != 0)
goto unionfs_create_abort;
if (vp->v_type == VSOCK)
if (error == 0)
vp_created = true;
if (__predict_false(unionfs_forward_vop_finish(ap->a_dvp, udvp,
lkflags)) && error == 0) {
error = ENOENT;
}
if (error == 0 && vp->v_type == VSOCK)
*(ap->a_vpp) = vp;
else {
else if (error == 0) {
VOP_UNLOCK(vp);
error = unionfs_nodeget(ap->a_dvp->v_mount, vp, NULLVP,
ap->a_dvp, ap->a_vpp, cnp);
vrele(vp);
}
} else if (vp_created)
vput(vp);
}
unionfs_create_abort:
UNIONFS_INTERNAL_DEBUG("unionfs_create: leave (%d)\n", error);
return (error);
@ -407,11 +413,14 @@ unionfs_whiteout(struct vop_whiteout_args *ap)
error = EOPNOTSUPP;
if (udvp != NULLVP) {
int lkflags;
switch (ap->a_flags) {
case CREATE:
case DELETE:
case LOOKUP:
unionfs_forward_vop_start(udvp, &lkflags);
error = VOP_WHITEOUT(udvp, cnp, ap->a_flags);
unionfs_forward_vop_finish(ap->a_dvp, udvp, lkflags);
break;
default:
error = EINVAL;
@ -443,21 +452,27 @@ unionfs_mknod(struct vop_mknod_args *ap)
error = EROFS;
if (udvp != NULLVP) {
int lkflags;
bool vp_created = false;
unionfs_forward_vop_start(udvp, &lkflags);
error = VOP_MKNOD(udvp, &vp, cnp, ap->a_vap);
if (error != 0)
goto unionfs_mknod_abort;
if (vp->v_type == VSOCK)
if (error == 0)
vp_created = true;
if (__predict_false(unionfs_forward_vop_finish(ap->a_dvp, udvp,
lkflags)) && error == 0) {
error = ENOENT;
}
if (error == 0 && vp->v_type == VSOCK)
*(ap->a_vpp) = vp;
else {
else if (error == 0) {
VOP_UNLOCK(vp);
error = unionfs_nodeget(ap->a_dvp->v_mount, vp, NULLVP,
ap->a_dvp, ap->a_vpp, cnp);
vrele(vp);
}
} else if (vp_created)
vput(vp);
}
unionfs_mknod_abort:
UNIONFS_INTERNAL_DEBUG("unionfs_mknod: leave (%d)\n", error);
return (error);
@ -890,8 +905,12 @@ unionfs_setattr(struct vop_setattr_args *ap)
uvp = unp->un_uppervp;
}
if (uvp != NULLVP)
if (uvp != NULLVP) {
int lkflags;
unionfs_forward_vop_start(uvp, &lkflags);
error = VOP_SETATTR(uvp, vap, ap->a_cred);
unionfs_forward_vop_finish(ap->a_vp, uvp, lkflags);
}
UNIONFS_INTERNAL_DEBUG("unionfs_setattr: leave (%d)\n", error);
@ -925,6 +944,7 @@ unionfs_write(struct vop_write_args *ap)
struct unionfs_node *unp;
struct vnode *tvp;
int error;
int lkflags;
/* UNIONFS_INTERNAL_DEBUG("unionfs_write: enter\n"); */
@ -933,7 +953,9 @@ unionfs_write(struct vop_write_args *ap)
unp = VTOUNIONFS(ap->a_vp);
tvp = (unp->un_uppervp != NULLVP ? unp->un_uppervp : unp->un_lowervp);
unionfs_forward_vop_start(tvp, &lkflags);
error = VOP_WRITE(tvp, ap->a_uio, ap->a_ioflag, ap->a_cred);
unionfs_forward_vop_finish(ap->a_vp, tvp, lkflags);
/* UNIONFS_INTERNAL_DEBUG("unionfs_write: leave (%d)\n", error); */
@ -999,6 +1021,7 @@ unionfs_fsync(struct vop_fsync_args *ap)
struct unionfs_node_status *unsp;
struct vnode *ovp;
enum unionfs_lkupgrade lkstatus;
int error, lkflags;
KASSERT_UNIONFS_VNODE(ap->a_vp);
@ -1017,7 +1040,11 @@ unionfs_fsync(struct vop_fsync_args *ap)
if (ovp == NULLVP)
return (EBADF);
return (VOP_FSYNC(ovp, ap->a_waitfor, ap->a_td));
unionfs_forward_vop_start(ovp, &lkflags);
error = VOP_FSYNC(ovp, ap->a_waitfor, ap->a_td);
unionfs_forward_vop_finish(ap->a_vp, ovp, lkflags);
return (error);
}
static int
@ -1046,6 +1073,7 @@ unionfs_remove(struct vop_remove_args *ap)
udvp = dunp->un_uppervp;
cnp = ap->a_cnp;
td = curthread;
unp = NULL;
if (ap->a_vp->v_op != &unionfs_vnodeops) {
if (ap->a_vp->v_type != VSOCK)
@ -1095,12 +1123,17 @@ unionfs_remove(struct vop_remove_args *ap)
* XXX: if the vnode type is VSOCK, it will create whiteout
* after remove.
*/
int udvp_lkflags, uvp_lkflags;
if (ump == NULL || ump->um_whitemode == UNIONFS_WHITE_ALWAYS ||
lvp != NULLVP)
cnp->cn_flags |= DOWHITEOUT;
unionfs_forward_vop_start_pair(udvp, &udvp_lkflags,
((unp == NULL) ? NULL : uvp), &uvp_lkflags);
error = VOP_REMOVE(udvp, uvp, cnp);
unionfs_forward_vop_finish_pair(ap->a_dvp, udvp, udvp_lkflags,
((unp == NULL) ? NULL : ap->a_vp), uvp, uvp_lkflags);
} else if (lvp != NULLVP)
error = unionfs_mkwhiteout(udvp, cnp, td, path, pathlen);
error = unionfs_mkwhiteout(ap->a_dvp, udvp, cnp, td, path, pathlen);
UNIONFS_INTERNAL_DEBUG("unionfs_remove: leave (%d)\n", error);
@ -1156,8 +1189,14 @@ unionfs_link(struct vop_link_args *ap)
if (needrelookup != 0)
error = unionfs_relookup_for_create(ap->a_tdvp, cnp, td);
if (error == 0)
if (error == 0) {
int udvp_lkflags, uvp_lkflags;
unionfs_forward_vop_start_pair(udvp, &udvp_lkflags,
uvp, &uvp_lkflags);
error = VOP_LINK(udvp, uvp, cnp);
unionfs_forward_vop_finish_pair(ap->a_tdvp, udvp, udvp_lkflags,
ap->a_vp, uvp, uvp_lkflags);
}
UNIONFS_INTERNAL_DEBUG("unionfs_link: leave (%d)\n", error);
@ -1413,7 +1452,6 @@ unionfs_mkdir(struct vop_mkdir_args *ap)
udvp = dunp->un_uppervp;
if (udvp != NULLVP) {
vref(udvp);
/* check opaque */
if (!(cnp->cn_flags & ISWHITEOUT)) {
error = VOP_GETATTR(udvp, &va, cnp->cn_cred);
@ -1423,38 +1461,27 @@ unionfs_mkdir(struct vop_mkdir_args *ap)
cnp->cn_flags |= ISWHITEOUT;
}
if ((error = VOP_MKDIR(udvp, &uvp, cnp, ap->a_vap)) == 0) {
int udvp_lkflags;
bool uvp_created = false;
unionfs_forward_vop_start(udvp, &udvp_lkflags);
error = VOP_MKDIR(udvp, &uvp, cnp, ap->a_vap);
if (error == 0)
uvp_created = true;
if (__predict_false(unionfs_forward_vop_finish(dvp, udvp,
udvp_lkflags)) && error == 0)
error = ENOENT;
if (error == 0) {
VOP_UNLOCK(uvp);
cnp->cn_lkflags = LK_EXCLUSIVE;
/*
* The underlying VOP_MKDIR() implementation may have
* temporarily dropped the parent directory vnode lock.
* Because the unionfs vnode ordinarily shares that
* lock, this may allow the unionfs vnode to be reclaimed
* and its lock field reset. In that case, the unionfs
* vnode is effectively no longer locked, and we must
* explicitly lock it before returning in order to meet
* the locking requirements of VOP_MKDIR().
*/
if (__predict_false(VTOUNIONFS(dvp) == NULL)) {
error = ENOENT;
goto unionfs_mkdir_cleanup;
}
error = unionfs_nodeget(dvp->v_mount, uvp, NULLVP,
dvp, ap->a_vpp, cnp);
cnp->cn_lkflags = lkflags;
vrele(uvp);
}
cnp->cn_lkflags = lkflags;
} else if (uvp_created)
vput(uvp);
}
unionfs_mkdir_cleanup:
if (__predict_false(VTOUNIONFS(dvp) == NULL)) {
vput(udvp);
vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY);
} else if (udvp != NULLVP)
vrele(udvp);
UNIONFS_INTERNAL_DEBUG("unionfs_mkdir: leave (%d)\n", error);
return (error);
@ -1521,10 +1548,16 @@ unionfs_rmdir(struct vop_rmdir_args *ap)
*/
if (error == 0 && VN_IS_DOOMED(uvp))
error = ENOENT;
if (error == 0)
if (error == 0) {
int udvp_lkflags, uvp_lkflags;
unionfs_forward_vop_start_pair(udvp, &udvp_lkflags,
uvp, &uvp_lkflags);
error = VOP_RMDIR(udvp, uvp, cnp);
unionfs_forward_vop_finish_pair(ap->a_dvp, udvp, udvp_lkflags,
ap->a_vp, uvp, uvp_lkflags);
}
} else if (lvp != NULLVP)
error = unionfs_mkwhiteout(udvp, cnp, td,
error = unionfs_mkwhiteout(ap->a_dvp, udvp, cnp, td,
unp->un_path, unp->un_pathlen);
if (error == 0) {
@ -1558,15 +1591,24 @@ unionfs_symlink(struct vop_symlink_args *ap)
udvp = dunp->un_uppervp;
if (udvp != NULLVP) {
int udvp_lkflags;
bool uvp_created = false;
unionfs_forward_vop_start(udvp, &udvp_lkflags);
error = VOP_SYMLINK(udvp, &uvp, cnp, ap->a_vap, ap->a_target);
if (error == 0)
uvp_created = true;
if (__predict_false(unionfs_forward_vop_finish(ap->a_dvp, udvp,
udvp_lkflags)) && error == 0)
error = ENOENT;
if (error == 0) {
VOP_UNLOCK(uvp);
cnp->cn_lkflags = LK_EXCLUSIVE;
error = unionfs_nodeget(ap->a_dvp->v_mount, uvp, NULLVP,
ap->a_dvp, ap->a_vpp, cnp);
cnp->cn_lkflags = lkflags;
vrele(uvp);
}
cnp->cn_lkflags = lkflags;
} else if (uvp_created)
vput(uvp);
}
UNIONFS_INTERNAL_DEBUG("unionfs_symlink: leave (%d)\n", error);
@ -2275,8 +2317,12 @@ unionfs_setacl(struct vop_setacl_args *ap)
uvp = unp->un_uppervp;
}
if (uvp != NULLVP)
if (uvp != NULLVP) {
int lkflags;
unionfs_forward_vop_start(uvp, &lkflags);
error = VOP_SETACL(uvp, ap->a_type, ap->a_aclp, ap->a_cred, td);
unionfs_forward_vop_finish(ap->a_vp, uvp, lkflags);
}
UNIONFS_INTERNAL_DEBUG("unionfs_setacl: leave (%d)\n", error);
@ -2458,9 +2504,13 @@ unionfs_setextattr(struct vop_setextattr_args *ap)
ovp = uvp;
}
if (ovp == uvp)
if (ovp == uvp) {
int lkflags;
unionfs_forward_vop_start(ovp, &lkflags);
error = VOP_SETEXTATTR(ovp, ap->a_attrnamespace, ap->a_name,
ap->a_uio, cred, td);
unionfs_forward_vop_finish(ap->a_vp, ovp, lkflags);
}
unionfs_setextattr_abort:
UNIONFS_INTERNAL_DEBUG("unionfs_setextattr: leave (%d)\n", error);