When filt_proc() removes event from the knlist due to the process

exiting (NOTE_EXIT->knlist_remove_inevent()), two things happen:
- knote kn_knlist pointer is reset
- INFLUX knote is removed from the process knlist.
And, there are two consequences:
- KN_LIST_UNLOCK() on such knote is nop
- there is nothing which would block exit1() from processing past the
  knlist_destroy() (and knlist_destroy() resets knlist lock pointers).
Both consequences result either in leaked process lock, or
dereferencing NULL function pointers for locking.

Handle this by stopping embedding the process knlist into struct proc.
Instead, the knlist is allocated together with struct proc, but marked
as autodestroy on the zombie reap, by knlist_detach() function.  The
knlist is freed when last kevent is removed from the list, in
particular, at the zombie reap time if the list is empty.  As result,
the knlist_remove_inevent() is no longer needed and removed.

Other changes:

In filt_procattach(), clear NOTE_EXEC and NOTE_FORK desired events
from kn_sfflags for knote registered by kernel to only get NOTE_CHILD
notifications.  The flags leak resulted in excessive
NOTE_EXEC/NOTE_FORK reports.

Fix immediate note activation in filt_procattach().  Condition should
be either the immediate CHILD_NOTE activation, or immediate NOTE_EXIT
report for the exiting process.

In knote_fork(), do not perform racy check for KN_INFLUX before kq
lock is taken.  Besides being racy, it did not accounted for notes
just added by scan (KN_SCAN).

Some minor and incomplete style fixes.

Analyzed and tested by:	Eric Badger <eric@badgerio.us>
Reviewed by:	jhb
Sponsored by:	The FreeBSD Foundation
MFC after:	2 weeks
Approved by:	re (gjb)
Differential revision:	https://reviews.freebsd.org/D6859
This commit is contained in:
Konstantin Belousov 2016-06-27 21:52:17 +00:00
parent 424340f2c7
commit 9e590ff04b
Notes: svn2git 2020-12-20 02:59:44 +00:00
svn path=/head/; revision=302235
8 changed files with 92 additions and 72 deletions

View File

@ -482,7 +482,7 @@ proc0_init(void *dummy __unused)
p->p_flag = P_SYSTEM | P_INMEM | P_KPROC;
p->p_flag2 = 0;
p->p_state = PRS_NORMAL;
knlist_init_mtx(&p->p_klist, &p->p_mtx);
p->p_klist = knlist_alloc(&p->p_mtx);
STAILQ_INIT(&p->p_ktr);
p->p_nice = NZERO;
/* pid_max cannot be greater than PID_MAX */

View File

@ -227,14 +227,33 @@ SYSCTL_UINT(_kern, OID_AUTO, kq_calloutmax, CTLFLAG_RW,
#define KQ_NOTOWNED(kq) do { \
mtx_assert(&(kq)->kq_lock, MA_NOTOWNED); \
} while (0)
#define KN_LIST_LOCK(kn) do { \
if (kn->kn_knlist != NULL) \
kn->kn_knlist->kl_lock(kn->kn_knlist->kl_lockarg); \
} while (0)
#define KN_LIST_UNLOCK(kn) do { \
if (kn->kn_knlist != NULL) \
kn->kn_knlist->kl_unlock(kn->kn_knlist->kl_lockarg); \
} while (0)
static struct knlist *
kn_list_lock(struct knote *kn)
{
struct knlist *knl;
knl = kn->kn_knlist;
if (knl != NULL)
knl->kl_lock(knl->kl_lockarg);
return (knl);
}
static void
kn_list_unlock(struct knlist *knl)
{
bool do_free;
if (knl == NULL)
return;
do_free = knl->kl_autodestroy && knlist_empty(knl);
knl->kl_unlock(knl->kl_lockarg);
if (do_free) {
knlist_destroy(knl);
free(knl, M_KQUEUE);
}
}
#define KNL_ASSERT_LOCK(knl, islocked) do { \
if (islocked) \
KNL_ASSERT_LOCKED(knl); \
@ -350,16 +369,16 @@ static int
filt_procattach(struct knote *kn)
{
struct proc *p;
int immediate;
int error;
bool exiting, immediate;
immediate = 0;
exiting = immediate = false;
p = pfind(kn->kn_id);
if (p == NULL && (kn->kn_sfflags & NOTE_EXIT)) {
p = zpfind(kn->kn_id);
immediate = 1;
exiting = true;
} else if (p != NULL && (p->p_flag & P_WEXIT)) {
immediate = 1;
exiting = true;
}
if (p == NULL)
@ -380,8 +399,8 @@ filt_procattach(struct knote *kn)
kn->kn_flags &= ~EV_FLAG2;
kn->kn_data = kn->kn_sdata; /* ppid */
kn->kn_fflags = NOTE_CHILD;
kn->kn_sfflags &= ~NOTE_EXIT;
immediate = 1; /* Force immediate activation of child note. */
kn->kn_sfflags &= ~(NOTE_EXIT | NOTE_EXEC | NOTE_FORK);
immediate = true; /* Force immediate activation of child note. */
}
/*
* Internal flag indicating registration done by kernel (for other than
@ -391,8 +410,7 @@ filt_procattach(struct knote *kn)
kn->kn_flags &= ~EV_FLAG1;
}
if (immediate == 0)
knlist_add(&p->p_klist, kn, 1);
knlist_add(p->p_klist, kn, 1);
/*
* Immediately activate any child notes or, in the case of a zombie
@ -400,7 +418,7 @@ filt_procattach(struct knote *kn)
* case where the target process, e.g. a child, dies before the kevent
* is registered.
*/
if (immediate && filt_proc(kn, NOTE_EXIT))
if (immediate || (exiting && filt_proc(kn, NOTE_EXIT)))
KNOTE_ACTIVATE(kn, 0);
PROC_UNLOCK(p);
@ -420,10 +438,8 @@ filt_procattach(struct knote *kn)
static void
filt_procdetach(struct knote *kn)
{
struct proc *p;
p = kn->kn_ptr.p_proc;
knlist_remove(&p->p_klist, kn, 0);
knlist_remove(kn->kn_knlist, kn, 0);
kn->kn_ptr.p_proc = NULL;
}
@ -444,8 +460,6 @@ filt_proc(struct knote *kn, long hint)
/* Process is gone, so flag the event as finished. */
if (event == NOTE_EXIT) {
if (!(kn->kn_status & KN_DETACHED))
knlist_remove_inevent(&p->p_klist, kn);
kn->kn_flags |= EV_EOF | EV_ONESHOT;
kn->kn_ptr.p_proc = NULL;
if (kn->kn_fflags & NOTE_EXIT)
@ -479,12 +493,6 @@ knote_fork(struct knlist *list, int pid)
list->kl_lock(list->kl_lockarg);
SLIST_FOREACH(kn, &list->kl_list, kn_selnext) {
/*
* XXX - Why do we skip the kn if it is _INFLUX? Does this
* mean we will not properly wake up some notes?
*/
if ((kn->kn_status & KN_INFLUX) == KN_INFLUX)
continue;
kq = kn->kn_kq;
KQ_LOCK(kq);
if ((kn->kn_status & (KN_INFLUX | KN_SCAN)) == KN_INFLUX) {
@ -525,7 +533,8 @@ knote_fork(struct knlist *list, int pid)
*/
kev.ident = pid;
kev.filter = kn->kn_filter;
kev.flags = kn->kn_flags | EV_ADD | EV_ENABLE | EV_ONESHOT | EV_FLAG2;
kev.flags = kn->kn_flags | EV_ADD | EV_ENABLE | EV_ONESHOT |
EV_FLAG2;
kev.fflags = kn->kn_sfflags;
kev.data = kn->kn_id; /* parent */
kev.udata = kn->kn_kevent.udata;/* preserve udata */
@ -1137,6 +1146,7 @@ kqueue_register(struct kqueue *kq, struct kevent *kev, struct thread *td, int wa
struct filterops *fops;
struct file *fp;
struct knote *kn, *tkn;
struct knlist *knl;
cap_rights_t rights;
int error, filt, event;
int haskqglobal, filedesc_unlock;
@ -1146,6 +1156,7 @@ kqueue_register(struct kqueue *kq, struct kevent *kev, struct thread *td, int wa
fp = NULL;
kn = NULL;
knl = NULL;
error = 0;
haskqglobal = 0;
filedesc_unlock = 0;
@ -1300,7 +1311,7 @@ kqueue_register(struct kqueue *kq, struct kevent *kev, struct thread *td, int wa
knote_drop(kn, td);
goto done;
}
KN_LIST_LOCK(kn);
knl = kn_list_lock(kn);
goto done_ev_add;
} else {
/* No matching knote and the EV_ADD flag is not set. */
@ -1331,7 +1342,7 @@ kqueue_register(struct kqueue *kq, struct kevent *kev, struct thread *td, int wa
*/
kn->kn_status |= KN_INFLUX | KN_SCAN;
KQ_UNLOCK(kq);
KN_LIST_LOCK(kn);
knl = kn_list_lock(kn);
kn->kn_kevent.udata = kev->udata;
if (!fops->f_isfd && fops->f_touch != NULL) {
fops->f_touch(kn, kev, EVENT_REGISTER);
@ -1365,7 +1376,7 @@ kqueue_register(struct kqueue *kq, struct kevent *kev, struct thread *td, int wa
KN_ACTIVE)
knote_enqueue(kn);
kn->kn_status &= ~(KN_INFLUX | KN_SCAN);
KN_LIST_UNLOCK(kn);
kn_list_unlock(knl);
KQ_UNLOCK_FLUX(kq);
done:
@ -1535,6 +1546,7 @@ kqueue_scan(struct kqueue *kq, int maxevents, struct kevent_copyops *k_ops,
{
struct kevent *kevp;
struct knote *kn, *marker;
struct knlist *knl;
sbintime_t asbt, rsbt;
int count, error, haskqglobal, influx, nkev, touch;
@ -1660,7 +1672,7 @@ kqueue_scan(struct kqueue *kq, int maxevents, struct kevent_copyops *k_ops,
KQ_UNLOCK(kq);
if ((kn->kn_status & KN_KQUEUE) == KN_KQUEUE)
KQ_GLOBAL_LOCK(&kq_global, haskqglobal);
KN_LIST_LOCK(kn);
knl = kn_list_lock(kn);
if (kn->kn_fop->f_event(kn, 0) == 0) {
KQ_LOCK(kq);
KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal);
@ -1668,7 +1680,7 @@ kqueue_scan(struct kqueue *kq, int maxevents, struct kevent_copyops *k_ops,
~(KN_QUEUED | KN_ACTIVE | KN_INFLUX |
KN_SCAN);
kq->kq_count--;
KN_LIST_UNLOCK(kn);
kn_list_unlock(knl);
influx = 1;
continue;
}
@ -1697,7 +1709,7 @@ kqueue_scan(struct kqueue *kq, int maxevents, struct kevent_copyops *k_ops,
TAILQ_INSERT_TAIL(&kq->kq_head, kn, kn_tqe);
kn->kn_status &= ~(KN_INFLUX | KN_SCAN);
KN_LIST_UNLOCK(kn);
kn_list_unlock(knl);
influx = 1;
}
@ -2062,7 +2074,8 @@ knlist_add(struct knlist *knl, struct knote *kn, int islocked)
}
static void
knlist_remove_kq(struct knlist *knl, struct knote *kn, int knlislocked, int kqislocked)
knlist_remove_kq(struct knlist *knl, struct knote *kn, int knlislocked,
int kqislocked)
{
KASSERT(!(!!kqislocked && !knlislocked), ("kq locked w/o knl locked"));
KNL_ASSERT_LOCK(knl, knlislocked);
@ -2075,7 +2088,7 @@ knlist_remove_kq(struct knlist *knl, struct knote *kn, int knlislocked, int kqis
SLIST_REMOVE(&knl->kl_list, kn, knote, kn_selnext);
kn->kn_knlist = NULL;
if (!knlislocked)
knl->kl_unlock(knl->kl_lockarg);
kn_list_unlock(knl);
if (!kqislocked)
KQ_LOCK(kn->kn_kq);
kn->kn_status |= KN_DETACHED;
@ -2093,17 +2106,6 @@ knlist_remove(struct knlist *knl, struct knote *kn, int islocked)
knlist_remove_kq(knl, kn, islocked, 0);
}
/*
* remove knote from the specified knlist while in f_event handler.
*/
void
knlist_remove_inevent(struct knlist *knl, struct knote *kn)
{
knlist_remove_kq(knl, kn, 1,
(kn->kn_status & KN_HASKQLOCK) == KN_HASKQLOCK);
}
int
knlist_empty(struct knlist *knl)
{
@ -2202,6 +2204,7 @@ knlist_init(struct knlist *knl, void *lock, void (*kl_lock)(void *),
else
knl->kl_assert_unlocked = kl_assert_unlocked;
knl->kl_autodestroy = false;
SLIST_INIT(&knl->kl_list);
}
@ -2212,6 +2215,16 @@ knlist_init_mtx(struct knlist *knl, struct mtx *lock)
knlist_init(knl, lock, NULL, NULL, NULL, NULL);
}
struct knlist *
knlist_alloc(struct mtx *lock)
{
struct knlist *knl;
knl = malloc(sizeof(struct knlist), M_KQUEUE, M_WAITOK);
knlist_init_mtx(knl, lock);
return (knl);
}
void
knlist_init_rw_reader(struct knlist *knl, struct rwlock *lock)
{
@ -2237,6 +2250,18 @@ knlist_destroy(struct knlist *knl)
SLIST_INIT(&knl->kl_list);
}
void
knlist_detach(struct knlist *knl)
{
KNL_ASSERT_LOCKED(knl);
knl->kl_autodestroy = true;
if (knlist_empty(knl)) {
knlist_destroy(knl);
free(knl, M_KQUEUE);
}
}
/*
* Even if we are locked, we may need to drop the lock to allow any influx
* knotes time to "settle".
@ -2247,6 +2272,7 @@ knlist_cleardel(struct knlist *knl, struct thread *td, int islocked, int killkn)
struct knote *kn, *kn2;
struct kqueue *kq;
KASSERT(!knl->kl_autodestroy, ("cleardel for autodestroy %p", knl));
if (islocked)
KNL_ASSERT_LOCKED(knl);
else {

View File

@ -832,7 +832,7 @@ do_execve(td, args, mac_p)
* Notify others that we exec'd, and clear the P_INEXEC flag
* as we're now a bona fide freshly-execed process.
*/
KNOTE_LOCKED(&p->p_klist, NOTE_EXEC);
KNOTE_LOCKED(p->p_klist, NOTE_EXEC);
p->p_flag &= ~P_INEXEC;
/* clear "fork but no exec" flag, as we _are_ execing */

View File

@ -512,7 +512,7 @@ exit1(struct thread *td, int rval, int signo)
/*
* Notify interested parties of our demise.
*/
KNOTE_LOCKED(&p->p_klist, NOTE_EXIT);
KNOTE_LOCKED(p->p_klist, NOTE_EXIT);
#ifdef KDTRACE_HOOKS
int reason = CLD_EXITED;
@ -523,13 +523,6 @@ exit1(struct thread *td, int rval, int signo)
SDT_PROBE1(proc, , , exit, reason);
#endif
/*
* Just delete all entries in the p_klist. At this point we won't
* report any more events, and there are nasty race conditions that
* can beat us if we don't.
*/
knlist_clear(&p->p_klist, 1);
/*
* If this is a process with a descriptor, we may not need to deliver
* a signal to the parent. proctree_lock is held over
@ -602,12 +595,6 @@ exit1(struct thread *td, int rval, int signo)
p->p_state = PRS_ZOMBIE;
PROC_UNLOCK(p->p_pptr);
/*
* Hopefully no one will try to deliver a signal to the process this
* late in the game.
*/
knlist_destroy(&p->p_klist);
/*
* Save our children's rusage information in our exit rusage.
*/
@ -853,6 +840,11 @@ proc_reap(struct thread *td, struct proc *p, int *status, int options)
procdesc_reap(p);
sx_xunlock(&proctree_lock);
PROC_LOCK(p);
knlist_detach(p->p_klist);
p->p_klist = NULL;
PROC_UNLOCK(p);
/*
* Removal from allproc list and process group list paired with
* PROC_LOCK which was executed during that time should guarantee

View File

@ -748,7 +748,7 @@ do_fork(struct thread *td, struct fork_req *fr, struct proc *p2, struct thread *
/*
* Tell any interested parties about the new process.
*/
knote_fork(&p1->p_klist, p2->p_pid);
knote_fork(p1->p_klist, p2->p_pid);
SDT_PROBE3(proc, , , create, p2, p1, fr->fr_flags);
if (fr->fr_flags & RFPROCDESC) {
@ -950,7 +950,7 @@ fork1(struct thread *td, struct fork_req *fr)
#ifdef MAC
mac_proc_init(newproc);
#endif
knlist_init_mtx(&newproc->p_klist, &newproc->p_mtx);
newproc->p_klist = knlist_alloc(&newproc->p_mtx);
STAILQ_INIT(&newproc->p_ktr);
/* We have to lock the process tree while we look for a pid. */

View File

@ -2112,7 +2112,7 @@ tdsendsignal(struct proc *p, struct thread *td, int sig, ksiginfo_t *ksi)
}
ps = p->p_sigacts;
KNOTE_LOCKED(&p->p_klist, NOTE_SIGNAL | sig);
KNOTE_LOCKED(p->p_klist, NOTE_SIGNAL | sig);
prop = sigprop(sig);
if (td == NULL) {
@ -3542,7 +3542,7 @@ filt_sigattach(struct knote *kn)
kn->kn_ptr.p_proc = p;
kn->kn_flags |= EV_CLEAR; /* automatically set */
knlist_add(&p->p_klist, kn, 0);
knlist_add(p->p_klist, kn, 0);
return (0);
}
@ -3552,7 +3552,7 @@ filt_sigdetach(struct knote *kn)
{
struct proc *p = kn->kn_ptr.p_proc;
knlist_remove(&p->p_klist, kn, 0);
knlist_remove(p->p_klist, kn, 0);
}
/*

View File

@ -158,7 +158,8 @@ struct knlist {
void (*kl_unlock)(void *);
void (*kl_assert_locked)(void *);
void (*kl_assert_unlocked)(void *);
void *kl_lockarg; /* argument passed to kl_lockf() */
void *kl_lockarg; /* argument passed to lock functions */
bool kl_autodestroy;
};
@ -258,9 +259,10 @@ struct rwlock;
extern void knote(struct knlist *list, long hint, int lockflags);
extern void knote_fork(struct knlist *list, int pid);
extern struct knlist *knlist_alloc(struct mtx *lock);
extern void knlist_detach(struct knlist *knl);
extern void knlist_add(struct knlist *knl, struct knote *kn, int islocked);
extern void knlist_remove(struct knlist *knl, struct knote *kn, int islocked);
extern void knlist_remove_inevent(struct knlist *knl, struct knote *kn);
extern int knlist_empty(struct knlist *knl);
extern void knlist_init(struct knlist *knl, void *lock,
void (*kl_lock)(void *), void (*kl_unlock)(void *),

View File

@ -611,7 +611,7 @@ struct proc {
/* End area that is copied on creation. */
#define p_endcopy p_xsig
struct pgrp *p_pgrp; /* (c + e) Pointer to process group. */
struct knlist p_klist; /* (c) Knotes attached to this proc. */
struct knlist *p_klist; /* (c) Knotes attached to this proc. */
int p_numthreads; /* (c) Number of threads. */
struct mdproc p_md; /* Any machine-dependent fields. */
struct callout p_itcallout; /* (h + c) Interval timer callout. */