From 9e590ff04b687e910579a5851d95cedf9eb10bfd Mon Sep 17 00:00:00 2001 From: Konstantin Belousov Date: Mon, 27 Jun 2016 21:52:17 +0000 Subject: [PATCH] When filt_proc() removes event from the knlist due to the process exiting (NOTE_EXIT->knlist_remove_inevent()), two things happen: - knote kn_knlist pointer is reset - INFLUX knote is removed from the process knlist. And, there are two consequences: - KN_LIST_UNLOCK() on such knote is nop - there is nothing which would block exit1() from processing past the knlist_destroy() (and knlist_destroy() resets knlist lock pointers). Both consequences result either in leaked process lock, or dereferencing NULL function pointers for locking. Handle this by stopping embedding the process knlist into struct proc. Instead, the knlist is allocated together with struct proc, but marked as autodestroy on the zombie reap, by knlist_detach() function. The knlist is freed when last kevent is removed from the list, in particular, at the zombie reap time if the list is empty. As result, the knlist_remove_inevent() is no longer needed and removed. Other changes: In filt_procattach(), clear NOTE_EXEC and NOTE_FORK desired events from kn_sfflags for knote registered by kernel to only get NOTE_CHILD notifications. The flags leak resulted in excessive NOTE_EXEC/NOTE_FORK reports. Fix immediate note activation in filt_procattach(). Condition should be either the immediate CHILD_NOTE activation, or immediate NOTE_EXIT report for the exiting process. In knote_fork(), do not perform racy check for KN_INFLUX before kq lock is taken. Besides being racy, it did not accounted for notes just added by scan (KN_SCAN). Some minor and incomplete style fixes. Analyzed and tested by: Eric Badger Reviewed by: jhb Sponsored by: The FreeBSD Foundation MFC after: 2 weeks Approved by: re (gjb) Differential revision: https://reviews.freebsd.org/D6859 --- sys/kern/init_main.c | 2 +- sys/kern/kern_event.c | 122 +++++++++++++++++++++++++----------------- sys/kern/kern_exec.c | 2 +- sys/kern/kern_exit.c | 20 +++---- sys/kern/kern_fork.c | 4 +- sys/kern/kern_sig.c | 6 +-- sys/sys/event.h | 6 ++- sys/sys/proc.h | 2 +- 8 files changed, 92 insertions(+), 72 deletions(-) diff --git a/sys/kern/init_main.c b/sys/kern/init_main.c index 7be38a66d023..85382cdd8c51 100644 --- a/sys/kern/init_main.c +++ b/sys/kern/init_main.c @@ -482,7 +482,7 @@ proc0_init(void *dummy __unused) p->p_flag = P_SYSTEM | P_INMEM | P_KPROC; p->p_flag2 = 0; p->p_state = PRS_NORMAL; - knlist_init_mtx(&p->p_klist, &p->p_mtx); + p->p_klist = knlist_alloc(&p->p_mtx); STAILQ_INIT(&p->p_ktr); p->p_nice = NZERO; /* pid_max cannot be greater than PID_MAX */ diff --git a/sys/kern/kern_event.c b/sys/kern/kern_event.c index 0614903d6d56..97137d860f8f 100644 --- a/sys/kern/kern_event.c +++ b/sys/kern/kern_event.c @@ -227,14 +227,33 @@ SYSCTL_UINT(_kern, OID_AUTO, kq_calloutmax, CTLFLAG_RW, #define KQ_NOTOWNED(kq) do { \ mtx_assert(&(kq)->kq_lock, MA_NOTOWNED); \ } while (0) -#define KN_LIST_LOCK(kn) do { \ - if (kn->kn_knlist != NULL) \ - kn->kn_knlist->kl_lock(kn->kn_knlist->kl_lockarg); \ -} while (0) -#define KN_LIST_UNLOCK(kn) do { \ - if (kn->kn_knlist != NULL) \ - kn->kn_knlist->kl_unlock(kn->kn_knlist->kl_lockarg); \ -} while (0) + +static struct knlist * +kn_list_lock(struct knote *kn) +{ + struct knlist *knl; + + knl = kn->kn_knlist; + if (knl != NULL) + knl->kl_lock(knl->kl_lockarg); + return (knl); +} + +static void +kn_list_unlock(struct knlist *knl) +{ + bool do_free; + + if (knl == NULL) + return; + do_free = knl->kl_autodestroy && knlist_empty(knl); + knl->kl_unlock(knl->kl_lockarg); + if (do_free) { + knlist_destroy(knl); + free(knl, M_KQUEUE); + } +} + #define KNL_ASSERT_LOCK(knl, islocked) do { \ if (islocked) \ KNL_ASSERT_LOCKED(knl); \ @@ -350,16 +369,16 @@ static int filt_procattach(struct knote *kn) { struct proc *p; - int immediate; int error; + bool exiting, immediate; - immediate = 0; + exiting = immediate = false; p = pfind(kn->kn_id); if (p == NULL && (kn->kn_sfflags & NOTE_EXIT)) { p = zpfind(kn->kn_id); - immediate = 1; + exiting = true; } else if (p != NULL && (p->p_flag & P_WEXIT)) { - immediate = 1; + exiting = true; } if (p == NULL) @@ -380,8 +399,8 @@ filt_procattach(struct knote *kn) kn->kn_flags &= ~EV_FLAG2; kn->kn_data = kn->kn_sdata; /* ppid */ kn->kn_fflags = NOTE_CHILD; - kn->kn_sfflags &= ~NOTE_EXIT; - immediate = 1; /* Force immediate activation of child note. */ + kn->kn_sfflags &= ~(NOTE_EXIT | NOTE_EXEC | NOTE_FORK); + immediate = true; /* Force immediate activation of child note. */ } /* * Internal flag indicating registration done by kernel (for other than @@ -391,8 +410,7 @@ filt_procattach(struct knote *kn) kn->kn_flags &= ~EV_FLAG1; } - if (immediate == 0) - knlist_add(&p->p_klist, kn, 1); + knlist_add(p->p_klist, kn, 1); /* * Immediately activate any child notes or, in the case of a zombie @@ -400,7 +418,7 @@ filt_procattach(struct knote *kn) * case where the target process, e.g. a child, dies before the kevent * is registered. */ - if (immediate && filt_proc(kn, NOTE_EXIT)) + if (immediate || (exiting && filt_proc(kn, NOTE_EXIT))) KNOTE_ACTIVATE(kn, 0); PROC_UNLOCK(p); @@ -420,10 +438,8 @@ filt_procattach(struct knote *kn) static void filt_procdetach(struct knote *kn) { - struct proc *p; - p = kn->kn_ptr.p_proc; - knlist_remove(&p->p_klist, kn, 0); + knlist_remove(kn->kn_knlist, kn, 0); kn->kn_ptr.p_proc = NULL; } @@ -444,8 +460,6 @@ filt_proc(struct knote *kn, long hint) /* Process is gone, so flag the event as finished. */ if (event == NOTE_EXIT) { - if (!(kn->kn_status & KN_DETACHED)) - knlist_remove_inevent(&p->p_klist, kn); kn->kn_flags |= EV_EOF | EV_ONESHOT; kn->kn_ptr.p_proc = NULL; if (kn->kn_fflags & NOTE_EXIT) @@ -479,12 +493,6 @@ knote_fork(struct knlist *list, int pid) list->kl_lock(list->kl_lockarg); SLIST_FOREACH(kn, &list->kl_list, kn_selnext) { - /* - * XXX - Why do we skip the kn if it is _INFLUX? Does this - * mean we will not properly wake up some notes? - */ - if ((kn->kn_status & KN_INFLUX) == KN_INFLUX) - continue; kq = kn->kn_kq; KQ_LOCK(kq); if ((kn->kn_status & (KN_INFLUX | KN_SCAN)) == KN_INFLUX) { @@ -525,7 +533,8 @@ knote_fork(struct knlist *list, int pid) */ kev.ident = pid; kev.filter = kn->kn_filter; - kev.flags = kn->kn_flags | EV_ADD | EV_ENABLE | EV_ONESHOT | EV_FLAG2; + kev.flags = kn->kn_flags | EV_ADD | EV_ENABLE | EV_ONESHOT | + EV_FLAG2; kev.fflags = kn->kn_sfflags; kev.data = kn->kn_id; /* parent */ kev.udata = kn->kn_kevent.udata;/* preserve udata */ @@ -1137,6 +1146,7 @@ kqueue_register(struct kqueue *kq, struct kevent *kev, struct thread *td, int wa struct filterops *fops; struct file *fp; struct knote *kn, *tkn; + struct knlist *knl; cap_rights_t rights; int error, filt, event; int haskqglobal, filedesc_unlock; @@ -1146,6 +1156,7 @@ kqueue_register(struct kqueue *kq, struct kevent *kev, struct thread *td, int wa fp = NULL; kn = NULL; + knl = NULL; error = 0; haskqglobal = 0; filedesc_unlock = 0; @@ -1300,7 +1311,7 @@ kqueue_register(struct kqueue *kq, struct kevent *kev, struct thread *td, int wa knote_drop(kn, td); goto done; } - KN_LIST_LOCK(kn); + knl = kn_list_lock(kn); goto done_ev_add; } else { /* No matching knote and the EV_ADD flag is not set. */ @@ -1331,7 +1342,7 @@ kqueue_register(struct kqueue *kq, struct kevent *kev, struct thread *td, int wa */ kn->kn_status |= KN_INFLUX | KN_SCAN; KQ_UNLOCK(kq); - KN_LIST_LOCK(kn); + knl = kn_list_lock(kn); kn->kn_kevent.udata = kev->udata; if (!fops->f_isfd && fops->f_touch != NULL) { fops->f_touch(kn, kev, EVENT_REGISTER); @@ -1365,7 +1376,7 @@ kqueue_register(struct kqueue *kq, struct kevent *kev, struct thread *td, int wa KN_ACTIVE) knote_enqueue(kn); kn->kn_status &= ~(KN_INFLUX | KN_SCAN); - KN_LIST_UNLOCK(kn); + kn_list_unlock(knl); KQ_UNLOCK_FLUX(kq); done: @@ -1535,6 +1546,7 @@ kqueue_scan(struct kqueue *kq, int maxevents, struct kevent_copyops *k_ops, { struct kevent *kevp; struct knote *kn, *marker; + struct knlist *knl; sbintime_t asbt, rsbt; int count, error, haskqglobal, influx, nkev, touch; @@ -1660,7 +1672,7 @@ kqueue_scan(struct kqueue *kq, int maxevents, struct kevent_copyops *k_ops, KQ_UNLOCK(kq); if ((kn->kn_status & KN_KQUEUE) == KN_KQUEUE) KQ_GLOBAL_LOCK(&kq_global, haskqglobal); - KN_LIST_LOCK(kn); + knl = kn_list_lock(kn); if (kn->kn_fop->f_event(kn, 0) == 0) { KQ_LOCK(kq); KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal); @@ -1668,7 +1680,7 @@ kqueue_scan(struct kqueue *kq, int maxevents, struct kevent_copyops *k_ops, ~(KN_QUEUED | KN_ACTIVE | KN_INFLUX | KN_SCAN); kq->kq_count--; - KN_LIST_UNLOCK(kn); + kn_list_unlock(knl); influx = 1; continue; } @@ -1697,7 +1709,7 @@ kqueue_scan(struct kqueue *kq, int maxevents, struct kevent_copyops *k_ops, TAILQ_INSERT_TAIL(&kq->kq_head, kn, kn_tqe); kn->kn_status &= ~(KN_INFLUX | KN_SCAN); - KN_LIST_UNLOCK(kn); + kn_list_unlock(knl); influx = 1; } @@ -2062,7 +2074,8 @@ knlist_add(struct knlist *knl, struct knote *kn, int islocked) } static void -knlist_remove_kq(struct knlist *knl, struct knote *kn, int knlislocked, int kqislocked) +knlist_remove_kq(struct knlist *knl, struct knote *kn, int knlislocked, + int kqislocked) { KASSERT(!(!!kqislocked && !knlislocked), ("kq locked w/o knl locked")); KNL_ASSERT_LOCK(knl, knlislocked); @@ -2075,7 +2088,7 @@ knlist_remove_kq(struct knlist *knl, struct knote *kn, int knlislocked, int kqis SLIST_REMOVE(&knl->kl_list, kn, knote, kn_selnext); kn->kn_knlist = NULL; if (!knlislocked) - knl->kl_unlock(knl->kl_lockarg); + kn_list_unlock(knl); if (!kqislocked) KQ_LOCK(kn->kn_kq); kn->kn_status |= KN_DETACHED; @@ -2093,17 +2106,6 @@ knlist_remove(struct knlist *knl, struct knote *kn, int islocked) knlist_remove_kq(knl, kn, islocked, 0); } -/* - * remove knote from the specified knlist while in f_event handler. - */ -void -knlist_remove_inevent(struct knlist *knl, struct knote *kn) -{ - - knlist_remove_kq(knl, kn, 1, - (kn->kn_status & KN_HASKQLOCK) == KN_HASKQLOCK); -} - int knlist_empty(struct knlist *knl) { @@ -2202,6 +2204,7 @@ knlist_init(struct knlist *knl, void *lock, void (*kl_lock)(void *), else knl->kl_assert_unlocked = kl_assert_unlocked; + knl->kl_autodestroy = false; SLIST_INIT(&knl->kl_list); } @@ -2212,6 +2215,16 @@ knlist_init_mtx(struct knlist *knl, struct mtx *lock) knlist_init(knl, lock, NULL, NULL, NULL, NULL); } +struct knlist * +knlist_alloc(struct mtx *lock) +{ + struct knlist *knl; + + knl = malloc(sizeof(struct knlist), M_KQUEUE, M_WAITOK); + knlist_init_mtx(knl, lock); + return (knl); +} + void knlist_init_rw_reader(struct knlist *knl, struct rwlock *lock) { @@ -2237,6 +2250,18 @@ knlist_destroy(struct knlist *knl) SLIST_INIT(&knl->kl_list); } +void +knlist_detach(struct knlist *knl) +{ + + KNL_ASSERT_LOCKED(knl); + knl->kl_autodestroy = true; + if (knlist_empty(knl)) { + knlist_destroy(knl); + free(knl, M_KQUEUE); + } +} + /* * Even if we are locked, we may need to drop the lock to allow any influx * knotes time to "settle". @@ -2247,6 +2272,7 @@ knlist_cleardel(struct knlist *knl, struct thread *td, int islocked, int killkn) struct knote *kn, *kn2; struct kqueue *kq; + KASSERT(!knl->kl_autodestroy, ("cleardel for autodestroy %p", knl)); if (islocked) KNL_ASSERT_LOCKED(knl); else { diff --git a/sys/kern/kern_exec.c b/sys/kern/kern_exec.c index 63d855d3ceea..844f1ed939cb 100644 --- a/sys/kern/kern_exec.c +++ b/sys/kern/kern_exec.c @@ -832,7 +832,7 @@ do_execve(td, args, mac_p) * Notify others that we exec'd, and clear the P_INEXEC flag * as we're now a bona fide freshly-execed process. */ - KNOTE_LOCKED(&p->p_klist, NOTE_EXEC); + KNOTE_LOCKED(p->p_klist, NOTE_EXEC); p->p_flag &= ~P_INEXEC; /* clear "fork but no exec" flag, as we _are_ execing */ diff --git a/sys/kern/kern_exit.c b/sys/kern/kern_exit.c index 0a0659d017ce..7b7323649b80 100644 --- a/sys/kern/kern_exit.c +++ b/sys/kern/kern_exit.c @@ -512,7 +512,7 @@ exit1(struct thread *td, int rval, int signo) /* * Notify interested parties of our demise. */ - KNOTE_LOCKED(&p->p_klist, NOTE_EXIT); + KNOTE_LOCKED(p->p_klist, NOTE_EXIT); #ifdef KDTRACE_HOOKS int reason = CLD_EXITED; @@ -523,13 +523,6 @@ exit1(struct thread *td, int rval, int signo) SDT_PROBE1(proc, , , exit, reason); #endif - /* - * Just delete all entries in the p_klist. At this point we won't - * report any more events, and there are nasty race conditions that - * can beat us if we don't. - */ - knlist_clear(&p->p_klist, 1); - /* * If this is a process with a descriptor, we may not need to deliver * a signal to the parent. proctree_lock is held over @@ -602,12 +595,6 @@ exit1(struct thread *td, int rval, int signo) p->p_state = PRS_ZOMBIE; PROC_UNLOCK(p->p_pptr); - /* - * Hopefully no one will try to deliver a signal to the process this - * late in the game. - */ - knlist_destroy(&p->p_klist); - /* * Save our children's rusage information in our exit rusage. */ @@ -853,6 +840,11 @@ proc_reap(struct thread *td, struct proc *p, int *status, int options) procdesc_reap(p); sx_xunlock(&proctree_lock); + PROC_LOCK(p); + knlist_detach(p->p_klist); + p->p_klist = NULL; + PROC_UNLOCK(p); + /* * Removal from allproc list and process group list paired with * PROC_LOCK which was executed during that time should guarantee diff --git a/sys/kern/kern_fork.c b/sys/kern/kern_fork.c index 035a704c72fb..b46cb722ce4f 100644 --- a/sys/kern/kern_fork.c +++ b/sys/kern/kern_fork.c @@ -748,7 +748,7 @@ do_fork(struct thread *td, struct fork_req *fr, struct proc *p2, struct thread * /* * Tell any interested parties about the new process. */ - knote_fork(&p1->p_klist, p2->p_pid); + knote_fork(p1->p_klist, p2->p_pid); SDT_PROBE3(proc, , , create, p2, p1, fr->fr_flags); if (fr->fr_flags & RFPROCDESC) { @@ -950,7 +950,7 @@ fork1(struct thread *td, struct fork_req *fr) #ifdef MAC mac_proc_init(newproc); #endif - knlist_init_mtx(&newproc->p_klist, &newproc->p_mtx); + newproc->p_klist = knlist_alloc(&newproc->p_mtx); STAILQ_INIT(&newproc->p_ktr); /* We have to lock the process tree while we look for a pid. */ diff --git a/sys/kern/kern_sig.c b/sys/kern/kern_sig.c index c740848abd74..424f3161d995 100644 --- a/sys/kern/kern_sig.c +++ b/sys/kern/kern_sig.c @@ -2112,7 +2112,7 @@ tdsendsignal(struct proc *p, struct thread *td, int sig, ksiginfo_t *ksi) } ps = p->p_sigacts; - KNOTE_LOCKED(&p->p_klist, NOTE_SIGNAL | sig); + KNOTE_LOCKED(p->p_klist, NOTE_SIGNAL | sig); prop = sigprop(sig); if (td == NULL) { @@ -3542,7 +3542,7 @@ filt_sigattach(struct knote *kn) kn->kn_ptr.p_proc = p; kn->kn_flags |= EV_CLEAR; /* automatically set */ - knlist_add(&p->p_klist, kn, 0); + knlist_add(p->p_klist, kn, 0); return (0); } @@ -3552,7 +3552,7 @@ filt_sigdetach(struct knote *kn) { struct proc *p = kn->kn_ptr.p_proc; - knlist_remove(&p->p_klist, kn, 0); + knlist_remove(p->p_klist, kn, 0); } /* diff --git a/sys/sys/event.h b/sys/sys/event.h index 5b54150f4ee3..d019307108ef 100644 --- a/sys/sys/event.h +++ b/sys/sys/event.h @@ -158,7 +158,8 @@ struct knlist { void (*kl_unlock)(void *); void (*kl_assert_locked)(void *); void (*kl_assert_unlocked)(void *); - void *kl_lockarg; /* argument passed to kl_lockf() */ + void *kl_lockarg; /* argument passed to lock functions */ + bool kl_autodestroy; }; @@ -258,9 +259,10 @@ struct rwlock; extern void knote(struct knlist *list, long hint, int lockflags); extern void knote_fork(struct knlist *list, int pid); +extern struct knlist *knlist_alloc(struct mtx *lock); +extern void knlist_detach(struct knlist *knl); extern void knlist_add(struct knlist *knl, struct knote *kn, int islocked); extern void knlist_remove(struct knlist *knl, struct knote *kn, int islocked); -extern void knlist_remove_inevent(struct knlist *knl, struct knote *kn); extern int knlist_empty(struct knlist *knl); extern void knlist_init(struct knlist *knl, void *lock, void (*kl_lock)(void *), void (*kl_unlock)(void *), diff --git a/sys/sys/proc.h b/sys/sys/proc.h index 7ae4a87ea40c..384c2f1ecf79 100644 --- a/sys/sys/proc.h +++ b/sys/sys/proc.h @@ -611,7 +611,7 @@ struct proc { /* End area that is copied on creation. */ #define p_endcopy p_xsig struct pgrp *p_pgrp; /* (c + e) Pointer to process group. */ - struct knlist p_klist; /* (c) Knotes attached to this proc. */ + struct knlist *p_klist; /* (c) Knotes attached to this proc. */ int p_numthreads; /* (c) Number of threads. */ struct mdproc p_md; /* Any machine-dependent fields. */ struct callout p_itcallout; /* (h + c) Interval timer callout. */