epoch(9): Make epochs non-preemptible by default

There are risks associated with waiting on a preemptible epoch section.
Change the name to make them not be the default and document the issue
under CAVEATS.

Reported by:	markj
This commit is contained in:
Matt Macy 2018-05-18 17:29:43 +00:00
parent 1696e1f2b8
commit 70398c2f86
Notes: svn2git 2020-12-20 02:59:44 +00:00
svn path=/head/; revision=333802
9 changed files with 85 additions and 71 deletions

View file

@ -49,15 +49,15 @@
.Ft void
.Fn epoch_enter "epoch_t epoch"
.Ft void
.Fn epoch_enter_critical "epoch_t epoch"
.Fn epoch_enter_preempt "epoch_t epoch"
.Ft void
.Fn epoch_exit "epoch_t epoch"
.Ft void
.Fn epoch_exit_critical "epoch_t epoch"
.Fn epoch_exit_preempt "epoch_t epoch"
.Ft void
.Fn epoch_wait "epoch_t epoch"
.Ft void
.Fn epoch_wait_critical "epoch_t epoch"
.Fn epoch_wait_preempt "epoch_t epoch"
.Ft void
.Fn epoch_call "epoch_t epoch" "epoch_context_t ctx" "void (*callback) (epoch_context_t)"
.Ft int
@ -73,20 +73,22 @@ Epochs are allocated with
and freed with
.Fn epoch_free .
The flags passed to epoch_alloc determine whether preemption is
allowed during a section (the default) or not, as specified by
EPOCH_CRITICAL.
allowed during a section or not (the dafult), as specified by
EPOCH_PREEMPT.
Threads indicate the start of an epoch critical section by calling
.Fn epoch_enter .
The end of a critical section is indicated by calling
.Fn epoch_exit .
The _critical variants can be used around code in which it is safe
to have preemption disable.
The _preempt variants can be used around code which requires preemption.
A thread can wait until a grace period has elapsed
since any threads have entered
the epoch by calling
.Fn epoch_wait .
The use of a EPOCH_CRITICAL epoch type allows one to use
.Fn epoch_wait_critical
.Fn epoch_wait
or
.Fn epoch_wait_preempt ,
depending on the epoch_type.
The use of a default epoch type allows one to use
.Fn epoch_wait
which is guaranteed to have much shorter completion times since
we know that none of the threads in an epoch section will be preempted
before completing its section.
@ -95,14 +97,14 @@ path it can ensure that a grace period has elapsed by calling
.Fn epoch_call
with a callback with any work that needs to wait for an epoch to elapse.
Only non-sleepable locks can be acquired during a section protected by
.Fn epoch_enter
.Fn epoch_enter_preempt
and
.Fn epoch_exit .
.Fn epoch_exit_preempt .
INVARIANTS can assert that a thread is in an epoch by using
.Fn in_epoch .
.Pp
The epoch API currently does not support sleeping in epoch sections.
A caller cannot do epoch_enter recursively on different epochs. A
The epoch API currently does not support sleeping in epoch_preempt sections.
A caller cannot do epoch_enter recursively on different preemptible epochs. A
caller should never call
.Fn epoch_wait
in the middle of an epoch section as this will lead to a deadlock.
@ -113,10 +115,16 @@ When modifying a list referenced from an epoch section safe removal
routines must be used and the caller can no longer modify a list entry
in place. An item to be modified must be handled with copy on write
and frees must be deferred until after a grace period has elapsed.
.Sh RETURN VALUES
.Fn in_epoch
will return 1 if curthread is in an epoch, 0 otherwise.
.Sh CAVEATS
One must be cautious when using
.Fn epoch_wait_preempt
threads are pinned during epoch sections so if a thread in a section is then
preempted by a higher priority compute bound thread on that CPU it can be
prevented from leaving the section. Thus the wait time for the waiter is
potentially unbounded.
.Sh EXAMPLES
Async free example:

View file

@ -1717,12 +1717,12 @@ pmc_process_mmap(struct thread *td, struct pmckern_map_in *pkm)
const struct pmc_process *pp;
freepath = fullpath = NULL;
epoch_exit(global_epoch);
epoch_exit_preempt(global_epoch_preempt);
pmc_getfilename((struct vnode *) pkm->pm_file, &fullpath, &freepath);
pid = td->td_proc->p_pid;
epoch_enter(global_epoch);
epoch_enter_preempt(global_epoch_preempt);
/* Inform owners of all system-wide sampling PMCs. */
CK_LIST_FOREACH(po, &pmc_ss_owners, po_ssnext)
if (po->po_flags & PMC_PO_OWNS_LOGFILE)
@ -1761,12 +1761,12 @@ pmc_process_munmap(struct thread *td, struct pmckern_map_out *pkm)
pid = td->td_proc->p_pid;
epoch_enter(global_epoch);
epoch_enter_preempt(global_epoch_preempt);
CK_LIST_FOREACH(po, &pmc_ss_owners, po_ssnext)
if (po->po_flags & PMC_PO_OWNS_LOGFILE)
pmclog_process_map_out(po, pid, pkm->pm_address,
pkm->pm_address + pkm->pm_size);
epoch_exit(global_epoch);
epoch_exit_preempt(global_epoch_preempt);
if ((pp = pmc_find_process_descriptor(td->td_proc, 0)) == NULL)
return;
@ -2065,13 +2065,13 @@ pmc_hook_handler(struct thread *td, int function, void *arg)
pk = (struct pmckern_procexec *) arg;
epoch_enter(global_epoch);
epoch_enter_preempt(global_epoch_preempt);
/* Inform owners of SS mode PMCs of the exec event. */
CK_LIST_FOREACH(po, &pmc_ss_owners, po_ssnext)
if (po->po_flags & PMC_PO_OWNS_LOGFILE)
pmclog_process_procexec(po, PMC_ID_INVALID,
p->p_pid, pk->pm_entryaddr, fullpath);
epoch_exit(global_epoch);
epoch_exit_preempt(global_epoch_preempt);
PROC_LOCK(p);
is_using_hwpmcs = p->p_flag & P_HWPMC;
@ -2749,7 +2749,7 @@ pmc_release_pmc_descriptor(struct pmc *pm)
if (po->po_sscount == 0) {
atomic_subtract_rel_int(&pmc_ss_count, 1);
CK_LIST_REMOVE(po, po_ssnext);
epoch_wait(global_epoch);
epoch_wait_preempt(global_epoch_preempt);
}
}
@ -3243,7 +3243,7 @@ pmc_stop(struct pmc *pm)
if (po->po_sscount == 0) {
atomic_subtract_rel_int(&pmc_ss_count, 1);
CK_LIST_REMOVE(po, po_ssnext);
epoch_wait(global_epoch);
epoch_wait_preempt(global_epoch_preempt);
PMCDBG1(PMC,OPS,2,"po=%p removed from global list", po);
}
}
@ -4873,11 +4873,11 @@ pmc_process_exit(void *arg __unused, struct proc *p)
/*
* Log a sysexit event to all SS PMC owners.
*/
epoch_enter(global_epoch);
epoch_enter_preempt(global_epoch_preempt);
CK_LIST_FOREACH(po, &pmc_ss_owners, po_ssnext)
if (po->po_flags & PMC_PO_OWNS_LOGFILE)
pmclog_process_sysexit(po, p->p_pid);
epoch_exit(global_epoch);
epoch_exit_preempt(global_epoch_preempt);
if (!is_using_hwpmcs)
return;
@ -5058,11 +5058,11 @@ pmc_process_fork(void *arg __unused, struct proc *p1, struct proc *newproc,
* If there are system-wide sampling PMCs active, we need to
* log all fork events to their owner's logs.
*/
epoch_enter(global_epoch);
epoch_enter_preempt(global_epoch_preempt);
CK_LIST_FOREACH(po, &pmc_ss_owners, po_ssnext)
if (po->po_flags & PMC_PO_OWNS_LOGFILE)
pmclog_process_procfork(po, p1->p_pid, newproc->p_pid);
epoch_exit(global_epoch);
epoch_exit_preempt(global_epoch_preempt);
if (!is_using_hwpmcs)
return;
@ -5131,12 +5131,12 @@ pmc_kld_load(void *arg __unused, linker_file_t lf)
/*
* Notify owners of system sampling PMCs about KLD operations.
*/
epoch_enter(global_epoch);
epoch_enter_preempt(global_epoch_preempt);
CK_LIST_FOREACH(po, &pmc_ss_owners, po_ssnext)
if (po->po_flags & PMC_PO_OWNS_LOGFILE)
pmclog_process_map_in(po, (pid_t) -1,
(uintfptr_t) lf->address, lf->filename);
epoch_exit(global_epoch);
epoch_exit_preempt(global_epoch_preempt);
/*
* TODO: Notify owners of (all) process-sampling PMCs too.
@ -5149,12 +5149,12 @@ pmc_kld_unload(void *arg __unused, const char *filename __unused,
{
struct pmc_owner *po;
epoch_enter(global_epoch);
epoch_enter_preempt(global_epoch_preempt);
CK_LIST_FOREACH(po, &pmc_ss_owners, po_ssnext)
if (po->po_flags & PMC_PO_OWNS_LOGFILE)
pmclog_process_map_out(po, (pid_t) -1,
(uintfptr_t) address, (uintfptr_t) address + size);
epoch_exit(global_epoch);
epoch_exit_preempt(global_epoch_preempt);
/*
* TODO: Notify owners of process-sampling PMCs.

View file

@ -124,7 +124,7 @@ static __read_mostly int domoffsets[MAXMEMDOM];
static __read_mostly int inited;
static __read_mostly int epoch_count;
__read_mostly epoch_t global_epoch;
__read_mostly epoch_t global_epoch_critical;
__read_mostly epoch_t global_epoch_preempt;
static void epoch_call_task(void *context __unused);
@ -169,7 +169,7 @@ epoch_init(void *arg __unused)
}
inited = 1;
global_epoch = epoch_alloc(0);
global_epoch_critical = epoch_alloc(EPOCH_CRITICAL);
global_epoch_preempt = epoch_alloc(EPOCH_PREEMPT);
}
SYSINIT(epoch, SI_SUB_TASKQ + 1, SI_ORDER_FIRST, epoch_init, NULL);
@ -247,7 +247,7 @@ epoch_free(epoch_t epoch)
}
#endif
allepochs[epoch->e_idx] = NULL;
epoch_wait_critical(global_epoch_critical);
epoch_wait(global_epoch);
if (usedomains)
for (domain = 0; domain < vm_ndomains; domain++)
free_domain(epoch->e_pcpu_dom[domain], M_EPOCH);
@ -263,10 +263,11 @@ epoch_free(epoch_t epoch)
} while (0)
void
epoch_enter_internal(epoch_t epoch, struct thread *td)
epoch_enter_preempt_internal(epoch_t epoch, struct thread *td)
{
struct epoch_pcpu_state *eps;
MPASS(epoch->e_flags & EPOCH_PREEMPT);
INIT_CHECK(epoch);
critical_enter();
td->td_pre_epoch_prio = td->td_priority;
@ -293,7 +294,7 @@ epoch_enter_internal(epoch_t epoch, struct thread *td)
void
epoch_enter_critical(epoch_t epoch)
epoch_enter(epoch_t epoch)
{
ck_epoch_record_t *record;
ck_epoch_section_t *section;
@ -310,7 +311,7 @@ epoch_enter_critical(epoch_t epoch)
}
void
epoch_exit_internal(epoch_t epoch, struct thread *td)
epoch_exit_preempt_internal(epoch_t epoch, struct thread *td)
{
struct epoch_pcpu_state *eps;
@ -319,6 +320,7 @@ epoch_exit_internal(epoch_t epoch, struct thread *td)
critical_enter();
eps = epoch->e_pcpu[curcpu];
MPASS(epoch->e_flags & EPOCH_PREEMPT);
ck_epoch_end(&eps->eps_record.er_record, (ck_epoch_section_t*)&td->td_epoch_section);
TAILQ_REMOVE(&eps->eps_record.er_tdlist, td, td_epochq);
eps->eps_record.er_gen++;
@ -332,7 +334,7 @@ epoch_exit_internal(epoch_t epoch, struct thread *td)
}
void
epoch_exit_critical(epoch_t epoch)
epoch_exit(epoch_t epoch)
{
ck_epoch_record_t *record;
ck_epoch_section_t *section;
@ -349,11 +351,11 @@ epoch_exit_critical(epoch_t epoch)
}
/*
* epoch_block_handler is a callback from the ck code when another thread is
* epoch_block_handler_preempt is a callback from the ck code when another thread is
* currently in an epoch section.
*/
static void
epoch_block_handler(struct ck_epoch *global __unused, ck_epoch_record_t *cr,
epoch_block_handler_preempt(struct ck_epoch *global __unused, ck_epoch_record_t *cr,
void *arg __unused)
{
epoch_record_t record;
@ -481,7 +483,7 @@ epoch_block_handler(struct ck_epoch *global __unused, ck_epoch_record_t *cr,
}
void
epoch_wait(epoch_t epoch)
epoch_wait_preempt(epoch_t epoch)
{
struct thread *td;
int was_bound;
@ -495,6 +497,7 @@ epoch_wait(epoch_t epoch)
#endif
INIT_CHECK(epoch);
MPASS(epoch->e_flags & EPOCH_PREEMPT);
WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL,
"epoch_wait() can sleep");
@ -512,7 +515,7 @@ epoch_wait(epoch_t epoch)
td->td_pinned = 0;
sched_bind(td, old_cpu);
ck_epoch_synchronize_wait(&epoch->e_epoch, epoch_block_handler, NULL);
ck_epoch_synchronize_wait(&epoch->e_epoch, epoch_block_handler_preempt, NULL);
/* restore CPU binding, if any */
if (was_bound != 0) {
@ -535,19 +538,19 @@ epoch_wait(epoch_t epoch)
}
static void
epoch_block_handler_critical(struct ck_epoch *g __unused, ck_epoch_record_t *c __unused,
epoch_block_handler(struct ck_epoch *g __unused, ck_epoch_record_t *c __unused,
void *arg __unused)
{
cpu_spinwait();
}
void
epoch_wait_critical(epoch_t epoch)
epoch_wait(epoch_t epoch)
{
MPASS(epoch->e_flags & EPOCH_CRITICAL);
MPASS(epoch->e_flags == 0);
critical_enter();
ck_epoch_synchronize_wait(&epoch->e_epoch, epoch_block_handler_critical, NULL);
ck_epoch_synchronize_wait(&epoch->e_epoch, epoch_block_handler, NULL);
critical_exit();
}
@ -585,7 +588,7 @@ epoch_call_task(void *arg __unused)
ck_stack_init(&cb_stack);
critical_enter();
epoch_enter_critical(global_epoch_critical);
epoch_enter(global_epoch);
for (total = i = 0; i < epoch_count; i++) {
if (__predict_false((epoch = allepochs[i]) == NULL))
continue;
@ -595,7 +598,7 @@ epoch_call_task(void *arg __unused)
ck_epoch_poll_deferred(record, &cb_stack);
total += npending - record->n_pending;
}
epoch_exit_critical(global_epoch_critical);
epoch_exit(global_epoch);
*DPCPU_PTR(epoch_cb_count) -= total;
critical_exit();

View file

@ -104,6 +104,7 @@
_Static_assert(sizeof(((struct ifreq *)0)->ifr_name) ==
offsetof(struct ifreq, ifr_ifru), "gap between ifr_name and ifr_ifru");
epoch_t net_epoch_preempt;
epoch_t net_epoch;
#ifdef COMPAT_FREEBSD32
#include <sys/mount.h>
@ -903,6 +904,7 @@ if_attachdomain(void *dummy)
{
struct ifnet *ifp;
net_epoch_preempt = epoch_alloc(EPOCH_PREEMPT);
net_epoch = epoch_alloc(0);
TAILQ_FOREACH(ifp, &V_ifnet, if_link)
if_attachdomain1(ifp);

View file

@ -73,8 +73,8 @@ __FBSDID("$FreeBSD$");
#include <net/if_lagg.h>
#include <net/ieee8023ad_lacp.h>
#define LAGG_RLOCK() epoch_enter(net_epoch)
#define LAGG_RUNLOCK() epoch_exit(net_epoch)
#define LAGG_RLOCK() epoch_enter_preempt(net_epoch_preempt)
#define LAGG_RUNLOCK() epoch_exit_preempt(net_epoch_preempt)
#define LAGG_RLOCK_ASSERT() MPASS(in_epoch())
#define LAGG_UNLOCK_ASSERT() MPASS(!in_epoch())
@ -859,7 +859,7 @@ lagg_port_destroy(struct lagg_port *lp, int rundelport)
* free port and release it's ifnet reference after a grace period has
* elapsed.
*/
epoch_call(net_epoch, &lp->lp_epoch_ctx, lagg_port_destroy_cb);
epoch_call(net_epoch_preempt, &lp->lp_epoch_ctx, lagg_port_destroy_cb);
/* Update lagg capabilities */
lagg_capabilities(sc);
lagg_linkstate(sc);

View file

@ -106,6 +106,7 @@ VNET_DECLARE(struct hhook_head *, ipsec_hhh_in[HHOOK_IPSEC_COUNT]);
VNET_DECLARE(struct hhook_head *, ipsec_hhh_out[HHOOK_IPSEC_COUNT]);
#define V_ipsec_hhh_in VNET(ipsec_hhh_in)
#define V_ipsec_hhh_out VNET(ipsec_hhh_out)
extern epoch_t net_epoch_preempt;
extern epoch_t net_epoch;
#endif /* _KERNEL */

View file

@ -35,10 +35,10 @@
struct epoch;
typedef struct epoch *epoch_t;
#define EPOCH_CRITICAL 0x1
#define EPOCH_PREEMPT 0x1
extern epoch_t global_epoch;
extern epoch_t global_epoch_critical;
extern epoch_t global_epoch_preempt;
DPCPU_DECLARE(int, epoch_cb_count);
DPCPU_DECLARE(struct grouptask, epoch_cb_task);
@ -50,17 +50,17 @@ typedef struct epoch_context *epoch_context_t;
epoch_t epoch_alloc(int flags);
void epoch_free(epoch_t epoch);
void epoch_enter_critical(epoch_t epoch);
void epoch_enter_internal(epoch_t epoch, struct thread *td);
void epoch_exit_critical(epoch_t epoch);
void epoch_exit_internal(epoch_t epoch, struct thread *td);
void epoch_enter(epoch_t epoch);
void epoch_enter_preempt_internal(epoch_t epoch, struct thread *td);
void epoch_exit(epoch_t epoch);
void epoch_exit_preempt_internal(epoch_t epoch, struct thread *td);
void epoch_wait(epoch_t epoch);
void epoch_wait_critical(epoch_t epoch);
void epoch_wait_preempt(epoch_t epoch);
void epoch_call(epoch_t epoch, epoch_context_t ctx, void (*callback) (epoch_context_t));
int in_epoch(void);
static __inline void
epoch_enter(epoch_t epoch)
epoch_enter_preempt(epoch_t epoch)
{
struct thread *td;
int nesting;
@ -70,18 +70,18 @@ epoch_enter(epoch_t epoch)
#ifndef INVARIANTS
if (nesting == 0)
#endif
epoch_enter_internal(epoch, td);
epoch_enter_preempt_internal(epoch, td);
}
static __inline void
epoch_exit(epoch_t epoch)
epoch_exit_preempt(epoch_t epoch)
{
struct thread *td;
td = curthread;
MPASS(td->td_epochnest);
if (td->td_epochnest-- == 1)
epoch_exit_internal(epoch, td);
epoch_exit_preempt_internal(epoch, td);
}
#endif

View file

@ -196,10 +196,10 @@ extern struct pmc_domain_buffer_header *pmc_dom_hdrs[MAXMEMDOM];
/* Hook invocation; for use within the kernel */
#define PMC_CALL_HOOK(t, cmd, arg) \
do { \
epoch_enter(global_epoch); \
epoch_enter_preempt(global_epoch_preempt); \
if (pmc_hook != NULL) \
(pmc_hook)((t), (cmd), (arg)); \
epoch_exit(global_epoch); \
epoch_exit_preempt(global_epoch_preempt); \
} while (0)
/* Hook invocation that needs an exclusive lock */

View file

@ -76,12 +76,12 @@ epoch_testcase1(struct epoch_test_instance *eti)
mtxp = &mutexB;
while (i < iterations) {
epoch_enter(test_epoch);
epoch_enter_preempt(test_epoch);
mtx_lock(mtxp);
i++;
mtx_unlock(mtxp);
epoch_exit(test_epoch);
epoch_wait(test_epoch);
epoch_exit_preempt(test_epoch);
epoch_wait_preempt(test_epoch);
}
printf("test1: thread: %d took %d ticks to complete %d iterations\n",
eti->threadid, ticks - startticks, iterations);
@ -98,13 +98,13 @@ epoch_testcase2(struct epoch_test_instance *eti)
mtxp = &mutexA;
while (i < iterations) {
epoch_enter(test_epoch);
epoch_enter_preempt(test_epoch);
mtx_lock(mtxp);
DELAY(1);
i++;
mtx_unlock(mtxp);
epoch_exit(test_epoch);
epoch_wait(test_epoch);
epoch_exit_preempt(test_epoch);
epoch_wait_preempt(test_epoch);
}
printf("test2: thread: %d took %d ticks to complete %d iterations\n",
eti->threadid, ticks - startticks, iterations);
@ -139,7 +139,7 @@ test_modinit(void)
int i, error, pri_range, pri_off;
pri_range = PRI_MIN_TIMESHARE - PRI_MIN_REALTIME;
test_epoch = epoch_alloc(0);
test_epoch = epoch_alloc(EPOCH_PREEMPT);
for (i = 0; i < mp_ncpus*2; i++) {
etilist[i].threadid = i;
error = kthread_add(testloop, &etilist[i], NULL, &testthreads[i],