qemu/coroutine-ucontext.c
Paolo Bonzini d1d1b206b0 coroutine-ucontext: use __thread
ELF thread local storage is about 10% faster on tests/test-coroutine's
perf/cost test.  The timing on my machine is 190ns per iteration with
pthread TLS, 170 with ELF TLS.

Based on a patch by Kevin Wolf and Peter Lieven, but redone to follow
the model of coroutine-win32.c (including the important "noinline"
attribute!).

Platforms without thread-local storage (OpenBSD probably?) will need
a new-enough GCC for this to compile, in order to use the same emutls
support that Windows already relies on.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Reviewed-by: Fam Zheng <famz@redhat.com>
Message-id: 1417518350-6167-2-git-send-email-pbonzini@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
2015-01-13 13:43:28 +00:00

195 lines
5.1 KiB
C

/*
* ucontext coroutine initialization code
*
* Copyright (C) 2006 Anthony Liguori <anthony@codemonkey.ws>
* Copyright (C) 2011 Kevin Wolf <kwolf@redhat.com>
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.0 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, see <http://www.gnu.org/licenses/>.
*/
/* XXX Is there a nicer way to disable glibc's stack check for longjmp? */
#ifdef _FORTIFY_SOURCE
#undef _FORTIFY_SOURCE
#endif
#include <stdlib.h>
#include <setjmp.h>
#include <stdint.h>
#include <ucontext.h>
#include "qemu-common.h"
#include "block/coroutine_int.h"
#ifdef CONFIG_VALGRIND_H
#include <valgrind/valgrind.h>
#endif
typedef struct {
Coroutine base;
void *stack;
sigjmp_buf env;
#ifdef CONFIG_VALGRIND_H
unsigned int valgrind_stack_id;
#endif
} CoroutineUContext;
/**
* Per-thread coroutine bookkeeping
*/
static __thread CoroutineUContext leader;
static __thread Coroutine *current;
/*
* va_args to makecontext() must be type 'int', so passing
* the pointer we need may require several int args. This
* union is a quick hack to let us do that
*/
union cc_arg {
void *p;
int i[2];
};
static void coroutine_trampoline(int i0, int i1)
{
union cc_arg arg;
CoroutineUContext *self;
Coroutine *co;
arg.i[0] = i0;
arg.i[1] = i1;
self = arg.p;
co = &self->base;
/* Initialize longjmp environment and switch back the caller */
if (!sigsetjmp(self->env, 0)) {
siglongjmp(*(sigjmp_buf *)co->entry_arg, 1);
}
while (true) {
co->entry(co->entry_arg);
qemu_coroutine_switch(co, co->caller, COROUTINE_TERMINATE);
}
}
Coroutine *qemu_coroutine_new(void)
{
const size_t stack_size = 1 << 20;
CoroutineUContext *co;
ucontext_t old_uc, uc;
sigjmp_buf old_env;
union cc_arg arg = {0};
/* The ucontext functions preserve signal masks which incurs a
* system call overhead. sigsetjmp(buf, 0)/siglongjmp() does not
* preserve signal masks but only works on the current stack.
* Since we need a way to create and switch to a new stack, use
* the ucontext functions for that but sigsetjmp()/siglongjmp() for
* everything else.
*/
if (getcontext(&uc) == -1) {
abort();
}
co = g_malloc0(sizeof(*co));
co->stack = g_malloc(stack_size);
co->base.entry_arg = &old_env; /* stash away our jmp_buf */
uc.uc_link = &old_uc;
uc.uc_stack.ss_sp = co->stack;
uc.uc_stack.ss_size = stack_size;
uc.uc_stack.ss_flags = 0;
#ifdef CONFIG_VALGRIND_H
co->valgrind_stack_id =
VALGRIND_STACK_REGISTER(co->stack, co->stack + stack_size);
#endif
arg.p = co;
makecontext(&uc, (void (*)(void))coroutine_trampoline,
2, arg.i[0], arg.i[1]);
/* swapcontext() in, siglongjmp() back out */
if (!sigsetjmp(old_env, 0)) {
swapcontext(&old_uc, &uc);
}
return &co->base;
}
#ifdef CONFIG_VALGRIND_H
#ifdef CONFIG_PRAGMA_DIAGNOSTIC_AVAILABLE
/* Work around an unused variable in the valgrind.h macro... */
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-but-set-variable"
#endif
static inline void valgrind_stack_deregister(CoroutineUContext *co)
{
VALGRIND_STACK_DEREGISTER(co->valgrind_stack_id);
}
#ifdef CONFIG_PRAGMA_DIAGNOSTIC_AVAILABLE
#pragma GCC diagnostic pop
#endif
#endif
void qemu_coroutine_delete(Coroutine *co_)
{
CoroutineUContext *co = DO_UPCAST(CoroutineUContext, base, co_);
#ifdef CONFIG_VALGRIND_H
valgrind_stack_deregister(co);
#endif
g_free(co->stack);
g_free(co);
}
/* This function is marked noinline to prevent GCC from inlining it
* into coroutine_trampoline(). If we allow it to do that then it
* hoists the code to get the address of the TLS variable "current"
* out of the while() loop. This is an invalid transformation because
* the sigsetjmp() call may be called when running thread A but
* return in thread B, and so we might be in a different thread
* context each time round the loop.
*/
CoroutineAction __attribute__((noinline))
qemu_coroutine_switch(Coroutine *from_, Coroutine *to_,
CoroutineAction action)
{
CoroutineUContext *from = DO_UPCAST(CoroutineUContext, base, from_);
CoroutineUContext *to = DO_UPCAST(CoroutineUContext, base, to_);
int ret;
current = to_;
ret = sigsetjmp(from->env, 0);
if (ret == 0) {
siglongjmp(to->env, action);
}
return ret;
}
Coroutine *qemu_coroutine_self(void)
{
if (!current) {
current = &leader.base;
}
return current;
}
bool qemu_in_coroutine(void)
{
return current && current->caller;
}