This is the MTTCG pull-request as posted yesterday.

-----BEGIN PGP SIGNATURE----- Version: GnuPG v1 iQEcBAABAgAGBQJYsBZfAAoJEPvQ2wlanipElJ0H+QGoStSPeHrvKu7Q07v4F9zM Pvf05gRsaxvXl7UbwmXC4oKhvZf9rVJ6ITk0x/y0WvmK0mHCmNBWkC0nn5UFL5IH cdxetLz21Q+Ghpc36tZvqn2HYwRQFoEznge2LdtBDG0TyVA4jwquHU3HCG2D51zi BaImI6lYW1e4ejjZHw8cEInSxsj/HJZE4pPas2Tkci+uAnrJroErwBVRRcE/y/Tn aupl9TJFs2JdyJFNDibIm0kjB+i+jvCiLgYjbKZ/dR/+GZt73TtiBk/q9ZOFjdmT 7YFPI3F46QbGHoZahtzh0Xt7WMj94SlQgQ9OJ3zmNMfpXrze6Yc78xo/nbQ33U0= =wR0/ -----END PGP SIGNATURE----- Merge remote-tracking branch 'remotes/stsquad/tags/pull-mttcg-240217-1' into staging This is the MTTCG pull-request as posted yesterday. # gpg: Signature made Fri 24 Feb 2017 11:17:51 GMT # gpg: using RSA key 0xFBD0DB095A9E2A44 # gpg: Good signature from "Alex Bennée (Master Work Key) <alex.bennee@linaro.org>" # Primary key fingerprint: 6685 AE99 E751 67BC AFC8 DF35 FBD0 DB09 5A9E 2A44 * remotes/stsquad/tags/pull-mttcg-240217-1: (24 commits) tcg: enable MTTCG by default for ARM on x86 hosts hw/misc/imx6_src: defer clearing of SRC_SCR reset bits target-arm: ensure all cross vCPUs TLB flushes complete target-arm: don't generate WFE/YIELD calls for MTTCG target-arm/powerctl: defer cpu reset work to CPU context cputlb: introduce tlb_flush_*_all_cpus[_synced] cputlb: atomically update tlb fields used by tlb_reset_dirty cputlb: add tlb_flush_by_mmuidx async routines cputlb and arm/sparc targets: convert mmuidx flushes from varg to bitmap cputlb: introduce tlb_flush_* async work. cputlb: tweak qemu_ram_addr_from_host_nofail reporting cputlb: add assert_cpu_is_self checks tcg: handle EXCP_ATOMIC exception for system emulation tcg: enable thread-per-vCPU tcg: enable tb_lock() for SoftMMU tcg: remove global exit_request tcg: drop global lock during TCG code execution tcg: rename tcg_current_cpu to tcg_current_rr_cpu tcg: add kick timer for single-threaded vCPU emulation tcg: add options for enabling MTTCG ... Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
2024-07-21 10:24:33 +00:00 · 2017-02-25 18:43:52 +00:00 · 2017-02-25 18:43:52 +00:00 · 28f997a82c
parent 2421f381dc ca759f9e38
commit 28f997a82c
40 changed files with 1881 additions and 479 deletions
--- a/6
+++ b/6
@ -5879,6 +5879,7 @@ mkdir -p $target_dir
 echo "# Automatically generated by configure - do not modify" > $config_target_mak

 bflt="no"
+mttcg="no"
 interp_prefix1=$(echo "$interp_prefix" | sed "s/%M/$target_name/g")
 gdb_xml_files=""

@ -5897,11 +5898,13 @@ case "$target_name" in
  arm|armeb)
    TARGET_ARCH=arm
    bflt="yes"
+    mttcg="yes"
    gdb_xml_files="arm-core.xml arm-vfp.xml arm-vfp3.xml arm-neon.xml"
  ;;
  aarch64)
    TARGET_BASE_ARCH=arm
    bflt="yes"
+    mttcg="yes"
    gdb_xml_files="aarch64-core.xml aarch64-fpu.xml arm-core.xml arm-vfp.xml arm-vfp3.xml arm-neon.xml"
  ;;
  cris)
@ -6066,6 +6069,9 @@ if test "$target_bigendian" = "yes" ; then
 fi
 if test "$target_softmmu" = "yes" ; then
  echo "CONFIG_SOFTMMU=y" >> $config_target_mak
+  if test "$mttcg" = "yes" ; then
+    echo "TARGET_SUPPORTS_MTTCG=y" >> $config_target_mak
+  fi
 fi
 if test "$target_user_only" = "yes" ; then
  echo "CONFIG_USER_ONLY=y" >> $config_target_mak
--- a/cpu-exec-common.c
+++ b/cpu-exec-common.c
@ -23,9 +23,6 @@
 #include "exec/exec-all.h"
 #include "exec/memory-internal.h"

-bool exit_request;
-CPUState *tcg_current_cpu;
-
 /* exit the current TB, but without causing any exception to be raised */
 void cpu_loop_exit_noexc(CPUState *cpu)
 {
--- a/cpu-exec.c
+++ b/cpu-exec.c
@ -29,6 +29,7 @@
 #include "qemu/rcu.h"
 #include "exec/tb-hash.h"
 #include "exec/log.h"
+#include "qemu/main-loop.h"
 #if defined(TARGET_I386) && !defined(CONFIG_USER_ONLY)
 #include "hw/i386/apic.h"
 #endif
@ -227,20 +228,43 @@ static void cpu_exec_nocache(CPUState *cpu, int max_cycles,

 static void cpu_exec_step(CPUState *cpu)
 {
+    CPUClass *cc = CPU_GET_CLASS(cpu);
    CPUArchState *env = (CPUArchState *)cpu->env_ptr;
    TranslationBlock *tb;
    target_ulong cs_base, pc;
    uint32_t flags;

    cpu_get_tb_cpu_state(env, &pc, &cs_base, &flags);
-    tb = tb_gen_code(cpu, pc, cs_base, flags,
-                     1 | CF_NOCACHE | CF_IGNORE_ICOUNT);
-    tb->orig_tb = NULL;
-    /* execute the generated code */
-    trace_exec_tb_nocache(tb, pc);
-    cpu_tb_exec(cpu, tb);
-    tb_phys_invalidate(tb, -1);
-    tb_free(tb);
+    if (sigsetjmp(cpu->jmp_env, 0) == 0) {
+        mmap_lock();
+        tb_lock();
+        tb = tb_gen_code(cpu, pc, cs_base, flags,
+                         1 | CF_NOCACHE | CF_IGNORE_ICOUNT);
+        tb->orig_tb = NULL;
+        tb_unlock();
+        mmap_unlock();
+
+        cc->cpu_exec_enter(cpu);
+        /* execute the generated code */
+        trace_exec_tb_nocache(tb, pc);
+        cpu_tb_exec(cpu, tb);
+        cc->cpu_exec_exit(cpu);
+
+        tb_lock();
+        tb_phys_invalidate(tb, -1);
+        tb_free(tb);
+        tb_unlock();
+    } else {
+        /* We may have exited due to another problem here, so we need
+         * to reset any tb_locks we may have taken but didn't release.
+         * The mmap_lock is dropped by tb_gen_code if it runs out of
+         * memory.
+         */
+#ifndef CONFIG_SOFTMMU
+        tcg_debug_assert(!have_mmap_lock());
+#endif
+        tb_lock_reset();
+    }
 }

 void cpu_exec_step_atomic(CPUState *cpu)
@ -384,12 +408,13 @@ static inline bool cpu_handle_halt(CPUState *cpu)
        if ((cpu->interrupt_request & CPU_INTERRUPT_POLL)
            && replay_interrupt()) {
            X86CPU *x86_cpu = X86_CPU(cpu);
+            qemu_mutex_lock_iothread();
            apic_poll_irq(x86_cpu->apic_state);
            cpu_reset_interrupt(cpu, CPU_INTERRUPT_POLL);
+            qemu_mutex_unlock_iothread();
        }
 #endif
        if (!cpu_has_work(cpu)) {
-            current_cpu = NULL;
            return true;
        }

@ -439,7 +464,9 @@ static inline bool cpu_handle_exception(CPUState *cpu, int *ret)
 #else
            if (replay_exception()) {
                CPUClass *cc = CPU_GET_CLASS(cpu);
+                qemu_mutex_lock_iothread();
                cc->do_interrupt(cpu);
+                qemu_mutex_unlock_iothread();
                cpu->exception_index = -1;
            } else if (!replay_has_interrupt()) {
                /* give a chance to iothread in replay mode */
@ -465,9 +492,11 @@ static inline bool cpu_handle_interrupt(CPUState *cpu,
                                        TranslationBlock **last_tb)
 {
    CPUClass *cc = CPU_GET_CLASS(cpu);
-    int interrupt_request = cpu->interrupt_request;

-    if (unlikely(interrupt_request)) {
+    if (unlikely(atomic_read(&cpu->interrupt_request))) {
+        int interrupt_request;
+        qemu_mutex_lock_iothread();
+        interrupt_request = cpu->interrupt_request;
        if (unlikely(cpu->singlestep_enabled & SSTEP_NOIRQ)) {
            /* Mask out external interrupts for this step. */
            interrupt_request &= ~CPU_INTERRUPT_SSTEP_MASK;
@ -475,6 +504,7 @@ static inline bool cpu_handle_interrupt(CPUState *cpu,
        if (interrupt_request & CPU_INTERRUPT_DEBUG) {
            cpu->interrupt_request &= ~CPU_INTERRUPT_DEBUG;
            cpu->exception_index = EXCP_DEBUG;
+            qemu_mutex_unlock_iothread();
            return true;
        }
        if (replay_mode == REPLAY_MODE_PLAY && !replay_has_interrupt()) {
@ -484,6 +514,7 @@ static inline bool cpu_handle_interrupt(CPUState *cpu,
            cpu->interrupt_request &= ~CPU_INTERRUPT_HALT;
            cpu->halted = 1;
            cpu->exception_index = EXCP_HLT;
+            qemu_mutex_unlock_iothread();
            return true;
        }
 #if defined(TARGET_I386)
@ -494,12 +525,14 @@ static inline bool cpu_handle_interrupt(CPUState *cpu,
            cpu_svm_check_intercept_param(env, SVM_EXIT_INIT, 0, 0);
            do_cpu_init(x86_cpu);
            cpu->exception_index = EXCP_HALTED;
+            qemu_mutex_unlock_iothread();
            return true;
        }
 #else
        else if (interrupt_request & CPU_INTERRUPT_RESET) {
            replay_interrupt();
            cpu_reset(cpu);
+            qemu_mutex_unlock_iothread();
            return true;
        }
 #endif
@ -522,7 +555,12 @@ static inline bool cpu_handle_interrupt(CPUState *cpu,
               the program flow was changed */
            *last_tb = NULL;
        }
+
+        /* If we exit via cpu_loop_exit/longjmp it is reset in cpu_exec */
+        qemu_mutex_unlock_iothread();
    }
+
+
    if (unlikely(atomic_read(&cpu->exit_request) || replay_has_interrupt())) {
        atomic_set(&cpu->exit_request, 0);
        cpu->exception_index = EXCP_INTERRUPT;
@ -548,15 +586,13 @@ static inline void cpu_loop_exec_tb(CPUState *cpu, TranslationBlock *tb,
    *tb_exit = ret & TB_EXIT_MASK;
    switch (*tb_exit) {
    case TB_EXIT_REQUESTED:
-        /* Something asked us to stop executing
-         * chained TBs; just continue round the main
-         * loop. Whatever requested the exit will also
-         * have set something else (eg exit_request or
-         * interrupt_request) which we will handle
-         * next time around the loop.  But we need to
-         * ensure the zeroing of tcg_exit_req (see cpu_tb_exec)
-         * comes before the next read of cpu->exit_request
-         * or cpu->interrupt_request.
+        /* Something asked us to stop executing chained TBs; just
+         * continue round the main loop. Whatever requested the exit
+         * will also have set something else (eg interrupt_request)
+         * which we will handle next time around the loop.  But we
+         * need to ensure the tcg_exit_req read in generated code
+         * comes before the next read of cpu->exit_request or
+         * cpu->interrupt_request.
         */
        smp_mb();
        *last_tb = NULL;
@ -608,13 +644,8 @@ int cpu_exec(CPUState *cpu)
        return EXCP_HALTED;
    }

-    atomic_mb_set(&tcg_current_cpu, cpu);
    rcu_read_lock();

-    if (unlikely(atomic_mb_read(&exit_request))) {
-        cpu->exit_request = 1;
-    }
-
    cc->cpu_exec_enter(cpu);

    /* Calculate difference between guest clock and host clock.
@ -640,6 +671,9 @@ int cpu_exec(CPUState *cpu)
 #endif /* buggy compiler */
        cpu->can_do_io = 1;
        tb_lock_reset();
+        if (qemu_mutex_iothread_locked()) {
+            qemu_mutex_unlock_iothread();
+        }
    }

    /* if an exception is pending, we execute it here */
@ -659,10 +693,5 @@ int cpu_exec(CPUState *cpu)
    cc->cpu_exec_exit(cpu);
    rcu_read_unlock();

-    /* fail safe : never use current_cpu outside cpu_exec() */
-    current_cpu = NULL;
-
-    /* Does not need atomic_mb_set because a spurious wakeup is okay.  */
-    atomic_set(&tcg_current_cpu, NULL);
    return ret;
 }
--- a/cpus.c
+++ b/cpus.c
@ -25,6 +25,7 @@
 /* Needed early for CONFIG_BSD etc. */
 #include "qemu/osdep.h"
 #include "qemu-common.h"
+#include "qemu/config-file.h"
 #include "cpu.h"
 #include "monitor/monitor.h"
 #include "qapi/qmp/qerror.h"
@ -45,6 +46,7 @@
 #include "qemu/main-loop.h"
 #include "qemu/bitmap.h"
 #include "qemu/seqlock.h"
+#include "tcg.h"
 #include "qapi-event.h"
 #include "hw/nmi.h"
 #include "sysemu/replay.h"
@ -150,6 +152,77 @@ typedef struct TimersState {
 } TimersState;

 static TimersState timers_state;
+bool mttcg_enabled;
+
+/*
+ * We default to false if we know other options have been enabled
+ * which are currently incompatible with MTTCG. Otherwise when each
+ * guest (target) has been updated to support:
+ *   - atomic instructions
+ *   - memory ordering primitives (barriers)
+ * they can set the appropriate CONFIG flags in ${target}-softmmu.mak
+ *
+ * Once a guest architecture has been converted to the new primitives
+ * there are two remaining limitations to check.
+ *
+ * - The guest can't be oversized (e.g. 64 bit guest on 32 bit host)
+ * - The host must have a stronger memory order than the guest
+ *
+ * It may be possible in future to support strong guests on weak hosts
+ * but that will require tagging all load/stores in a guest with their
+ * implicit memory order requirements which would likely slow things
+ * down a lot.
+ */
+
+static bool check_tcg_memory_orders_compatible(void)
+{
+#if defined(TCG_GUEST_DEFAULT_MO) && defined(TCG_TARGET_DEFAULT_MO)
+    return (TCG_GUEST_DEFAULT_MO & ~TCG_TARGET_DEFAULT_MO) == 0;
+#else
+    return false;
+#endif
+}
+
+static bool default_mttcg_enabled(void)
+{
+    QemuOpts *icount_opts = qemu_find_opts_singleton("icount");
+    const char *rr = qemu_opt_get(icount_opts, "rr");
+
+    if (rr || TCG_OVERSIZED_GUEST) {
+        return false;
+    } else {
+#ifdef TARGET_SUPPORTS_MTTCG
+        return check_tcg_memory_orders_compatible();
+#else
+        return false;
+#endif
+    }
+}
+
+void qemu_tcg_configure(QemuOpts *opts, Error **errp)
+{
+    const char *t = qemu_opt_get(opts, "thread");
+    if (t) {
+        if (strcmp(t, "multi") == 0) {
+            if (TCG_OVERSIZED_GUEST) {
+                error_setg(errp, "No MTTCG when guest word size > hosts");
+            } else {
+                if (!check_tcg_memory_orders_compatible()) {
+                    error_report("Guest expects a stronger memory ordering "
+                                 "than the host provides");
+                    error_printf("This may cause strange/hard to debug errors");
+                }
+                mttcg_enabled = true;
+            }
+        } else if (strcmp(t, "single") == 0) {
+            mttcg_enabled = false;
+        } else {
+            error_setg(errp, "Invalid 'thread' setting %s", t);
+        }
+    } else {
+        mttcg_enabled = default_mttcg_enabled();
+    }
+}

 int64_t cpu_get_icount_raw(void)
 {
@ -694,6 +767,63 @@ void configure_icount(QemuOpts *opts, Error **errp)
                   NANOSECONDS_PER_SECOND / 10);
 }

+/***********************************************************/
+/* TCG vCPU kick timer
+ *
+ * The kick timer is responsible for moving single threaded vCPU
+ * emulation on to the next vCPU. If more than one vCPU is running a
+ * timer event with force a cpu->exit so the next vCPU can get
+ * scheduled.
+ *
+ * The timer is removed if all vCPUs are idle and restarted again once
+ * idleness is complete.
+ */
+
+static QEMUTimer *tcg_kick_vcpu_timer;
+static CPUState *tcg_current_rr_cpu;
+
+#define TCG_KICK_PERIOD (NANOSECONDS_PER_SECOND / 10)
+
+static inline int64_t qemu_tcg_next_kick(void)
+{
+    return qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + TCG_KICK_PERIOD;
+}
+
+/* Kick the currently round-robin scheduled vCPU */
+static void qemu_cpu_kick_rr_cpu(void)
+{
+    CPUState *cpu;
+    do {
+        cpu = atomic_mb_read(&tcg_current_rr_cpu);
+        if (cpu) {
+            cpu_exit(cpu);
+        }
+    } while (cpu != atomic_mb_read(&tcg_current_rr_cpu));
+}
+
+static void kick_tcg_thread(void *opaque)
+{
+    timer_mod(tcg_kick_vcpu_timer, qemu_tcg_next_kick());
+    qemu_cpu_kick_rr_cpu();
+}
+
+static void start_tcg_kick_timer(void)
+{
+    if (!mttcg_enabled && !tcg_kick_vcpu_timer && CPU_NEXT(first_cpu)) {
+        tcg_kick_vcpu_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
+                                           kick_tcg_thread, NULL);
+        timer_mod(tcg_kick_vcpu_timer, qemu_tcg_next_kick());
+    }
+}
+
+static void stop_tcg_kick_timer(void)
+{
+    if (tcg_kick_vcpu_timer) {
+        timer_del(tcg_kick_vcpu_timer);
+        tcg_kick_vcpu_timer = NULL;
+    }
+}
+
 /***********************************************************/
 void hw_error(const char *fmt, ...)
 {
@ -896,8 +1026,6 @@ static void qemu_kvm_init_cpu_signals(CPUState *cpu)
 #endif /* _WIN32 */

 static QemuMutex qemu_global_mutex;
-static QemuCond qemu_io_proceeded_cond;
-static unsigned iothread_requesting_mutex;

 static QemuThread io_thread;

@ -911,7 +1039,6 @@ void qemu_init_cpu_loop(void)
    qemu_init_sigbus();
    qemu_cond_init(&qemu_cpu_cond);
    qemu_cond_init(&qemu_pause_cond);
-    qemu_cond_init(&qemu_io_proceeded_cond);
    qemu_mutex_init(&qemu_global_mutex);

    qemu_thread_get_self(&io_thread);
@ -936,28 +1063,34 @@ static void qemu_tcg_destroy_vcpu(CPUState *cpu)

 static void qemu_wait_io_event_common(CPUState *cpu)
 {
+    atomic_mb_set(&cpu->thread_kicked, false);
    if (cpu->stop) {
        cpu->stop = false;
        cpu->stopped = true;
        qemu_cond_broadcast(&qemu_pause_cond);
    }
    process_queued_cpu_work(cpu);
-    cpu->thread_kicked = false;
+}
+
+static bool qemu_tcg_should_sleep(CPUState *cpu)
+{
+    if (mttcg_enabled) {
+        return cpu_thread_is_idle(cpu);
+    } else {
+        return all_cpu_threads_idle();
+    }
 }

 static void qemu_tcg_wait_io_event(CPUState *cpu)
 {
-    while (all_cpu_threads_idle()) {
+    while (qemu_tcg_should_sleep(cpu)) {
+        stop_tcg_kick_timer();
        qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex);
    }

-    while (iothread_requesting_mutex) {
-        qemu_cond_wait(&qemu_io_proceeded_cond, &qemu_global_mutex);
-    }
+    start_tcg_kick_timer();

-    CPU_FOREACH(cpu) {
-        qemu_wait_io_event_common(cpu);
-    }
+    qemu_wait_io_event_common(cpu);
 }

 static void qemu_kvm_wait_io_event(CPUState *cpu)
@ -1028,6 +1161,7 @@ static void *qemu_dummy_cpu_thread_fn(void *arg)
    qemu_thread_get_self(cpu->thread);
    cpu->thread_id = qemu_get_thread_id();
    cpu->can_do_io = 1;
+    current_cpu = cpu;

    sigemptyset(&waitset);
    sigaddset(&waitset, SIG_IPI);
@ -1036,9 +1170,7 @@ static void *qemu_dummy_cpu_thread_fn(void *arg)
    cpu->created = true;
    qemu_cond_signal(&qemu_cpu_cond);

-    current_cpu = cpu;
    while (1) {
-        current_cpu = NULL;
        qemu_mutex_unlock_iothread();
        do {
            int sig;
@ -1049,7 +1181,6 @@ static void *qemu_dummy_cpu_thread_fn(void *arg)
            exit(1);
        }
        qemu_mutex_lock_iothread();
-        current_cpu = cpu;
        qemu_wait_io_event_common(cpu);
    }

@ -1115,9 +1246,11 @@ static int tcg_cpu_exec(CPUState *cpu)
        cpu->icount_decr.u16.low = decr;
        cpu->icount_extra = count;
    }
+    qemu_mutex_unlock_iothread();
    cpu_exec_start(cpu);
    ret = cpu_exec(cpu);
    cpu_exec_end(cpu);
+    qemu_mutex_lock_iothread();
 #ifdef CONFIG_PROFILER
    tcg_time += profile_getclock() - ti;
 #endif
@ -1150,7 +1283,16 @@ static void deal_with_unplugged_cpus(void)
    }
 }

-static void *qemu_tcg_cpu_thread_fn(void *arg)
+/* Single-threaded TCG
+ *
+ * In the single-threaded case each vCPU is simulated in turn. If
+ * there is more than a single vCPU we create a simple timer to kick
+ * the vCPU and ensure we don't get stuck in a tight loop in one vCPU.
+ * This is done explicitly rather than relying on side-effects
+ * elsewhere.
+ */
+
+static void *qemu_tcg_rr_cpu_thread_fn(void *arg)
 {
    CPUState *cpu = arg;

@ -1172,15 +1314,18 @@ static void *qemu_tcg_cpu_thread_fn(void *arg)

        /* process any pending work */
        CPU_FOREACH(cpu) {
+            current_cpu = cpu;
            qemu_wait_io_event_common(cpu);
        }
    }

-    /* process any pending work */
-    atomic_mb_set(&exit_request, 1);
+    start_tcg_kick_timer();

    cpu = first_cpu;

+    /* process any pending work */
+    cpu->exit_request = 1;
+
    while (1) {
        /* Account partial waits to QEMU_CLOCK_VIRTUAL.  */
        qemu_account_warp_timer();
@ -1189,7 +1334,10 @@ static void *qemu_tcg_cpu_thread_fn(void *arg)
            cpu = first_cpu;
        }

-        for (; cpu != NULL && !exit_request; cpu = CPU_NEXT(cpu)) {
+        while (cpu && !cpu->queued_work_first && !cpu->exit_request) {
+
+            atomic_mb_set(&tcg_current_rr_cpu, cpu);
+            current_cpu = cpu;

            qemu_clock_enable(QEMU_CLOCK_VIRTUAL,
                              (cpu->singlestep_enabled & SSTEP_NOTIMER) == 0);
@ -1200,22 +1348,32 @@ static void *qemu_tcg_cpu_thread_fn(void *arg)
                if (r == EXCP_DEBUG) {
                    cpu_handle_guest_debug(cpu);
                    break;
+                } else if (r == EXCP_ATOMIC) {
+                    qemu_mutex_unlock_iothread();
+                    cpu_exec_step_atomic(cpu);
+                    qemu_mutex_lock_iothread();
+                    break;
                }
-            } else if (cpu->stop || cpu->stopped) {
+            } else if (cpu->stop) {
                if (cpu->unplug) {
                    cpu = CPU_NEXT(cpu);
                }
                break;
            }

-        } /* for cpu.. */
+            cpu = CPU_NEXT(cpu);
+        } /* while (cpu && !cpu->exit_request).. */

-        /* Pairs with smp_wmb in qemu_cpu_kick.  */
-        atomic_mb_set(&exit_request, 0);
+        /* Does not need atomic_mb_set because a spurious wakeup is okay.  */
+        atomic_set(&tcg_current_rr_cpu, NULL);
+
+        if (cpu && cpu->exit_request) {
+            atomic_mb_set(&cpu->exit_request, 0);
+        }

        handle_icount_deadline();

-        qemu_tcg_wait_io_event(QTAILQ_FIRST(&cpus));
+        qemu_tcg_wait_io_event(cpu ? cpu : QTAILQ_FIRST(&cpus));
        deal_with_unplugged_cpus();
    }

@ -1262,6 +1420,68 @@ static void CALLBACK dummy_apc_func(ULONG_PTR unused)
 }
 #endif

+/* Multi-threaded TCG
+ *
+ * In the multi-threaded case each vCPU has its own thread. The TLS
+ * variable current_cpu can be used deep in the code to find the
+ * current CPUState for a given thread.
+ */
+
+static void *qemu_tcg_cpu_thread_fn(void *arg)
+{
+    CPUState *cpu = arg;
+
+    rcu_register_thread();
+
+    qemu_mutex_lock_iothread();
+    qemu_thread_get_self(cpu->thread);
+
+    cpu->thread_id = qemu_get_thread_id();
+    cpu->created = true;
+    cpu->can_do_io = 1;
+    current_cpu = cpu;
+    qemu_cond_signal(&qemu_cpu_cond);
+
+    /* process any pending work */
+    cpu->exit_request = 1;
+
+    while (1) {
+        if (cpu_can_run(cpu)) {
+            int r;
+            r = tcg_cpu_exec(cpu);
+            switch (r) {
+            case EXCP_DEBUG:
+                cpu_handle_guest_debug(cpu);
+                break;
+            case EXCP_HALTED:
+                /* during start-up the vCPU is reset and the thread is
+                 * kicked several times. If we don't ensure we go back
+                 * to sleep in the halted state we won't cleanly
+                 * start-up when the vCPU is enabled.
+                 *
+                 * cpu->halted should ensure we sleep in wait_io_event
+                 */
+                g_assert(cpu->halted);
+                break;
+            case EXCP_ATOMIC:
+                qemu_mutex_unlock_iothread();
+                cpu_exec_step_atomic(cpu);
+                qemu_mutex_lock_iothread();
+            default:
+                /* Ignore everything else? */
+                break;
+            }
+        }
+
+        handle_icount_deadline();
+
+        atomic_mb_set(&cpu->exit_request, 0);
+        qemu_tcg_wait_io_event(cpu);
+    }
+
+    return NULL;
+}
+
 static void qemu_cpu_kick_thread(CPUState *cpu)
 {
 #ifndef _WIN32
@ -1287,24 +1507,13 @@ static void qemu_cpu_kick_thread(CPUState *cpu)
 #endif
 }

-static void qemu_cpu_kick_no_halt(void)
-{
-    CPUState *cpu;
-    /* Ensure whatever caused the exit has reached the CPU threads before
-     * writing exit_request.
-     */
-    atomic_mb_set(&exit_request, 1);
-    cpu = atomic_mb_read(&tcg_current_cpu);
-    if (cpu) {
-        cpu_exit(cpu);
-    }
-}
-
 void qemu_cpu_kick(CPUState *cpu)
 {
    qemu_cond_broadcast(cpu->halt_cond);
    if (tcg_enabled()) {
-        qemu_cpu_kick_no_halt();
+        cpu_exit(cpu);
+        /* NOP unless doing single-thread RR */
+        qemu_cpu_kick_rr_cpu();
    } else {
        if (hax_enabled()) {
            /*
@ -1342,27 +1551,14 @@ bool qemu_mutex_iothread_locked(void)

 void qemu_mutex_lock_iothread(void)
 {
-    atomic_inc(&iothread_requesting_mutex);
-    /* In the simple case there is no need to bump the VCPU thread out of
-     * TCG code execution.
-     */
-    if (!tcg_enabled() || qemu_in_vcpu_thread() ||
-        !first_cpu || !first_cpu->created) {
-        qemu_mutex_lock(&qemu_global_mutex);
-        atomic_dec(&iothread_requesting_mutex);
-    } else {
-        if (qemu_mutex_trylock(&qemu_global_mutex)) {
-            qemu_cpu_kick_no_halt();
-            qemu_mutex_lock(&qemu_global_mutex);
-        }
-        atomic_dec(&iothread_requesting_mutex);
-        qemu_cond_broadcast(&qemu_io_proceeded_cond);
-    }
+    g_assert(!qemu_mutex_iothread_locked());
+    qemu_mutex_lock(&qemu_global_mutex);
    iothread_locked = true;
 }

 void qemu_mutex_unlock_iothread(void)
 {
+    g_assert(qemu_mutex_iothread_locked());
    iothread_locked = false;
    qemu_mutex_unlock(&qemu_global_mutex);
 }
@ -1392,13 +1588,6 @@ void pause_all_vcpus(void)

    if (qemu_in_vcpu_thread()) {
        cpu_stop_current();
-        if (!kvm_enabled()) {
-            CPU_FOREACH(cpu) {
-                cpu->stop = false;
-                cpu->stopped = true;
-            }
-            return;
-        }
    }

    while (!all_vcpus_paused()) {
@ -1447,29 +1636,43 @@ void cpu_remove_sync(CPUState *cpu)
 static void qemu_tcg_init_vcpu(CPUState *cpu)
 {
    char thread_name[VCPU_THREAD_NAME_SIZE];
-    static QemuCond *tcg_halt_cond;
-    static QemuThread *tcg_cpu_thread;
+    static QemuCond *single_tcg_halt_cond;
+    static QemuThread *single_tcg_cpu_thread;

-    /* share a single thread for all cpus with TCG */
-    if (!tcg_cpu_thread) {
+    if (qemu_tcg_mttcg_enabled() || !single_tcg_cpu_thread) {
        cpu->thread = g_malloc0(sizeof(QemuThread));
        cpu->halt_cond = g_malloc0(sizeof(QemuCond));
        qemu_cond_init(cpu->halt_cond);
-        tcg_halt_cond = cpu->halt_cond;
-        snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/TCG",
+
+        if (qemu_tcg_mttcg_enabled()) {
+            /* create a thread per vCPU with TCG (MTTCG) */
+            parallel_cpus = true;
+            snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/TCG",
                 cpu->cpu_index);
-        qemu_thread_create(cpu->thread, thread_name, qemu_tcg_cpu_thread_fn,
-                           cpu, QEMU_THREAD_JOINABLE);
+
+            qemu_thread_create(cpu->thread, thread_name, qemu_tcg_cpu_thread_fn,
+                               cpu, QEMU_THREAD_JOINABLE);
+
+        } else {
+            /* share a single thread for all cpus with TCG */
+            snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "ALL CPUs/TCG");
+            qemu_thread_create(cpu->thread, thread_name,
+                               qemu_tcg_rr_cpu_thread_fn,
+                               cpu, QEMU_THREAD_JOINABLE);
+
+            single_tcg_halt_cond = cpu->halt_cond;
+            single_tcg_cpu_thread = cpu->thread;
+        }
 #ifdef _WIN32
        cpu->hThread = qemu_thread_get_handle(cpu->thread);
 #endif
        while (!cpu->created) {
            qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
        }
-        tcg_cpu_thread = cpu->thread;
    } else {
-        cpu->thread = tcg_cpu_thread;
-        cpu->halt_cond = tcg_halt_cond;
+        /* For non-MTTCG cases we share the thread */
+        cpu->thread = single_tcg_cpu_thread;
+        cpu->halt_cond = single_tcg_halt_cond;
    }
 }

--- a/cputlb.c
+++ b/cputlb.c
@ -18,6 +18,7 @@
 */

 #include "qemu/osdep.h"
+#include "qemu/main-loop.h"
 #include "cpu.h"
 #include "exec/exec-all.h"
 #include "exec/memory.h"
@ -57,6 +58,40 @@
    } \
 } while (0)

+#define assert_cpu_is_self(this_cpu) do {                         \
+        if (DEBUG_TLB_GATE) {                                     \
+            g_assert(!cpu->created || qemu_cpu_is_self(cpu));     \
+        }                                                         \
+    } while (0)
+
+/* run_on_cpu_data.target_ptr should always be big enough for a
+ * target_ulong even on 32 bit builds */
+QEMU_BUILD_BUG_ON(sizeof(target_ulong) > sizeof(run_on_cpu_data));
+
+/* We currently can't handle more than 16 bits in the MMUIDX bitmask.
+ */
+QEMU_BUILD_BUG_ON(NB_MMU_MODES > 16);
+#define ALL_MMUIDX_BITS ((1 << NB_MMU_MODES) - 1)
+
+/* flush_all_helper: run fn across all cpus
+ *
+ * If the wait flag is set then the src cpu's helper will be queued as
+ * "safe" work and the loop exited creating a synchronisation point
+ * where all queued work will be finished before execution starts
+ * again.
+ */
+static void flush_all_helper(CPUState *src, run_on_cpu_func fn,
+                             run_on_cpu_data d)
+{
+    CPUState *cpu;
+
+    CPU_FOREACH(cpu) {
+        if (cpu != src) {
+            async_run_on_cpu(cpu, fn, d);
+        }
+    }
+}
+
 /* statistics */
 int tlb_flush_count;

@ -65,10 +100,22 @@ int tlb_flush_count;
 * flushing more entries than required is only an efficiency issue,
 * not a correctness issue.
 */
-void tlb_flush(CPUState *cpu)
+static void tlb_flush_nocheck(CPUState *cpu)
 {
    CPUArchState *env = cpu->env_ptr;

+    /* The QOM tests will trigger tlb_flushes without setting up TCG
+     * so we bug out here in that case.
+     */
+    if (!tcg_enabled()) {
+        return;
+    }
+
+    assert_cpu_is_self(cpu);
+    tlb_debug("(count: %d)\n", tlb_flush_count++);
+
+    tb_lock();
+
    memset(env->tlb_table, -1, sizeof(env->tlb_table));
    memset(env->tlb_v_table, -1, sizeof(env->tlb_v_table));
    memset(cpu->tb_jmp_cache, 0, sizeof(cpu->tb_jmp_cache));
@ -76,39 +123,117 @@ void tlb_flush(CPUState *cpu)
    env->vtlb_index = 0;
    env->tlb_flush_addr = -1;
    env->tlb_flush_mask = 0;
-    tlb_flush_count++;
+
+    tb_unlock();
+
+    atomic_mb_set(&cpu->pending_tlb_flush, 0);
 }

-static inline void v_tlb_flush_by_mmuidx(CPUState *cpu, va_list argp)
+static void tlb_flush_global_async_work(CPUState *cpu, run_on_cpu_data data)
+{
+    tlb_flush_nocheck(cpu);
+}
+
+void tlb_flush(CPUState *cpu)
+{
+    if (cpu->created && !qemu_cpu_is_self(cpu)) {
+        if (atomic_mb_read(&cpu->pending_tlb_flush) != ALL_MMUIDX_BITS) {
+            atomic_mb_set(&cpu->pending_tlb_flush, ALL_MMUIDX_BITS);
+            async_run_on_cpu(cpu, tlb_flush_global_async_work,
+                             RUN_ON_CPU_NULL);
+        }
+    } else {
+        tlb_flush_nocheck(cpu);
+    }
+}
+
+void tlb_flush_all_cpus(CPUState *src_cpu)
+{
+    const run_on_cpu_func fn = tlb_flush_global_async_work;
+    flush_all_helper(src_cpu, fn, RUN_ON_CPU_NULL);
+    fn(src_cpu, RUN_ON_CPU_NULL);
+}
+
+void tlb_flush_all_cpus_synced(CPUState *src_cpu)
+{
+    const run_on_cpu_func fn = tlb_flush_global_async_work;
+    flush_all_helper(src_cpu, fn, RUN_ON_CPU_NULL);
+    async_safe_run_on_cpu(src_cpu, fn, RUN_ON_CPU_NULL);
+}
+
+static void tlb_flush_by_mmuidx_async_work(CPUState *cpu, run_on_cpu_data data)
 {
    CPUArchState *env = cpu->env_ptr;
+    unsigned long mmu_idx_bitmask = data.host_int;
+    int mmu_idx;

-    tlb_debug("start\n");
+    assert_cpu_is_self(cpu);

-    for (;;) {
-        int mmu_idx = va_arg(argp, int);
+    tb_lock();

-        if (mmu_idx < 0) {
-            break;
+    tlb_debug("start: mmu_idx:0x%04lx\n", mmu_idx_bitmask);
+
+    for (mmu_idx = 0; mmu_idx < NB_MMU_MODES; mmu_idx++) {
+
+        if (test_bit(mmu_idx, &mmu_idx_bitmask)) {
+            tlb_debug("%d\n", mmu_idx);
+
+            memset(env->tlb_table[mmu_idx], -1, sizeof(env->tlb_table[0]));
+            memset(env->tlb_v_table[mmu_idx], -1, sizeof(env->tlb_v_table[0]));
        }
-
-        tlb_debug("%d\n", mmu_idx);
-
-        memset(env->tlb_table[mmu_idx], -1, sizeof(env->tlb_table[0]));
-        memset(env->tlb_v_table[mmu_idx], -1, sizeof(env->tlb_v_table[0]));
    }

    memset(cpu->tb_jmp_cache, 0, sizeof(cpu->tb_jmp_cache));
+
+    tlb_debug("done\n");
+
+    tb_unlock();
 }

-void tlb_flush_by_mmuidx(CPUState *cpu, ...)
+void tlb_flush_by_mmuidx(CPUState *cpu, uint16_t idxmap)
 {
-    va_list argp;
-    va_start(argp, cpu);
-    v_tlb_flush_by_mmuidx(cpu, argp);
-    va_end(argp);
+    tlb_debug("mmu_idx: 0x%" PRIx16 "\n", idxmap);
+
+    if (!qemu_cpu_is_self(cpu)) {
+        uint16_t pending_flushes = idxmap;
+        pending_flushes &= ~atomic_mb_read(&cpu->pending_tlb_flush);
+
+        if (pending_flushes) {
+            tlb_debug("reduced mmu_idx: 0x%" PRIx16 "\n", pending_flushes);
+
+            atomic_or(&cpu->pending_tlb_flush, pending_flushes);
+            async_run_on_cpu(cpu, tlb_flush_by_mmuidx_async_work,
+                             RUN_ON_CPU_HOST_INT(pending_flushes));
+        }
+    } else {
+        tlb_flush_by_mmuidx_async_work(cpu,
+                                       RUN_ON_CPU_HOST_INT(idxmap));
+    }
 }

+void tlb_flush_by_mmuidx_all_cpus(CPUState *src_cpu, uint16_t idxmap)
+{
+    const run_on_cpu_func fn = tlb_flush_by_mmuidx_async_work;
+
+    tlb_debug("mmu_idx: 0x%"PRIx16"\n", idxmap);
+
+    flush_all_helper(src_cpu, fn, RUN_ON_CPU_HOST_INT(idxmap));
+    fn(src_cpu, RUN_ON_CPU_HOST_INT(idxmap));
+}
+
+void tlb_flush_by_mmuidx_all_cpus_synced(CPUState *src_cpu,
+                                                       uint16_t idxmap)
+{
+    const run_on_cpu_func fn = tlb_flush_by_mmuidx_async_work;
+
+    tlb_debug("mmu_idx: 0x%"PRIx16"\n", idxmap);
+
+    flush_all_helper(src_cpu, fn, RUN_ON_CPU_HOST_INT(idxmap));
+    async_safe_run_on_cpu(src_cpu, fn, RUN_ON_CPU_HOST_INT(idxmap));
+}
+
+
+
 static inline void tlb_flush_entry(CPUTLBEntry *tlb_entry, target_ulong addr)
 {
    if (addr == (tlb_entry->addr_read &
@ -121,12 +246,15 @@ static inline void tlb_flush_entry(CPUTLBEntry *tlb_entry, target_ulong addr)
    }
 }

-void tlb_flush_page(CPUState *cpu, target_ulong addr)
+static void tlb_flush_page_async_work(CPUState *cpu, run_on_cpu_data data)
 {
    CPUArchState *env = cpu->env_ptr;
+    target_ulong addr = (target_ulong) data.target_ptr;
    int i;
    int mmu_idx;

+    assert_cpu_is_self(cpu);
+
    tlb_debug("page :" TARGET_FMT_lx "\n", addr);

    /* Check if we need to flush due to large pages.  */
@ -156,15 +284,62 @@ void tlb_flush_page(CPUState *cpu, target_ulong addr)
    tb_flush_jmp_cache(cpu, addr);
 }

-void tlb_flush_page_by_mmuidx(CPUState *cpu, target_ulong addr, ...)
+void tlb_flush_page(CPUState *cpu, target_ulong addr)
+{
+    tlb_debug("page :" TARGET_FMT_lx "\n", addr);
+
+    if (!qemu_cpu_is_self(cpu)) {
+        async_run_on_cpu(cpu, tlb_flush_page_async_work,
+                         RUN_ON_CPU_TARGET_PTR(addr));
+    } else {
+        tlb_flush_page_async_work(cpu, RUN_ON_CPU_TARGET_PTR(addr));
+    }
+}
+
+/* As we are going to hijack the bottom bits of the page address for a
+ * mmuidx bit mask we need to fail to build if we can't do that
+ */
+QEMU_BUILD_BUG_ON(NB_MMU_MODES > TARGET_PAGE_BITS_MIN);
+
+static void tlb_flush_page_by_mmuidx_async_work(CPUState *cpu,
+                                                run_on_cpu_data data)
 {
    CPUArchState *env = cpu->env_ptr;
-    int i, k;
-    va_list argp;
+    target_ulong addr_and_mmuidx = (target_ulong) data.target_ptr;
+    target_ulong addr = addr_and_mmuidx & TARGET_PAGE_MASK;
+    unsigned long mmu_idx_bitmap = addr_and_mmuidx & ALL_MMUIDX_BITS;
+    int page = (addr >> TARGET_PAGE_BITS) & (CPU_TLB_SIZE - 1);
+    int mmu_idx;
+    int i;

-    va_start(argp, addr);
+    assert_cpu_is_self(cpu);

-    tlb_debug("addr "TARGET_FMT_lx"\n", addr);
+    tlb_debug("page:%d addr:"TARGET_FMT_lx" mmu_idx:0x%lx\n",
+              page, addr, mmu_idx_bitmap);
+
+    for (mmu_idx = 0; mmu_idx < NB_MMU_MODES; mmu_idx++) {
+        if (test_bit(mmu_idx, &mmu_idx_bitmap)) {
+            tlb_flush_entry(&env->tlb_table[mmu_idx][page], addr);
+
+            /* check whether there are vltb entries that need to be flushed */
+            for (i = 0; i < CPU_VTLB_SIZE; i++) {
+                tlb_flush_entry(&env->tlb_v_table[mmu_idx][i], addr);
+            }
+        }
+    }
+
+    tb_flush_jmp_cache(cpu, addr);
+}
+
+static void tlb_check_page_and_flush_by_mmuidx_async_work(CPUState *cpu,
+                                                          run_on_cpu_data data)
+{
+    CPUArchState *env = cpu->env_ptr;
+    target_ulong addr_and_mmuidx = (target_ulong) data.target_ptr;
+    target_ulong addr = addr_and_mmuidx & TARGET_PAGE_MASK;
+    unsigned long mmu_idx_bitmap = addr_and_mmuidx & ALL_MMUIDX_BITS;
+
+    tlb_debug("addr:"TARGET_FMT_lx" mmu_idx: %04lx\n", addr, mmu_idx_bitmap);

    /* Check if we need to flush due to large pages.  */
    if ((addr & env->tlb_flush_mask) == env->tlb_flush_addr) {
@ -172,33 +347,80 @@ void tlb_flush_page_by_mmuidx(CPUState *cpu, target_ulong addr, ...)
                  TARGET_FMT_lx "/" TARGET_FMT_lx ")\n",
                  env->tlb_flush_addr, env->tlb_flush_mask);

-        v_tlb_flush_by_mmuidx(cpu, argp);
-        va_end(argp);
-        return;
+        tlb_flush_by_mmuidx_async_work(cpu,
+                                       RUN_ON_CPU_HOST_INT(mmu_idx_bitmap));
+    } else {
+        tlb_flush_page_by_mmuidx_async_work(cpu, data);
    }
+}

-    addr &= TARGET_PAGE_MASK;
-    i = (addr >> TARGET_PAGE_BITS) & (CPU_TLB_SIZE - 1);
+void tlb_flush_page_by_mmuidx(CPUState *cpu, target_ulong addr, uint16_t idxmap)
+{
+    target_ulong addr_and_mmu_idx;

-    for (;;) {
-        int mmu_idx = va_arg(argp, int);
+    tlb_debug("addr: "TARGET_FMT_lx" mmu_idx:%" PRIx16 "\n", addr, idxmap);

-        if (mmu_idx < 0) {
-            break;
-        }
+    /* This should already be page aligned */
+    addr_and_mmu_idx = addr & TARGET_PAGE_MASK;
+    addr_and_mmu_idx |= idxmap;

-        tlb_debug("idx %d\n", mmu_idx);
-
-        tlb_flush_entry(&env->tlb_table[mmu_idx][i], addr);
-
-        /* check whether there are vltb entries that need to be flushed */
-        for (k = 0; k < CPU_VTLB_SIZE; k++) {
-            tlb_flush_entry(&env->tlb_v_table[mmu_idx][k], addr);
-        }
+    if (!qemu_cpu_is_self(cpu)) {
+        async_run_on_cpu(cpu, tlb_check_page_and_flush_by_mmuidx_async_work,
+                         RUN_ON_CPU_TARGET_PTR(addr_and_mmu_idx));
+    } else {
+        tlb_check_page_and_flush_by_mmuidx_async_work(
+            cpu, RUN_ON_CPU_TARGET_PTR(addr_and_mmu_idx));
    }
-    va_end(argp);
+}

-    tb_flush_jmp_cache(cpu, addr);
+void tlb_flush_page_by_mmuidx_all_cpus(CPUState *src_cpu, target_ulong addr,
+                                       uint16_t idxmap)
+{
+    const run_on_cpu_func fn = tlb_check_page_and_flush_by_mmuidx_async_work;
+    target_ulong addr_and_mmu_idx;
+
+    tlb_debug("addr: "TARGET_FMT_lx" mmu_idx:%"PRIx16"\n", addr, idxmap);
+
+    /* This should already be page aligned */
+    addr_and_mmu_idx = addr & TARGET_PAGE_MASK;
+    addr_and_mmu_idx |= idxmap;
+
+    flush_all_helper(src_cpu, fn, RUN_ON_CPU_TARGET_PTR(addr_and_mmu_idx));
+    fn(src_cpu, RUN_ON_CPU_TARGET_PTR(addr_and_mmu_idx));
+}
+
+void tlb_flush_page_by_mmuidx_all_cpus_synced(CPUState *src_cpu,
+                                                            target_ulong addr,
+                                                            uint16_t idxmap)
+{
+    const run_on_cpu_func fn = tlb_check_page_and_flush_by_mmuidx_async_work;
+    target_ulong addr_and_mmu_idx;
+
+    tlb_debug("addr: "TARGET_FMT_lx" mmu_idx:%"PRIx16"\n", addr, idxmap);
+
+    /* This should already be page aligned */
+    addr_and_mmu_idx = addr & TARGET_PAGE_MASK;
+    addr_and_mmu_idx |= idxmap;
+
+    flush_all_helper(src_cpu, fn, RUN_ON_CPU_TARGET_PTR(addr_and_mmu_idx));
+    async_safe_run_on_cpu(src_cpu, fn, RUN_ON_CPU_TARGET_PTR(addr_and_mmu_idx));
+}
+
+void tlb_flush_page_all_cpus(CPUState *src, target_ulong addr)
+{
+    const run_on_cpu_func fn = tlb_flush_page_async_work;
+
+    flush_all_helper(src, fn, RUN_ON_CPU_TARGET_PTR(addr));
+    fn(src, RUN_ON_CPU_TARGET_PTR(addr));
+}
+
+void tlb_flush_page_all_cpus_synced(CPUState *src,
+                                                  target_ulong addr)
+{
+    const run_on_cpu_func fn = tlb_flush_page_async_work;
+
+    flush_all_helper(src, fn, RUN_ON_CPU_TARGET_PTR(addr));
+    async_safe_run_on_cpu(src, fn, RUN_ON_CPU_TARGET_PTR(addr));
 }

 /* update the TLBs so that writes to code in the virtual page 'addr'
@ -216,36 +438,84 @@ void tlb_unprotect_code(ram_addr_t ram_addr)
    cpu_physical_memory_set_dirty_flag(ram_addr, DIRTY_MEMORY_CODE);
 }

-static bool tlb_is_dirty_ram(CPUTLBEntry *tlbe)
-{
-    return (tlbe->addr_write & (TLB_INVALID_MASK|TLB_MMIO|TLB_NOTDIRTY)) == 0;
-}

-void tlb_reset_dirty_range(CPUTLBEntry *tlb_entry, uintptr_t start,
+/*
+ * Dirty write flag handling
+ *
+ * When the TCG code writes to a location it looks up the address in
+ * the TLB and uses that data to compute the final address. If any of
+ * the lower bits of the address are set then the slow path is forced.
+ * There are a number of reasons to do this but for normal RAM the
+ * most usual is detecting writes to code regions which may invalidate
+ * generated code.
+ *
+ * Because we want other vCPUs to respond to changes straight away we
+ * update the te->addr_write field atomically. If the TLB entry has
+ * been changed by the vCPU in the mean time we skip the update.
+ *
+ * As this function uses atomic accesses we also need to ensure
+ * updates to tlb_entries follow the same access rules. We don't need
+ * to worry about this for oversized guests as MTTCG is disabled for
+ * them.
+ */
+
+static void tlb_reset_dirty_range(CPUTLBEntry *tlb_entry, uintptr_t start,
                           uintptr_t length)
 {
-    uintptr_t addr;
+#if TCG_OVERSIZED_GUEST
+    uintptr_t addr = tlb_entry->addr_write;

-    if (tlb_is_dirty_ram(tlb_entry)) {
-        addr = (tlb_entry->addr_write & TARGET_PAGE_MASK) + tlb_entry->addend;
+    if ((addr & (TLB_INVALID_MASK | TLB_MMIO | TLB_NOTDIRTY)) == 0) {
+        addr &= TARGET_PAGE_MASK;
+        addr += tlb_entry->addend;
        if ((addr - start) < length) {
            tlb_entry->addr_write |= TLB_NOTDIRTY;
        }
    }
-}
+#else
+    /* paired with atomic_mb_set in tlb_set_page_with_attrs */
+    uintptr_t orig_addr = atomic_mb_read(&tlb_entry->addr_write);
+    uintptr_t addr = orig_addr;

-static inline ram_addr_t qemu_ram_addr_from_host_nofail(void *ptr)
-{
-    ram_addr_t ram_addr;
-
-    ram_addr = qemu_ram_addr_from_host(ptr);
-    if (ram_addr == RAM_ADDR_INVALID) {
-        fprintf(stderr, "Bad ram pointer %p\n", ptr);
-        abort();
+    if ((addr & (TLB_INVALID_MASK | TLB_MMIO | TLB_NOTDIRTY)) == 0) {
+        addr &= TARGET_PAGE_MASK;
+        addr += atomic_read(&tlb_entry->addend);
+        if ((addr - start) < length) {
+            uintptr_t notdirty_addr = orig_addr | TLB_NOTDIRTY;
+            atomic_cmpxchg(&tlb_entry->addr_write, orig_addr, notdirty_addr);
+        }
    }
-    return ram_addr;
+#endif
 }

+/* For atomic correctness when running MTTCG we need to use the right
+ * primitives when copying entries */
+static inline void copy_tlb_helper(CPUTLBEntry *d, CPUTLBEntry *s,
+                                   bool atomic_set)
+{
+#if TCG_OVERSIZED_GUEST
+    *d = *s;
+#else
+    if (atomic_set) {
+        d->addr_read = s->addr_read;
+        d->addr_code = s->addr_code;
+        atomic_set(&d->addend, atomic_read(&s->addend));
+        /* Pairs with flag setting in tlb_reset_dirty_range */
+        atomic_mb_set(&d->addr_write, atomic_read(&s->addr_write));
+    } else {
+        d->addr_read = s->addr_read;
+        d->addr_write = atomic_read(&s->addr_write);
+        d->addr_code = s->addr_code;
+        d->addend = atomic_read(&s->addend);
+    }
+#endif
+}
+
+/* This is a cross vCPU call (i.e. another vCPU resetting the flags of
+ * the target vCPU). As such care needs to be taken that we don't
+ * dangerously race with another vCPU update. The only thing actually
+ * updated is the target TLB entry ->addr_write flags.
+ */
 void tlb_reset_dirty(CPUState *cpu, ram_addr_t start1, ram_addr_t length)
 {
    CPUArchState *env;
@ -283,6 +553,8 @@ void tlb_set_dirty(CPUState *cpu, target_ulong vaddr)
    int i;
    int mmu_idx;

+    assert_cpu_is_self(cpu);
+
    vaddr &= TARGET_PAGE_MASK;
    i = (vaddr >> TARGET_PAGE_BITS) & (CPU_TLB_SIZE - 1);
    for (mmu_idx = 0; mmu_idx < NB_MMU_MODES; mmu_idx++) {
@ -337,11 +609,12 @@ void tlb_set_page_with_attrs(CPUState *cpu, target_ulong vaddr,
    target_ulong address;
    target_ulong code_address;
    uintptr_t addend;
-    CPUTLBEntry *te;
+    CPUTLBEntry *te, *tv, tn;
    hwaddr iotlb, xlat, sz;
    unsigned vidx = env->vtlb_index++ % CPU_VTLB_SIZE;
    int asidx = cpu_asidx_from_attrs(cpu, attrs);

+    assert_cpu_is_self(cpu);
    assert(size >= TARGET_PAGE_SIZE);
    if (size != TARGET_PAGE_SIZE) {
        tlb_add_large_page(env, vaddr, size);
@ -371,41 +644,50 @@ void tlb_set_page_with_attrs(CPUState *cpu, target_ulong vaddr,

    index = (vaddr >> TARGET_PAGE_BITS) & (CPU_TLB_SIZE - 1);
    te = &env->tlb_table[mmu_idx][index];
-
    /* do not discard the translation in te, evict it into a victim tlb */
-    env->tlb_v_table[mmu_idx][vidx] = *te;
+    tv = &env->tlb_v_table[mmu_idx][vidx];
+
+    /* addr_write can race with tlb_reset_dirty_range */
+    copy_tlb_helper(tv, te, true);
+
    env->iotlb_v[mmu_idx][vidx] = env->iotlb[mmu_idx][index];

    /* refill the tlb */
    env->iotlb[mmu_idx][index].addr = iotlb - vaddr;
    env->iotlb[mmu_idx][index].attrs = attrs;
-    te->addend = addend - vaddr;
+
+    /* Now calculate the new entry */
+    tn.addend = addend - vaddr;
    if (prot & PAGE_READ) {
-        te->addr_read = address;
+        tn.addr_read = address;
    } else {
-        te->addr_read = -1;
+        tn.addr_read = -1;
    }

    if (prot & PAGE_EXEC) {
-        te->addr_code = code_address;
+        tn.addr_code = code_address;
    } else {
-        te->addr_code = -1;
+        tn.addr_code = -1;
    }
+
+    tn.addr_write = -1;
    if (prot & PAGE_WRITE) {
        if ((memory_region_is_ram(section->mr) && section->readonly)
            || memory_region_is_romd(section->mr)) {
            /* Write access calls the I/O callback.  */
-            te->addr_write = address | TLB_MMIO;
+            tn.addr_write = address | TLB_MMIO;
        } else if (memory_region_is_ram(section->mr)
                   && cpu_physical_memory_is_clean(
                        memory_region_get_ram_addr(section->mr) + xlat)) {
-            te->addr_write = address | TLB_NOTDIRTY;
+            tn.addr_write = address | TLB_NOTDIRTY;
        } else {
-            te->addr_write = address;
+            tn.addr_write = address;
        }
-    } else {
-        te->addr_write = -1;
    }
+
+    /* Pairs with flag setting in tlb_reset_dirty_range */
+    copy_tlb_helper(te, &tn, true);
+    /* atomic_mb_set(&te->addr_write, write_address); */
 }

 /* Add a new TLB entry, but without specifying the memory
@ -452,6 +734,18 @@ static void report_bad_exec(CPUState *cpu, target_ulong addr)
    log_cpu_state_mask(LOG_GUEST_ERROR, cpu, CPU_DUMP_FPU | CPU_DUMP_CCOP);
 }

+static inline ram_addr_t qemu_ram_addr_from_host_nofail(void *ptr)
+{
+    ram_addr_t ram_addr;
+
+    ram_addr = qemu_ram_addr_from_host(ptr);
+    if (ram_addr == RAM_ADDR_INVALID) {
+        error_report("Bad ram pointer %p", ptr);
+        abort();
+    }
+    return ram_addr;
+}
+
 /* NOTE: this function can trigger an exception */
 /* NOTE2: the returned address is not exactly the physical address: it
 * is actually a ram_addr_t (in system mode; the user mode emulation
@ -495,6 +789,7 @@ static uint64_t io_readx(CPUArchState *env, CPUIOTLBEntry *iotlbentry,
    hwaddr physaddr = iotlbentry->addr;
    MemoryRegion *mr = iotlb_to_region(cpu, physaddr, iotlbentry->attrs);
    uint64_t val;
+    bool locked = false;

    physaddr = (physaddr & TARGET_PAGE_MASK) + addr;
    cpu->mem_io_pc = retaddr;
@ -503,7 +798,16 @@ static uint64_t io_readx(CPUArchState *env, CPUIOTLBEntry *iotlbentry,
    }

    cpu->mem_io_vaddr = addr;
+
+    if (mr->global_locking) {
+        qemu_mutex_lock_iothread();
+        locked = true;
+    }
    memory_region_dispatch_read(mr, physaddr, &val, size, iotlbentry->attrs);
+    if (locked) {
+        qemu_mutex_unlock_iothread();
+    }
+
    return val;
 }

@ -514,15 +818,23 @@ static void io_writex(CPUArchState *env, CPUIOTLBEntry *iotlbentry,
    CPUState *cpu = ENV_GET_CPU(env);
    hwaddr physaddr = iotlbentry->addr;
    MemoryRegion *mr = iotlb_to_region(cpu, physaddr, iotlbentry->attrs);
+    bool locked = false;

    physaddr = (physaddr & TARGET_PAGE_MASK) + addr;
    if (mr != &io_mem_rom && mr != &io_mem_notdirty && !cpu->can_do_io) {
        cpu_io_recompile(cpu, retaddr);
    }
-
    cpu->mem_io_vaddr = addr;
    cpu->mem_io_pc = retaddr;
+
+    if (mr->global_locking) {
+        qemu_mutex_lock_iothread();
+        locked = true;
+    }
    memory_region_dispatch_write(mr, physaddr, val, size, iotlbentry->attrs);
+    if (locked) {
+        qemu_mutex_unlock_iothread();
+    }
 }

 /* Return true if ADDR is present in the victim tlb, and has been copied
@ -538,10 +850,13 @@ static bool victim_tlb_hit(CPUArchState *env, size_t mmu_idx, size_t index,
        if (cmp == page) {
            /* Found entry in victim tlb, swap tlb and iotlb.  */
            CPUTLBEntry tmptlb, *tlb = &env->tlb_table[mmu_idx][index];
+
+            copy_tlb_helper(&tmptlb, tlb, false);
+            copy_tlb_helper(tlb, vtlb, true);
+            copy_tlb_helper(vtlb, &tmptlb, true);
+
            CPUIOTLBEntry tmpio, *io = &env->iotlb[mmu_idx][index];
            CPUIOTLBEntry *vio = &env->iotlb_v[mmu_idx][vidx];
-
-            tmptlb = *tlb; *tlb = *vtlb; *vtlb = tmptlb;
            tmpio = *io; *io = *vio; *vio = tmpio;
            return true;
        }
--- a/docs/multi-thread-tcg.txt
+++ b/docs/multi-thread-tcg.txt
@ -0,0 +1,350 @@
+Copyright (c) 2015-2016 Linaro Ltd.
+
+This work is licensed under the terms of the GNU GPL, version 2 or
+later. See the COPYING file in the top-level directory.
+
+Introduction
+============
+
+This document outlines the design for multi-threaded TCG system-mode
+emulation. The current user-mode emulation mirrors the thread
+structure of the translated executable. Some of the work will be
+applicable to both system and linux-user emulation.
+
+The original system-mode TCG implementation was single threaded and
+dealt with multiple CPUs with simple round-robin scheduling. This
+simplified a lot of things but became increasingly limited as systems
+being emulated gained additional cores and per-core performance gains
+for host systems started to level off.
+
+vCPU Scheduling
+===============
+
+We introduce a new running mode where each vCPU will run on its own
+user-space thread. This will be enabled by default for all FE/BE
+combinations that have had the required work done to support this
+safely.
+
+In the general case of running translated code there should be no
+inter-vCPU dependencies and all vCPUs should be able to run at full
+speed. Synchronisation will only be required while accessing internal
+shared data structures or when the emulated architecture requires a
+coherent representation of the emulated machine state.
+
+Shared Data Structures
+======================
+
+Main Run Loop
+-------------
+
+Even when there is no code being generated there are a number of
+structures associated with the hot-path through the main run-loop.
+These are associated with looking up the next translation block to
+execute. These include:
+
+    tb_jmp_cache (per-vCPU, cache of recent jumps)
+    tb_ctx.htable (global hash table, phys address->tb lookup)
+
+As TB linking only occurs when blocks are in the same page this code
+is critical to performance as looking up the next TB to execute is the
+most common reason to exit the generated code.
+
+DESIGN REQUIREMENT: Make access to lookup structures safe with
+multiple reader/writer threads. Minimise any lock contention to do it.
+
+The hot-path avoids using locks where possible. The tb_jmp_cache is
+updated with atomic accesses to ensure consistent results. The fall
+back QHT based hash table is also designed for lockless lookups. Locks
+are only taken when code generation is required or TranslationBlocks
+have their block-to-block jumps patched.
+
+Global TCG State
+----------------
+
+We need to protect the entire code generation cycle including any post
+generation patching of the translated code. This also implies a shared
+translation buffer which contains code running on all cores. Any
+execution path that comes to the main run loop will need to hold a
+mutex for code generation. This also includes times when we need flush
+code or entries from any shared lookups/caches. Structures held on a
+per-vCPU basis won't need locking unless other vCPUs will need to
+modify them.
+
+DESIGN REQUIREMENT: Add locking around all code generation and TB
+patching.
+
+(Current solution)
+
+Mainly as part of the linux-user work all code generation is
+serialised with a tb_lock(). For the SoftMMU tb_lock() also takes the
+place of mmap_lock() in linux-user.
+
+Translation Blocks
+------------------
+
+Currently the whole system shares a single code generation buffer
+which when full will force a flush of all translations and start from
+scratch again. Some operations also force a full flush of translations
+including:
+
+  - debugging operations (breakpoint insertion/removal)
+  - some CPU helper functions
+
+This is done with the async_safe_run_on_cpu() mechanism to ensure all
+vCPUs are quiescent when changes are being made to shared global
+structures.
+
+More granular translation invalidation events are typically due
+to a change of the state of a physical page:
+
+  - code modification (self modify code, patching code)
+  - page changes (new page mapping in linux-user mode)
+
+While setting the invalid flag in a TranslationBlock will stop it
+being used when looked up in the hot-path there are a number of other
+book-keeping structures that need to be safely cleared.
+
+Any TranslationBlocks which have been patched to jump directly to the
+now invalid blocks need the jump patches reversing so they will return
+to the C code.
+
+There are a number of look-up caches that need to be properly updated
+including the:
+
+  - jump lookup cache
+  - the physical-to-tb lookup hash table
+  - the global page table
+
+The global page table (l1_map) which provides a multi-level look-up
+for PageDesc structures which contain pointers to the start of a
+linked list of all Translation Blocks in that page (see page_next).
+
+Both the jump patching and the page cache involve linked lists that
+the invalidated TranslationBlock needs to be removed from.
+
+DESIGN REQUIREMENT: Safely handle invalidation of TBs
+                      - safely patch/revert direct jumps
+                      - remove central PageDesc lookup entries
+                      - ensure lookup caches/hashes are safely updated
+
+(Current solution)
+
+The direct jump themselves are updated atomically by the TCG
+tb_set_jmp_target() code. Modification to the linked lists that allow
+searching for linked pages are done under the protect of the
+tb_lock().
+
+The global page table is protected by the tb_lock() in system-mode and
+mmap_lock() in linux-user mode.
+
+The lookup caches are updated atomically and the lookup hash uses QHT
+which is designed for concurrent safe lookup.
+
+
+Memory maps and TLBs
+--------------------
+
+The memory handling code is fairly critical to the speed of memory
+access in the emulated system. The SoftMMU code is designed so the
+hot-path can be handled entirely within translated code. This is
+handled with a per-vCPU TLB structure which once populated will allow
+a series of accesses to the page to occur without exiting the
+translated code. It is possible to set flags in the TLB address which
+will ensure the slow-path is taken for each access. This can be done
+to support:
+
+  - Memory regions (dividing up access to PIO, MMIO and RAM)
+  - Dirty page tracking (for code gen, SMC detection, migration and display)
+  - Virtual TLB (for translating guest address->real address)
+
+When the TLB tables are updated by a vCPU thread other than their own
+we need to ensure it is done in a safe way so no inconsistent state is
+seen by the vCPU thread.
+
+Some operations require updating a number of vCPUs TLBs at the same
+time in a synchronised manner.
+
+DESIGN REQUIREMENTS:
+
+  - TLB Flush All/Page
+    - can be across-vCPUs
+    - cross vCPU TLB flush may need other vCPU brought to halt
+    - change may need to be visible to the calling vCPU immediately
+  - TLB Flag Update
+    - usually cross-vCPU
+    - want change to be visible as soon as possible
+  - TLB Update (update a CPUTLBEntry, via tlb_set_page_with_attrs)
+    - This is a per-vCPU table - by definition can't race
+    - updated by its own thread when the slow-path is forced
+
+(Current solution)
+
+We have updated cputlb.c to defer operations when a cross-vCPU
+operation with async_run_on_cpu() which ensures each vCPU sees a
+coherent state when it next runs its work (in a few instructions
+time).
+
+A new set up operations (tlb_flush_*_all_cpus) take an additional flag
+which when set will force synchronisation by setting the source vCPUs
+work as "safe work" and exiting the cpu run loop. This ensure by the
+time execution restarts all flush operations have completed.
+
+TLB flag updates are all done atomically and are also protected by the
+tb_lock() which is used by the functions that update the TLB in bulk.
+
+(Known limitation)
+
+Not really a limitation but the wait mechanism is overly strict for
+some architectures which only need flushes completed by a barrier
+instruction. This could be a future optimisation.
+
+Emulated hardware state
+-----------------------
+
+Currently thanks to KVM work any access to IO memory is automatically
+protected by the global iothread mutex, also known as the BQL (Big
+Qemu Lock). Any IO region that doesn't use global mutex is expected to
+do its own locking.
+
+However IO memory isn't the only way emulated hardware state can be
+modified. Some architectures have model specific registers that
+trigger hardware emulation features. Generally any translation helper
+that needs to update more than a single vCPUs of state should take the
+BQL.
+
+As the BQL, or global iothread mutex is shared across the system we
+push the use of the lock as far down into the TCG code as possible to
+minimise contention.
+
+(Current solution)
+
+MMIO access automatically serialises hardware emulation by way of the
+BQL. Currently ARM targets serialise all ARM_CP_IO register accesses
+and also defer the reset/startup of vCPUs to the vCPU context by way
+of async_run_on_cpu().
+
+Updates to interrupt state are also protected by the BQL as they can
+often be cross vCPU.
+
+Memory Consistency
+==================
+
+Between emulated guests and host systems there are a range of memory
+consistency models. Even emulating weakly ordered systems on strongly
+ordered hosts needs to ensure things like store-after-load re-ordering
+can be prevented when the guest wants to.
+
+Memory Barriers
+---------------
+
+Barriers (sometimes known as fences) provide a mechanism for software
+to enforce a particular ordering of memory operations from the point
+of view of external observers (e.g. another processor core). They can
+apply to any memory operations as well as just loads or stores.
+
+The Linux kernel has an excellent write-up on the various forms of
+memory barrier and the guarantees they can provide [1].
+
+Barriers are often wrapped around synchronisation primitives to
+provide explicit memory ordering semantics. However they can be used
+by themselves to provide safe lockless access by ensuring for example
+a change to a signal flag will only be visible once the changes to
+payload are.
+
+DESIGN REQUIREMENT: Add a new tcg_memory_barrier op
+
+This would enforce a strong load/store ordering so all loads/stores
+complete at the memory barrier. On single-core non-SMP strongly
+ordered backends this could become a NOP.
+
+Aside from explicit standalone memory barrier instructions there are
+also implicit memory ordering semantics which comes with each guest
+memory access instruction. For example all x86 load/stores come with
+fairly strong guarantees of sequential consistency where as ARM has
+special variants of load/store instructions that imply acquire/release
+semantics.
+
+In the case of a strongly ordered guest architecture being emulated on
+a weakly ordered host the scope for a heavy performance impact is
+quite high.
+
+DESIGN REQUIREMENTS: Be efficient with use of memory barriers
+       - host systems with stronger implied guarantees can skip some barriers
+       - merge consecutive barriers to the strongest one
+
+(Current solution)
+
+The system currently has a tcg_gen_mb() which will add memory barrier
+operations if code generation is being done in a parallel context. The
+tcg_optimize() function attempts to merge barriers up to their
+strongest form before any load/store operations. The solution was
+originally developed and tested for linux-user based systems. All
+backends have been converted to emit fences when required. So far the
+following front-ends have been updated to emit fences when required:
+
+    - target-i386
+    - target-arm
+    - target-aarch64
+    - target-alpha
+    - target-mips
+
+Memory Control and Maintenance
+------------------------------
+
+This includes a class of instructions for controlling system cache
+behaviour. While QEMU doesn't model cache behaviour these instructions
+are often seen when code modification has taken place to ensure the
+changes take effect.
+
+Synchronisation Primitives
+--------------------------
+
+There are two broad types of synchronisation primitives found in
+modern ISAs: atomic instructions and exclusive regions.
+
+The first type offer a simple atomic instruction which will guarantee
+some sort of test and conditional store will be truly atomic w.r.t.
+other cores sharing access to the memory. The classic example is the
+x86 cmpxchg instruction.
+
+The second type offer a pair of load/store instructions which offer a
+guarantee that an region of memory has not been touched between the
+load and store instructions. An example of this is ARM's ldrex/strex
+pair where the strex instruction will return a flag indicating a
+successful store only if no other CPU has accessed the memory region
+since the ldrex.
+
+Traditionally TCG has generated a series of operations that work
+because they are within the context of a single translation block so
+will have completed before another CPU is scheduled. However with
+the ability to have multiple threads running to emulate multiple CPUs
+we will need to explicitly expose these semantics.
+
+DESIGN REQUIREMENTS:
+  - Support classic atomic instructions
+  - Support load/store exclusive (or load link/store conditional) pairs
+  - Generic enough infrastructure to support all guest architectures
+CURRENT OPEN QUESTIONS:
+  - How problematic is the ABA problem in general?
+
+(Current solution)
+
+The TCG provides a number of atomic helpers (tcg_gen_atomic_*) which
+can be used directly or combined to emulate other instructions like
+ARM's ldrex/strex instructions. While they are susceptible to the ABA
+problem so far common guests have not implemented patterns where
+this may be a problem - typically presenting a locking ABI which
+assumes cmpxchg like semantics.
+
+The code also includes a fall-back for cases where multi-threaded TCG
+ops can't work (e.g. guest atomic width > host atomic width). In this
+case an EXCP_ATOMIC exit occurs and the instruction is emulated with
+an exclusive lock which ensures all emulation is serialised.
+
+While the atomic helpers look good enough for now there may be a need
+to look at solutions that can more closely model the guest
+architectures semantics.
+
+==========
+
+[1] https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/plain/Documentation/memory-barriers.txt
--- a/exec.c
+++ b/exec.c
@ -2134,9 +2134,9 @@ static void check_watchpoint(int offset, int len, MemTxAttrs attrs, int flags)
                }
                cpu->watchpoint_hit = wp;

-                /* The tb_lock will be reset when cpu_loop_exit or
-                 * cpu_loop_exit_noexc longjmp back into the cpu_exec
-                 * main loop.
+                /* Both tb_lock and iothread_mutex will be reset when
+                 * cpu_loop_exit or cpu_loop_exit_noexc longjmp
+                 * back into the cpu_exec main loop.
                 */
                tb_lock();
                tb_check_watchpoint(cpu);
@ -2371,8 +2371,14 @@ static void io_mem_init(void)
    memory_region_init_io(&io_mem_rom, NULL, &unassigned_mem_ops, NULL, NULL, UINT64_MAX);
    memory_region_init_io(&io_mem_unassigned, NULL, &unassigned_mem_ops, NULL,
                          NULL, UINT64_MAX);
+
+    /* io_mem_notdirty calls tb_invalidate_phys_page_fast,
+     * which can be called without the iothread mutex.
+     */
    memory_region_init_io(&io_mem_notdirty, NULL, &notdirty_mem_ops, NULL,
                          NULL, UINT64_MAX);
+    memory_region_clear_global_locking(&io_mem_notdirty);
+
    memory_region_init_io(&io_mem_watch, NULL, &watch_mem_ops, NULL,
                          NULL, UINT64_MAX);
 }
--- a/hw/core/irq.c
+++ b/hw/core/irq.c
@ -22,6 +22,7 @@
 * THE SOFTWARE.
 */
 #include "qemu/osdep.h"
+#include "qemu/main-loop.h"
 #include "qemu-common.h"
 #include "hw/irq.h"
 #include "qom/object.h"
--- a/hw/i386/kvmvapic.c
+++ b/hw/i386/kvmvapic.c
@ -457,8 +457,8 @@ static void patch_instruction(VAPICROMState *s, X86CPU *cpu, target_ulong ip)
    resume_all_vcpus();

    if (!kvm_enabled()) {
-        /* tb_lock will be reset when cpu_loop_exit_noexc longjmps
-         * back into the cpu_exec loop. */
+        /* Both tb_lock and iothread_mutex will be reset when
+         *  longjmps back into the cpu_exec loop. */
        tb_lock();
        tb_gen_code(cs, current_pc, current_cs_base, current_flags, 1);
        cpu_loop_exit_noexc(cs);
--- a/hw/intc/arm_gicv3_cpuif.c
+++ b/hw/intc/arm_gicv3_cpuif.c
@ -14,6 +14,7 @@

 #include "qemu/osdep.h"
 #include "qemu/bitops.h"
+#include "qemu/main-loop.h"
 #include "trace.h"
 #include "gicv3_internal.h"
 #include "cpu.h"
@ -733,6 +734,8 @@ void gicv3_cpuif_update(GICv3CPUState *cs)
    ARMCPU *cpu = ARM_CPU(cs->cpu);
    CPUARMState *env = &cpu->env;

+    g_assert(qemu_mutex_iothread_locked());
+
    trace_gicv3_cpuif_update(gicv3_redist_affid(cs), cs->hppi.irq,
                             cs->hppi.grp, cs->hppi.prio);

--- a/hw/misc/imx6_src.c
+++ b/hw/misc/imx6_src.c
@ -14,6 +14,7 @@
 #include "qemu/bitops.h"
 #include "qemu/log.h"
 #include "arm-powerctl.h"
+#include "qom/cpu.h"

 #ifndef DEBUG_IMX6_SRC
 #define DEBUG_IMX6_SRC 0
@ -113,6 +114,45 @@ static uint64_t imx6_src_read(void *opaque, hwaddr offset, unsigned size)
    return value;
 }

+
+/* The reset is asynchronous so we need to defer clearing the reset
+ * bit until the work is completed.
+ */
+
+struct SRCSCRResetInfo {
+    IMX6SRCState *s;
+    int reset_bit;
+};
+
+static void imx6_clear_reset_bit(CPUState *cpu, run_on_cpu_data data)
+{
+    struct SRCSCRResetInfo *ri = data.host_ptr;
+    IMX6SRCState *s = ri->s;
+
+    assert(qemu_mutex_iothread_locked());
+
+    s->regs[SRC_SCR] = deposit32(s->regs[SRC_SCR], ri->reset_bit, 1, 0);
+    DPRINTF("reg[%s] <= 0x%" PRIx32 "\n",
+            imx6_src_reg_name(SRC_SCR), s->regs[SRC_SCR]);
+
+    g_free(ri);
+}
+
+static void imx6_defer_clear_reset_bit(int cpuid,
+                                       IMX6SRCState *s,
+                                       unsigned long reset_shift)
+{
+    struct SRCSCRResetInfo *ri;
+
+    ri = g_malloc(sizeof(struct SRCSCRResetInfo));
+    ri->s = s;
+    ri->reset_bit = reset_shift;
+
+    async_run_on_cpu(arm_get_cpu_by_id(cpuid), imx6_clear_reset_bit,
+                     RUN_ON_CPU_HOST_PTR(ri));
+}
+
+
 static void imx6_src_write(void *opaque, hwaddr offset, uint64_t value,
                           unsigned size)
 {
@ -153,7 +193,7 @@ static void imx6_src_write(void *opaque, hwaddr offset, uint64_t value,
                arm_set_cpu_off(3);
            }
            /* We clear the reset bits as the processor changed state */
-            clear_bit(CORE3_RST_SHIFT, &current_value);
+            imx6_defer_clear_reset_bit(3, s, CORE3_RST_SHIFT);
            clear_bit(CORE3_RST_SHIFT, &change_mask);
        }
        if (EXTRACT(change_mask, CORE2_ENABLE)) {
@ -162,11 +202,11 @@ static void imx6_src_write(void *opaque, hwaddr offset, uint64_t value,
                arm_set_cpu_on(2, s->regs[SRC_GPR5], s->regs[SRC_GPR6],
                               3, false);
            } else {
-                /* CORE 3 is shut down */
+                /* CORE 2 is shut down */
                arm_set_cpu_off(2);
            }
            /* We clear the reset bits as the processor changed state */
-            clear_bit(CORE2_RST_SHIFT, &current_value);
+            imx6_defer_clear_reset_bit(2, s, CORE2_RST_SHIFT);
            clear_bit(CORE2_RST_SHIFT, &change_mask);
        }
        if (EXTRACT(change_mask, CORE1_ENABLE)) {
@ -175,28 +215,28 @@ static void imx6_src_write(void *opaque, hwaddr offset, uint64_t value,
                arm_set_cpu_on(1, s->regs[SRC_GPR3], s->regs[SRC_GPR4],
                               3, false);
            } else {
-                /* CORE 3 is shut down */
+                /* CORE 1 is shut down */
                arm_set_cpu_off(1);
            }
            /* We clear the reset bits as the processor changed state */
-            clear_bit(CORE1_RST_SHIFT, &current_value);
+            imx6_defer_clear_reset_bit(1, s, CORE1_RST_SHIFT);
            clear_bit(CORE1_RST_SHIFT, &change_mask);
        }
        if (EXTRACT(change_mask, CORE0_RST)) {
            arm_reset_cpu(0);
-            clear_bit(CORE0_RST_SHIFT, &current_value);
+            imx6_defer_clear_reset_bit(0, s, CORE0_RST_SHIFT);
        }
        if (EXTRACT(change_mask, CORE1_RST)) {
            arm_reset_cpu(1);
-            clear_bit(CORE1_RST_SHIFT, &current_value);
+            imx6_defer_clear_reset_bit(1, s, CORE1_RST_SHIFT);
        }
        if (EXTRACT(change_mask, CORE2_RST)) {
            arm_reset_cpu(2);
-            clear_bit(CORE2_RST_SHIFT, &current_value);
+            imx6_defer_clear_reset_bit(2, s, CORE2_RST_SHIFT);
        }
        if (EXTRACT(change_mask, CORE3_RST)) {
            arm_reset_cpu(3);
-            clear_bit(CORE3_RST_SHIFT, &current_value);
+            imx6_defer_clear_reset_bit(3, s, CORE3_RST_SHIFT);
        }
        if (EXTRACT(change_mask, SW_IPU2_RST)) {
            /* We pretend the IPU2 is reset */
--- a/hw/ppc/ppc.c
+++ b/hw/ppc/ppc.c
@ -62,7 +62,16 @@ void ppc_set_irq(PowerPCCPU *cpu, int n_IRQ, int level)
 {
    CPUState *cs = CPU(cpu);
    CPUPPCState *env = &cpu->env;
-    unsigned int old_pending = env->pending_interrupts;
+    unsigned int old_pending;
+    bool locked = false;
+
+    /* We may already have the BQL if coming from the reset path */
+    if (!qemu_mutex_iothread_locked()) {
+        locked = true;
+        qemu_mutex_lock_iothread();
+    }
+
+    old_pending = env->pending_interrupts;

    if (level) {
        env->pending_interrupts |= 1 << n_IRQ;
@ -80,9 +89,14 @@ void ppc_set_irq(PowerPCCPU *cpu, int n_IRQ, int level)
 #endif
    }

+
    LOG_IRQ("%s: %p n_IRQ %d level %d => pending %08" PRIx32
                "req %08x\n", __func__, env, n_IRQ, level,
                env->pending_interrupts, CPU(cpu)->interrupt_request);
+
+    if (locked) {
+        qemu_mutex_unlock_iothread();
+    }
 }

 /* PowerPC 6xx / 7xx internal IRQ controller */
--- a/hw/ppc/spapr.c
+++ b/hw/ppc/spapr.c
@ -1010,6 +1010,9 @@ static void emulate_spapr_hypercall(PPCVirtualHypervisor *vhyp,
 {
    CPUPPCState *env = &cpu->env;

+    /* The TCG path should also be holding the BQL at this point */
+    g_assert(qemu_mutex_iothread_locked());
+
    if (msr_pr) {
        hcall_dprintf("Hypercall made with MSR[PR]=1\n");
        env->gpr[3] = H_PRIVILEGE;
--- a/include/exec/cputlb.h
+++ b/include/exec/cputlb.h
@ -23,8 +23,6 @@
 /* cputlb.c */
 void tlb_protect_code(ram_addr_t ram_addr);
 void tlb_unprotect_code(ram_addr_t ram_addr);
-void tlb_reset_dirty_range(CPUTLBEntry *tlb_entry, uintptr_t start,
-                           uintptr_t length);
 extern int tlb_flush_count;

 #endif
--- a/include/exec/exec-all.h
+++ b/include/exec/exec-all.h
@ -92,6 +92,27 @@ void cpu_address_space_init(CPUState *cpu, AddressSpace *as, int asidx);
 * MMU indexes.
 */
 void tlb_flush_page(CPUState *cpu, target_ulong addr);
+/**
+ * tlb_flush_page_all_cpus:
+ * @cpu: src CPU of the flush
+ * @addr: virtual address of page to be flushed
+ *
+ * Flush one page from the TLB of the specified CPU, for all
+ * MMU indexes.
+ */
+void tlb_flush_page_all_cpus(CPUState *src, target_ulong addr);
+/**
+ * tlb_flush_page_all_cpus_synced:
+ * @cpu: src CPU of the flush
+ * @addr: virtual address of page to be flushed
+ *
+ * Flush one page from the TLB of the specified CPU, for all MMU
+ * indexes like tlb_flush_page_all_cpus except the source vCPUs work
+ * is scheduled as safe work meaning all flushes will be complete once
+ * the source vCPUs safe work is complete. This will depend on when
+ * the guests translation ends the TB.
+ */
+void tlb_flush_page_all_cpus_synced(CPUState *src, target_ulong addr);
 /**
 * tlb_flush:
 * @cpu: CPU whose TLB should be flushed
@ -102,25 +123,88 @@ void tlb_flush_page(CPUState *cpu, target_ulong addr);
 * use one of the other functions for efficiency.
 */
 void tlb_flush(CPUState *cpu);
+/**
+ * tlb_flush_all_cpus:
+ * @cpu: src CPU of the flush
+ */
+void tlb_flush_all_cpus(CPUState *src_cpu);
+/**
+ * tlb_flush_all_cpus_synced:
+ * @cpu: src CPU of the flush
+ *
+ * Like tlb_flush_all_cpus except this except the source vCPUs work is
+ * scheduled as safe work meaning all flushes will be complete once
+ * the source vCPUs safe work is complete. This will depend on when
+ * the guests translation ends the TB.
+ */
+void tlb_flush_all_cpus_synced(CPUState *src_cpu);
 /**
 * tlb_flush_page_by_mmuidx:
 * @cpu: CPU whose TLB should be flushed
 * @addr: virtual address of page to be flushed
- * @...: list of MMU indexes to flush, terminated by a negative value
+ * @idxmap: bitmap of MMU indexes to flush
 *
 * Flush one page from the TLB of the specified CPU, for the specified
 * MMU indexes.
 */
-void tlb_flush_page_by_mmuidx(CPUState *cpu, target_ulong addr, ...);
+void tlb_flush_page_by_mmuidx(CPUState *cpu, target_ulong addr,
+                              uint16_t idxmap);
+/**
+ * tlb_flush_page_by_mmuidx_all_cpus:
+ * @cpu: Originating CPU of the flush
+ * @addr: virtual address of page to be flushed
+ * @idxmap: bitmap of MMU indexes to flush
+ *
+ * Flush one page from the TLB of all CPUs, for the specified
+ * MMU indexes.
+ */
+void tlb_flush_page_by_mmuidx_all_cpus(CPUState *cpu, target_ulong addr,
+                                       uint16_t idxmap);
+/**
+ * tlb_flush_page_by_mmuidx_all_cpus_synced:
+ * @cpu: Originating CPU of the flush
+ * @addr: virtual address of page to be flushed
+ * @idxmap: bitmap of MMU indexes to flush
+ *
+ * Flush one page from the TLB of all CPUs, for the specified MMU
+ * indexes like tlb_flush_page_by_mmuidx_all_cpus except the source
+ * vCPUs work is scheduled as safe work meaning all flushes will be
+ * complete once  the source vCPUs safe work is complete. This will
+ * depend on when the guests translation ends the TB.
+ */
+void tlb_flush_page_by_mmuidx_all_cpus_synced(CPUState *cpu, target_ulong addr,
+                                              uint16_t idxmap);
 /**
 * tlb_flush_by_mmuidx:
 * @cpu: CPU whose TLB should be flushed
- * @...: list of MMU indexes to flush, terminated by a negative value
+ * @wait: If true ensure synchronisation by exiting the cpu_loop
+ * @idxmap: bitmap of MMU indexes to flush
 *
 * Flush all entries from the TLB of the specified CPU, for the specified
 * MMU indexes.
 */
-void tlb_flush_by_mmuidx(CPUState *cpu, ...);
+void tlb_flush_by_mmuidx(CPUState *cpu, uint16_t idxmap);
+/**
+ * tlb_flush_by_mmuidx_all_cpus:
+ * @cpu: Originating CPU of the flush
+ * @idxmap: bitmap of MMU indexes to flush
+ *
+ * Flush all entries from all TLBs of all CPUs, for the specified
+ * MMU indexes.
+ */
+void tlb_flush_by_mmuidx_all_cpus(CPUState *cpu, uint16_t idxmap);
+/**
+ * tlb_flush_by_mmuidx_all_cpus_synced:
+ * @cpu: Originating CPU of the flush
+ * @idxmap: bitmap of MMU indexes to flush
+ *
+ * Flush all entries from all TLBs of all CPUs, for the specified
+ * MMU indexes like tlb_flush_by_mmuidx_all_cpus except except the source
+ * vCPUs work is scheduled as safe work meaning all flushes will be
+ * complete once  the source vCPUs safe work is complete. This will
+ * depend on when the guests translation ends the TB.
+ */
+void tlb_flush_by_mmuidx_all_cpus_synced(CPUState *cpu, uint16_t idxmap);
 /**
 * tlb_set_page_with_attrs:
 * @cpu: CPU to add this TLB entry for
@ -162,17 +246,45 @@ void probe_write(CPUArchState *env, target_ulong addr, int mmu_idx,
 static inline void tlb_flush_page(CPUState *cpu, target_ulong addr)
 {
 }
-
+static inline void tlb_flush_page_all_cpus(CPUState *src, target_ulong addr)
+{
+}
+static inline void tlb_flush_page_all_cpus_synced(CPUState *src,
+                                                  target_ulong addr)
+{
+}
 static inline void tlb_flush(CPUState *cpu)
 {
 }
-
+static inline void tlb_flush_all_cpus(CPUState *src_cpu)
+{
+}
+static inline void tlb_flush_all_cpus_synced(CPUState *src_cpu)
+{
+}
 static inline void tlb_flush_page_by_mmuidx(CPUState *cpu,
-                                            target_ulong addr, ...)
+                                            target_ulong addr, uint16_t idxmap)
 {
 }

-static inline void tlb_flush_by_mmuidx(CPUState *cpu, ...)
+static inline void tlb_flush_by_mmuidx(CPUState *cpu, uint16_t idxmap)
+{
+}
+static inline void tlb_flush_page_by_mmuidx_all_cpus(CPUState *cpu,
+                                                     target_ulong addr,
+                                                     uint16_t idxmap)
+{
+}
+static inline void tlb_flush_page_by_mmuidx_all_cpus_synced(CPUState *cpu,
+                                                            target_ulong addr,
+                                                            uint16_t idxmap)
+{
+}
+static inline void tlb_flush_by_mmuidx_all_cpus(CPUState *cpu, uint16_t idxmap)
+{
+}
+static inline void tlb_flush_by_mmuidx_all_cpus_synced(CPUState *cpu,
+                                                       uint16_t idxmap)
 {
 }
 #endif
@ -404,8 +516,4 @@ bool memory_region_is_unassigned(MemoryRegion *mr);
 /* vl.c */
 extern int singlestep;

-/* cpu-exec.c, accessed with atomic_mb_read/atomic_mb_set */
-extern CPUState *tcg_current_cpu;
-extern bool exit_request;
-
 #endif
--- a/include/qom/cpu.h
+++ b/include/qom/cpu.h
@ -329,6 +329,7 @@ struct CPUState {
    bool unplug;
    bool crash_occurred;
    bool exit_request;
+    /* updates protected by BQL */
    uint32_t interrupt_request;
    int singlestep_enabled;
    int64_t icount_extra;
@ -401,6 +402,12 @@ struct CPUState {

    bool hax_vcpu_dirty;
    struct hax_vcpu_state *hax_vcpu;
+
+    /* The pending_tlb_flush flag is set and cleared atomically to
+     * avoid potential races. The aim of the flag is to avoid
+     * unnecessary flushes.
+     */
+    uint16_t pending_tlb_flush;
 };

 QTAILQ_HEAD(CPUTailQ, CPUState);
@ -415,6 +422,15 @@ extern struct CPUTailQ cpus;

 extern __thread CPUState *current_cpu;

+/**
+ * qemu_tcg_mttcg_enabled:
+ * Check whether we are running MultiThread TCG or not.
+ *
+ * Returns: %true if we are in MTTCG mode %false otherwise.
+ */
+extern bool mttcg_enabled;
+#define qemu_tcg_mttcg_enabled() (mttcg_enabled)
+
 /**
 * cpu_paging_enabled:
 * @cpu: The CPU whose state is to be inspected.
--- a/include/sysemu/cpus.h
+++ b/include/sysemu/cpus.h
@ -36,4 +36,6 @@ extern int smp_threads;

 void list_cpus(FILE *f, fprintf_function cpu_fprintf, const char *optarg);

+void qemu_tcg_configure(QemuOpts *opts, Error **errp);
+
 #endif
--- a/memory.c
+++ b/memory.c
@ -917,6 +917,8 @@ void memory_region_transaction_commit(void)
    AddressSpace *as;

    assert(memory_region_transaction_depth);
+    assert(qemu_mutex_iothread_locked());
+
    --memory_region_transaction_depth;
    if (!memory_region_transaction_depth) {
        if (memory_region_update_pending) {
--- a/qemu-options.hx
+++ b/qemu-options.hx
@ -95,6 +95,26 @@ STEXI
 Select CPU model (@code{-cpu help} for list and additional feature selection)
 ETEXI

+DEF("accel", HAS_ARG, QEMU_OPTION_accel,
+    "-accel [accel=]accelerator[,thread=single|multi]\n"
+    "               select accelerator ('-accel help for list')\n"
+    "               thread=single|multi (enable multi-threaded TCG)", QEMU_ARCH_ALL)
+STEXI
+@item -accel @var{name}[,prop=@var{value}[,...]]
+@findex -accel
+This is used to enable an accelerator. Depending on the target architecture,
+kvm, xen, or tcg can be available. By default, tcg is used. If there is more
+than one accelerator specified, the next one is used if the previous one fails
+to initialize.
+@table @option
+@item thread=single|multi
+Controls number of TCG threads. When the TCG is multi-threaded there will be one
+thread per vCPU therefor taking advantage of additional host cores. The default
+is to enable multi-threading where both the back-end and front-ends support it and
+no incompatible TCG features have been enabled (e.g. icount/replay).
+@end table
+ETEXI
+
 DEF("smp", HAS_ARG, QEMU_OPTION_smp,
    "-smp [cpus=]n[,maxcpus=cpus][,cores=cores][,threads=threads][,sockets=sockets]\n"
    "                set the number of CPUs to 'n' [default=1]\n"
--- a/qom/cpu.c
+++ b/qom/cpu.c
@ -113,9 +113,19 @@ static void cpu_common_get_memory_mapping(CPUState *cpu,
    error_setg(errp, "Obtaining memory mappings is unsupported on this CPU.");
 }

+/* Resetting the IRQ comes from across the code base so we take the
+ * BQL here if we need to.  cpu_interrupt assumes it is held.*/
 void cpu_reset_interrupt(CPUState *cpu, int mask)
 {
+    bool need_lock = !qemu_mutex_iothread_locked();
+
+    if (need_lock) {
+        qemu_mutex_lock_iothread();
+    }
    cpu->interrupt_request &= ~mask;
+    if (need_lock) {
+        qemu_mutex_unlock_iothread();
+    }
 }

 void cpu_exit(CPUState *cpu)
--- a/target/arm/arm-powerctl.c
+++ b/target/arm/arm-powerctl.c
@ -14,6 +14,7 @@
 #include "internals.h"
 #include "arm-powerctl.h"
 #include "qemu/log.h"
+#include "qemu/main-loop.h"
 #include "exec/exec-all.h"

 #ifndef DEBUG_ARM_POWERCTL
@ -48,11 +49,93 @@ CPUState *arm_get_cpu_by_id(uint64_t id)
    return NULL;
 }

+struct CpuOnInfo {
+    uint64_t entry;
+    uint64_t context_id;
+    uint32_t target_el;
+    bool target_aa64;
+};
+
+
+static void arm_set_cpu_on_async_work(CPUState *target_cpu_state,
+                                      run_on_cpu_data data)
+{
+    ARMCPU *target_cpu = ARM_CPU(target_cpu_state);
+    struct CpuOnInfo *info = (struct CpuOnInfo *) data.host_ptr;
+
+    /* Initialize the cpu we are turning on */
+    cpu_reset(target_cpu_state);
+    target_cpu_state->halted = 0;
+
+    if (info->target_aa64) {
+        if ((info->target_el < 3) && arm_feature(&target_cpu->env,
+                                                 ARM_FEATURE_EL3)) {
+            /*
+             * As target mode is AArch64, we need to set lower
+             * exception level (the requested level 2) to AArch64
+             */
+            target_cpu->env.cp15.scr_el3 |= SCR_RW;
+        }
+
+        if ((info->target_el < 2) && arm_feature(&target_cpu->env,
+                                                 ARM_FEATURE_EL2)) {
+            /*
+             * As target mode is AArch64, we need to set lower
+             * exception level (the requested level 1) to AArch64
+             */
+            target_cpu->env.cp15.hcr_el2 |= HCR_RW;
+        }
+
+        target_cpu->env.pstate = aarch64_pstate_mode(info->target_el, true);
+    } else {
+        /* We are requested to boot in AArch32 mode */
+        static const uint32_t mode_for_el[] = { 0,
+                                                ARM_CPU_MODE_SVC,
+                                                ARM_CPU_MODE_HYP,
+                                                ARM_CPU_MODE_SVC };
+
+        cpsr_write(&target_cpu->env, mode_for_el[info->target_el], CPSR_M,
+                   CPSRWriteRaw);
+    }
+
+    if (info->target_el == 3) {
+        /* Processor is in secure mode */
+        target_cpu->env.cp15.scr_el3 &= ~SCR_NS;
+    } else {
+        /* Processor is not in secure mode */
+        target_cpu->env.cp15.scr_el3 |= SCR_NS;
+    }
+
+    /* We check if the started CPU is now at the correct level */
+    assert(info->target_el == arm_current_el(&target_cpu->env));
+
+    if (info->target_aa64) {
+        target_cpu->env.xregs[0] = info->context_id;
+        target_cpu->env.thumb = false;
+    } else {
+        target_cpu->env.regs[0] = info->context_id;
+        target_cpu->env.thumb = info->entry & 1;
+        info->entry &= 0xfffffffe;
+    }
+
+    /* Start the new CPU at the requested address */
+    cpu_set_pc(target_cpu_state, info->entry);
+
+    g_free(info);
+
+    /* Finally set the power status */
+    assert(qemu_mutex_iothread_locked());
+    target_cpu->power_state = PSCI_ON;
+}
+
 int arm_set_cpu_on(uint64_t cpuid, uint64_t entry, uint64_t context_id,
                   uint32_t target_el, bool target_aa64)
 {
    CPUState *target_cpu_state;
    ARMCPU *target_cpu;
+    struct CpuOnInfo *info;
+
+    assert(qemu_mutex_iothread_locked());

    DPRINTF("cpu %" PRId64 " (EL %d, %s) @ 0x%" PRIx64 " with R0 = 0x%" PRIx64
            "\n", cpuid, target_el, target_aa64 ? "aarch64" : "aarch32", entry,
@ -77,7 +160,7 @@ int arm_set_cpu_on(uint64_t cpuid, uint64_t entry, uint64_t context_id,
    }

    target_cpu = ARM_CPU(target_cpu_state);
-    if (!target_cpu->powered_off) {
+    if (target_cpu->power_state == PSCI_ON) {
        qemu_log_mask(LOG_GUEST_ERROR,
                      "[ARM]%s: CPU %" PRId64 " is already on\n",
                      __func__, cpuid);
@ -109,74 +192,54 @@ int arm_set_cpu_on(uint64_t cpuid, uint64_t entry, uint64_t context_id,
        return QEMU_ARM_POWERCTL_INVALID_PARAM;
    }

-    /* Initialize the cpu we are turning on */
-    cpu_reset(target_cpu_state);
-    target_cpu->powered_off = false;
-    target_cpu_state->halted = 0;
-
-    if (target_aa64) {
-        if ((target_el < 3) && arm_feature(&target_cpu->env, ARM_FEATURE_EL3)) {
-            /*
-             * As target mode is AArch64, we need to set lower
-             * exception level (the requested level 2) to AArch64
-             */
-            target_cpu->env.cp15.scr_el3 |= SCR_RW;
-        }
-
-        if ((target_el < 2) && arm_feature(&target_cpu->env, ARM_FEATURE_EL2)) {
-            /*
-             * As target mode is AArch64, we need to set lower
-             * exception level (the requested level 1) to AArch64
-             */
-            target_cpu->env.cp15.hcr_el2 |= HCR_RW;
-        }
-
-        target_cpu->env.pstate = aarch64_pstate_mode(target_el, true);
-    } else {
-        /* We are requested to boot in AArch32 mode */
-        static uint32_t mode_for_el[] = { 0,
-                                          ARM_CPU_MODE_SVC,
-                                          ARM_CPU_MODE_HYP,
-                                          ARM_CPU_MODE_SVC };
-
-        cpsr_write(&target_cpu->env, mode_for_el[target_el], CPSR_M,
-                   CPSRWriteRaw);
+    /*
+     * If another CPU has powered the target on we are in the state
+     * ON_PENDING and additional attempts to power on the CPU should
+     * fail (see 6.6 Implementation CPU_ON/CPU_OFF races in the PSCI
+     * spec)
+     */
+    if (target_cpu->power_state == PSCI_ON_PENDING) {
+        qemu_log_mask(LOG_GUEST_ERROR,
+                      "[ARM]%s: CPU %" PRId64 " is already powering on\n",
+                      __func__, cpuid);
+        return QEMU_ARM_POWERCTL_ON_PENDING;
    }

-    if (target_el == 3) {
-        /* Processor is in secure mode */
-        target_cpu->env.cp15.scr_el3 &= ~SCR_NS;
-    } else {
-        /* Processor is not in secure mode */
-        target_cpu->env.cp15.scr_el3 |= SCR_NS;
-    }
+    /* To avoid racing with a CPU we are just kicking off we do the
+     * final bit of preparation for the work in the target CPUs
+     * context.
+     */
+    info = g_new(struct CpuOnInfo, 1);
+    info->entry = entry;
+    info->context_id = context_id;
+    info->target_el = target_el;
+    info->target_aa64 = target_aa64;

-    /* We check if the started CPU is now at the correct level */
-    assert(target_el == arm_current_el(&target_cpu->env));
-
-    if (target_aa64) {
-        target_cpu->env.xregs[0] = context_id;
-        target_cpu->env.thumb = false;
-    } else {
-        target_cpu->env.regs[0] = context_id;
-        target_cpu->env.thumb = entry & 1;
-        entry &= 0xfffffffe;
-    }
-
-    /* Start the new CPU at the requested address */
-    cpu_set_pc(target_cpu_state, entry);
-
-    qemu_cpu_kick(target_cpu_state);
+    async_run_on_cpu(target_cpu_state, arm_set_cpu_on_async_work,
+                     RUN_ON_CPU_HOST_PTR(info));

    /* We are good to go */
    return QEMU_ARM_POWERCTL_RET_SUCCESS;
 }

+static void arm_set_cpu_off_async_work(CPUState *target_cpu_state,
+                                       run_on_cpu_data data)
+{
+    ARMCPU *target_cpu = ARM_CPU(target_cpu_state);
+
+    assert(qemu_mutex_iothread_locked());
+    target_cpu->power_state = PSCI_OFF;
+    target_cpu_state->halted = 1;
+    target_cpu_state->exception_index = EXCP_HLT;
+}
+
 int arm_set_cpu_off(uint64_t cpuid)
 {
    CPUState *target_cpu_state;
    ARMCPU *target_cpu;

+    assert(qemu_mutex_iothread_locked());
+
    DPRINTF("cpu %" PRId64 "\n", cpuid);

    /* change to the cpu we are powering up */
@ -185,27 +248,34 @@ int arm_set_cpu_off(uint64_t cpuid)
        return QEMU_ARM_POWERCTL_INVALID_PARAM;
    }
    target_cpu = ARM_CPU(target_cpu_state);
-    if (target_cpu->powered_off) {
+    if (target_cpu->power_state == PSCI_OFF) {
        qemu_log_mask(LOG_GUEST_ERROR,
                      "[ARM]%s: CPU %" PRId64 " is already off\n",
                      __func__, cpuid);
        return QEMU_ARM_POWERCTL_IS_OFF;
    }

-    target_cpu->powered_off = true;
-    target_cpu_state->halted = 1;
-    target_cpu_state->exception_index = EXCP_HLT;
-    cpu_loop_exit(target_cpu_state);
-    /* notreached */
+    /* Queue work to run under the target vCPUs context */
+    async_run_on_cpu(target_cpu_state, arm_set_cpu_off_async_work,
+                     RUN_ON_CPU_NULL);

    return QEMU_ARM_POWERCTL_RET_SUCCESS;
 }

+static void arm_reset_cpu_async_work(CPUState *target_cpu_state,
+                                     run_on_cpu_data data)
+{
+    /* Reset the cpu */
+    cpu_reset(target_cpu_state);
+}
+
 int arm_reset_cpu(uint64_t cpuid)
 {
    CPUState *target_cpu_state;
    ARMCPU *target_cpu;

+    assert(qemu_mutex_iothread_locked());
+
    DPRINTF("cpu %" PRId64 "\n", cpuid);

    /* change to the cpu we are resetting */
@ -214,15 +284,17 @@ int arm_reset_cpu(uint64_t cpuid)
        return QEMU_ARM_POWERCTL_INVALID_PARAM;
    }
    target_cpu = ARM_CPU(target_cpu_state);
-    if (target_cpu->powered_off) {
+
+    if (target_cpu->power_state == PSCI_OFF) {
        qemu_log_mask(LOG_GUEST_ERROR,
                      "[ARM]%s: CPU %" PRId64 " is off\n",
                      __func__, cpuid);
        return QEMU_ARM_POWERCTL_IS_OFF;
    }

-    /* Reset the cpu */
-    cpu_reset(target_cpu_state);
+    /* Queue work to run under the target vCPUs context */
+    async_run_on_cpu(target_cpu_state, arm_reset_cpu_async_work,
+                     RUN_ON_CPU_NULL);

    return QEMU_ARM_POWERCTL_RET_SUCCESS;
 }
--- a/target/arm/arm-powerctl.h
+++ b/target/arm/arm-powerctl.h
@ -17,6 +17,7 @@
 #define QEMU_ARM_POWERCTL_INVALID_PARAM QEMU_PSCI_RET_INVALID_PARAMS
 #define QEMU_ARM_POWERCTL_ALREADY_ON QEMU_PSCI_RET_ALREADY_ON
 #define QEMU_ARM_POWERCTL_IS_OFF QEMU_PSCI_RET_DENIED
+#define QEMU_ARM_POWERCTL_ON_PENDING QEMU_PSCI_RET_ON_PENDING

 /*
 * arm_get_cpu_by_id:
@ -43,6 +44,7 @@ CPUState *arm_get_cpu_by_id(uint64_t cpuid);
 * Returns: QEMU_ARM_POWERCTL_RET_SUCCESS on success.
 * QEMU_ARM_POWERCTL_INVALID_PARAM if bad parameters are provided.
 * QEMU_ARM_POWERCTL_ALREADY_ON if the CPU was already started.
+ * QEMU_ARM_POWERCTL_ON_PENDING if the CPU is still powering up
 */
 int arm_set_cpu_on(uint64_t cpuid, uint64_t entry, uint64_t context_id,
                   uint32_t target_el, bool target_aa64);
--- a/target/arm/cpu.c
+++ b/target/arm/cpu.c
@ -45,7 +45,7 @@ static bool arm_cpu_has_work(CPUState *cs)
 {
    ARMCPU *cpu = ARM_CPU(cs);

-    return !cpu->powered_off
+    return (cpu->power_state != PSCI_OFF)
        && cs->interrupt_request &
        (CPU_INTERRUPT_FIQ | CPU_INTERRUPT_HARD
         | CPU_INTERRUPT_VFIQ | CPU_INTERRUPT_VIRQ
@ -132,7 +132,7 @@ static void arm_cpu_reset(CPUState *s)
    env->vfp.xregs[ARM_VFP_MVFR1] = cpu->mvfr1;
    env->vfp.xregs[ARM_VFP_MVFR2] = cpu->mvfr2;

-    cpu->powered_off = cpu->start_powered_off;
+    cpu->power_state = cpu->start_powered_off ? PSCI_OFF : PSCI_ON;
    s->halted = cpu->start_powered_off;

    if (arm_feature(env, ARM_FEATURE_IWMMXT)) {
--- a/target/arm/cpu.h
+++ b/target/arm/cpu.h
@ -30,6 +30,9 @@
 #  define TARGET_LONG_BITS 32
 #endif

+/* ARM processors have a weak memory model */
+#define TCG_GUEST_DEFAULT_MO      (0)
+
 #define CPUArchState struct CPUARMState

 #include "qemu-common.h"
@ -526,6 +529,15 @@ typedef struct CPUARMState {
 */
 typedef void ARMELChangeHook(ARMCPU *cpu, void *opaque);

+
+/* These values map onto the return values for
+ * QEMU_PSCI_0_2_FN_AFFINITY_INFO */
+typedef enum ARMPSCIState {
+    PSCI_OFF = 0,
+    PSCI_ON = 1,
+    PSCI_ON_PENDING = 2
+} ARMPSCIState;
+
 /**
 * ARMCPU:
 * @env: #CPUARMState
@ -582,8 +594,10 @@ struct ARMCPU {

    /* Should CPU start in PSCI powered-off state? */
    bool start_powered_off;
-    /* CPU currently in PSCI powered-off state */
-    bool powered_off;
+
+    /* Current power state, access guarded by BQL */
+    ARMPSCIState power_state;
+
    /* CPU has virtualization extension */
    bool has_el2;
    /* CPU has security extension */
--- a/target/arm/helper.c
+++ b/target/arm/helper.c
@ -536,41 +536,33 @@ static void tlbimvaa_write(CPUARMState *env, const ARMCPRegInfo *ri,
 static void tlbiall_is_write(CPUARMState *env, const ARMCPRegInfo *ri,
                             uint64_t value)
 {
-    CPUState *other_cs;
+    CPUState *cs = ENV_GET_CPU(env);

-    CPU_FOREACH(other_cs) {
-        tlb_flush(other_cs);
-    }
+    tlb_flush_all_cpus_synced(cs);
 }

 static void tlbiasid_is_write(CPUARMState *env, const ARMCPRegInfo *ri,
                             uint64_t value)
 {
-    CPUState *other_cs;
+    CPUState *cs = ENV_GET_CPU(env);

-    CPU_FOREACH(other_cs) {
-        tlb_flush(other_cs);
-    }
+    tlb_flush_all_cpus_synced(cs);
 }

 static void tlbimva_is_write(CPUARMState *env, const ARMCPRegInfo *ri,
                             uint64_t value)
 {
-    CPUState *other_cs;
+    CPUState *cs = ENV_GET_CPU(env);

-    CPU_FOREACH(other_cs) {
-        tlb_flush_page(other_cs, value & TARGET_PAGE_MASK);
-    }
+    tlb_flush_page_all_cpus_synced(cs, value & TARGET_PAGE_MASK);
 }

 static void tlbimvaa_is_write(CPUARMState *env, const ARMCPRegInfo *ri,
                             uint64_t value)
 {
-    CPUState *other_cs;
+    CPUState *cs = ENV_GET_CPU(env);

-    CPU_FOREACH(other_cs) {
-        tlb_flush_page(other_cs, value & TARGET_PAGE_MASK);
-    }
+    tlb_flush_page_all_cpus_synced(cs, value & TARGET_PAGE_MASK);
 }

 static void tlbiall_nsnh_write(CPUARMState *env, const ARMCPRegInfo *ri,
@ -578,19 +570,21 @@ static void tlbiall_nsnh_write(CPUARMState *env, const ARMCPRegInfo *ri,
 {
    CPUState *cs = ENV_GET_CPU(env);

-    tlb_flush_by_mmuidx(cs, ARMMMUIdx_S12NSE1, ARMMMUIdx_S12NSE0,
-                        ARMMMUIdx_S2NS, -1);
+    tlb_flush_by_mmuidx(cs,
+                        (1 << ARMMMUIdx_S12NSE1) |
+                        (1 << ARMMMUIdx_S12NSE0) |
+                        (1 << ARMMMUIdx_S2NS));
 }

 static void tlbiall_nsnh_is_write(CPUARMState *env, const ARMCPRegInfo *ri,
                                  uint64_t value)
 {
-    CPUState *other_cs;
+    CPUState *cs = ENV_GET_CPU(env);

-    CPU_FOREACH(other_cs) {
-        tlb_flush_by_mmuidx(other_cs, ARMMMUIdx_S12NSE1,
-                            ARMMMUIdx_S12NSE0, ARMMMUIdx_S2NS, -1);
-    }
+    tlb_flush_by_mmuidx_all_cpus_synced(cs,
+                                        (1 << ARMMMUIdx_S12NSE1) |
+                                        (1 << ARMMMUIdx_S12NSE0) |
+                                        (1 << ARMMMUIdx_S2NS));
 }

 static void tlbiipas2_write(CPUARMState *env, const ARMCPRegInfo *ri,
@ -611,13 +605,13 @@ static void tlbiipas2_write(CPUARMState *env, const ARMCPRegInfo *ri,

    pageaddr = sextract64(value << 12, 0, 40);

-    tlb_flush_page_by_mmuidx(cs, pageaddr, ARMMMUIdx_S2NS, -1);
+    tlb_flush_page_by_mmuidx(cs, pageaddr, (1 << ARMMMUIdx_S2NS));
 }

 static void tlbiipas2_is_write(CPUARMState *env, const ARMCPRegInfo *ri,
                               uint64_t value)
 {
-    CPUState *other_cs;
+    CPUState *cs = ENV_GET_CPU(env);
    uint64_t pageaddr;

    if (!arm_feature(env, ARM_FEATURE_EL2) || !(env->cp15.scr_el3 & SCR_NS)) {
@ -626,9 +620,8 @@ static void tlbiipas2_is_write(CPUARMState *env, const ARMCPRegInfo *ri,

    pageaddr = sextract64(value << 12, 0, 40);

-    CPU_FOREACH(other_cs) {
-        tlb_flush_page_by_mmuidx(other_cs, pageaddr, ARMMMUIdx_S2NS, -1);
-    }
+    tlb_flush_page_by_mmuidx_all_cpus_synced(cs, pageaddr,
+                                             (1 << ARMMMUIdx_S2NS));
 }

 static void tlbiall_hyp_write(CPUARMState *env, const ARMCPRegInfo *ri,
@ -636,17 +629,15 @@ static void tlbiall_hyp_write(CPUARMState *env, const ARMCPRegInfo *ri,
 {
    CPUState *cs = ENV_GET_CPU(env);

-    tlb_flush_by_mmuidx(cs, ARMMMUIdx_S1E2, -1);
+    tlb_flush_by_mmuidx(cs, (1 << ARMMMUIdx_S1E2));
 }

 static void tlbiall_hyp_is_write(CPUARMState *env, const ARMCPRegInfo *ri,
                                 uint64_t value)
 {
-    CPUState *other_cs;
+    CPUState *cs = ENV_GET_CPU(env);

-    CPU_FOREACH(other_cs) {
-        tlb_flush_by_mmuidx(other_cs, ARMMMUIdx_S1E2, -1);
-    }
+    tlb_flush_by_mmuidx_all_cpus_synced(cs, (1 << ARMMMUIdx_S1E2));
 }

 static void tlbimva_hyp_write(CPUARMState *env, const ARMCPRegInfo *ri,
@ -655,18 +646,17 @@ static void tlbimva_hyp_write(CPUARMState *env, const ARMCPRegInfo *ri,
    CPUState *cs = ENV_GET_CPU(env);
    uint64_t pageaddr = value & ~MAKE_64BIT_MASK(0, 12);

-    tlb_flush_page_by_mmuidx(cs, pageaddr, ARMMMUIdx_S1E2, -1);
+    tlb_flush_page_by_mmuidx(cs, pageaddr, (1 << ARMMMUIdx_S1E2));
 }

 static void tlbimva_hyp_is_write(CPUARMState *env, const ARMCPRegInfo *ri,
                                 uint64_t value)
 {
-    CPUState *other_cs;
+    CPUState *cs = ENV_GET_CPU(env);
    uint64_t pageaddr = value & ~MAKE_64BIT_MASK(0, 12);

-    CPU_FOREACH(other_cs) {
-        tlb_flush_page_by_mmuidx(other_cs, pageaddr, ARMMMUIdx_S1E2, -1);
-    }
+    tlb_flush_page_by_mmuidx_all_cpus_synced(cs, pageaddr,
+                                             (1 << ARMMMUIdx_S1E2));
 }

 static const ARMCPRegInfo cp_reginfo[] = {
@ -2542,8 +2532,10 @@ static void vttbr_write(CPUARMState *env, const ARMCPRegInfo *ri,

    /* Accesses to VTTBR may change the VMID so we must flush the TLB.  */
    if (raw_read(env, ri) != value) {
-        tlb_flush_by_mmuidx(cs, ARMMMUIdx_S12NSE1, ARMMMUIdx_S12NSE0,
-                            ARMMMUIdx_S2NS, -1);
+        tlb_flush_by_mmuidx(cs,
+                            (1 << ARMMMUIdx_S12NSE1) |
+                            (1 << ARMMMUIdx_S12NSE0) |
+                            (1 << ARMMMUIdx_S2NS));
        raw_write(env, ri, value);
    }
 }
@ -2898,29 +2890,33 @@ static CPAccessResult aa64_cacheop_access(CPUARMState *env,
 static void tlbi_aa64_vmalle1_write(CPUARMState *env, const ARMCPRegInfo *ri,
                                    uint64_t value)
 {
-    ARMCPU *cpu = arm_env_get_cpu(env);
-    CPUState *cs = CPU(cpu);
+    CPUState *cs = ENV_GET_CPU(env);

    if (arm_is_secure_below_el3(env)) {
-        tlb_flush_by_mmuidx(cs, ARMMMUIdx_S1SE1, ARMMMUIdx_S1SE0, -1);
+        tlb_flush_by_mmuidx(cs,
+                            (1 << ARMMMUIdx_S1SE1) |
+                            (1 << ARMMMUIdx_S1SE0));
    } else {
-        tlb_flush_by_mmuidx(cs, ARMMMUIdx_S12NSE1, ARMMMUIdx_S12NSE0, -1);
+        tlb_flush_by_mmuidx(cs,
+                            (1 << ARMMMUIdx_S12NSE1) |
+                            (1 << ARMMMUIdx_S12NSE0));
    }
 }

 static void tlbi_aa64_vmalle1is_write(CPUARMState *env, const ARMCPRegInfo *ri,
                                      uint64_t value)
 {
+    CPUState *cs = ENV_GET_CPU(env);
    bool sec = arm_is_secure_below_el3(env);
-    CPUState *other_cs;

-    CPU_FOREACH(other_cs) {
-        if (sec) {
-            tlb_flush_by_mmuidx(other_cs, ARMMMUIdx_S1SE1, ARMMMUIdx_S1SE0, -1);
-        } else {
-            tlb_flush_by_mmuidx(other_cs, ARMMMUIdx_S12NSE1,
-                                ARMMMUIdx_S12NSE0, -1);
-        }
+    if (sec) {
+        tlb_flush_by_mmuidx_all_cpus_synced(cs,
+                                            (1 << ARMMMUIdx_S1SE1) |
+                                            (1 << ARMMMUIdx_S1SE0));
+    } else {
+        tlb_flush_by_mmuidx_all_cpus_synced(cs,
+                                            (1 << ARMMMUIdx_S12NSE1) |
+                                            (1 << ARMMMUIdx_S12NSE0));
    }
 }

@ -2935,13 +2931,19 @@ static void tlbi_aa64_alle1_write(CPUARMState *env, const ARMCPRegInfo *ri,
    CPUState *cs = CPU(cpu);

    if (arm_is_secure_below_el3(env)) {
-        tlb_flush_by_mmuidx(cs, ARMMMUIdx_S1SE1, ARMMMUIdx_S1SE0, -1);
+        tlb_flush_by_mmuidx(cs,
+                            (1 << ARMMMUIdx_S1SE1) |
+                            (1 << ARMMMUIdx_S1SE0));
    } else {
        if (arm_feature(env, ARM_FEATURE_EL2)) {
-            tlb_flush_by_mmuidx(cs, ARMMMUIdx_S12NSE1, ARMMMUIdx_S12NSE0,
-                                ARMMMUIdx_S2NS, -1);
+            tlb_flush_by_mmuidx(cs,
+                                (1 << ARMMMUIdx_S12NSE1) |
+                                (1 << ARMMMUIdx_S12NSE0) |
+                                (1 << ARMMMUIdx_S2NS));
        } else {
-            tlb_flush_by_mmuidx(cs, ARMMMUIdx_S12NSE1, ARMMMUIdx_S12NSE0, -1);
+            tlb_flush_by_mmuidx(cs,
+                                (1 << ARMMMUIdx_S12NSE1) |
+                                (1 << ARMMMUIdx_S12NSE0));
        }
    }
 }
@ -2952,7 +2954,7 @@ static void tlbi_aa64_alle2_write(CPUARMState *env, const ARMCPRegInfo *ri,
    ARMCPU *cpu = arm_env_get_cpu(env);
    CPUState *cs = CPU(cpu);

-    tlb_flush_by_mmuidx(cs, ARMMMUIdx_S1E2, -1);
+    tlb_flush_by_mmuidx(cs, (1 << ARMMMUIdx_S1E2));
 }

 static void tlbi_aa64_alle3_write(CPUARMState *env, const ARMCPRegInfo *ri,
@ -2961,7 +2963,7 @@ static void tlbi_aa64_alle3_write(CPUARMState *env, const ARMCPRegInfo *ri,
    ARMCPU *cpu = arm_env_get_cpu(env);
    CPUState *cs = CPU(cpu);

-    tlb_flush_by_mmuidx(cs, ARMMMUIdx_S1E3, -1);
+    tlb_flush_by_mmuidx(cs, (1 << ARMMMUIdx_S1E3));
 }

 static void tlbi_aa64_alle1is_write(CPUARMState *env, const ARMCPRegInfo *ri,
@ -2971,41 +2973,40 @@ static void tlbi_aa64_alle1is_write(CPUARMState *env, const ARMCPRegInfo *ri,
     * stage 2 translations, whereas most other scopes only invalidate
     * stage 1 translations.
     */
+    CPUState *cs = ENV_GET_CPU(env);
    bool sec = arm_is_secure_below_el3(env);
    bool has_el2 = arm_feature(env, ARM_FEATURE_EL2);
-    CPUState *other_cs;

-    CPU_FOREACH(other_cs) {
-        if (sec) {
-            tlb_flush_by_mmuidx(other_cs, ARMMMUIdx_S1SE1, ARMMMUIdx_S1SE0, -1);
-        } else if (has_el2) {
-            tlb_flush_by_mmuidx(other_cs, ARMMMUIdx_S12NSE1,
-                                ARMMMUIdx_S12NSE0, ARMMMUIdx_S2NS, -1);
-        } else {
-            tlb_flush_by_mmuidx(other_cs, ARMMMUIdx_S12NSE1,
-                                ARMMMUIdx_S12NSE0, -1);
-        }
+    if (sec) {
+        tlb_flush_by_mmuidx_all_cpus_synced(cs,
+                                            (1 << ARMMMUIdx_S1SE1) |
+                                            (1 << ARMMMUIdx_S1SE0));
+    } else if (has_el2) {
+        tlb_flush_by_mmuidx_all_cpus_synced(cs,
+                                            (1 << ARMMMUIdx_S12NSE1) |
+                                            (1 << ARMMMUIdx_S12NSE0) |
+                                            (1 << ARMMMUIdx_S2NS));
+    } else {
+          tlb_flush_by_mmuidx_all_cpus_synced(cs,
+                                              (1 << ARMMMUIdx_S12NSE1) |
+                                              (1 << ARMMMUIdx_S12NSE0));
    }
 }

 static void tlbi_aa64_alle2is_write(CPUARMState *env, const ARMCPRegInfo *ri,
                                    uint64_t value)
 {
-    CPUState *other_cs;
+    CPUState *cs = ENV_GET_CPU(env);

-    CPU_FOREACH(other_cs) {
-        tlb_flush_by_mmuidx(other_cs, ARMMMUIdx_S1E2, -1);
-    }
+    tlb_flush_by_mmuidx_all_cpus_synced(cs, (1 << ARMMMUIdx_S1E2));
 }

 static void tlbi_aa64_alle3is_write(CPUARMState *env, const ARMCPRegInfo *ri,
                                    uint64_t value)
 {
-    CPUState *other_cs;
+    CPUState *cs = ENV_GET_CPU(env);

-    CPU_FOREACH(other_cs) {
-        tlb_flush_by_mmuidx(other_cs, ARMMMUIdx_S1E3, -1);
-    }
+    tlb_flush_by_mmuidx_all_cpus_synced(cs, (1 << ARMMMUIdx_S1E3));
 }

 static void tlbi_aa64_vae1_write(CPUARMState *env, const ARMCPRegInfo *ri,
@ -3021,11 +3022,13 @@ static void tlbi_aa64_vae1_write(CPUARMState *env, const ARMCPRegInfo *ri,
    uint64_t pageaddr = sextract64(value << 12, 0, 56);

    if (arm_is_secure_below_el3(env)) {
-        tlb_flush_page_by_mmuidx(cs, pageaddr, ARMMMUIdx_S1SE1,
-                                 ARMMMUIdx_S1SE0, -1);
+        tlb_flush_page_by_mmuidx(cs, pageaddr,
+                                 (1 << ARMMMUIdx_S1SE1) |
+                                 (1 << ARMMMUIdx_S1SE0));
    } else {
-        tlb_flush_page_by_mmuidx(cs, pageaddr, ARMMMUIdx_S12NSE1,
-                                 ARMMMUIdx_S12NSE0, -1);
+        tlb_flush_page_by_mmuidx(cs, pageaddr,
+                                 (1 << ARMMMUIdx_S12NSE1) |
+                                 (1 << ARMMMUIdx_S12NSE0));
    }
 }

@ -3040,7 +3043,7 @@ static void tlbi_aa64_vae2_write(CPUARMState *env, const ARMCPRegInfo *ri,
    CPUState *cs = CPU(cpu);
    uint64_t pageaddr = sextract64(value << 12, 0, 56);

-    tlb_flush_page_by_mmuidx(cs, pageaddr, ARMMMUIdx_S1E2, -1);
+    tlb_flush_page_by_mmuidx(cs, pageaddr, (1 << ARMMMUIdx_S1E2));
 }

 static void tlbi_aa64_vae3_write(CPUARMState *env, const ARMCPRegInfo *ri,
@ -3054,47 +3057,46 @@ static void tlbi_aa64_vae3_write(CPUARMState *env, const ARMCPRegInfo *ri,
    CPUState *cs = CPU(cpu);
    uint64_t pageaddr = sextract64(value << 12, 0, 56);

-    tlb_flush_page_by_mmuidx(cs, pageaddr, ARMMMUIdx_S1E3, -1);
+    tlb_flush_page_by_mmuidx(cs, pageaddr, (1 << ARMMMUIdx_S1E3));
 }

 static void tlbi_aa64_vae1is_write(CPUARMState *env, const ARMCPRegInfo *ri,
                                   uint64_t value)
 {
+    ARMCPU *cpu = arm_env_get_cpu(env);
+    CPUState *cs = CPU(cpu);
    bool sec = arm_is_secure_below_el3(env);
-    CPUState *other_cs;
    uint64_t pageaddr = sextract64(value << 12, 0, 56);

-    CPU_FOREACH(other_cs) {
-        if (sec) {
-            tlb_flush_page_by_mmuidx(other_cs, pageaddr, ARMMMUIdx_S1SE1,
-                                     ARMMMUIdx_S1SE0, -1);
-        } else {
-            tlb_flush_page_by_mmuidx(other_cs, pageaddr, ARMMMUIdx_S12NSE1,
-                                     ARMMMUIdx_S12NSE0, -1);
-        }
+    if (sec) {
+        tlb_flush_page_by_mmuidx_all_cpus_synced(cs, pageaddr,
+                                                 (1 << ARMMMUIdx_S1SE1) |
+                                                 (1 << ARMMMUIdx_S1SE0));
+    } else {
+        tlb_flush_page_by_mmuidx_all_cpus_synced(cs, pageaddr,
+                                                 (1 << ARMMMUIdx_S12NSE1) |
+                                                 (1 << ARMMMUIdx_S12NSE0));
    }
 }

 static void tlbi_aa64_vae2is_write(CPUARMState *env, const ARMCPRegInfo *ri,
                                   uint64_t value)
 {
-    CPUState *other_cs;
+    CPUState *cs = ENV_GET_CPU(env);
    uint64_t pageaddr = sextract64(value << 12, 0, 56);

-    CPU_FOREACH(other_cs) {
-        tlb_flush_page_by_mmuidx(other_cs, pageaddr, ARMMMUIdx_S1E2, -1);
-    }
+    tlb_flush_page_by_mmuidx_all_cpus_synced(cs, pageaddr,
+                                             (1 << ARMMMUIdx_S1E2));
 }

 static void tlbi_aa64_vae3is_write(CPUARMState *env, const ARMCPRegInfo *ri,
                                   uint64_t value)
 {
-    CPUState *other_cs;
+    CPUState *cs = ENV_GET_CPU(env);
    uint64_t pageaddr = sextract64(value << 12, 0, 56);

-    CPU_FOREACH(other_cs) {
-        tlb_flush_page_by_mmuidx(other_cs, pageaddr, ARMMMUIdx_S1E3, -1);
-    }
+    tlb_flush_page_by_mmuidx_all_cpus_synced(cs, pageaddr,
+                                             (1 << ARMMMUIdx_S1E3));
 }

 static void tlbi_aa64_ipas2e1_write(CPUARMState *env, const ARMCPRegInfo *ri,
@ -3116,13 +3118,13 @@ static void tlbi_aa64_ipas2e1_write(CPUARMState *env, const ARMCPRegInfo *ri,

    pageaddr = sextract64(value << 12, 0, 48);

-    tlb_flush_page_by_mmuidx(cs, pageaddr, ARMMMUIdx_S2NS, -1);
+    tlb_flush_page_by_mmuidx(cs, pageaddr, (1 << ARMMMUIdx_S2NS));
 }

 static void tlbi_aa64_ipas2e1is_write(CPUARMState *env, const ARMCPRegInfo *ri,
                                      uint64_t value)
 {
-    CPUState *other_cs;
+    CPUState *cs = ENV_GET_CPU(env);
    uint64_t pageaddr;

    if (!arm_feature(env, ARM_FEATURE_EL2) || !(env->cp15.scr_el3 & SCR_NS)) {
@ -3131,9 +3133,8 @@ static void tlbi_aa64_ipas2e1is_write(CPUARMState *env, const ARMCPRegInfo *ri,

    pageaddr = sextract64(value << 12, 0, 48);

-    CPU_FOREACH(other_cs) {
-        tlb_flush_page_by_mmuidx(other_cs, pageaddr, ARMMMUIdx_S2NS, -1);
-    }
+    tlb_flush_page_by_mmuidx_all_cpus_synced(cs, pageaddr,
+                                             (1 << ARMMMUIdx_S2NS));
 }

 static CPAccessResult aa64_zva_access(CPUARMState *env, const ARMCPRegInfo *ri,
@ -6769,6 +6770,12 @@ void arm_cpu_do_interrupt(CPUState *cs)
        arm_cpu_do_interrupt_aarch32(cs);
    }

+    /* Hooks may change global state so BQL should be held, also the
+     * BQL needs to be held for any modification of
+     * cs->interrupt_request.
+     */
+    g_assert(qemu_mutex_iothread_locked());
+
    arm_call_el_change_hook(cpu);

    if (!kvm_enabled()) {
--- a/target/arm/kvm.c
+++ b/target/arm/kvm.c
@ -488,8 +488,8 @@ int kvm_arm_sync_mpstate_to_kvm(ARMCPU *cpu)
 {
    if (cap_has_mp_state) {
        struct kvm_mp_state mp_state = {
-            .mp_state =
-            cpu->powered_off ? KVM_MP_STATE_STOPPED : KVM_MP_STATE_RUNNABLE
+            .mp_state = (cpu->power_state == PSCI_OFF) ?
+            KVM_MP_STATE_STOPPED : KVM_MP_STATE_RUNNABLE
        };
        int ret = kvm_vcpu_ioctl(CPU(cpu), KVM_SET_MP_STATE, &mp_state);
        if (ret) {
@ -515,7 +515,8 @@ int kvm_arm_sync_mpstate_to_qemu(ARMCPU *cpu)
                    __func__, ret, strerror(-ret));
            abort();
        }
-        cpu->powered_off = (mp_state.mp_state == KVM_MP_STATE_STOPPED);
+        cpu->power_state = (mp_state.mp_state == KVM_MP_STATE_STOPPED) ?
+            PSCI_OFF : PSCI_ON;
    }

    return 0;
--- a/target/arm/machine.c
+++ b/target/arm/machine.c
@ -211,6 +211,38 @@ static const VMStateInfo vmstate_cpsr = {
    .put = put_cpsr,
 };

+static int get_power(QEMUFile *f, void *opaque, size_t size,
+                    VMStateField *field)
+{
+    ARMCPU *cpu = opaque;
+    bool powered_off = qemu_get_byte(f);
+    cpu->power_state = powered_off ? PSCI_OFF : PSCI_ON;
+    return 0;
+}
+
+static int put_power(QEMUFile *f, void *opaque, size_t size,
+                    VMStateField *field, QJSON *vmdesc)
+{
+    ARMCPU *cpu = opaque;
+
+    /* Migration should never happen while we transition power states */
+
+    if (cpu->power_state == PSCI_ON ||
+        cpu->power_state == PSCI_OFF) {
+        bool powered_off = (cpu->power_state == PSCI_OFF) ? true : false;
+        qemu_put_byte(f, powered_off);
+        return 0;
+    } else {
+        return 1;
+    }
+}
+
+static const VMStateInfo vmstate_powered_off = {
+    .name = "powered_off",
+    .get = get_power,
+    .put = put_power,
+};
+
 static void cpu_pre_save(void *opaque)
 {
    ARMCPU *cpu = opaque;
@ -329,7 +361,14 @@ const VMStateDescription vmstate_arm_cpu = {
        VMSTATE_UINT64(env.exception.vaddress, ARMCPU),
        VMSTATE_TIMER_PTR(gt_timer[GTIMER_PHYS], ARMCPU),
        VMSTATE_TIMER_PTR(gt_timer[GTIMER_VIRT], ARMCPU),
-        VMSTATE_BOOL(powered_off, ARMCPU),
+        {
+            .name = "power_state",
+            .version_id = 0,
+            .size = sizeof(bool),
+            .info = &vmstate_powered_off,
+            .flags = VMS_SINGLE,
+            .offset = 0,
+        },
        VMSTATE_END_OF_LIST()
    },
    .subsections = (const VMStateDescription*[]) {
--- a/target/arm/op_helper.c
+++ b/target/arm/op_helper.c
@ -18,6 +18,7 @@
 */
 #include "qemu/osdep.h"
 #include "qemu/log.h"
+#include "qemu/main-loop.h"
 #include "cpu.h"
 #include "exec/helper-proto.h"
 #include "internals.h"
@ -435,6 +436,13 @@ void HELPER(yield)(CPUARMState *env)
    ARMCPU *cpu = arm_env_get_cpu(env);
    CPUState *cs = CPU(cpu);

+    /* When running in MTTCG we don't generate jumps to the yield and
+     * WFE helpers as it won't affect the scheduling of other vCPUs.
+     * If we wanted to more completely model WFE/SEV so we don't busy
+     * spin unnecessarily we would need to do something more involved.
+     */
+    g_assert(!parallel_cpus);
+
    /* This is a non-trappable hint instruction that generally indicates
     * that the guest is currently busy-looping. Yield control back to the
     * top level loop so that a more deserving VCPU has a chance to run.
@ -487,7 +495,9 @@ void HELPER(cpsr_write_eret)(CPUARMState *env, uint32_t val)
     */
    env->regs[15] &= (env->thumb ? ~1 : ~3);

+    qemu_mutex_lock_iothread();
    arm_call_el_change_hook(arm_env_get_cpu(env));
+    qemu_mutex_unlock_iothread();
 }

 /* Access to user mode registers from privileged modes.  */
@ -735,28 +745,58 @@ void HELPER(set_cp_reg)(CPUARMState *env, void *rip, uint32_t value)
 {
    const ARMCPRegInfo *ri = rip;

-    ri->writefn(env, ri, value);
+    if (ri->type & ARM_CP_IO) {
+        qemu_mutex_lock_iothread();
+        ri->writefn(env, ri, value);
+        qemu_mutex_unlock_iothread();
+    } else {
+        ri->writefn(env, ri, value);
+    }
 }

 uint32_t HELPER(get_cp_reg)(CPUARMState *env, void *rip)
 {
    const ARMCPRegInfo *ri = rip;
+    uint32_t res;

-    return ri->readfn(env, ri);
+    if (ri->type & ARM_CP_IO) {
+        qemu_mutex_lock_iothread();
+        res = ri->readfn(env, ri);
+        qemu_mutex_unlock_iothread();
+    } else {
+        res = ri->readfn(env, ri);
+    }
+
+    return res;
 }

 void HELPER(set_cp_reg64)(CPUARMState *env, void *rip, uint64_t value)
 {
    const ARMCPRegInfo *ri = rip;

-    ri->writefn(env, ri, value);
+    if (ri->type & ARM_CP_IO) {
+        qemu_mutex_lock_iothread();
+        ri->writefn(env, ri, value);
+        qemu_mutex_unlock_iothread();
+    } else {
+        ri->writefn(env, ri, value);
+    }
 }

 uint64_t HELPER(get_cp_reg64)(CPUARMState *env, void *rip)
 {
    const ARMCPRegInfo *ri = rip;
+    uint64_t res;

-    return ri->readfn(env, ri);
+    if (ri->type & ARM_CP_IO) {
+        qemu_mutex_lock_iothread();
+        res = ri->readfn(env, ri);
+        qemu_mutex_unlock_iothread();
+    } else {
+        res = ri->readfn(env, ri);
+    }
+
+    return res;
 }

 void HELPER(msr_i_pstate)(CPUARMState *env, uint32_t op, uint32_t imm)
@ -989,7 +1029,9 @@ void HELPER(exception_return)(CPUARMState *env)
                      cur_el, new_el, env->pc);
    }

+    qemu_mutex_lock_iothread();
    arm_call_el_change_hook(arm_env_get_cpu(env));
+    qemu_mutex_unlock_iothread();

    return;

--- a/target/arm/psci.c
+++ b/target/arm/psci.c
@ -127,7 +127,9 @@ void arm_handle_psci_call(ARMCPU *cpu)
                break;
            }
            target_cpu = ARM_CPU(target_cpu_state);
-            ret = target_cpu->powered_off ? 1 : 0;
+
+            g_assert(qemu_mutex_iothread_locked());
+            ret = target_cpu->power_state;
            break;
        default:
            /* Everything above affinity level 0 is always on. */
--- a/target/arm/translate-a64.c
+++ b/target/arm/translate-a64.c
@ -1328,10 +1328,14 @@ static void handle_hint(DisasContext *s, uint32_t insn,
        s->is_jmp = DISAS_WFI;
        return;
    case 1: /* YIELD */
-        s->is_jmp = DISAS_YIELD;
+        if (!parallel_cpus) {
+            s->is_jmp = DISAS_YIELD;
+        }
        return;
    case 2: /* WFE */
-        s->is_jmp = DISAS_WFE;
+        if (!parallel_cpus) {
+            s->is_jmp = DISAS_WFE;
+        }
        return;
    case 4: /* SEV */
    case 5: /* SEVL */
--- a/target/arm/translate.c
+++ b/target/arm/translate.c
@ -4404,20 +4404,32 @@ static void gen_exception_return(DisasContext *s, TCGv_i32 pc)
    gen_rfe(s, pc, load_cpu_field(spsr));
 }

+/*
+ * For WFI we will halt the vCPU until an IRQ. For WFE and YIELD we
+ * only call the helper when running single threaded TCG code to ensure
+ * the next round-robin scheduled vCPU gets a crack. In MTTCG mode we
+ * just skip this instruction. Currently the SEV/SEVL instructions
+ * which are *one* of many ways to wake the CPU from WFE are not
+ * implemented so we can't sleep like WFI does.
+ */
 static void gen_nop_hint(DisasContext *s, int val)
 {
    switch (val) {
    case 1: /* yield */
-        gen_set_pc_im(s, s->pc);
-        s->is_jmp = DISAS_YIELD;
+        if (!parallel_cpus) {
+            gen_set_pc_im(s, s->pc);
+            s->is_jmp = DISAS_YIELD;
+        }
        break;
    case 3: /* wfi */
        gen_set_pc_im(s, s->pc);
        s->is_jmp = DISAS_WFI;
        break;
    case 2: /* wfe */
-        gen_set_pc_im(s, s->pc);
-        s->is_jmp = DISAS_WFE;
+        if (!parallel_cpus) {
+            gen_set_pc_im(s, s->pc);
+            s->is_jmp = DISAS_WFE;
+        }
        break;
    case 4: /* sev */
    case 5: /* sevl */
--- a/target/i386/smm_helper.c
+++ b/target/i386/smm_helper.c
@ -18,6 +18,7 @@
 */

 #include "qemu/osdep.h"
+#include "qemu/main-loop.h"
 #include "cpu.h"
 #include "exec/helper-proto.h"
 #include "exec/log.h"
@ -42,11 +43,14 @@ void helper_rsm(CPUX86State *env)
 #define SMM_REVISION_ID 0x00020000
 #endif

+/* Called with iothread lock taken */
 void cpu_smm_update(X86CPU *cpu)
 {
    CPUX86State *env = &cpu->env;
    bool smm_enabled = (env->hflags & HF_SMM_MASK);

+    g_assert(qemu_mutex_iothread_locked());
+
    if (cpu->smram) {
        memory_region_set_enabled(cpu->smram, smm_enabled);
    }
@ -333,7 +337,10 @@ void helper_rsm(CPUX86State *env)
    }
    env->hflags2 &= ~HF2_SMM_INSIDE_NMI_MASK;
    env->hflags &= ~HF_SMM_MASK;
+
+    qemu_mutex_lock_iothread();
    cpu_smm_update(cpu);
+    qemu_mutex_unlock_iothread();

    qemu_log_mask(CPU_LOG_INT, "SMM: after RSM\n");
    log_cpu_state_mask(CPU_LOG_INT, CPU(cpu), CPU_DUMP_CCOP);
--- a/target/s390x/misc_helper.c
+++ b/target/s390x/misc_helper.c
@ -25,6 +25,7 @@
 #include "exec/helper-proto.h"
 #include "sysemu/kvm.h"
 #include "qemu/timer.h"
+#include "qemu/main-loop.h"
 #include "exec/address-spaces.h"
 #ifdef CONFIG_KVM
 #include <linux/kvm.h>
@ -109,11 +110,13 @@ void program_interrupt(CPUS390XState *env, uint32_t code, int ilen)
 /* SCLP service call */
 uint32_t HELPER(servc)(CPUS390XState *env, uint64_t r1, uint64_t r2)
 {
+    qemu_mutex_lock_iothread();
    int r = sclp_service_call(env, r1, r2);
    if (r < 0) {
        program_interrupt(env, -r, 4);
-        return 0;
+        r = 0;
    }
+    qemu_mutex_unlock_iothread();
    return r;
 }

--- a/target/sparc/ldst_helper.c
+++ b/target/sparc/ldst_helper.c
@ -1768,13 +1768,15 @@ void helper_st_asi(CPUSPARCState *env, target_ulong addr, target_ulong val,
          case 1:
              env->dmmu.mmu_primary_context = val;
              env->immu.mmu_primary_context = val;
-              tlb_flush_by_mmuidx(CPU(cpu), MMU_USER_IDX, MMU_KERNEL_IDX, -1);
+              tlb_flush_by_mmuidx(CPU(cpu),
+                                  (1 << MMU_USER_IDX) | (1 << MMU_KERNEL_IDX));
              break;
          case 2:
              env->dmmu.mmu_secondary_context = val;
              env->immu.mmu_secondary_context = val;
-              tlb_flush_by_mmuidx(CPU(cpu), MMU_USER_SECONDARY_IDX,
-                                  MMU_KERNEL_SECONDARY_IDX, -1);
+              tlb_flush_by_mmuidx(CPU(cpu),
+                                  (1 << MMU_USER_SECONDARY_IDX) |
+                                  (1 << MMU_KERNEL_SECONDARY_IDX));
              break;
          default:
              cpu_unassigned_access(cs, addr, true, false, 1, size);
--- a/tcg/i386/tcg-target.h
+++ b/tcg/i386/tcg-target.h
@ -165,4 +165,15 @@ static inline void flush_icache_range(uintptr_t start, uintptr_t stop)
 {
 }

+/* This defines the natural memory order supported by this
+ * architecture before guarantees made by various barrier
+ * instructions.
+ *
+ * The x86 has a pretty strong memory ordering which only really
+ * allows for some stores to be re-ordered after loads.
+ */
+#include "tcg-mo.h"
+
+#define TCG_TARGET_DEFAULT_MO (TCG_MO_ALL & ~TCG_MO_ST_LD)
+
 #endif
--- a/tcg/tcg-mo.h
+++ b/tcg/tcg-mo.h
@ -0,0 +1,48 @@
+/*
+ * Tiny Code Generator for QEMU
+ *
+ * Copyright (c) 2008 Fabrice Bellard
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#ifndef TCG_MO_H
+#define TCG_MO_H
+
+typedef enum {
+    /* Used to indicate the type of accesses on which ordering
+       is to be ensured.  Modeled after SPARC barriers.
+
+       This is of the form TCG_MO_A_B where A is before B in program order.
+    */
+    TCG_MO_LD_LD  = 0x01,
+    TCG_MO_ST_LD  = 0x02,
+    TCG_MO_LD_ST  = 0x04,
+    TCG_MO_ST_ST  = 0x08,
+    TCG_MO_ALL    = 0x0F,  /* OR of the above */
+
+    /* Used to indicate the kind of ordering which is to be ensured by the
+       instruction.  These types are derived from x86/aarch64 instructions.
+       It should be noted that these are different from C11 semantics.  */
+    TCG_BAR_LDAQ  = 0x10,  /* Following ops will not come forward */
+    TCG_BAR_STRL  = 0x20,  /* Previous ops will not be delayed */
+    TCG_BAR_SC    = 0x30,  /* No ops cross barrier; OR of the above */
+} TCGBar;
+
+#endif /* TCG_MO_H */
--- a/tcg/tcg.h
+++ b/tcg/tcg.h
@ -29,6 +29,7 @@
 #include "cpu.h"
 #include "exec/tb-context.h"
 #include "qemu/bitops.h"
+#include "tcg-mo.h"
 #include "tcg-target.h"

 /* XXX: make safe guess about sizes */
@ -79,6 +80,15 @@ typedef uint64_t tcg_target_ulong;
 #error unsupported
 #endif

+/* Oversized TCG guests make things like MTTCG hard
+ * as we can't use atomics for cputlb updates.
+ */
+#if TARGET_LONG_BITS > TCG_TARGET_REG_BITS
+#define TCG_OVERSIZED_GUEST 1
+#else
+#define TCG_OVERSIZED_GUEST 0
+#endif
+
 #if TCG_TARGET_NB_REGS <= 32
 typedef uint32_t TCGRegSet;
 #elif TCG_TARGET_NB_REGS <= 64
@ -498,23 +508,6 @@ static inline intptr_t QEMU_ARTIFICIAL GET_TCGV_PTR(TCGv_ptr t)
 #define TCG_CALL_DUMMY_TCGV     MAKE_TCGV_I32(-1)
 #define TCG_CALL_DUMMY_ARG      ((TCGArg)(-1))

-typedef enum {
-    /* Used to indicate the type of accesses on which ordering
-       is to be ensured.  Modeled after SPARC barriers.  */
-    TCG_MO_LD_LD  = 0x01,
-    TCG_MO_ST_LD  = 0x02,
-    TCG_MO_LD_ST  = 0x04,
-    TCG_MO_ST_ST  = 0x08,
-    TCG_MO_ALL    = 0x0F,  /* OR of the above */
-
-    /* Used to indicate the kind of ordering which is to be ensured by the
-       instruction.  These types are derived from x86/aarch64 instructions.
-       It should be noted that these are different from C11 semantics.  */
-    TCG_BAR_LDAQ  = 0x10,  /* Following ops will not come forward */
-    TCG_BAR_STRL  = 0x20,  /* Previous ops will not be delayed */
-    TCG_BAR_SC    = 0x30,  /* No ops cross barrier; OR of the above */
-} TCGBar;
-
 /* Conditions.  Note that these are laid out for easy manipulation by
   the functions below:
     bit 0 is used for inverting;
--- a/translate-all.c
+++ b/translate-all.c
@ -55,11 +55,11 @@
 #include "translate-all.h"
 #include "qemu/bitmap.h"
 #include "qemu/timer.h"
+#include "qemu/main-loop.h"
 #include "exec/log.h"

 /* #define DEBUG_TB_INVALIDATE */
 /* #define DEBUG_TB_FLUSH */
-/* #define DEBUG_LOCKING */
 /* make various TB consistency checks */
 /* #define DEBUG_TB_CHECK */

@ -74,20 +74,10 @@
 * access to the memory related structures are protected with the
 * mmap_lock.
 */
-#ifdef DEBUG_LOCKING
-#define DEBUG_MEM_LOCKS 1
-#else
-#define DEBUG_MEM_LOCKS 0
-#endif
-
 #ifdef CONFIG_SOFTMMU
-#define assert_memory_lock() do { /* nothing */ } while (0)
+#define assert_memory_lock() tcg_debug_assert(have_tb_lock)
 #else
-#define assert_memory_lock() do {               \
-        if (DEBUG_MEM_LOCKS) {                  \
-            g_assert(have_mmap_lock());         \
-        }                                       \
-    } while (0)
+#define assert_memory_lock() tcg_debug_assert(have_mmap_lock())
 #endif

 #define SMC_BITMAP_USE_THRESHOLD 10
@ -145,9 +135,7 @@ TCGContext tcg_ctx;
 bool parallel_cpus;

 /* translation block context */
-#ifdef CONFIG_USER_ONLY
 __thread int have_tb_lock;
-#endif

 static void page_table_config_init(void)
 {
@ -169,51 +157,31 @@ static void page_table_config_init(void)
    assert(v_l2_levels >= 0);
 }

+#define assert_tb_locked() tcg_debug_assert(have_tb_lock)
+#define assert_tb_unlocked() tcg_debug_assert(!have_tb_lock)
+
 void tb_lock(void)
 {
-#ifdef CONFIG_USER_ONLY
-    assert(!have_tb_lock);
+    assert_tb_unlocked();
    qemu_mutex_lock(&tcg_ctx.tb_ctx.tb_lock);
    have_tb_lock++;
-#endif
 }

 void tb_unlock(void)
 {
-#ifdef CONFIG_USER_ONLY
-    assert(have_tb_lock);
+    assert_tb_locked();
    have_tb_lock--;
    qemu_mutex_unlock(&tcg_ctx.tb_ctx.tb_lock);
-#endif
 }

 void tb_lock_reset(void)
 {
-#ifdef CONFIG_USER_ONLY
    if (have_tb_lock) {
        qemu_mutex_unlock(&tcg_ctx.tb_ctx.tb_lock);
        have_tb_lock = 0;
    }
-#endif
 }

-#ifdef DEBUG_LOCKING
-#define DEBUG_TB_LOCKS 1
-#else
-#define DEBUG_TB_LOCKS 0
-#endif
-
-#ifdef CONFIG_SOFTMMU
-#define assert_tb_lock() do { /* nothing */ } while (0)
-#else
-#define assert_tb_lock() do {               \
-        if (DEBUG_TB_LOCKS) {               \
-            g_assert(have_tb_lock);         \
-        }                                   \
-    } while (0)
-#endif
-
-
 static TranslationBlock *tb_find_pc(uintptr_t tc_ptr);

 void cpu_gen_init(void)
@ -847,7 +815,7 @@ static TranslationBlock *tb_alloc(target_ulong pc)
 {
    TranslationBlock *tb;

-    assert_tb_lock();
+    assert_tb_locked();

    if (tcg_ctx.tb_ctx.nb_tbs >= tcg_ctx.code_gen_max_blocks) {
        return NULL;
@ -862,7 +830,7 @@ static TranslationBlock *tb_alloc(target_ulong pc)
 /* Called with tb_lock held.  */
 void tb_free(TranslationBlock *tb)
 {
-    assert_tb_lock();
+    assert_tb_locked();

    /* In practice this is mostly used for single use temporary TB
       Ignore the hard cases and just back up if this TB happens to
@ -1104,7 +1072,7 @@ void tb_phys_invalidate(TranslationBlock *tb, tb_page_addr_t page_addr)
    uint32_t h;
    tb_page_addr_t phys_pc;

-    assert_tb_lock();
+    assert_tb_locked();

    atomic_set(&tb->invalid, true);

@ -1421,7 +1389,7 @@ static void tb_invalidate_phys_range_1(tb_page_addr_t start, tb_page_addr_t end)
 #ifdef CONFIG_SOFTMMU
 void tb_invalidate_phys_range(tb_page_addr_t start, tb_page_addr_t end)
 {
-    assert_tb_lock();
+    assert_tb_locked();
    tb_invalidate_phys_range_1(start, end);
 }
 #else
@ -1464,7 +1432,7 @@ void tb_invalidate_phys_page_range(tb_page_addr_t start, tb_page_addr_t end,
 #endif /* TARGET_HAS_PRECISE_SMC */

    assert_memory_lock();
-    assert_tb_lock();
+    assert_tb_locked();

    p = page_find(start >> TARGET_PAGE_BITS);
    if (!p) {
@ -1543,7 +1511,7 @@ void tb_invalidate_phys_page_range(tb_page_addr_t start, tb_page_addr_t end,
 #ifdef CONFIG_SOFTMMU
 /* len must be <= 8 and start must be a multiple of len.
 * Called via softmmu_template.h when code areas are written to with
- * tb_lock held.
+ * iothread mutex not held.
 */
 void tb_invalidate_phys_page_fast(tb_page_addr_t start, int len)
 {
@ -1745,7 +1713,10 @@ void tb_check_watchpoint(CPUState *cpu)

 #ifndef CONFIG_USER_ONLY
 /* in deterministic execution mode, instructions doing device I/Os
-   must be at the end of the TB */
+ * must be at the end of the TB.
+ *
+ * Called by softmmu_template.h, with iothread mutex not held.
+ */
 void cpu_io_recompile(CPUState *cpu, uintptr_t retaddr)
 {
 #if defined(TARGET_MIPS) || defined(TARGET_SH4)
@ -1957,6 +1928,7 @@ void dump_opcount_info(FILE *f, fprintf_function cpu_fprintf)

 void cpu_interrupt(CPUState *cpu, int mask)
 {
+    g_assert(qemu_mutex_iothread_locked());
    cpu->interrupt_request |= mask;
    cpu->tcg_exit_req = 1;
 }
--- a/translate-common.c
+++ b/translate-common.c
@ -21,6 +21,7 @@
 #include "qemu-common.h"
 #include "qom/cpu.h"
 #include "sysemu/cpus.h"
+#include "qemu/main-loop.h"

 uintptr_t qemu_real_host_page_size;
 intptr_t qemu_real_host_page_mask;
@ -30,6 +31,7 @@ intptr_t qemu_real_host_page_mask;
 static void tcg_handle_interrupt(CPUState *cpu, int mask)
 {
    int old_mask;
+    g_assert(qemu_mutex_iothread_locked());

    old_mask = cpu->interrupt_request;
    cpu->interrupt_request |= mask;
@ -40,17 +42,16 @@ static void tcg_handle_interrupt(CPUState *cpu, int mask)
     */
    if (!qemu_cpu_is_self(cpu)) {
        qemu_cpu_kick(cpu);
-        return;
-    }
-
-    if (use_icount) {
-        cpu->icount_decr.u16.high = 0xffff;
-        if (!cpu->can_do_io
-            && (mask & ~old_mask) != 0) {
-            cpu_abort(cpu, "Raised interrupt while not in I/O function");
-        }
    } else {
-        cpu->tcg_exit_req = 1;
+        if (use_icount) {
+            cpu->icount_decr.u16.high = 0xffff;
+            if (!cpu->can_do_io
+                && (mask & ~old_mask) != 0) {
+                cpu_abort(cpu, "Raised interrupt while not in I/O function");
+            }
+        } else {
+            cpu->tcg_exit_req = 1;
+        }
    }
 }

--- a/vl.c
+++ b/vl.c
@ -300,6 +300,26 @@ static QemuOptsList qemu_machine_opts = {
    },
 };

+static QemuOptsList qemu_accel_opts = {
+    .name = "accel",
+    .implied_opt_name = "accel",
+    .head = QTAILQ_HEAD_INITIALIZER(qemu_accel_opts.head),
+    .merge_lists = true,
+    .desc = {
+        {
+            .name = "accel",
+            .type = QEMU_OPT_STRING,
+            .help = "Select the type of accelerator",
+        },
+        {
+            .name = "thread",
+            .type = QEMU_OPT_STRING,
+            .help = "Enable/disable multi-threaded TCG",
+        },
+        { /* end of list */ }
+    },
+};
+
 static QemuOptsList qemu_boot_opts = {
    .name = "boot-opts",
    .implied_opt_name = "order",
@ -2928,7 +2948,8 @@ int main(int argc, char **argv, char **envp)
    const char *boot_once = NULL;
    DisplayState *ds;
    int cyls, heads, secs, translation;
-    QemuOpts *hda_opts = NULL, *opts, *machine_opts, *icount_opts = NULL;
+    QemuOpts *opts, *machine_opts;
+    QemuOpts *hda_opts = NULL, *icount_opts = NULL, *accel_opts = NULL;
    QemuOptsList *olist;
    int optind;
    const char *optarg;
@ -2983,6 +3004,7 @@ int main(int argc, char **argv, char **envp)
    qemu_add_opts(&qemu_trace_opts);
    qemu_add_opts(&qemu_option_rom_opts);
    qemu_add_opts(&qemu_machine_opts);
+    qemu_add_opts(&qemu_accel_opts);
    qemu_add_opts(&qemu_mem_opts);
    qemu_add_opts(&qemu_smp_opts);
    qemu_add_opts(&qemu_boot_opts);
@ -3675,6 +3697,26 @@ int main(int argc, char **argv, char **envp)
                qdev_prop_register_global(&kvm_pit_lost_tick_policy);
                break;
            }
+            case QEMU_OPTION_accel:
+                accel_opts = qemu_opts_parse_noisily(qemu_find_opts("accel"),
+                                                     optarg, true);
+                optarg = qemu_opt_get(accel_opts, "accel");
+
+                olist = qemu_find_opts("machine");
+                if (strcmp("kvm", optarg) == 0) {
+                    qemu_opts_parse_noisily(olist, "accel=kvm", false);
+                } else if (strcmp("xen", optarg) == 0) {
+                    qemu_opts_parse_noisily(olist, "accel=xen", false);
+                } else if (strcmp("tcg", optarg) == 0) {
+                    qemu_opts_parse_noisily(olist, "accel=tcg", false);
+                } else {
+                    if (!is_help_option(optarg)) {
+                        error_printf("Unknown accelerator: %s", optarg);
+                    }
+                    error_printf("Supported accelerators: kvm, xen, tcg\n");
+                    exit(1);
+                }
+                break;
            case QEMU_OPTION_usb:
                olist = qemu_find_opts("machine");
                qemu_opts_parse_noisily(olist, "usb=on", false);
@ -3983,6 +4025,8 @@ int main(int argc, char **argv, char **envp)

    replay_configure(icount_opts);

+    qemu_tcg_configure(accel_opts, &error_fatal);
+
    machine_class = select_machine();

    set_memory_options(&ram_slots, &maxram_size, machine_class);
@ -4349,6 +4393,9 @@ int main(int argc, char **argv, char **envp)
        if (!tcg_enabled()) {
            error_report("-icount is not allowed with hardware virtualization");
            exit(1);
+        } else if (qemu_tcg_mttcg_enabled()) {
+            error_report("-icount does not currently work with MTTCG");
+            exit(1);
        }
        configure_icount(icount_opts, &error_abort);
        qemu_opts_del(icount_opts);