Improvements to 128-bit atomics:

- Separate __int128_t type and arithmetic detection - Support 128-bit load/store in backend for i386, aarch64, ppc64, s390x - Accelerate atomics via host/include/ Decodetree: - Add named field syntax - Move tests to meson -----BEGIN PGP SIGNATURE----- iQFRBAABCgA7FiEEekgeeIaLTbaoWgXAZN846K9+IV8FAmR2R10dHHJpY2hhcmQu aGVuZGVyc29uQGxpbmFyby5vcmcACgkQZN846K9+IV/bsgf/XLi8q+ITyoEAKwG4 6ML7DktLAdIs9Euah9twqe16U0BM0YzpKfymBfVVBKKaIa0524N4ZKIT3h6EeJo+ f+ultqrpsnH+aQh4wc3ZCkEvRdhzhFT8VcoRTunJuJrbL3Y8n2ZSgODUL2a0tahT Nn+zEPm8rzQanSKQHq5kyNBLpgTUKjc5wKfvy/WwttnFmkTnqzcuEA6nPVOVwOHC lZBQCByIQWsHfFHUVJFvsFzBQbm0mAiW6FNKzPBkoXon0h/UZUI1lV+xXzgutFs+ zR2O8IZwLYRu2wOWiTF8Nn2qQafkB3Dhwoq3JTEXhOqosOPExbIiWlsZDlPiKRJk bwmQlg== =XQMb -----END PGP SIGNATURE----- Merge tag 'pull-tcg-20230530' of https://gitlab.com/rth7680/qemu into staging Improvements to 128-bit atomics: - Separate __int128_t type and arithmetic detection - Support 128-bit load/store in backend for i386, aarch64, ppc64, s390x - Accelerate atomics via host/include/ Decodetree: - Add named field syntax - Move tests to meson # -----BEGIN PGP SIGNATURE----- # # iQFRBAABCgA7FiEEekgeeIaLTbaoWgXAZN846K9+IV8FAmR2R10dHHJpY2hhcmQu # aGVuZGVyc29uQGxpbmFyby5vcmcACgkQZN846K9+IV/bsgf/XLi8q+ITyoEAKwG4 # 6ML7DktLAdIs9Euah9twqe16U0BM0YzpKfymBfVVBKKaIa0524N4ZKIT3h6EeJo+ # f+ultqrpsnH+aQh4wc3ZCkEvRdhzhFT8VcoRTunJuJrbL3Y8n2ZSgODUL2a0tahT # Nn+zEPm8rzQanSKQHq5kyNBLpgTUKjc5wKfvy/WwttnFmkTnqzcuEA6nPVOVwOHC # lZBQCByIQWsHfFHUVJFvsFzBQbm0mAiW6FNKzPBkoXon0h/UZUI1lV+xXzgutFs+ # zR2O8IZwLYRu2wOWiTF8Nn2qQafkB3Dhwoq3JTEXhOqosOPExbIiWlsZDlPiKRJk # bwmQlg== # =XQMb # -----END PGP SIGNATURE----- # gpg: Signature made Tue 30 May 2023 11:58:37 AM PDT # gpg: using RSA key 7A481E78868B4DB6A85A05C064DF38E8AF7E215F # gpg: issuer "richard.henderson@linaro.org" # gpg: Good signature from "Richard Henderson <richard.henderson@linaro.org>" [ultimate] * tag 'pull-tcg-20230530' of https://gitlab.com/rth7680/qemu: (27 commits) tests/decode: Add tests for various named-field cases scripts/decodetree: Implement named field support scripts/decodetree: Implement a topological sort scripts/decodetree: Pass lvalue-formatter function to str_extract() docs: Document decodetree named field syntax tests/decode: Convert tests to meson decodetree: Do not remove output_file from /dev decodetree: Diagnose empty pattern group decodetree: Fix recursion in prop_format and build_tree decodetree: Add --test-for-error tcg: Remove TCG_TARGET_TLB_DISPLACEMENT_BITS accel/tcg: Add aarch64 store_atom_insert_al16 accel/tcg: Add aarch64 lse2 load_atom_extract_al16_or_al8 accel/tcg: Add x86_64 load_atom_extract_al16_or_al8 accel/tcg: Extract store_atom_insert_al16 to host header accel/tcg: Extract load_atom_extract_al16_or_al8 to host header tcg/s390x: Support 128-bit load/store tcg/ppc: Support 128-bit load/store tcg/aarch64: Support 128-bit load/store tcg/aarch64: Simplify constraints on qemu_ld/st ... Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
2024-07-22 19:04:26 +00:00 · 2023-05-30 13:25:18 -07:00 · 2023-05-30 13:25:18 -07:00 · 51bdb0b57a
parent 7f027ee0ce 276d77de50
commit 51bdb0b57a
38 changed files with 1312 additions and 225 deletions
--- a/accel/tcg/ldst_atomicity.c.inc
+++ b/accel/tcg/ldst_atomicity.c.inc
@ -9,6 +9,9 @@
 * See the COPYING file in the top-level directory.
 */

+#include "host/load-extract-al16-al8.h"
+#include "host/store-insert-al16.h"
+
 #ifdef CONFIG_ATOMIC64
 # define HAVE_al8          true
 #else
@ -156,7 +159,7 @@ static uint64_t load_atomic8_or_exit(CPUArchState *env, uintptr_t ra, void *pv)
     * another process, because the fallback start_exclusive solution
     * provides no protection across processes.
     */
-    if (!page_check_range(h2g(pv), 8, PAGE_WRITE)) {
+    if (!page_check_range(h2g(pv), 8, PAGE_WRITE_ORG)) {
        uint64_t *p = __builtin_assume_aligned(pv, 8);
        return *p;
    }
@ -191,7 +194,7 @@ static Int128 load_atomic16_or_exit(CPUArchState *env, uintptr_t ra, void *pv)
     * another process, because the fallback start_exclusive solution
     * provides no protection across processes.
     */
-    if (!page_check_range(h2g(p), 16, PAGE_WRITE)) {
+    if (!page_check_range(h2g(p), 16, PAGE_WRITE_ORG)) {
        return *p;
    }
 #endif
@ -311,40 +314,6 @@ static uint64_t load_atom_extract_al16_or_exit(CPUArchState *env, uintptr_t ra,
    return int128_getlo(r);
 }

-/**
- * load_atom_extract_al16_or_al8:
- * @p: host address
- * @s: object size in bytes, @s <= 8.
- *
- * Load @s bytes from @p, when p % s != 0.  If [p, p+s-1] does not
- * cross an 16-byte boundary then the access must be 16-byte atomic,
- * otherwise the access must be 8-byte atomic.
- */
-static inline uint64_t ATTRIBUTE_ATOMIC128_OPT
-load_atom_extract_al16_or_al8(void *pv, int s)
-{
-    uintptr_t pi = (uintptr_t)pv;
-    int o = pi & 7;
-    int shr = (HOST_BIG_ENDIAN ? 16 - s - o : o) * 8;
-    Int128 r;
-
-    pv = (void *)(pi & ~7);
-    if (pi & 8) {
-        uint64_t *p8 = __builtin_assume_aligned(pv, 16, 8);
-        uint64_t a = qatomic_read__nocheck(p8);
-        uint64_t b = qatomic_read__nocheck(p8 + 1);
-
-        if (HOST_BIG_ENDIAN) {
-            r = int128_make128(b, a);
-        } else {
-            r = int128_make128(a, b);
-        }
-    } else {
-        r = atomic16_read_ro(pv);
-    }
-    return int128_getlo(int128_urshift(r, shr));
-}
-
 /**
 * load_atom_4_by_2:
 * @pv: host address
@ -713,45 +682,6 @@ static void store_atom_insert_al8(uint64_t *p, uint64_t val, uint64_t msk)
                                          __ATOMIC_RELAXED, __ATOMIC_RELAXED));
 }

-/**
- * store_atom_insert_al16:
- * @p: host address
- * @val: shifted value to store
- * @msk: mask for value to store
- *
- * Atomically store @val to @p masked by @msk.
- */
-static void ATTRIBUTE_ATOMIC128_OPT
-store_atom_insert_al16(Int128 *ps, Int128Alias val, Int128Alias msk)
-{
-#if defined(CONFIG_ATOMIC128)
-    __uint128_t *pu, old, new;
-
-    /* With CONFIG_ATOMIC128, we can avoid the memory barriers. */
-    pu = __builtin_assume_aligned(ps, 16);
-    old = *pu;
-    do {
-        new = (old & ~msk.u) | val.u;
-    } while (!__atomic_compare_exchange_n(pu, &old, new, true,
-                                          __ATOMIC_RELAXED, __ATOMIC_RELAXED));
-#elif defined(CONFIG_CMPXCHG128)
-    __uint128_t *pu, old, new;
-
-    /*
-     * Without CONFIG_ATOMIC128, __atomic_compare_exchange_n will always
-     * defer to libatomic, so we must use __sync_*_compare_and_swap_16
-     * and accept the sequential consistency that comes with it.
-     */
-    pu = __builtin_assume_aligned(ps, 16);
-    do {
-        old = *pu;
-        new = (old & ~msk.u) | val.u;
-    } while (!__sync_bool_compare_and_swap_16(pu, old, new));
-#else
-    qemu_build_not_reached();
-#endif
-}
-
 /**
 * store_bytes_leN:
 * @pv: host address
--- a/docs/devel/decodetree.rst
+++ b/docs/devel/decodetree.rst
@ -23,22 +23,42 @@ Fields

 Syntax::

-  field_def     := '%' identifier ( unnamed_field )* ( !function=identifier )?
+  field_def     := '%' identifier ( field )* ( !function=identifier )?
+  field         := unnamed_field | named_field
  unnamed_field := number ':' ( 's' ) number
+  named_field   := identifier ':' ( 's' ) number

 For *unnamed_field*, the first number is the least-significant bit position
 of the field and the second number is the length of the field.  If the 's' is
-present, the field is considered signed.  If multiple ``unnamed_fields`` are
-present, they are concatenated.  In this way one can define disjoint fields.
+present, the field is considered signed.
+
+A *named_field* refers to some other field in the instruction pattern
+or format. Regardless of the length of the other field where it is
+defined, it will be inserted into this field with the specified
+signedness and bit width.
+
+Field definitions that involve loops (i.e. where a field is defined
+directly or indirectly in terms of itself) are errors.
+
+A format can include fields that refer to named fields that are
+defined in the instruction pattern(s) that use the format.
+Conversely, an instruction pattern can include fields that refer to
+named fields that are defined in the format it uses. However you
+cannot currently do both at once (i.e. pattern P uses format F; F has
+a field A that refers to a named field B that is defined in P, and P
+has a field C that refers to a named field D that is defined in F).
+
+If multiple ``fields`` are present, they are concatenated.
+In this way one can define disjoint fields.

 If ``!function`` is specified, the concatenated result is passed through the
 named function, taking and returning an integral value.

-One may use ``!function`` with zero ``unnamed_fields``.  This case is called
+One may use ``!function`` with zero ``fields``.  This case is called
 a *parameter*, and the named function is only passed the ``DisasContext``
 and returns an integral value extracted from there.

-A field with no ``unnamed_fields`` and no ``!function`` is in error.
+A field with no ``fields`` and no ``!function`` is in error.

 Field examples:

@ -56,6 +76,9 @@ Field examples:
 | %shimm8 5:s8 13:1         | expand_shimm8(sextract(i, 5, 8) << 1 |      |
 |   !function=expand_shimm8 |               extract(i, 13, 1))            |
 +---------------------------+---------------------------------------------+
+| %sz_imm 10:2 sz:3         | expand_sz_imm(extract(i, 10, 2) << 3 |      |
+|   !function=expand_sz_imm |               extract(a->sz, 0, 3))         |
+---------------------------+---------------------------------------------+

 Argument Sets
 =============
--- a/host/include/aarch64/host/load-extract-al16-al8.h
+++ b/host/include/aarch64/host/load-extract-al16-al8.h
@ -0,0 +1,40 @@
+/*
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ * Atomic extract 64 from 128-bit, AArch64 version.
+ *
+ * Copyright (C) 2023 Linaro, Ltd.
+ */
+
+#ifndef AARCH64_LOAD_EXTRACT_AL16_AL8_H
+#define AARCH64_LOAD_EXTRACT_AL16_AL8_H
+
+#include "host/cpuinfo.h"
+#include "tcg/debug-assert.h"
+
+/**
+ * load_atom_extract_al16_or_al8:
+ * @pv: host address
+ * @s: object size in bytes, @s <= 8.
+ *
+ * Load @s bytes from @pv, when pv % s != 0.  If [p, p+s-1] does not
+ * cross an 16-byte boundary then the access must be 16-byte atomic,
+ * otherwise the access must be 8-byte atomic.
+ */
+static inline uint64_t load_atom_extract_al16_or_al8(void *pv, int s)
+{
+    uintptr_t pi = (uintptr_t)pv;
+    __int128_t *ptr_align = (__int128_t *)(pi & ~7);
+    int shr = (pi & 7) * 8;
+    uint64_t l, h;
+
+    /*
+     * With FEAT_LSE2, LDP is single-copy atomic if 16-byte aligned
+     * and single-copy atomic on the parts if 8-byte aligned.
+     * All we need do is align the pointer mod 8.
+     */
+    tcg_debug_assert(HAVE_ATOMIC128_RO);
+    asm("ldp %0, %1, %2" : "=r"(l), "=r"(h) : "m"(*ptr_align));
+    return (l >> shr) | (h << (-shr & 63));
+}
+
+#endif /* AARCH64_LOAD_EXTRACT_AL16_AL8_H */
--- a/host/include/aarch64/host/store-insert-al16.h
+++ b/host/include/aarch64/host/store-insert-al16.h
@ -0,0 +1,47 @@
+/*
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ * Atomic store insert into 128-bit, AArch64 version.
+ *
+ * Copyright (C) 2023 Linaro, Ltd.
+ */
+
+#ifndef AARCH64_STORE_INSERT_AL16_H
+#define AARCH64_STORE_INSERT_AL16_H
+
+/**
+ * store_atom_insert_al16:
+ * @p: host address
+ * @val: shifted value to store
+ * @msk: mask for value to store
+ *
+ * Atomically store @val to @p masked by @msk.
+ */
+static inline void ATTRIBUTE_ATOMIC128_OPT
+store_atom_insert_al16(Int128 *ps, Int128 val, Int128 msk)
+{
+    /*
+     * GCC only implements __sync* primitives for int128 on aarch64.
+     * We can do better without the barriers, and integrating the
+     * arithmetic into the load-exclusive/store-conditional pair.
+     */
+    uint64_t tl, th, vl, vh, ml, mh;
+    uint32_t fail;
+
+    qemu_build_assert(!HOST_BIG_ENDIAN);
+    vl = int128_getlo(val);
+    vh = int128_gethi(val);
+    ml = int128_getlo(msk);
+    mh = int128_gethi(msk);
+
+    asm("0: ldxp %[l], %[h], %[mem]\n\t"
+        "bic %[l], %[l], %[ml]\n\t"
+        "bic %[h], %[h], %[mh]\n\t"
+        "orr %[l], %[l], %[vl]\n\t"
+        "orr %[h], %[h], %[vh]\n\t"
+        "stxp %w[f], %[l], %[h], %[mem]\n\t"
+        "cbnz %w[f], 0b\n"
+        : [mem] "+Q"(*ps), [f] "=&r"(fail), [l] "=&r"(tl), [h] "=&r"(th)
+        : [vl] "r"(vl), [vh] "r"(vh), [ml] "r"(ml), [mh] "r"(mh));
+}
+
+#endif /* AARCH64_STORE_INSERT_AL16_H */
--- a/host/include/generic/host/load-extract-al16-al8.h
+++ b/host/include/generic/host/load-extract-al16-al8.h
@ -0,0 +1,45 @@
+/*
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ * Atomic extract 64 from 128-bit, generic version.
+ *
+ * Copyright (C) 2023 Linaro, Ltd.
+ */
+
+#ifndef HOST_LOAD_EXTRACT_AL16_AL8_H
+#define HOST_LOAD_EXTRACT_AL16_AL8_H
+
+/**
+ * load_atom_extract_al16_or_al8:
+ * @pv: host address
+ * @s: object size in bytes, @s <= 8.
+ *
+ * Load @s bytes from @pv, when pv % s != 0.  If [p, p+s-1] does not
+ * cross an 16-byte boundary then the access must be 16-byte atomic,
+ * otherwise the access must be 8-byte atomic.
+ */
+static inline uint64_t ATTRIBUTE_ATOMIC128_OPT
+load_atom_extract_al16_or_al8(void *pv, int s)
+{
+    uintptr_t pi = (uintptr_t)pv;
+    int o = pi & 7;
+    int shr = (HOST_BIG_ENDIAN ? 16 - s - o : o) * 8;
+    Int128 r;
+
+    pv = (void *)(pi & ~7);
+    if (pi & 8) {
+        uint64_t *p8 = __builtin_assume_aligned(pv, 16, 8);
+        uint64_t a = qatomic_read__nocheck(p8);
+        uint64_t b = qatomic_read__nocheck(p8 + 1);
+
+        if (HOST_BIG_ENDIAN) {
+            r = int128_make128(b, a);
+        } else {
+            r = int128_make128(a, b);
+        }
+    } else {
+        r = atomic16_read_ro(pv);
+    }
+    return int128_getlo(int128_urshift(r, shr));
+}
+
+#endif /* HOST_LOAD_EXTRACT_AL16_AL8_H */
--- a/host/include/generic/host/store-insert-al16.h
+++ b/host/include/generic/host/store-insert-al16.h
@ -0,0 +1,50 @@
+/*
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ * Atomic store insert into 128-bit, generic version.
+ *
+ * Copyright (C) 2023 Linaro, Ltd.
+ */
+
+#ifndef HOST_STORE_INSERT_AL16_H
+#define HOST_STORE_INSERT_AL16_H
+
+/**
+ * store_atom_insert_al16:
+ * @p: host address
+ * @val: shifted value to store
+ * @msk: mask for value to store
+ *
+ * Atomically store @val to @p masked by @msk.
+ */
+static inline void ATTRIBUTE_ATOMIC128_OPT
+store_atom_insert_al16(Int128 *ps, Int128 val, Int128 msk)
+{
+#if defined(CONFIG_ATOMIC128)
+    __uint128_t *pu;
+    Int128Alias old, new;
+
+    /* With CONFIG_ATOMIC128, we can avoid the memory barriers. */
+    pu = __builtin_assume_aligned(ps, 16);
+    old.u = *pu;
+    msk = int128_not(msk);
+    do {
+        new.s = int128_and(old.s, msk);
+        new.s = int128_or(new.s, val);
+    } while (!__atomic_compare_exchange_n(pu, &old.u, new.u, true,
+                                          __ATOMIC_RELAXED, __ATOMIC_RELAXED));
+#else
+    Int128 old, new, cmp;
+
+    ps = __builtin_assume_aligned(ps, 16);
+    old = *ps;
+    msk = int128_not(msk);
+    do {
+        cmp = old;
+        new = int128_and(old, msk);
+        new = int128_or(new, val);
+        old = atomic16_cmpxchg(ps, cmp, new);
+    } while (int128_ne(cmp, old));
+#endif
+}
+
+#endif /* HOST_STORE_INSERT_AL16_H */
--- a/host/include/x86_64/host/atomic128-ldst.h
+++ b/host/include/x86_64/host/atomic128-ldst.h
@ -0,0 +1,68 @@
+/*
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ * Load/store for 128-bit atomic operations, x86_64 version.
+ *
+ * Copyright (C) 2023 Linaro, Ltd.
+ *
+ * See docs/devel/atomics.rst for discussion about the guarantees each
+ * atomic primitive is meant to provide.
+ */
+
+#ifndef AARCH64_ATOMIC128_LDST_H
+#define AARCH64_ATOMIC128_LDST_H
+
+#ifdef CONFIG_INT128_TYPE
+#include "host/cpuinfo.h"
+#include "tcg/debug-assert.h"
+
+/*
+ * Through clang 16, with -mcx16, __atomic_load_n is incorrectly
+ * expanded to a read-write operation: lock cmpxchg16b.
+ */
+
+#define HAVE_ATOMIC128_RO  likely(cpuinfo & CPUINFO_ATOMIC_VMOVDQA)
+#define HAVE_ATOMIC128_RW  1
+
+static inline Int128 atomic16_read_ro(const Int128 *ptr)
+{
+    Int128Alias r;
+
+    tcg_debug_assert(HAVE_ATOMIC128_RO);
+    asm("vmovdqa %1, %0" : "=x" (r.i) : "m" (*ptr));
+
+    return r.s;
+}
+
+static inline Int128 atomic16_read_rw(Int128 *ptr)
+{
+    __int128_t *ptr_align = __builtin_assume_aligned(ptr, 16);
+    Int128Alias r;
+
+    if (HAVE_ATOMIC128_RO) {
+        asm("vmovdqa %1, %0" : "=x" (r.i) : "m" (*ptr_align));
+    } else {
+        r.i = __sync_val_compare_and_swap_16(ptr_align, 0, 0);
+    }
+    return r.s;
+}
+
+static inline void atomic16_set(Int128 *ptr, Int128 val)
+{
+    __int128_t *ptr_align = __builtin_assume_aligned(ptr, 16);
+    Int128Alias new = { .s = val };
+
+    if (HAVE_ATOMIC128_RO) {
+        asm("vmovdqa %1, %0" : "=m"(*ptr_align) : "x" (new.i));
+    } else {
+        __int128_t old;
+        do {
+            old = *ptr_align;
+        } while (!__sync_bool_compare_and_swap_16(ptr_align, old, new.i));
+    }
+}
+#else
+/* Provide QEMU_ERROR stubs. */
+#include "host/include/generic/host/atomic128-ldst.h"
+#endif
+
+#endif /* AARCH64_ATOMIC128_LDST_H */
--- a/host/include/x86_64/host/load-extract-al16-al8.h
+++ b/host/include/x86_64/host/load-extract-al16-al8.h
@ -0,0 +1,50 @@
+/*
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ * Atomic extract 64 from 128-bit, x86_64 version.
+ *
+ * Copyright (C) 2023 Linaro, Ltd.
+ */
+
+#ifndef X86_64_LOAD_EXTRACT_AL16_AL8_H
+#define X86_64_LOAD_EXTRACT_AL16_AL8_H
+
+#ifdef CONFIG_INT128_TYPE
+#include "host/cpuinfo.h"
+
+/**
+ * load_atom_extract_al16_or_al8:
+ * @pv: host address
+ * @s: object size in bytes, @s <= 8.
+ *
+ * Load @s bytes from @pv, when pv % s != 0.  If [p, p+s-1] does not
+ * cross an 16-byte boundary then the access must be 16-byte atomic,
+ * otherwise the access must be 8-byte atomic.
+ */
+static inline uint64_t ATTRIBUTE_ATOMIC128_OPT
+load_atom_extract_al16_or_al8(void *pv, int s)
+{
+    uintptr_t pi = (uintptr_t)pv;
+    __int128_t *ptr_align = (__int128_t *)(pi & ~7);
+    int shr = (pi & 7) * 8;
+    Int128Alias r;
+
+    /*
+     * ptr_align % 16 is now only 0 or 8.
+     * If the host supports atomic loads with VMOVDQU, then always use that,
+     * making the branch highly predictable.  Otherwise we must use VMOVDQA
+     * when ptr_align % 16 == 0 for 16-byte atomicity.
+     */
+    if ((cpuinfo & CPUINFO_ATOMIC_VMOVDQU) || (pi & 8)) {
+        asm("vmovdqu %1, %0" : "=x" (r.i) : "m" (*ptr_align));
+    } else {
+        asm("vmovdqa %1, %0" : "=x" (r.i) : "m" (*ptr_align));
+    }
+    return int128_getlo(int128_urshift(r.s, shr));
+}
+#else
+/* Fallback definition that must be optimized away, or error.  */
+uint64_t QEMU_ERROR("unsupported atomic")
+    load_atom_extract_al16_or_al8(void *pv, int s);
+#endif
+
+#endif /* X86_64_LOAD_EXTRACT_AL16_AL8_H */
--- a/include/qemu/int128.h
+++ b/include/qemu/int128.h
@ -481,7 +481,7 @@ static inline void bswap128s(Int128 *s)
 * a possible structure and the native types.  Ease parameter passing
 * via use of the transparent union extension.
 */
-#ifdef CONFIG_INT128
+#ifdef CONFIG_INT128_TYPE
 typedef union {
    __uint128_t u;
    __int128_t i;
@ -489,6 +489,6 @@ typedef union {
 } Int128Alias __attribute__((transparent_union));
 #else
 typedef Int128 Int128Alias;
-#endif /* CONFIG_INT128 */
+#endif /* CONFIG_INT128_TYPE */

 #endif /* INT128_H */
--- a/meson.build
+++ b/meson.build
@ -2543,7 +2543,13 @@ config_host_data.set('CONFIG_ATOMIC64', cc.links('''
    return 0;
  }'''))

-has_int128 = cc.links('''
+has_int128_type = cc.compiles('''
+  __int128_t a;
+  __uint128_t b;
+  int main(void) { b = a; }''')
+config_host_data.set('CONFIG_INT128_TYPE', has_int128_type)
+
+has_int128 = has_int128_type and cc.links('''
  __int128_t a;
  __uint128_t b;
  int main (void) {
@ -2552,10 +2558,9 @@ has_int128 = cc.links('''
    a = a * a;
    return 0;
  }''')
-
 config_host_data.set('CONFIG_INT128', has_int128)

-if has_int128
+if has_int128_type
  # "do we have 128-bit atomics which are handled inline and specifically not
  # via libatomic". The reason we can't use libatomic is documented in the
  # comment starting "GCC is a house divided" in include/qemu/atomic128.h.
@ -2564,7 +2569,7 @@ if has_int128
  # __alignof(unsigned __int128) for the host.
  atomic_test_128 = '''
    int main(int ac, char **av) {
-      unsigned __int128 *p = __builtin_assume_aligned(av[ac - 1], 16);
+      __uint128_t *p = __builtin_assume_aligned(av[ac - 1], 16);
      p[1] = __atomic_load_n(&p[0], __ATOMIC_RELAXED);
      __atomic_store_n(&p[2], p[3], __ATOMIC_RELAXED);
      __atomic_compare_exchange_n(&p[4], &p[5], p[6], 0, __ATOMIC_RELAXED, __ATOMIC_RELAXED);
@ -2586,7 +2591,7 @@ if has_int128
      config_host_data.set('CONFIG_CMPXCHG128', cc.links('''
        int main(void)
        {
-          unsigned __int128 x = 0, y = 0;
+          __uint128_t x = 0, y = 0;
          __sync_val_compare_and_swap_16(&x, y, x);
          return 0;
        }
--- a/scripts/decodetree.py
+++ b/scripts/decodetree.py
@ -35,6 +35,7 @@
 formats = {}
 allpatterns = []
 anyextern = False
+testforerror = False

 translate_prefix = 'trans'
 translate_scope = 'static '
@ -53,6 +54,80 @@
 re_fmt_ident = '@[a-zA-Z0-9_]*'
 re_pat_ident = '[a-zA-Z0-9_]*'

+# Local implementation of a topological sort. We use the same API that
+# the Python graphlib does, so that when QEMU moves forward to a
+# baseline of Python 3.9 or newer this code can all be dropped and
+# replaced with:
+#    from graphlib import TopologicalSorter, CycleError
+#
+# https://docs.python.org/3.9/library/graphlib.html#graphlib.TopologicalSorter
+#
+# We only implement the parts of TopologicalSorter we care about:
+#  ts = TopologicalSorter(graph=None)
+#    create the sorter. graph is a dictionary whose keys are
+#    nodes and whose values are lists of the predecessors of that node.
+#    (That is, if graph contains "A" -> ["B", "C"] then we must output
+#    B and C before A.)
+#  ts.static_order()
+#    returns a list of all the nodes in sorted order, or raises CycleError
+#  CycleError
+#    exception raised if there are cycles in the graph. The second
+#    element in the args attribute is a list of nodes which form a
+#    cycle; the first and last element are the same, eg [a, b, c, a]
+#    (Our implementation doesn't give the order correctly.)
+#
+# For our purposes we can assume that the data set is always small
+# (typically 10 nodes or less, actual links in the graph very rare),
+# so we don't need to worry about efficiency of implementation.
+#
+# The core of this implementation is from
+# https://code.activestate.com/recipes/578272-topological-sort/
+# (but updated to Python 3), and is under the MIT license.
+
+class CycleError(ValueError):
+    """Subclass of ValueError raised if cycles exist in the graph"""
+    pass
+
+class TopologicalSorter:
+    """Topologically sort a graph"""
+    def __init__(self, graph=None):
+        self.graph = graph
+
+    def static_order(self):
+        # We do the sort right here, unlike the stdlib version
+        from functools import reduce
+        data = {}
+        r = []
+
+        if not self.graph:
+            return []
+
+        # This code wants the values in the dict to be specifically sets
+        for k, v in self.graph.items():
+            data[k] = set(v)
+
+        # Find all items that don't depend on anything.
+        extra_items_in_deps = (reduce(set.union, data.values())
+                               - set(data.keys()))
+        # Add empty dependencies where needed
+        data.update({item:{} for item in extra_items_in_deps})
+        while True:
+            ordered = set(item for item, dep in data.items() if not dep)
+            if not ordered:
+                break
+            r.extend(ordered)
+            data = {item: (dep - ordered)
+                    for item, dep in data.items()
+                        if item not in ordered}
+        if data:
+            # This doesn't give as nice results as the stdlib, which
+            # gives you the cycle by listing the nodes in order. Here
+            # we only know the nodes in the cycle but not their order.
+            raise CycleError(f'nodes are in a cycle', list(data.keys()))
+
+        return r
+# end TopologicalSorter
+
 def error_with_file(file, lineno, *args):
    """Print an error message from file:line and args and exit."""
    global output_file
@ -70,8 +145,13 @@ def error_with_file(file, lineno, *args):

    if output_file and output_fd:
        output_fd.close()
-        os.remove(output_file)
-    exit(1)
+        # Do not try to remove e.g. -o /dev/null
+        if not output_file.startswith("/dev"):
+            try:
+                os.remove(output_file)
+            except PermissionError:
+                pass
+    exit(0 if testforerror else 1)
 # end error_with_file


@ -205,11 +285,14 @@ def __str__(self):
            s = ''
        return str(self.pos) + ':' + s + str(self.len)

-    def str_extract(self):
+    def str_extract(self, lvalue_formatter):
        global bitop_width
        s = 's' if self.sign else ''
        return f'{s}extract{bitop_width}(insn, {self.pos}, {self.len})'

+    def referenced_fields(self):
+        return []
+
    def __eq__(self, other):
        return self.sign == other.sign and self.mask == other.mask

@ -228,12 +311,12 @@ def __init__(self, subs, mask):
    def __str__(self):
        return str(self.subs)

-    def str_extract(self):
+    def str_extract(self, lvalue_formatter):
        global bitop_width
        ret = '0'
        pos = 0
        for f in reversed(self.subs):
-            ext = f.str_extract()
+            ext = f.str_extract(lvalue_formatter)
            if pos == 0:
                ret = ext
            else:
@ -241,6 +324,12 @@ def str_extract(self):
            pos += f.len
        return ret

+    def referenced_fields(self):
+        l = []
+        for f in self.subs:
+            l.extend(f.referenced_fields())
+        return l
+
    def __ne__(self, other):
        if len(self.subs) != len(other.subs):
            return True
@ -264,9 +353,12 @@ def __init__(self, value):
    def __str__(self):
        return str(self.value)

-    def str_extract(self):
+    def str_extract(self, lvalue_formatter):
        return str(self.value)

+    def referenced_fields(self):
+        return []
+
    def __cmp__(self, other):
        return self.value - other.value
 # end ConstField
@ -283,8 +375,12 @@ def __init__(self, func, base):
    def __str__(self):
        return self.func + '(' + str(self.base) + ')'

-    def str_extract(self):
-        return self.func + '(ctx, ' + self.base.str_extract() + ')'
+    def str_extract(self, lvalue_formatter):
+        return (self.func + '(ctx, '
+                + self.base.str_extract(lvalue_formatter) + ')')
+
+    def referenced_fields(self):
+        return self.base.referenced_fields()

    def __eq__(self, other):
        return self.func == other.func and self.base == other.base
@ -304,9 +400,12 @@ def __init__(self, func):
    def __str__(self):
        return self.func

-    def str_extract(self):
+    def str_extract(self, lvalue_formatter):
        return self.func + '(ctx)'

+    def referenced_fields(self):
+        return []
+
    def __eq__(self, other):
        return self.func == other.func

@ -314,6 +413,32 @@ def __ne__(self, other):
        return not self.__eq__(other)
 # end ParameterField

+class NamedField:
+    """Class representing a field already named in the pattern"""
+    def __init__(self, name, sign, len):
+        self.mask = 0
+        self.sign = sign
+        self.len = len
+        self.name = name
+
+    def __str__(self):
+        return self.name
+
+    def str_extract(self, lvalue_formatter):
+        global bitop_width
+        s = 's' if self.sign else ''
+        lvalue = lvalue_formatter(self.name)
+        return f'{s}extract{bitop_width}({lvalue}, 0, {self.len})'
+
+    def referenced_fields(self):
+        return [self.name]
+
+    def __eq__(self, other):
+        return self.name == other.name
+
+    def __ne__(self, other):
+        return not self.__eq__(other)
+# end NamedField

 class Arguments:
    """Class representing the extracted fields of a format"""
@ -337,7 +462,6 @@ def output_def(self):
            output('} ', self.struct_name(), ';\n\n')
 # end Arguments

-
 class General:
    """Common code between instruction formats and instruction patterns"""
    def __init__(self, name, lineno, base, fixb, fixm, udfm, fldm, flds, w):
@ -351,12 +475,59 @@ def __init__(self, name, lineno, base, fixb, fixm, udfm, fldm, flds, w):
        self.fieldmask = fldm
        self.fields = flds
        self.width = w
+        self.dangling = None

    def __str__(self):
        return self.name + ' ' + str_match_bits(self.fixedbits, self.fixedmask)

    def str1(self, i):
        return str_indent(i) + self.__str__()
+
+    def dangling_references(self):
+        # Return a list of all named references which aren't satisfied
+        # directly by this format/pattern. This will be either:
+        #  * a format referring to a field which is specified by the
+        #    pattern(s) using it
+        #  * a pattern referring to a field which is specified by the
+        #    format it uses
+        #  * a user error (referring to a field that doesn't exist at all)
+        if self.dangling is None:
+            # Compute this once and cache the answer
+            dangling = []
+            for n, f in self.fields.items():
+                for r in f.referenced_fields():
+                    if r not in self.fields:
+                        dangling.append(r)
+            self.dangling = dangling
+        return self.dangling
+
+    def output_fields(self, indent, lvalue_formatter):
+        # We use a topological sort to ensure that any use of NamedField
+        # comes after the initialization of the field it is referencing.
+        graph = {}
+        for n, f in self.fields.items():
+            refs = f.referenced_fields()
+            graph[n] = refs
+
+        try:
+            ts = TopologicalSorter(graph)
+            for n in ts.static_order():
+                # We only want to emit assignments for the keys
+                # in our fields list, not for anything that ends up
+                # in the tsort graph only because it was referenced as
+                # a NamedField.
+                try:
+                    f = self.fields[n]
+                    output(indent, lvalue_formatter(n), ' = ',
+                           f.str_extract(lvalue_formatter), ';\n')
+                except KeyError:
+                    pass
+        except CycleError as e:
+            # The second element of args is a list of nodes which form
+            # a cycle (there might be others too, but only one is reported).
+            # Pretty-print it to tell the user.
+            cycle = ' => '.join(e.args[1])
+            error(self.lineno, 'field definitions form a cycle: ' + cycle)
 # end General


@ -370,8 +541,7 @@ def extract_name(self):
    def output_extract(self):
        output('static void ', self.extract_name(), '(DisasContext *ctx, ',
               self.base.struct_name(), ' *a, ', insntype, ' insn)\n{\n')
-        for n, f in self.fields.items():
-            output('    a->', n, ' = ', f.str_extract(), ';\n')
+        self.output_fields(str_indent(4), lambda n: 'a->' + n)
        output('}\n\n')
 # end Format

@ -392,11 +562,36 @@ def output_code(self, i, extracted, outerbits, outermask):
        ind = str_indent(i)
        arg = self.base.base.name
        output(ind, '/* ', self.file, ':', str(self.lineno), ' */\n')
+        # We might have named references in the format that refer to fields
+        # in the pattern, or named references in the pattern that refer
+        # to fields in the format. This affects whether we extract the fields
+        # for the format before or after the ones for the pattern.
+        # For simplicity we don't allow cross references in both directions.
+        # This is also where we catch the syntax error of referring to
+        # a nonexistent field.
+        fmt_refs = self.base.dangling_references()
+        for r in fmt_refs:
+            if r not in self.fields:
+                error(self.lineno, f'format refers to undefined field {r}')
+        pat_refs = self.dangling_references()
+        for r in pat_refs:
+            if r not in self.base.fields:
+                error(self.lineno, f'pattern refers to undefined field {r}')
+        if pat_refs and fmt_refs:
+            error(self.lineno, ('pattern that uses fields defined in format '
+                                'cannot use format that uses fields defined '
+                                'in pattern'))
+        if fmt_refs:
+            # pattern fields first
+            self.output_fields(ind, lambda n: 'u.f_' + arg + '.' + n)
+            assert not extracted, "dangling fmt refs but it was already extracted"
        if not extracted:
            output(ind, self.base.extract_name(),
                   '(ctx, &u.f_', arg, ', insn);\n')
-        for n, f in self.fields.items():
-            output(ind, 'u.f_', arg, '.', n, ' = ', f.str_extract(), ';\n')
+        if not fmt_refs:
+            # pattern fields last
+            self.output_fields(ind, lambda n: 'u.f_' + arg + '.' + n)
+
        output(ind, 'if (', translate_prefix, '_', self.name,
               '(ctx, &u.f_', arg, ')) return true;\n')

@ -473,7 +668,7 @@ def build_tree(self):

    def prop_format(self):
        for p in self.pats:
-            p.build_tree()
+            p.prop_format()

    def prop_width(self):
        width = None
@ -505,6 +700,12 @@ def output_code(self, i, extracted, outerbits, outermask):
                output(ind, '}\n')
            else:
                p.output_code(i, extracted, p.fixedbits, p.fixedmask)
+
+    def build_tree(self):
+        if not self.pats:
+            error_with_file(self.file, self.lineno, 'empty pattern group')
+        super().build_tree()
+
 #end IncMultiPattern


@ -536,8 +737,10 @@ def output_code(self, i, extracted, outerbits, outermask):
        ind = str_indent(i)

        # If we identified all nodes below have the same format,
-        # extract the fields now.
-        if not extracted and self.base:
+        # extract the fields now. But don't do it if the format relies
+        # on named fields from the insn pattern, as those won't have
+        # been initialised at this point.
+        if not extracted and self.base and not self.base.dangling_references():
            output(ind, self.base.extract_name(),
                   '(ctx, &u.f_', self.base.base.name, ', insn);\n')
            extracted = True
@ -623,7 +826,7 @@ def __build_tree(pats, outerbits, outermask):
        return t

    def build_tree(self):
-        super().prop_format()
+        super().build_tree()
        self.tree = self.__build_tree(self.pats, self.fixedbits,
                                      self.fixedmask)

@ -659,6 +862,7 @@ def parse_field(lineno, name, toks):
    """Parse one instruction field from TOKS at LINENO"""
    global fields
    global insnwidth
+    global re_C_ident

    # A "simple" field will have only one entry;
    # a "multifield" will have several.
@ -673,6 +877,25 @@ def parse_field(lineno, name, toks):
            func = func[1]
            continue

+        if re.fullmatch(re_C_ident + ':s[0-9]+', t):
+            # Signed named field
+            subtoks = t.split(':')
+            n = subtoks[0]
+            le = int(subtoks[1])
+            f = NamedField(n, True, le)
+            subs.append(f)
+            width += le
+            continue
+        if re.fullmatch(re_C_ident + ':[0-9]+', t):
+            # Unsigned named field
+            subtoks = t.split(':')
+            n = subtoks[0]
+            le = int(subtoks[1])
+            f = NamedField(n, False, le)
+            subs.append(f)
+            width += le
+            continue
+
        if re.fullmatch('[0-9]+:s[0-9]+', t):
            # Signed field extract
            subtoks = t.split(':s')
@ -1286,11 +1509,12 @@ def main():
    global bitop_width
    global variablewidth
    global anyextern
+    global testforerror

    decode_scope = 'static '

    long_opts = ['decode=', 'translate=', 'output=', 'insnwidth=',
-                 'static-decode=', 'varinsnwidth=']
+                 'static-decode=', 'varinsnwidth=', 'test-for-error']
    try:
        (opts, args) = getopt.gnu_getopt(sys.argv[1:], 'o:vw:', long_opts)
    except getopt.GetoptError as err:
@ -1319,6 +1543,8 @@ def main():
                bitop_width = 64
            elif insnwidth != 32:
                error(0, 'cannot handle insns of width', insnwidth)
+        elif o == '--test-for-error':
+            testforerror = True
        else:
            assert False, 'unhandled option'

@ -1417,6 +1643,7 @@ def main():

    if output_file:
        output_fd.close()
+    exit(1 if testforerror else 0)
 # end main


--- a/tcg/aarch64/tcg-target-con-set.h
+++ b/tcg/aarch64/tcg-target-con-set.h
@ -10,11 +10,10 @@
 * tcg-target-con-str.h; the constraint combination is inclusive or.
 */
 C_O0_I1(r)
-C_O0_I2(lZ, l)
 C_O0_I2(r, rA)
 C_O0_I2(rZ, r)
 C_O0_I2(w, r)
-C_O1_I1(r, l)
+C_O0_I3(rZ, rZ, r)
 C_O1_I1(r, r)
 C_O1_I1(w, r)
 C_O1_I1(w, w)
@ -33,4 +32,5 @@ C_O1_I2(w, w, wO)
 C_O1_I2(w, w, wZ)
 C_O1_I3(w, w, w, w)
 C_O1_I4(r, r, rA, rZ, rZ)
+C_O2_I1(r, r, r)
 C_O2_I4(r, r, rZ, rZ, rA, rMZ)
--- a/tcg/aarch64/tcg-target-con-str.h
+++ b/tcg/aarch64/tcg-target-con-str.h
@ -9,7 +9,6 @@
 * REGS(letter, register_mask)
 */
 REGS('r', ALL_GENERAL_REGS)
-REGS('l', ALL_QLDST_REGS)
 REGS('w', ALL_VECTOR_REGS)

 /*
--- a/tcg/aarch64/tcg-target.c.inc
+++ b/tcg/aarch64/tcg-target.c.inc
@ -40,11 +40,12 @@ static const int tcg_target_reg_alloc_order[] = {

    TCG_REG_X8, TCG_REG_X9, TCG_REG_X10, TCG_REG_X11,
    TCG_REG_X12, TCG_REG_X13, TCG_REG_X14, TCG_REG_X15,
-    TCG_REG_X16, TCG_REG_X17,

    TCG_REG_X0, TCG_REG_X1, TCG_REG_X2, TCG_REG_X3,
    TCG_REG_X4, TCG_REG_X5, TCG_REG_X6, TCG_REG_X7,

+    /* X16 reserved as temporary */
+    /* X17 reserved as temporary */
    /* X18 reserved by system */
    /* X19 reserved for AREG0 */
    /* X29 reserved as fp */
@ -71,8 +72,10 @@ static TCGReg tcg_target_call_oarg_reg(TCGCallReturnKind kind, int slot)
    return TCG_REG_X0 + slot;
 }

-#define TCG_REG_TMP TCG_REG_X30
-#define TCG_VEC_TMP TCG_REG_V31
+#define TCG_REG_TMP0 TCG_REG_X16
+#define TCG_REG_TMP1 TCG_REG_X17
+#define TCG_REG_TMP2 TCG_REG_X30
+#define TCG_VEC_TMP0 TCG_REG_V31

 #ifndef CONFIG_SOFTMMU
 #define TCG_REG_GUEST_BASE TCG_REG_X28
@ -129,14 +132,6 @@ static bool patch_reloc(tcg_insn_unit *code_ptr, int type,
 #define ALL_GENERAL_REGS  0xffffffffu
 #define ALL_VECTOR_REGS   0xffffffff00000000ull

-#ifdef CONFIG_SOFTMMU
-#define ALL_QLDST_REGS \
-    (ALL_GENERAL_REGS & ~((1 << TCG_REG_X0) | (1 << TCG_REG_X1) | \
-                          (1 << TCG_REG_X2) | (1 << TCG_REG_X3)))
-#else
-#define ALL_QLDST_REGS   ALL_GENERAL_REGS
-#endif
-
 /* Match a constant valid for addition (12-bit, optionally shifted).  */
 static inline bool is_aimm(uint64_t val)
 {
@ -390,6 +385,10 @@ typedef enum {
    I3305_LDR_v64   = 0x5c000000,
    I3305_LDR_v128  = 0x9c000000,

+    /* Load/store exclusive. */
+    I3306_LDXP      = 0xc8600000,
+    I3306_STXP      = 0xc8200000,
+
    /* Load/store register.  Described here as 3.3.12, but the helper
       that emits them can transform to 3.3.10 or 3.3.13.  */
    I3312_STRB      = 0x38000000 | LDST_ST << 22 | MO_8 << 30,
@ -454,6 +453,9 @@ typedef enum {
    I3406_ADR       = 0x10000000,
    I3406_ADRP      = 0x90000000,

+    /* Add/subtract extended register instructions. */
+    I3501_ADD       = 0x0b200000,
+
    /* Add/subtract shifted register instructions (without a shift).  */
    I3502_ADD       = 0x0b000000,
    I3502_ADDS      = 0x2b000000,
@ -624,6 +626,12 @@ static void tcg_out_insn_3305(TCGContext *s, AArch64Insn insn,
    tcg_out32(s, insn | (imm19 & 0x7ffff) << 5 | rt);
 }

+static void tcg_out_insn_3306(TCGContext *s, AArch64Insn insn, TCGReg rs,
+                              TCGReg rt, TCGReg rt2, TCGReg rn)
+{
+    tcg_out32(s, insn | rs << 16 | rt2 << 10 | rn << 5 | rt);
+}
+
 static void tcg_out_insn_3201(TCGContext *s, AArch64Insn insn, TCGType ext,
                              TCGReg rt, int imm19)
 {
@ -706,6 +714,14 @@ static void tcg_out_insn_3406(TCGContext *s, AArch64Insn insn,
    tcg_out32(s, insn | (disp & 3) << 29 | (disp & 0x1ffffc) << (5 - 2) | rd);
 }

+static inline void tcg_out_insn_3501(TCGContext *s, AArch64Insn insn,
+                                     TCGType sf, TCGReg rd, TCGReg rn,
+                                     TCGReg rm, int opt, int imm3)
+{
+    tcg_out32(s, insn | sf << 31 | rm << 16 | opt << 13 |
+              imm3 << 10 | rn << 5 | rd);
+}
+
 /* This function is for both 3.5.2 (Add/Subtract shifted register), for
   the rare occasion when we actually want to supply a shift amount.  */
 static inline void tcg_out_insn_3502S(TCGContext *s, AArch64Insn insn,
@ -984,7 +1000,7 @@ static bool tcg_out_dup_vec(TCGContext *s, TCGType type, unsigned vece,
 static bool tcg_out_dupm_vec(TCGContext *s, TCGType type, unsigned vece,
                             TCGReg r, TCGReg base, intptr_t offset)
 {
-    TCGReg temp = TCG_REG_TMP;
+    TCGReg temp = TCG_REG_TMP0;

    if (offset < -0xffffff || offset > 0xffffff) {
        tcg_out_movi(s, TCG_TYPE_PTR, temp, offset);
@ -1136,8 +1152,8 @@ static void tcg_out_ldst(TCGContext *s, AArch64Insn insn, TCGReg rd,
    }

    /* Worst-case scenario, move offset to temp register, use reg offset.  */
-    tcg_out_movi(s, TCG_TYPE_I64, TCG_REG_TMP, offset);
-    tcg_out_ldst_r(s, insn, rd, rn, TCG_TYPE_I64, TCG_REG_TMP);
+    tcg_out_movi(s, TCG_TYPE_I64, TCG_REG_TMP0, offset);
+    tcg_out_ldst_r(s, insn, rd, rn, TCG_TYPE_I64, TCG_REG_TMP0);
 }

 static bool tcg_out_mov(TCGContext *s, TCGType type, TCGReg ret, TCGReg arg)
@ -1353,8 +1369,8 @@ static void tcg_out_call_int(TCGContext *s, const tcg_insn_unit *target)
    if (offset == sextract64(offset, 0, 26)) {
        tcg_out_insn(s, 3206, BL, offset);
    } else {
-        tcg_out_movi(s, TCG_TYPE_I64, TCG_REG_TMP, (intptr_t)target);
-        tcg_out_insn(s, 3207, BLR, TCG_REG_TMP);
+        tcg_out_movi(s, TCG_TYPE_I64, TCG_REG_TMP0, (intptr_t)target);
+        tcg_out_insn(s, 3207, BLR, TCG_REG_TMP0);
    }
 }

@ -1491,7 +1507,7 @@ static void tcg_out_addsub2(TCGContext *s, TCGType ext, TCGReg rl,
    AArch64Insn insn;

    if (rl == ah || (!const_bh && rl == bh)) {
-        rl = TCG_REG_TMP;
+        rl = TCG_REG_TMP0;
    }

    if (const_bl) {
@ -1508,7 +1524,7 @@ static void tcg_out_addsub2(TCGContext *s, TCGType ext, TCGReg rl,
               possibility of adding 0+const in the low part, and the
               immediate add instructions encode XSP not XZR.  Don't try
               anything more elaborate here than loading another zero.  */
-            al = TCG_REG_TMP;
+            al = TCG_REG_TMP0;
            tcg_out_movi(s, ext, al, 0);
        }
        tcg_out_insn_3401(s, insn, ext, rl, al, bl);
@ -1549,7 +1565,7 @@ static void tcg_out_cltz(TCGContext *s, TCGType ext, TCGReg d,
 {
    TCGReg a1 = a0;
    if (is_ctz) {
-        a1 = TCG_REG_TMP;
+        a1 = TCG_REG_TMP0;
        tcg_out_insn(s, 3507, RBIT, ext, a1, a0);
    }
    if (const_b && b == (ext ? 64 : 32)) {
@ -1558,7 +1574,7 @@ static void tcg_out_cltz(TCGContext *s, TCGType ext, TCGReg d,
        AArch64Insn sel = I3506_CSEL;

        tcg_out_cmp(s, ext, a0, 0, 1);
-        tcg_out_insn(s, 3507, CLZ, ext, TCG_REG_TMP, a1);
+        tcg_out_insn(s, 3507, CLZ, ext, TCG_REG_TMP0, a1);

        if (const_b) {
            if (b == -1) {
@ -1571,7 +1587,7 @@ static void tcg_out_cltz(TCGContext *s, TCGType ext, TCGReg d,
                b = d;
            }
        }
-        tcg_out_insn_3506(s, sel, ext, d, TCG_REG_TMP, b, TCG_COND_NE);
+        tcg_out_insn_3506(s, sel, ext, d, TCG_REG_TMP0, b, TCG_COND_NE);
    }
 }

@ -1588,7 +1604,7 @@ bool tcg_target_has_memory_bswap(MemOp memop)
 }

 static const TCGLdstHelperParam ldst_helper_param = {
-    .ntmp = 1, .tmp = { TCG_REG_TMP }
+    .ntmp = 1, .tmp = { TCG_REG_TMP0 }
 };

 static bool tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *lb)
@ -1633,19 +1649,19 @@ static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, HostAddress *h,
    TCGType addr_type = s->addr_type;
    TCGLabelQemuLdst *ldst = NULL;
    MemOp opc = get_memop(oi);
+    MemOp s_bits = opc & MO_SIZE;
    unsigned a_mask;

    h->aa = atom_and_align_for_opc(s, opc,
                                   have_lse2 ? MO_ATOM_WITHIN16
                                             : MO_ATOM_IFALIGN,
-                                   false);
+                                   s_bits == MO_128);
    a_mask = (1 << h->aa.align) - 1;

 #ifdef CONFIG_SOFTMMU
-    unsigned s_bits = opc & MO_SIZE;
    unsigned s_mask = (1u << s_bits) - 1;
    unsigned mem_index = get_mmuidx(oi);
-    TCGReg x3;
+    TCGReg addr_adj;
    TCGType mask_type;
    uint64_t compare_mask;

@ -1657,27 +1673,27 @@ static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, HostAddress *h,
    mask_type = (s->page_bits + s->tlb_dyn_max_bits > 32
                 ? TCG_TYPE_I64 : TCG_TYPE_I32);

-    /* Load env_tlb(env)->f[mmu_idx].{mask,table} into {x0,x1}.  */
+    /* Load env_tlb(env)->f[mmu_idx].{mask,table} into {tmp0,tmp1}. */
    QEMU_BUILD_BUG_ON(TLB_MASK_TABLE_OFS(0) > 0);
    QEMU_BUILD_BUG_ON(TLB_MASK_TABLE_OFS(0) < -512);
    QEMU_BUILD_BUG_ON(offsetof(CPUTLBDescFast, mask) != 0);
    QEMU_BUILD_BUG_ON(offsetof(CPUTLBDescFast, table) != 8);
-    tcg_out_insn(s, 3314, LDP, TCG_REG_X0, TCG_REG_X1, TCG_AREG0,
+    tcg_out_insn(s, 3314, LDP, TCG_REG_TMP0, TCG_REG_TMP1, TCG_AREG0,
                 TLB_MASK_TABLE_OFS(mem_index), 1, 0);

    /* Extract the TLB index from the address into X0.  */
    tcg_out_insn(s, 3502S, AND_LSR, mask_type == TCG_TYPE_I64,
-                 TCG_REG_X0, TCG_REG_X0, addr_reg,
+                 TCG_REG_TMP0, TCG_REG_TMP0, addr_reg,
                 s->page_bits - CPU_TLB_ENTRY_BITS);

-    /* Add the tlb_table pointer, creating the CPUTLBEntry address into X1.  */
-    tcg_out_insn(s, 3502, ADD, 1, TCG_REG_X1, TCG_REG_X1, TCG_REG_X0);
+    /* Add the tlb_table pointer, forming the CPUTLBEntry address in TMP1. */
+    tcg_out_insn(s, 3502, ADD, 1, TCG_REG_TMP1, TCG_REG_TMP1, TCG_REG_TMP0);

-    /* Load the tlb comparator into X0, and the fast path addend into X1.  */
-    tcg_out_ld(s, addr_type, TCG_REG_X0, TCG_REG_X1,
+    /* Load the tlb comparator into TMP0, and the fast path addend into TMP1. */
+    tcg_out_ld(s, addr_type, TCG_REG_TMP0, TCG_REG_TMP1,
               is_ld ? offsetof(CPUTLBEntry, addr_read)
                     : offsetof(CPUTLBEntry, addr_write));
-    tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_X1, TCG_REG_X1,
+    tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_TMP1, TCG_REG_TMP1,
               offsetof(CPUTLBEntry, addend));

    /*
@ -1686,25 +1702,26 @@ static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, HostAddress *h,
     * cross pages using the address of the last byte of the access.
     */
    if (a_mask >= s_mask) {
-        x3 = addr_reg;
+        addr_adj = addr_reg;
    } else {
+        addr_adj = TCG_REG_TMP2;
        tcg_out_insn(s, 3401, ADDI, addr_type,
-                     TCG_REG_X3, addr_reg, s_mask - a_mask);
-        x3 = TCG_REG_X3;
+                     addr_adj, addr_reg, s_mask - a_mask);
    }
    compare_mask = (uint64_t)s->page_mask | a_mask;

-    /* Store the page mask part of the address into X3.  */
-    tcg_out_logicali(s, I3404_ANDI, addr_type, TCG_REG_X3, x3, compare_mask);
+    /* Store the page mask part of the address into TMP2.  */
+    tcg_out_logicali(s, I3404_ANDI, addr_type, TCG_REG_TMP2,
+                     addr_adj, compare_mask);

    /* Perform the address comparison. */
-    tcg_out_cmp(s, addr_type, TCG_REG_X0, TCG_REG_X3, 0);
+    tcg_out_cmp(s, addr_type, TCG_REG_TMP0, TCG_REG_TMP2, 0);

    /* If not equal, we jump to the slow path. */
    ldst->label_ptr[0] = s->code_ptr;
    tcg_out_insn(s, 3202, B_C, TCG_COND_NE, 0);

-    h->base = TCG_REG_X1,
+    h->base = TCG_REG_TMP1;
    h->index = addr_reg;
    h->index_ext = addr_type;
 #else
@ -1822,6 +1839,108 @@ static void tcg_out_qemu_st(TCGContext *s, TCGReg data_reg, TCGReg addr_reg,
    }
 }

+static void tcg_out_qemu_ldst_i128(TCGContext *s, TCGReg datalo, TCGReg datahi,
+                                   TCGReg addr_reg, MemOpIdx oi, bool is_ld)
+{
+    TCGLabelQemuLdst *ldst;
+    HostAddress h;
+    TCGReg base;
+    bool use_pair;
+
+    ldst = prepare_host_addr(s, &h, addr_reg, oi, is_ld);
+
+    /* Compose the final address, as LDP/STP have no indexing. */
+    if (h.index == TCG_REG_XZR) {
+        base = h.base;
+    } else {
+        base = TCG_REG_TMP2;
+        if (h.index_ext == TCG_TYPE_I32) {
+            /* add base, base, index, uxtw */
+            tcg_out_insn(s, 3501, ADD, TCG_TYPE_I64, base,
+                         h.base, h.index, MO_32, 0);
+        } else {
+            /* add base, base, index */
+            tcg_out_insn(s, 3502, ADD, 1, base, h.base, h.index);
+        }
+    }
+
+    use_pair = h.aa.atom < MO_128 || have_lse2;
+
+    if (!use_pair) {
+        tcg_insn_unit *branch = NULL;
+        TCGReg ll, lh, sl, sh;
+
+        /*
+         * If we have already checked for 16-byte alignment, that's all
+         * we need. Otherwise we have determined that misaligned atomicity
+         * may be handled with two 8-byte loads.
+         */
+        if (h.aa.align < MO_128) {
+            /*
+             * TODO: align should be MO_64, so we only need test bit 3,
+             * which means we could use TBNZ instead of ANDS+B_C.
+             */
+            tcg_out_logicali(s, I3404_ANDSI, 0, TCG_REG_XZR, addr_reg, 15);
+            branch = s->code_ptr;
+            tcg_out_insn(s, 3202, B_C, TCG_COND_NE, 0);
+            use_pair = true;
+        }
+
+        if (is_ld) {
+            /*
+             * 16-byte atomicity without LSE2 requires LDXP+STXP loop:
+             *    ldxp lo, hi, [base]
+             *    stxp t0, lo, hi, [base]
+             *    cbnz t0, .-8
+             * Require no overlap between data{lo,hi} and base.
+             */
+            if (datalo == base || datahi == base) {
+                tcg_out_mov(s, TCG_TYPE_REG, TCG_REG_TMP2, base);
+                base = TCG_REG_TMP2;
+            }
+            ll = sl = datalo;
+            lh = sh = datahi;
+        } else {
+            /*
+             * 16-byte atomicity without LSE2 requires LDXP+STXP loop:
+             * 1: ldxp t0, t1, [base]
+             *    stxp t0, lo, hi, [base]
+             *    cbnz t0, 1b
+             */
+            tcg_debug_assert(base != TCG_REG_TMP0 && base != TCG_REG_TMP1);
+            ll = TCG_REG_TMP0;
+            lh = TCG_REG_TMP1;
+            sl = datalo;
+            sh = datahi;
+        }
+
+        tcg_out_insn(s, 3306, LDXP, TCG_REG_XZR, ll, lh, base);
+        tcg_out_insn(s, 3306, STXP, TCG_REG_TMP0, sl, sh, base);
+        tcg_out_insn(s, 3201, CBNZ, 0, TCG_REG_TMP0, -2);
+
+        if (use_pair) {
+            /* "b .+8", branching across the one insn of use_pair. */
+            tcg_out_insn(s, 3206, B, 2);
+            reloc_pc19(branch, tcg_splitwx_to_rx(s->code_ptr));
+        }
+    }
+
+    if (use_pair) {
+        if (is_ld) {
+            tcg_out_insn(s, 3314, LDP, datalo, datahi, base, 0, 1, 0);
+        } else {
+            tcg_out_insn(s, 3314, STP, datalo, datahi, base, 0, 1, 0);
+        }
+    }
+
+    if (ldst) {
+        ldst->type = TCG_TYPE_I128;
+        ldst->datalo_reg = datalo;
+        ldst->datahi_reg = datahi;
+        ldst->raddr = tcg_splitwx_to_rx(s->code_ptr);
+    }
+}
+
 static const tcg_insn_unit *tb_ret_addr;

 static void tcg_out_exit_tb(TCGContext *s, uintptr_t a0)
@ -1847,7 +1966,7 @@ static void tcg_out_goto_tb(TCGContext *s, int which)

    set_jmp_insn_offset(s, which);
    tcg_out32(s, I3206_B);
-    tcg_out_insn(s, 3207, BR, TCG_REG_TMP);
+    tcg_out_insn(s, 3207, BR, TCG_REG_TMP0);
    set_jmp_reset_offset(s, which);
 }

@ -1866,7 +1985,7 @@ void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
        ptrdiff_t i_offset = i_addr - jmp_rx;

        /* Note that we asserted this in range in tcg_out_goto_tb. */
-        insn = deposit32(I3305_LDR | TCG_REG_TMP, 5, 19, i_offset >> 2);
+        insn = deposit32(I3305_LDR | TCG_REG_TMP0, 5, 19, i_offset >> 2);
    }
    qatomic_set((uint32_t *)jmp_rw, insn);
    flush_idcache_range(jmp_rx, jmp_rw, 4);
@ -2060,13 +2179,13 @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,

    case INDEX_op_rem_i64:
    case INDEX_op_rem_i32:
-        tcg_out_insn(s, 3508, SDIV, ext, TCG_REG_TMP, a1, a2);
-        tcg_out_insn(s, 3509, MSUB, ext, a0, TCG_REG_TMP, a2, a1);
+        tcg_out_insn(s, 3508, SDIV, ext, TCG_REG_TMP0, a1, a2);
+        tcg_out_insn(s, 3509, MSUB, ext, a0, TCG_REG_TMP0, a2, a1);
        break;
    case INDEX_op_remu_i64:
    case INDEX_op_remu_i32:
-        tcg_out_insn(s, 3508, UDIV, ext, TCG_REG_TMP, a1, a2);
-        tcg_out_insn(s, 3509, MSUB, ext, a0, TCG_REG_TMP, a2, a1);
+        tcg_out_insn(s, 3508, UDIV, ext, TCG_REG_TMP0, a1, a2);
+        tcg_out_insn(s, 3509, MSUB, ext, a0, TCG_REG_TMP0, a2, a1);
        break;

    case INDEX_op_shl_i64:
@ -2110,8 +2229,8 @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
        if (c2) {
            tcg_out_rotl(s, ext, a0, a1, a2);
        } else {
-            tcg_out_insn(s, 3502, SUB, 0, TCG_REG_TMP, TCG_REG_XZR, a2);
-            tcg_out_insn(s, 3508, RORV, ext, a0, a1, TCG_REG_TMP);
+            tcg_out_insn(s, 3502, SUB, 0, TCG_REG_TMP0, TCG_REG_XZR, a2);
+            tcg_out_insn(s, 3508, RORV, ext, a0, a1, TCG_REG_TMP0);
        }
        break;

@ -2161,6 +2280,14 @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
    case INDEX_op_qemu_st_a64_i64:
        tcg_out_qemu_st(s, REG0(0), a1, a2, ext);
        break;
+    case INDEX_op_qemu_ld_a32_i128:
+    case INDEX_op_qemu_ld_a64_i128:
+        tcg_out_qemu_ldst_i128(s, a0, a1, a2, args[3], true);
+        break;
+    case INDEX_op_qemu_st_a32_i128:
+    case INDEX_op_qemu_st_a64_i128:
+        tcg_out_qemu_ldst_i128(s, REG0(0), REG0(1), a2, args[3], false);
+        break;

    case INDEX_op_bswap64_i64:
        tcg_out_rev(s, TCG_TYPE_I64, MO_64, a0, a1);
@ -2517,8 +2644,8 @@ static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
                            break;
                        }
                    }
-                    tcg_out_dupi_vec(s, type, MO_8, TCG_VEC_TMP, 0);
-                    a2 = TCG_VEC_TMP;
+                    tcg_out_dupi_vec(s, type, MO_8, TCG_VEC_TMP0, 0);
+                    a2 = TCG_VEC_TMP0;
                }
                if (is_scalar) {
                    insn = cmp_scalar_insn[cond];
@ -2799,12 +2926,18 @@ static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op)
    case INDEX_op_qemu_ld_a64_i32:
    case INDEX_op_qemu_ld_a32_i64:
    case INDEX_op_qemu_ld_a64_i64:
-        return C_O1_I1(r, l);
+        return C_O1_I1(r, r);
+    case INDEX_op_qemu_ld_a32_i128:
+    case INDEX_op_qemu_ld_a64_i128:
+        return C_O2_I1(r, r, r);
    case INDEX_op_qemu_st_a32_i32:
    case INDEX_op_qemu_st_a64_i32:
    case INDEX_op_qemu_st_a32_i64:
    case INDEX_op_qemu_st_a64_i64:
-        return C_O0_I2(lZ, l);
+        return C_O0_I2(rZ, r);
+    case INDEX_op_qemu_st_a32_i128:
+    case INDEX_op_qemu_st_a64_i128:
+        return C_O0_I3(rZ, rZ, r);

    case INDEX_op_deposit_i32:
    case INDEX_op_deposit_i64:
@ -2900,9 +3033,11 @@ static void tcg_target_init(TCGContext *s)
    s->reserved_regs = 0;
    tcg_regset_set_reg(s->reserved_regs, TCG_REG_SP);
    tcg_regset_set_reg(s->reserved_regs, TCG_REG_FP);
-    tcg_regset_set_reg(s->reserved_regs, TCG_REG_TMP);
    tcg_regset_set_reg(s->reserved_regs, TCG_REG_X18); /* platform register */
-    tcg_regset_set_reg(s->reserved_regs, TCG_VEC_TMP);
+    tcg_regset_set_reg(s->reserved_regs, TCG_REG_TMP0);
+    tcg_regset_set_reg(s->reserved_regs, TCG_REG_TMP1);
+    tcg_regset_set_reg(s->reserved_regs, TCG_REG_TMP2);
+    tcg_regset_set_reg(s->reserved_regs, TCG_VEC_TMP0);
 }

 /* Saving pairs: (X19, X20) .. (X27, X28), (X29(fp), X30(lr)).  */
--- a/tcg/aarch64/tcg-target.h
+++ b/tcg/aarch64/tcg-target.h
@ -16,7 +16,6 @@
 #include "host/cpuinfo.h"

 #define TCG_TARGET_INSN_UNIT_SIZE  4
-#define TCG_TARGET_TLB_DISPLACEMENT_BITS 24
 #define MAX_CODE_GEN_BUFFER_SIZE  ((size_t)-1)

 typedef enum {
@ -131,7 +130,16 @@ typedef enum {
 #define TCG_TARGET_HAS_muluh_i64        1
 #define TCG_TARGET_HAS_mulsh_i64        1

-#define TCG_TARGET_HAS_qemu_ldst_i128   0
+/*
+ * Without FEAT_LSE2, we must use LDXP+STXP to implement atomic 128-bit load,
+ * which requires writable pages.  We must defer to the helper for user-only,
+ * but in system mode all ram is writable for the host.
+ */
+#ifdef CONFIG_USER_ONLY
+#define TCG_TARGET_HAS_qemu_ldst_i128   have_lse2
+#else
+#define TCG_TARGET_HAS_qemu_ldst_i128   1
+#endif

 #define TCG_TARGET_HAS_v64              1
 #define TCG_TARGET_HAS_v128             1
--- a/tcg/arm/tcg-target.h
+++ b/tcg/arm/tcg-target.h
@ -31,7 +31,6 @@ extern int arm_arch;
 #define use_armv7_instructions  (__ARM_ARCH >= 7 || arm_arch >= 7)

 #define TCG_TARGET_INSN_UNIT_SIZE 4
-#define TCG_TARGET_TLB_DISPLACEMENT_BITS 16
 #define MAX_CODE_GEN_BUFFER_SIZE  UINT32_MAX

 typedef enum {
--- a/tcg/i386/tcg-target.c.inc
+++ b/tcg/i386/tcg-target.c.inc
@ -91,6 +91,8 @@ static const int tcg_target_reg_alloc_order[] = {
 #endif
 };

+#define TCG_TMP_VEC  TCG_REG_XMM5
+
 static const int tcg_target_call_iarg_regs[] = {
 #if TCG_TARGET_REG_BITS == 64
 #if defined(_WIN64)
@ -319,6 +321,8 @@ static bool tcg_target_const_match(int64_t val, TCGType type, int ct)
 #define OPC_PCMPGTW     (0x65 | P_EXT | P_DATA16)
 #define OPC_PCMPGTD     (0x66 | P_EXT | P_DATA16)
 #define OPC_PCMPGTQ     (0x37 | P_EXT38 | P_DATA16)
+#define OPC_PEXTRD      (0x16 | P_EXT3A | P_DATA16)
+#define OPC_PINSRD      (0x22 | P_EXT3A | P_DATA16)
 #define OPC_PMAXSB      (0x3c | P_EXT38 | P_DATA16)
 #define OPC_PMAXSW      (0xee | P_EXT | P_DATA16)
 #define OPC_PMAXSD      (0x3d | P_EXT38 | P_DATA16)
@ -1753,7 +1757,21 @@ typedef struct {

 bool tcg_target_has_memory_bswap(MemOp memop)
 {
-    return have_movbe;
+    TCGAtomAlign aa;
+
+    if (!have_movbe) {
+        return false;
+    }
+    if ((memop & MO_SIZE) < MO_128) {
+        return true;
+    }
+
+    /*
+     * Reject 16-byte memop with 16-byte atomicity, i.e. VMOVDQA,
+     * but do allow a pair of 64-bit operations, i.e. MOVBEQ.
+     */
+    aa = atom_and_align_for_opc(tcg_ctx, memop, MO_ATOM_IFALIGN, true);
+    return aa.atom < MO_128;
 }

 /*
@ -1781,6 +1799,30 @@ static const TCGLdstHelperParam ldst_helper_param = {
 static const TCGLdstHelperParam ldst_helper_param = { };
 #endif

+static void tcg_out_vec_to_pair(TCGContext *s, TCGType type,
+                                TCGReg l, TCGReg h, TCGReg v)
+{
+    int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW;
+
+    /* vpmov{d,q} %v, %l */
+    tcg_out_vex_modrm(s, OPC_MOVD_EyVy + rexw, v, 0, l);
+    /* vpextr{d,q} $1, %v, %h */
+    tcg_out_vex_modrm(s, OPC_PEXTRD + rexw, v, 0, h);
+    tcg_out8(s, 1);
+}
+
+static void tcg_out_pair_to_vec(TCGContext *s, TCGType type,
+                                TCGReg v, TCGReg l, TCGReg h)
+{
+    int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW;
+
+    /* vmov{d,q} %l, %v */
+    tcg_out_vex_modrm(s, OPC_MOVD_VyEy + rexw, v, 0, l);
+    /* vpinsr{d,q} $1, %h, %v, %v */
+    tcg_out_vex_modrm(s, OPC_PINSRD + rexw, v, v, h);
+    tcg_out8(s, 1);
+}
+
 /*
 * Generate code for the slow path for a load at the end of block
 */
@ -1870,6 +1912,7 @@ static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, HostAddress *h,
 {
    TCGLabelQemuLdst *ldst = NULL;
    MemOp opc = get_memop(oi);
+    MemOp s_bits = opc & MO_SIZE;
    unsigned a_mask;

 #ifdef CONFIG_SOFTMMU
@ -1880,7 +1923,7 @@ static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, HostAddress *h,
    *h = x86_guest_base;
 #endif
    h->base = addrlo;
-    h->aa = atom_and_align_for_opc(s, opc, MO_ATOM_IFALIGN, false);
+    h->aa = atom_and_align_for_opc(s, opc, MO_ATOM_IFALIGN, s_bits == MO_128);
    a_mask = (1 << h->aa.align) - 1;

 #ifdef CONFIG_SOFTMMU
@ -1890,7 +1933,6 @@ static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, HostAddress *h,
    TCGType tlbtype = TCG_TYPE_I32;
    int trexw = 0, hrexw = 0, tlbrexw = 0;
    unsigned mem_index = get_mmuidx(oi);
-    unsigned s_bits = opc & MO_SIZE;
    unsigned s_mask = (1 << s_bits) - 1;
    int tlb_mask;

@ -2070,6 +2112,72 @@ static void tcg_out_qemu_ld_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
                                     h.base, h.index, 0, h.ofs + 4);
        }
        break;
+
+    case MO_128:
+        tcg_debug_assert(TCG_TARGET_REG_BITS == 64);
+
+        /*
+         * Without 16-byte atomicity, use integer regs.
+         * That is where we want the data, and it allows bswaps.
+         */
+        if (h.aa.atom < MO_128) {
+            if (use_movbe) {
+                TCGReg t = datalo;
+                datalo = datahi;
+                datahi = t;
+            }
+            if (h.base == datalo || h.index == datalo) {
+                tcg_out_modrm_sib_offset(s, OPC_LEA + P_REXW, datahi,
+                                         h.base, h.index, 0, h.ofs);
+                tcg_out_modrm_offset(s, movop + P_REXW + h.seg,
+                                     datalo, datahi, 0);
+                tcg_out_modrm_offset(s, movop + P_REXW + h.seg,
+                                     datahi, datahi, 8);
+            } else {
+                tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datalo,
+                                         h.base, h.index, 0, h.ofs);
+                tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datahi,
+                                         h.base, h.index, 0, h.ofs + 8);
+            }
+            break;
+        }
+
+        /*
+         * With 16-byte atomicity, a vector load is required.
+         * If we already have 16-byte alignment, then VMOVDQA always works.
+         * Else if VMOVDQU has atomicity with dynamic alignment, use that.
+         * Else use we require a runtime test for alignment for VMOVDQA;
+         * use VMOVDQU on the unaligned nonatomic path for simplicity.
+         */
+        if (h.aa.align >= MO_128) {
+            tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQA_VxWx + h.seg,
+                                         TCG_TMP_VEC, 0,
+                                         h.base, h.index, 0, h.ofs);
+        } else if (cpuinfo & CPUINFO_ATOMIC_VMOVDQU) {
+            tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQU_VxWx + h.seg,
+                                         TCG_TMP_VEC, 0,
+                                         h.base, h.index, 0, h.ofs);
+        } else {
+            TCGLabel *l1 = gen_new_label();
+            TCGLabel *l2 = gen_new_label();
+
+            tcg_out_testi(s, h.base, 15);
+            tcg_out_jxx(s, JCC_JNE, l1, true);
+
+            tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQA_VxWx + h.seg,
+                                         TCG_TMP_VEC, 0,
+                                         h.base, h.index, 0, h.ofs);
+            tcg_out_jxx(s, JCC_JMP, l2, true);
+
+            tcg_out_label(s, l1);
+            tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQU_VxWx + h.seg,
+                                         TCG_TMP_VEC, 0,
+                                         h.base, h.index, 0, h.ofs);
+            tcg_out_label(s, l2);
+        }
+        tcg_out_vec_to_pair(s, TCG_TYPE_I64, datalo, datahi, TCG_TMP_VEC);
+        break;
+
    default:
        g_assert_not_reached();
    }
@ -2140,6 +2248,63 @@ static void tcg_out_qemu_st_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
                                     h.base, h.index, 0, h.ofs + 4);
        }
        break;
+
+    case MO_128:
+        tcg_debug_assert(TCG_TARGET_REG_BITS == 64);
+
+        /*
+         * Without 16-byte atomicity, use integer regs.
+         * That is where we have the data, and it allows bswaps.
+         */
+        if (h.aa.atom < MO_128) {
+            if (use_movbe) {
+                TCGReg t = datalo;
+                datalo = datahi;
+                datahi = t;
+            }
+            tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datalo,
+                                     h.base, h.index, 0, h.ofs);
+            tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datahi,
+                                     h.base, h.index, 0, h.ofs + 8);
+            break;
+        }
+
+        /*
+         * With 16-byte atomicity, a vector store is required.
+         * If we already have 16-byte alignment, then VMOVDQA always works.
+         * Else if VMOVDQU has atomicity with dynamic alignment, use that.
+         * Else use we require a runtime test for alignment for VMOVDQA;
+         * use VMOVDQU on the unaligned nonatomic path for simplicity.
+         */
+        tcg_out_pair_to_vec(s, TCG_TYPE_I64, TCG_TMP_VEC, datalo, datahi);
+        if (h.aa.align >= MO_128) {
+            tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQA_WxVx + h.seg,
+                                         TCG_TMP_VEC, 0,
+                                         h.base, h.index, 0, h.ofs);
+        } else if (cpuinfo & CPUINFO_ATOMIC_VMOVDQU) {
+            tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQU_WxVx + h.seg,
+                                         TCG_TMP_VEC, 0,
+                                         h.base, h.index, 0, h.ofs);
+        } else {
+            TCGLabel *l1 = gen_new_label();
+            TCGLabel *l2 = gen_new_label();
+
+            tcg_out_testi(s, h.base, 15);
+            tcg_out_jxx(s, JCC_JNE, l1, true);
+
+            tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQA_WxVx + h.seg,
+                                         TCG_TMP_VEC, 0,
+                                         h.base, h.index, 0, h.ofs);
+            tcg_out_jxx(s, JCC_JMP, l2, true);
+
+            tcg_out_label(s, l1);
+            tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQU_WxVx + h.seg,
+                                         TCG_TMP_VEC, 0,
+                                         h.base, h.index, 0, h.ofs);
+            tcg_out_label(s, l2);
+        }
+        break;
+
    default:
        g_assert_not_reached();
    }
@ -2470,6 +2635,11 @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
            tcg_out_qemu_ld(s, a0, a1, a2, args[3], args[4], TCG_TYPE_I64);
        }
        break;
+    case INDEX_op_qemu_ld_a32_i128:
+    case INDEX_op_qemu_ld_a64_i128:
+        tcg_debug_assert(TCG_TARGET_REG_BITS == 64);
+        tcg_out_qemu_ld(s, a0, a1, a2, -1, args[3], TCG_TYPE_I128);
+        break;

    case INDEX_op_qemu_st_a64_i32:
    case INDEX_op_qemu_st8_a64_i32:
@ -2496,6 +2666,11 @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
            tcg_out_qemu_st(s, a0, a1, a2, args[3], args[4], TCG_TYPE_I64);
        }
        break;
+    case INDEX_op_qemu_st_a32_i128:
+    case INDEX_op_qemu_st_a64_i128:
+        tcg_debug_assert(TCG_TARGET_REG_BITS == 64);
+        tcg_out_qemu_st(s, a0, a1, a2, -1, args[3], TCG_TYPE_I128);
+        break;

    OP_32_64(mulu2):
        tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_MUL, args[3]);
@ -3193,6 +3368,15 @@ static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op)
    case INDEX_op_qemu_st_a64_i64:
        return TCG_TARGET_REG_BITS == 64 ? C_O0_I2(L, L) : C_O0_I4(L, L, L, L);

+    case INDEX_op_qemu_ld_a32_i128:
+    case INDEX_op_qemu_ld_a64_i128:
+        tcg_debug_assert(TCG_TARGET_REG_BITS == 64);
+        return C_O2_I1(r, r, L);
+    case INDEX_op_qemu_st_a32_i128:
+    case INDEX_op_qemu_st_a64_i128:
+        tcg_debug_assert(TCG_TARGET_REG_BITS == 64);
+        return C_O0_I3(L, L, L);
+
    case INDEX_op_brcond2_i32:
        return C_O0_I4(r, r, ri, ri);

@ -3962,6 +4146,7 @@ static void tcg_target_init(TCGContext *s)

    s->reserved_regs = 0;
    tcg_regset_set_reg(s->reserved_regs, TCG_REG_CALL_STACK);
+    tcg_regset_set_reg(s->reserved_regs, TCG_TMP_VEC);
 #ifdef _WIN64
    /* These are call saved, and we don't save them, so don't use them. */
    tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM6);
--- a/tcg/i386/tcg-target.h
+++ b/tcg/i386/tcg-target.h
@ -28,7 +28,6 @@
 #include "host/cpuinfo.h"

 #define TCG_TARGET_INSN_UNIT_SIZE  1
-#define TCG_TARGET_TLB_DISPLACEMENT_BITS 31

 #ifdef __x86_64__
 # define TCG_TARGET_REG_BITS  64
@ -118,7 +117,6 @@ typedef enum {
 #define have_avx1         (cpuinfo & CPUINFO_AVX1)
 #define have_avx2         (cpuinfo & CPUINFO_AVX2)
 #define have_movbe        (cpuinfo & CPUINFO_MOVBE)
-#define have_atomic16     (cpuinfo & CPUINFO_ATOMIC_VMOVDQA)

 /*
 * There are interesting instructions in AVX512, so long as we have AVX512VL,
@ -202,7 +200,8 @@ typedef enum {
 #define TCG_TARGET_HAS_qemu_st8_i32     1
 #endif

-#define TCG_TARGET_HAS_qemu_ldst_i128   0
+#define TCG_TARGET_HAS_qemu_ldst_i128 \
+    (TCG_TARGET_REG_BITS == 64 && (cpuinfo & CPUINFO_ATOMIC_VMOVDQA))

 /* We do not support older SSE systems, only beginning with AVX1.  */
 #define TCG_TARGET_HAS_v64              have_avx1
--- a/tcg/mips/tcg-target.h
+++ b/tcg/mips/tcg-target.h
@ -36,7 +36,6 @@
 #endif

 #define TCG_TARGET_INSN_UNIT_SIZE 4
-#define TCG_TARGET_TLB_DISPLACEMENT_BITS 16
 #define TCG_TARGET_NB_REGS 32

 #define MAX_CODE_GEN_BUFFER_SIZE  ((size_t)-1)
--- a/tcg/ppc/tcg-target-con-set.h
+++ b/tcg/ppc/tcg-target-con-set.h
@ -14,6 +14,7 @@ C_O0_I2(r, r)
 C_O0_I2(r, ri)
 C_O0_I2(v, r)
 C_O0_I3(r, r, r)
+C_O0_I3(o, m, r)
 C_O0_I4(r, r, ri, ri)
 C_O0_I4(r, r, r, r)
 C_O1_I1(r, r)
@ -34,6 +35,7 @@ C_O1_I3(v, v, v, v)
 C_O1_I4(r, r, ri, rZ, rZ)
 C_O1_I4(r, r, r, ri, ri)
 C_O2_I1(r, r, r)
+C_O2_I1(o, m, r)
 C_O2_I2(r, r, r, r)
 C_O2_I4(r, r, rI, rZM, r, r)
 C_O2_I4(r, r, r, r, rI, rZM)
--- a/tcg/ppc/tcg-target-con-str.h
+++ b/tcg/ppc/tcg-target-con-str.h
@ -9,6 +9,7 @@
 * REGS(letter, register_mask)
 */
 REGS('r', ALL_GENERAL_REGS)
+REGS('o', ALL_GENERAL_REGS & 0xAAAAAAAAu)  /* odd registers */
 REGS('v', ALL_VECTOR_REGS)

 /*
--- a/tcg/ppc/tcg-target.c.inc
+++ b/tcg/ppc/tcg-target.c.inc
@ -295,25 +295,27 @@ static bool tcg_target_const_match(int64_t val, TCGType type, int ct)

 #define B      OPCD( 18)
 #define BC     OPCD( 16)
+
 #define LBZ    OPCD( 34)
 #define LHZ    OPCD( 40)
 #define LHA    OPCD( 42)
 #define LWZ    OPCD( 32)
 #define LWZUX  XO31( 55)
-#define STB    OPCD( 38)
-#define STH    OPCD( 44)
-#define STW    OPCD( 36)
-
-#define STD    XO62(  0)
-#define STDU   XO62(  1)
-#define STDX   XO31(149)
-
 #define LD     XO58(  0)
 #define LDX    XO31( 21)
 #define LDU    XO58(  1)
 #define LDUX   XO31( 53)
 #define LWA    XO58(  2)
 #define LWAX   XO31(341)
+#define LQ     OPCD( 56)
+
+#define STB    OPCD( 38)
+#define STH    OPCD( 44)
+#define STW    OPCD( 36)
+#define STD    XO62(  0)
+#define STDU   XO62(  1)
+#define STDX   XO31(149)
+#define STQ    XO62(  2)

 #define ADDIC  OPCD( 12)
 #define ADDI   OPCD( 14)
@ -2020,7 +2022,18 @@ typedef struct {

 bool tcg_target_has_memory_bswap(MemOp memop)
 {
-    return true;
+    TCGAtomAlign aa;
+
+    if ((memop & MO_SIZE) <= MO_64) {
+        return true;
+    }
+
+    /*
+     * Reject 16-byte memop with 16-byte atomicity,
+     * but do allow a pair of 64-bit operations.
+     */
+    aa = atom_and_align_for_opc(tcg_ctx, memop, MO_ATOM_IFALIGN, true);
+    return aa.atom <= MO_64;
 }

 /*
@ -2035,7 +2048,7 @@ static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, HostAddress *h,
 {
    TCGLabelQemuLdst *ldst = NULL;
    MemOp opc = get_memop(oi);
-    MemOp a_bits;
+    MemOp a_bits, s_bits;

    /*
     * Book II, Section 1.4, Single-Copy Atomicity, specifies:
@ -2047,10 +2060,11 @@ static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, HostAddress *h,
     * As of 3.0, "the non-atomic access is performed as described in
     * the corresponding list", which matches MO_ATOM_SUBALIGN.
     */
+    s_bits = opc & MO_SIZE;
    h->aa = atom_and_align_for_opc(s, opc,
                                   have_isa_3_00 ? MO_ATOM_SUBALIGN
                                                 : MO_ATOM_IFALIGN,
-                                   false);
+                                   s_bits == MO_128);
    a_bits = h->aa.align;

 #ifdef CONFIG_SOFTMMU
@ -2060,7 +2074,6 @@ static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, HostAddress *h,
    int fast_off = TLB_MASK_TABLE_OFS(mem_index);
    int mask_off = fast_off + offsetof(CPUTLBDescFast, mask);
    int table_off = fast_off + offsetof(CPUTLBDescFast, table);
-    unsigned s_bits = opc & MO_SIZE;

    ldst = new_ldst_label(s);
    ldst->is_ld = is_ld;
@ -2303,6 +2316,60 @@ static void tcg_out_qemu_st(TCGContext *s, TCGReg datalo, TCGReg datahi,
    }
 }

+static void tcg_out_qemu_ldst_i128(TCGContext *s, TCGReg datalo, TCGReg datahi,
+                                   TCGReg addr_reg, MemOpIdx oi, bool is_ld)
+{
+    TCGLabelQemuLdst *ldst;
+    HostAddress h;
+    bool need_bswap;
+    uint32_t insn;
+    TCGReg index;
+
+    ldst = prepare_host_addr(s, &h, addr_reg, -1, oi, is_ld);
+
+    /* Compose the final address, as LQ/STQ have no indexing. */
+    index = h.index;
+    if (h.base != 0) {
+        index = TCG_REG_TMP1;
+        tcg_out32(s, ADD | TAB(index, h.base, h.index));
+    }
+    need_bswap = get_memop(oi) & MO_BSWAP;
+
+    if (h.aa.atom == MO_128) {
+        tcg_debug_assert(!need_bswap);
+        tcg_debug_assert(datalo & 1);
+        tcg_debug_assert(datahi == datalo - 1);
+        insn = is_ld ? LQ : STQ;
+        tcg_out32(s, insn | TAI(datahi, index, 0));
+    } else {
+        TCGReg d1, d2;
+
+        if (HOST_BIG_ENDIAN ^ need_bswap) {
+            d1 = datahi, d2 = datalo;
+        } else {
+            d1 = datalo, d2 = datahi;
+        }
+
+        if (need_bswap) {
+            tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_R0, 8);
+            insn = is_ld ? LDBRX : STDBRX;
+            tcg_out32(s, insn | TAB(d1, 0, index));
+            tcg_out32(s, insn | TAB(d2, index, TCG_REG_R0));
+        } else {
+            insn = is_ld ? LD : STD;
+            tcg_out32(s, insn | TAI(d1, index, 0));
+            tcg_out32(s, insn | TAI(d2, index, 8));
+        }
+    }
+
+    if (ldst) {
+        ldst->type = TCG_TYPE_I128;
+        ldst->datalo_reg = datalo;
+        ldst->datahi_reg = datahi;
+        ldst->raddr = tcg_splitwx_to_rx(s->code_ptr);
+    }
+}
+
 static void tcg_out_nop_fill(tcg_insn_unit *p, int count)
 {
    int i;
@ -2860,6 +2927,11 @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
                            args[4], TCG_TYPE_I64);
        }
        break;
+    case INDEX_op_qemu_ld_a32_i128:
+    case INDEX_op_qemu_ld_a64_i128:
+        tcg_debug_assert(TCG_TARGET_REG_BITS == 64);
+        tcg_out_qemu_ldst_i128(s, args[0], args[1], args[2], args[3], true);
+        break;

    case INDEX_op_qemu_st_a64_i32:
        if (TCG_TARGET_REG_BITS == 32) {
@ -2889,6 +2961,11 @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
                            args[4], TCG_TYPE_I64);
        }
        break;
+    case INDEX_op_qemu_st_a32_i128:
+    case INDEX_op_qemu_st_a64_i128:
+        tcg_debug_assert(TCG_TARGET_REG_BITS == 64);
+        tcg_out_qemu_ldst_i128(s, args[0], args[1], args[2], args[3], false);
+        break;

    case INDEX_op_setcond_i32:
        tcg_out_setcond(s, TCG_TYPE_I32, args[3], args[0], args[1], args[2],
@ -3722,6 +3799,13 @@ static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op)
    case INDEX_op_qemu_st_a64_i64:
        return TCG_TARGET_REG_BITS == 64 ? C_O0_I2(r, r) : C_O0_I4(r, r, r, r);

+    case INDEX_op_qemu_ld_a32_i128:
+    case INDEX_op_qemu_ld_a64_i128:
+        return C_O2_I1(o, m, r);
+    case INDEX_op_qemu_st_a32_i128:
+    case INDEX_op_qemu_st_a64_i128:
+        return C_O0_I3(o, m, r);
+
    case INDEX_op_add_vec:
    case INDEX_op_sub_vec:
    case INDEX_op_mul_vec:
--- a/tcg/ppc/tcg-target.h
+++ b/tcg/ppc/tcg-target.h
@ -34,7 +34,6 @@

 #define TCG_TARGET_NB_REGS 64
 #define TCG_TARGET_INSN_UNIT_SIZE 4
-#define TCG_TARGET_TLB_DISPLACEMENT_BITS 16

 typedef enum {
    TCG_REG_R0,  TCG_REG_R1,  TCG_REG_R2,  TCG_REG_R3,
@ -149,7 +148,8 @@ extern bool have_vsx;
 #define TCG_TARGET_HAS_mulsh_i64        1
 #endif

-#define TCG_TARGET_HAS_qemu_ldst_i128   0
+#define TCG_TARGET_HAS_qemu_ldst_i128   \
+    (TCG_TARGET_REG_BITS == 64 && have_isa_2_07)

 /*
 * While technically Altivec could support V64, it has no 64-bit store
--- a/tcg/riscv/tcg-target.h
+++ b/tcg/riscv/tcg-target.h
@ -35,7 +35,6 @@
 #define TCG_TARGET_REG_BITS 64

 #define TCG_TARGET_INSN_UNIT_SIZE 4
-#define TCG_TARGET_TLB_DISPLACEMENT_BITS 20
 #define TCG_TARGET_NB_REGS 32
 #define MAX_CODE_GEN_BUFFER_SIZE  ((size_t)-1)

--- a/tcg/s390x/tcg-target-con-set.h
+++ b/tcg/s390x/tcg-target-con-set.h
@ -14,6 +14,7 @@ C_O0_I2(r, r)
 C_O0_I2(r, ri)
 C_O0_I2(r, rA)
 C_O0_I2(v, r)
+C_O0_I3(o, m, r)
 C_O1_I1(r, r)
 C_O1_I1(v, r)
 C_O1_I1(v, v)
@ -36,6 +37,7 @@ C_O1_I2(v, v, v)
 C_O1_I3(v, v, v, v)
 C_O1_I4(r, r, ri, rI, r)
 C_O1_I4(r, r, rA, rI, r)
+C_O2_I1(o, m, r)
 C_O2_I2(o, m, 0, r)
 C_O2_I2(o, m, r, r)
 C_O2_I3(o, m, 0, 1, r)
--- a/tcg/s390x/tcg-target.c.inc
+++ b/tcg/s390x/tcg-target.c.inc
@ -243,6 +243,7 @@ typedef enum S390Opcode {
    RXY_LLGF    = 0xe316,
    RXY_LLGH    = 0xe391,
    RXY_LMG     = 0xeb04,
+    RXY_LPQ     = 0xe38f,
    RXY_LRV     = 0xe31e,
    RXY_LRVG    = 0xe30f,
    RXY_LRVH    = 0xe31f,
@ -253,6 +254,7 @@ typedef enum S390Opcode {
    RXY_STG     = 0xe324,
    RXY_STHY    = 0xe370,
    RXY_STMG    = 0xeb24,
+    RXY_STPQ    = 0xe38e,
    RXY_STRV    = 0xe33e,
    RXY_STRVG   = 0xe32f,
    RXY_STRVH   = 0xe33f,
@ -1577,7 +1579,18 @@ typedef struct {

 bool tcg_target_has_memory_bswap(MemOp memop)
 {
-    return true;
+    TCGAtomAlign aa;
+
+    if ((memop & MO_SIZE) <= MO_64) {
+        return true;
+    }
+
+    /*
+     * Reject 16-byte memop with 16-byte atomicity,
+     * but do allow a pair of 64-bit operations.
+     */
+    aa = atom_and_align_for_opc(tcg_ctx, memop, MO_ATOM_IFALIGN, true);
+    return aa.atom <= MO_64;
 }

 static void tcg_out_qemu_ld_direct(TCGContext *s, MemOp opc, TCGReg data,
@ -1734,13 +1747,13 @@ static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, HostAddress *h,
 {
    TCGLabelQemuLdst *ldst = NULL;
    MemOp opc = get_memop(oi);
+    MemOp s_bits = opc & MO_SIZE;
    unsigned a_mask;

-    h->aa = atom_and_align_for_opc(s, opc, MO_ATOM_IFALIGN, false);
+    h->aa = atom_and_align_for_opc(s, opc, MO_ATOM_IFALIGN, s_bits == MO_128);
    a_mask = (1 << h->aa.align) - 1;

 #ifdef CONFIG_SOFTMMU
-    unsigned s_bits = opc & MO_SIZE;
    unsigned s_mask = (1 << s_bits) - 1;
    int mem_index = get_mmuidx(oi);
    int fast_off = TLB_MASK_TABLE_OFS(mem_index);
@ -1865,6 +1878,80 @@ static void tcg_out_qemu_st(TCGContext* s, TCGReg data_reg, TCGReg addr_reg,
    }
 }

+static void tcg_out_qemu_ldst_i128(TCGContext *s, TCGReg datalo, TCGReg datahi,
+                                   TCGReg addr_reg, MemOpIdx oi, bool is_ld)
+{
+    TCGLabel *l1 = NULL, *l2 = NULL;
+    TCGLabelQemuLdst *ldst;
+    HostAddress h;
+    bool need_bswap;
+    bool use_pair;
+    S390Opcode insn;
+
+    ldst = prepare_host_addr(s, &h, addr_reg, oi, is_ld);
+
+    use_pair = h.aa.atom < MO_128;
+    need_bswap = get_memop(oi) & MO_BSWAP;
+
+    if (!use_pair) {
+        /*
+         * Atomicity requires we use LPQ.  If we've already checked for
+         * 16-byte alignment, that's all we need.  If we arrive with
+         * lesser alignment, we have determined that less than 16-byte
+         * alignment can be satisfied with two 8-byte loads.
+         */
+        if (h.aa.align < MO_128) {
+            use_pair = true;
+            l1 = gen_new_label();
+            l2 = gen_new_label();
+
+            tcg_out_insn(s, RI, TMLL, addr_reg, 15);
+            tgen_branch(s, 7, l1); /* CC in {1,2,3} */
+        }
+
+        tcg_debug_assert(!need_bswap);
+        tcg_debug_assert(datalo & 1);
+        tcg_debug_assert(datahi == datalo - 1);
+        insn = is_ld ? RXY_LPQ : RXY_STPQ;
+        tcg_out_insn_RXY(s, insn, datahi, h.base, h.index, h.disp);
+
+        if (use_pair) {
+            tgen_branch(s, S390_CC_ALWAYS, l2);
+            tcg_out_label(s, l1);
+        }
+    }
+    if (use_pair) {
+        TCGReg d1, d2;
+
+        if (need_bswap) {
+            d1 = datalo, d2 = datahi;
+            insn = is_ld ? RXY_LRVG : RXY_STRVG;
+        } else {
+            d1 = datahi, d2 = datalo;
+            insn = is_ld ? RXY_LG : RXY_STG;
+        }
+
+        if (h.base == d1 || h.index == d1) {
+            tcg_out_insn(s, RXY, LAY, TCG_TMP0, h.base, h.index, h.disp);
+            h.base = TCG_TMP0;
+            h.index = TCG_REG_NONE;
+            h.disp = 0;
+        }
+        tcg_out_insn_RXY(s, insn, d1, h.base, h.index, h.disp);
+        tcg_out_insn_RXY(s, insn, d2, h.base, h.index, h.disp + 8);
+    }
+    if (l2) {
+        tcg_out_label(s, l2);
+    }
+
+    if (ldst) {
+        ldst->type = TCG_TYPE_I128;
+        ldst->datalo_reg = datalo;
+        ldst->datahi_reg = datahi;
+        ldst->raddr = tcg_splitwx_to_rx(s->code_ptr);
+    }
+}
+
 static void tcg_out_exit_tb(TCGContext *s, uintptr_t a0)
 {
    /* Reuse the zeroing that exists for goto_ptr.  */
@ -2226,6 +2313,14 @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
    case INDEX_op_qemu_st_a64_i64:
        tcg_out_qemu_st(s, args[0], args[1], args[2], TCG_TYPE_I64);
        break;
+    case INDEX_op_qemu_ld_a32_i128:
+    case INDEX_op_qemu_ld_a64_i128:
+        tcg_out_qemu_ldst_i128(s, args[0], args[1], args[2], args[3], true);
+        break;
+    case INDEX_op_qemu_st_a32_i128:
+    case INDEX_op_qemu_st_a64_i128:
+        tcg_out_qemu_ldst_i128(s, args[0], args[1], args[2], args[3], false);
+        break;

    case INDEX_op_ld16s_i64:
        tcg_out_mem(s, 0, RXY_LGH, args[0], args[1], TCG_REG_NONE, args[2]);
@ -3107,6 +3202,12 @@ static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op)
    case INDEX_op_qemu_st_a32_i32:
    case INDEX_op_qemu_st_a64_i32:
        return C_O0_I2(r, r);
+    case INDEX_op_qemu_ld_a32_i128:
+    case INDEX_op_qemu_ld_a64_i128:
+        return C_O2_I1(o, m, r);
+    case INDEX_op_qemu_st_a32_i128:
+    case INDEX_op_qemu_st_a64_i128:
+        return C_O0_I3(o, m, r);

    case INDEX_op_deposit_i32:
    case INDEX_op_deposit_i64:
--- a/tcg/s390x/tcg-target.h
+++ b/tcg/s390x/tcg-target.h
@ -26,7 +26,6 @@
 #define S390_TCG_TARGET_H

 #define TCG_TARGET_INSN_UNIT_SIZE 2
-#define TCG_TARGET_TLB_DISPLACEMENT_BITS 19

 /* We have a +- 4GB range on the branches; leave some slop.  */
 #define MAX_CODE_GEN_BUFFER_SIZE  (3 * GiB)
@ -140,7 +139,7 @@ extern uint64_t s390_facilities[3];
 #define TCG_TARGET_HAS_muluh_i64      0
 #define TCG_TARGET_HAS_mulsh_i64      0

-#define TCG_TARGET_HAS_qemu_ldst_i128 0
+#define TCG_TARGET_HAS_qemu_ldst_i128 1

 #define TCG_TARGET_HAS_v64            HAVE_FACILITY(VECTOR)
 #define TCG_TARGET_HAS_v128           HAVE_FACILITY(VECTOR)
--- a/tcg/sparc64/tcg-target.h
+++ b/tcg/sparc64/tcg-target.h
@ -26,7 +26,6 @@
 #define SPARC_TCG_TARGET_H

 #define TCG_TARGET_INSN_UNIT_SIZE 4
-#define TCG_TARGET_TLB_DISPLACEMENT_BITS 32
 #define TCG_TARGET_NB_REGS 32
 #define MAX_CODE_GEN_BUFFER_SIZE  (2 * GiB)

--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@ -5736,8 +5736,8 @@ static void tcg_out_ld_helper_ret(TCGContext *s, const TCGLabelQemuLdst *ldst,
    mov[0].dst = ldst->datalo_reg;
    mov[0].src =
        tcg_target_call_oarg_reg(TCG_CALL_RET_NORMAL, HOST_BIG_ENDIAN);
-    mov[0].dst_type = TCG_TYPE_I32;
-    mov[0].src_type = TCG_TYPE_I32;
+    mov[0].dst_type = TCG_TYPE_REG;
+    mov[0].src_type = TCG_TYPE_REG;
    mov[0].src_ext = TCG_TARGET_REG_BITS == 32 ? MO_32 : MO_64;

    mov[1].dst = ldst->datahi_reg;
--- a/tcg/tci/tcg-target.h
+++ b/tcg/tci/tcg-target.h
@ -42,7 +42,6 @@

 #define TCG_TARGET_INTERPRETER 1
 #define TCG_TARGET_INSN_UNIT_SIZE 4
-#define TCG_TARGET_TLB_DISPLACEMENT_BITS 32
 #define MAX_CODE_GEN_BUFFER_SIZE  ((size_t)-1)

 #if UINTPTR_MAX == UINT32_MAX
--- a/tests/decode/check.sh
+++ b/tests/decode/check.sh
@ -1,24 +0,0 @@
-#!/bin/sh
-# This work is licensed under the terms of the GNU LGPL, version 2 or later.
-# See the COPYING.LIB file in the top-level directory.
-
-PYTHON=$1
-DECODETREE=$2
-E=0
-
-# All of these tests should produce errors
-for i in err_*.decode; do
-    if $PYTHON $DECODETREE $i > /dev/null 2> /dev/null; then
-        # Pass, aka failed to fail.
-        echo FAIL: $i 1>&2
-        E=1
-    fi
-done
-
-for i in succ_*.decode; do
-    if ! $PYTHON $DECODETREE $i > /dev/null 2> /dev/null; then
-        echo FAIL:$i 1>&2
-    fi
-done
-
-exit $E
--- a/tests/decode/err_field10.decode
+++ b/tests/decode/err_field10.decode
@ -0,0 +1,7 @@
+# This work is licensed under the terms of the GNU LGPL, version 2 or later.
+# See the COPYING.LIB file in the top-level directory.
+
+# Diagnose formats which refer to undefined fields
+%field1        field2:3
+@fmt ........ ........ ........ ........ %field1
+insn 00000000 00000000 00000000 00000000 @fmt
--- a/tests/decode/err_field7.decode
+++ b/tests/decode/err_field7.decode
@ -0,0 +1,7 @@
+# This work is licensed under the terms of the GNU LGPL, version 2 or later.
+# See the COPYING.LIB file in the top-level directory.
+
+# Diagnose fields whose definitions form a loop
+%field1        field2:3
+%field2        field1:4
+insn 00000000 00000000 00000000 00000000 %field1 %field2
--- a/tests/decode/err_field8.decode
+++ b/tests/decode/err_field8.decode
@ -0,0 +1,8 @@
+# This work is licensed under the terms of the GNU LGPL, version 2 or later.
+# See the COPYING.LIB file in the top-level directory.
+
+# Diagnose patterns which refer to undefined fields
+&f1 f1 a
+%field1        field2:3
+@fmt ........ ........ ........ .... a:4 &f1
+insn 00000000 00000000 00000000 0000 .... @fmt f1=%field1
--- a/tests/decode/err_field9.decode
+++ b/tests/decode/err_field9.decode
@ -0,0 +1,14 @@
+# This work is licensed under the terms of the GNU LGPL, version 2 or later.
+# See the COPYING.LIB file in the top-level directory.
+
+# Diagnose fields where the format refers to a field defined in the
+# pattern and the pattern refers to a field defined in the format.
+# This is theoretically not impossible to implement, but is not
+# supported by the script at this time.
+&abcd a b c d
+%refa        a:3
+%refc        c:4
+# Format defines 'c' and sets 'b' to an indirect ref to 'a'
+@fmt ........ ........ ........ c:8 &abcd b=%refa
+# Pattern defines 'a' and sets 'd' to an indirect ref to 'c'
+insn 00000000 00000000 00000000 ........ @fmt d=%refc a=6
--- a/tests/decode/meson.build
+++ b/tests/decode/meson.build
@ -0,0 +1,64 @@
+err_tests = [
+    'err_argset1.decode',
+    'err_argset2.decode',
+    'err_field1.decode',
+    'err_field2.decode',
+    'err_field3.decode',
+    'err_field4.decode',
+    'err_field5.decode',
+    'err_field6.decode',
+    'err_field7.decode',
+    'err_field8.decode',
+    'err_field9.decode',
+    'err_field10.decode',
+    'err_init1.decode',
+    'err_init2.decode',
+    'err_init3.decode',
+    'err_init4.decode',
+    'err_overlap1.decode',
+    'err_overlap2.decode',
+    'err_overlap3.decode',
+    'err_overlap4.decode',
+    'err_overlap5.decode',
+    'err_overlap6.decode',
+    'err_overlap7.decode',
+    'err_overlap8.decode',
+    'err_overlap9.decode',
+    'err_pattern_group_empty.decode',
+    'err_pattern_group_ident1.decode',
+    'err_pattern_group_ident2.decode',
+    'err_pattern_group_nest1.decode',
+    'err_pattern_group_nest2.decode',
+    'err_pattern_group_nest3.decode',
+    'err_pattern_group_overlap1.decode',
+    'err_width1.decode',
+    'err_width2.decode',
+    'err_width3.decode',
+    'err_width4.decode',
+]
+
+succ_tests = [
+    'succ_argset_type1.decode',
+    'succ_function.decode',
+    'succ_ident1.decode',
+    'succ_named_field.decode',
+    'succ_pattern_group_nest1.decode',
+    'succ_pattern_group_nest2.decode',
+    'succ_pattern_group_nest3.decode',
+    'succ_pattern_group_nest4.decode',
+]
+
+suite = 'decodetree'
+decodetree = find_program(meson.project_source_root() / 'scripts/decodetree.py')
+
+foreach t: err_tests
+    test(fs.replace_suffix(t, ''),
+         decodetree, args: ['-o', '/dev/null', '--test-for-error', files(t)],
+         suite: suite)
+endforeach
+
+foreach t: succ_tests
+    test(fs.replace_suffix(t, ''),
+         decodetree, args: ['-o', '/dev/null', files(t)],
+         suite: suite)
+endforeach
--- a/tests/decode/succ_named_field.decode
+++ b/tests/decode/succ_named_field.decode
@ -0,0 +1,19 @@
+# This work is licensed under the terms of the GNU LGPL, version 2 or later.
+# See the COPYING.LIB file in the top-level directory.
+
+# field using a named_field
+%imm_sz	8:8 sz:3
+insn 00000000 00000000 ........ 00000000 imm_sz=%imm_sz sz=1
+
+# Ditto, via a format. Here a field in the format
+# references a named field defined in the insn pattern:
+&imm_a imm alpha
+%foo 0:16 alpha:4
+@foo 00000001 ........ ........ ........ &imm_a imm=%foo
+i1   ........ 00000000 ........ ........ @foo alpha=1
+i2   ........ 00000001 ........ ........ @foo alpha=2
+
+# Here the named field is defined in the format and referenced
+# from the insn pattern:
+@bar 00000010 ........ ........ ........ &imm_a alpha=4
+i3   ........ 00000000 ........ ........ @bar imm=%foo
--- a/tests/meson.build
+++ b/tests/meson.build
@ -74,10 +74,7 @@ if have_tools and have_vhost_user and 'CONFIG_LINUX' in config_host
             dependencies: [qemuutil, vhost_user])
 endif

-test('decodetree', sh,
-     args: [ files('decode/check.sh'), config_host['PYTHON'], files('../scripts/decodetree.py') ],
-     workdir: meson.current_source_dir() / 'decode',
-     suite: 'decodetree')
+subdir('decode')

 if 'CONFIG_TCG' in config_all
  subdir('fp')