target/arm: Implement MVE scatter-gather immediate forms

Implement the MVE VLDR/VSTR insns which do scatter-gather using base
addresses from Qm plus or minus an immediate offset (possibly with
writeback). Note that writeback is not predicated but it does have
to honour ECI state, so we have to add an eci_mask check to the
VSTR_SG macros (the VLDR_SG macros already needed this to be able
to distinguish "skip beat" from "set predicated element to 0").

Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
This commit is contained in:
Peter Maydell 2021-08-13 17:11:57 +01:00
parent dc18628b18
commit fac80f0856
4 changed files with 146 additions and 32 deletions

View file

@ -65,6 +65,11 @@ DEF_HELPER_FLAGS_4(mve_vstrh_sg_os_uw, TCG_CALL_NO_WG, void, env, ptr, ptr, i32)
DEF_HELPER_FLAGS_4(mve_vstrw_sg_os_uw, TCG_CALL_NO_WG, void, env, ptr, ptr, i32)
DEF_HELPER_FLAGS_4(mve_vstrd_sg_os_ud, TCG_CALL_NO_WG, void, env, ptr, ptr, i32)
DEF_HELPER_FLAGS_4(mve_vldrw_sg_wb_uw, TCG_CALL_NO_WG, void, env, ptr, ptr, i32)
DEF_HELPER_FLAGS_4(mve_vldrd_sg_wb_ud, TCG_CALL_NO_WG, void, env, ptr, ptr, i32)
DEF_HELPER_FLAGS_4(mve_vstrw_sg_wb_uw, TCG_CALL_NO_WG, void, env, ptr, ptr, i32)
DEF_HELPER_FLAGS_4(mve_vstrd_sg_wb_ud, TCG_CALL_NO_WG, void, env, ptr, ptr, i32)
DEF_HELPER_FLAGS_3(mve_vdup, TCG_CALL_NO_WG, void, env, ptr, i32)
DEF_HELPER_FLAGS_4(mve_vidupb, TCG_CALL_NO_WG, i32, env, ptr, i32, i32)

View file

@ -43,6 +43,7 @@
&vmaxv qm rda size
&vabav qn qm rda size
&vldst_sg qd qm rn size msize os
&vldst_sg_imm qd qm a w imm
# scatter-gather memory size is in bits 6:4
%sg_msize 6:1 4:1
@ -54,6 +55,10 @@
@vldst_sg .... .... .... rn:4 .... ... size:2 ... ... os:1 &vldst_sg \
qd=%qd qm=%qm msize=%sg_msize
# Qm is in the fields usually labeled Qn
@vldst_sg_imm .... .... a:1 . w:1 . .... .... .... . imm:7 &vldst_sg_imm \
qd=%qd qm=%qn
@1op .... .... .... size:2 .. .... .... .... .... &1op qd=%qd qm=%qm
@1op_nosz .... .... .... .... .... .... .... .... &1op qd=%qd qm=%qm size=0
@2op .... .... .. size:2 .... .... .... .... .... &2op qd=%qd qm=%qm qn=%qn
@ -148,6 +153,11 @@ VLDR_S_sg 111 0 1100 1 . 01 .... ... 0 111 . .... .... @vldst_sg
VLDR_U_sg 111 1 1100 1 . 01 .... ... 0 111 . .... .... @vldst_sg
VSTR_sg 111 0 1100 1 . 00 .... ... 0 111 . .... .... @vldst_sg
VLDRW_sg_imm 111 1 1101 ... 1 ... 0 ... 1 1110 .... .... @vldst_sg_imm
VLDRD_sg_imm 111 1 1101 ... 1 ... 0 ... 1 1111 .... .... @vldst_sg_imm
VSTRW_sg_imm 111 1 1101 ... 0 ... 0 ... 1 1110 .... .... @vldst_sg_imm
VSTRD_sg_imm 111 1 1101 ... 0 ... 0 ... 1 1111 .... .... @vldst_sg_imm
# Moves between 2 32-bit vector lanes and 2 general purpose registers
VMOV_to_2gp 1110 1100 0 . 00 rt2:4 ... 0 1111 000 idx:1 rt:4 qd=%qd
VMOV_from_2gp 1110 1100 0 . 01 rt2:4 ... 0 1111 000 idx:1 rt:4 qd=%qd

View file

@ -213,7 +213,7 @@ DO_VSTR(vstrh_w, 2, stw, 4, int32_t)
* For loads, predicated lanes are zeroed instead of retaining
* their previous values.
*/
#define DO_VLDR_SG(OP, LDTYPE, ESIZE, TYPE, OFFTYPE, ADDRFN) \
#define DO_VLDR_SG(OP, LDTYPE, ESIZE, TYPE, OFFTYPE, ADDRFN, WB) \
void HELPER(mve_##OP)(CPUARMState *env, void *vd, void *vm, \
uint32_t base) \
{ \
@ -230,25 +230,35 @@ DO_VSTR(vstrh_w, 2, stw, 4, int32_t)
addr = ADDRFN(base, m[H##ESIZE(e)]); \
d[H##ESIZE(e)] = (mask & 1) ? \
cpu_##LDTYPE##_data_ra(env, addr, GETPC()) : 0; \
if (WB) { \
m[H##ESIZE(e)] = addr; \
} \
} \
mve_advance_vpt(env); \
}
/* We know here TYPE is unsigned so always the same as the offset type */
#define DO_VSTR_SG(OP, STTYPE, ESIZE, TYPE, ADDRFN) \
#define DO_VSTR_SG(OP, STTYPE, ESIZE, TYPE, ADDRFN, WB) \
void HELPER(mve_##OP)(CPUARMState *env, void *vd, void *vm, \
uint32_t base) \
{ \
TYPE *d = vd; \
TYPE *m = vm; \
uint16_t mask = mve_element_mask(env); \
uint16_t eci_mask = mve_eci_mask(env); \
unsigned e; \
uint32_t addr; \
for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) { \
for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE, eci_mask >>= ESIZE) { \
if (!(eci_mask & 1)) { \
continue; \
} \
addr = ADDRFN(base, m[H##ESIZE(e)]); \
if (mask & 1) { \
cpu_##STTYPE##_data_ra(env, addr, d[H##ESIZE(e)], GETPC()); \
} \
if (WB) { \
m[H##ESIZE(e)] = addr; \
} \
} \
mve_advance_vpt(env); \
}
@ -258,8 +268,10 @@ DO_VSTR(vstrh_w, 2, stw, 4, int32_t)
* accesses, controlled by the predicate mask for the relevant beat,
* and with a single 32-bit offset in the first of the two Qm elements.
* Note that for QEMU our IMPDEF AIRCR.ENDIANNESS is always 0 (little).
* Address writeback happens on the odd beats and updates the address
* stored in the even-beat element.
*/
#define DO_VLDR64_SG(OP, ADDRFN) \
#define DO_VLDR64_SG(OP, ADDRFN, WB) \
void HELPER(mve_##OP)(CPUARMState *env, void *vd, void *vm, \
uint32_t base) \
{ \
@ -276,25 +288,35 @@ DO_VSTR(vstrh_w, 2, stw, 4, int32_t)
addr = ADDRFN(base, m[H4(e & ~1)]); \
addr += 4 * (e & 1); \
d[H4(e)] = (mask & 1) ? cpu_ldl_data_ra(env, addr, GETPC()) : 0; \
if (WB && (e & 1)) { \
m[H4(e & ~1)] = addr - 4; \
} \
} \
mve_advance_vpt(env); \
}
#define DO_VSTR64_SG(OP, ADDRFN) \
#define DO_VSTR64_SG(OP, ADDRFN, WB) \
void HELPER(mve_##OP)(CPUARMState *env, void *vd, void *vm, \
uint32_t base) \
{ \
uint32_t *d = vd; \
uint32_t *m = vm; \
uint16_t mask = mve_element_mask(env); \
uint16_t eci_mask = mve_eci_mask(env); \
unsigned e; \
uint32_t addr; \
for (e = 0; e < 16 / 4; e++, mask >>= 4) { \
for (e = 0; e < 16 / 4; e++, mask >>= 4, eci_mask >>= 4) { \
if (!(eci_mask & 1)) { \
continue; \
} \
addr = ADDRFN(base, m[H4(e & ~1)]); \
addr += 4 * (e & 1); \
if (mask & 1) { \
cpu_stl_data_ra(env, addr, d[H4(e)], GETPC()); \
} \
if (WB && (e & 1)) { \
m[H4(e & ~1)] = addr - 4; \
} \
} \
mve_advance_vpt(env); \
}
@ -304,36 +326,41 @@ DO_VSTR(vstrh_w, 2, stw, 4, int32_t)
#define ADDR_ADD_OSW(BASE, OFFSET) ((BASE) + ((OFFSET) << 2))
#define ADDR_ADD_OSD(BASE, OFFSET) ((BASE) + ((OFFSET) << 3))
DO_VLDR_SG(vldrb_sg_sh, ldsb, 2, int16_t, uint16_t, ADDR_ADD)
DO_VLDR_SG(vldrb_sg_sw, ldsb, 4, int32_t, uint32_t, ADDR_ADD)
DO_VLDR_SG(vldrh_sg_sw, ldsw, 4, int32_t, uint32_t, ADDR_ADD)
DO_VLDR_SG(vldrb_sg_sh, ldsb, 2, int16_t, uint16_t, ADDR_ADD, false)
DO_VLDR_SG(vldrb_sg_sw, ldsb, 4, int32_t, uint32_t, ADDR_ADD, false)
DO_VLDR_SG(vldrh_sg_sw, ldsw, 4, int32_t, uint32_t, ADDR_ADD, false)
DO_VLDR_SG(vldrb_sg_ub, ldub, 1, uint8_t, uint8_t, ADDR_ADD)
DO_VLDR_SG(vldrb_sg_uh, ldub, 2, uint16_t, uint16_t, ADDR_ADD)
DO_VLDR_SG(vldrb_sg_uw, ldub, 4, uint32_t, uint32_t, ADDR_ADD)
DO_VLDR_SG(vldrh_sg_uh, lduw, 2, uint16_t, uint16_t, ADDR_ADD)
DO_VLDR_SG(vldrh_sg_uw, lduw, 4, uint32_t, uint32_t, ADDR_ADD)
DO_VLDR_SG(vldrw_sg_uw, ldl, 4, uint32_t, uint32_t, ADDR_ADD)
DO_VLDR64_SG(vldrd_sg_ud, ADDR_ADD)
DO_VLDR_SG(vldrb_sg_ub, ldub, 1, uint8_t, uint8_t, ADDR_ADD, false)
DO_VLDR_SG(vldrb_sg_uh, ldub, 2, uint16_t, uint16_t, ADDR_ADD, false)
DO_VLDR_SG(vldrb_sg_uw, ldub, 4, uint32_t, uint32_t, ADDR_ADD, false)
DO_VLDR_SG(vldrh_sg_uh, lduw, 2, uint16_t, uint16_t, ADDR_ADD, false)
DO_VLDR_SG(vldrh_sg_uw, lduw, 4, uint32_t, uint32_t, ADDR_ADD, false)
DO_VLDR_SG(vldrw_sg_uw, ldl, 4, uint32_t, uint32_t, ADDR_ADD, false)
DO_VLDR64_SG(vldrd_sg_ud, ADDR_ADD, false)
DO_VLDR_SG(vldrh_sg_os_sw, ldsw, 4, int32_t, uint32_t, ADDR_ADD_OSH)
DO_VLDR_SG(vldrh_sg_os_uh, lduw, 2, uint16_t, uint16_t, ADDR_ADD_OSH)
DO_VLDR_SG(vldrh_sg_os_uw, lduw, 4, uint32_t, uint32_t, ADDR_ADD_OSH)
DO_VLDR_SG(vldrw_sg_os_uw, ldl, 4, uint32_t, uint32_t, ADDR_ADD_OSW)
DO_VLDR64_SG(vldrd_sg_os_ud, ADDR_ADD_OSD)
DO_VLDR_SG(vldrh_sg_os_sw, ldsw, 4, int32_t, uint32_t, ADDR_ADD_OSH, false)
DO_VLDR_SG(vldrh_sg_os_uh, lduw, 2, uint16_t, uint16_t, ADDR_ADD_OSH, false)
DO_VLDR_SG(vldrh_sg_os_uw, lduw, 4, uint32_t, uint32_t, ADDR_ADD_OSH, false)
DO_VLDR_SG(vldrw_sg_os_uw, ldl, 4, uint32_t, uint32_t, ADDR_ADD_OSW, false)
DO_VLDR64_SG(vldrd_sg_os_ud, ADDR_ADD_OSD, false)
DO_VSTR_SG(vstrb_sg_ub, stb, 1, uint8_t, ADDR_ADD)
DO_VSTR_SG(vstrb_sg_uh, stb, 2, uint16_t, ADDR_ADD)
DO_VSTR_SG(vstrb_sg_uw, stb, 4, uint32_t, ADDR_ADD)
DO_VSTR_SG(vstrh_sg_uh, stw, 2, uint16_t, ADDR_ADD)
DO_VSTR_SG(vstrh_sg_uw, stw, 4, uint32_t, ADDR_ADD)
DO_VSTR_SG(vstrw_sg_uw, stl, 4, uint32_t, ADDR_ADD)
DO_VSTR64_SG(vstrd_sg_ud, ADDR_ADD)
DO_VSTR_SG(vstrb_sg_ub, stb, 1, uint8_t, ADDR_ADD, false)
DO_VSTR_SG(vstrb_sg_uh, stb, 2, uint16_t, ADDR_ADD, false)
DO_VSTR_SG(vstrb_sg_uw, stb, 4, uint32_t, ADDR_ADD, false)
DO_VSTR_SG(vstrh_sg_uh, stw, 2, uint16_t, ADDR_ADD, false)
DO_VSTR_SG(vstrh_sg_uw, stw, 4, uint32_t, ADDR_ADD, false)
DO_VSTR_SG(vstrw_sg_uw, stl, 4, uint32_t, ADDR_ADD, false)
DO_VSTR64_SG(vstrd_sg_ud, ADDR_ADD, false)
DO_VSTR_SG(vstrh_sg_os_uh, stw, 2, uint16_t, ADDR_ADD_OSH)
DO_VSTR_SG(vstrh_sg_os_uw, stw, 4, uint32_t, ADDR_ADD_OSH)
DO_VSTR_SG(vstrw_sg_os_uw, stl, 4, uint32_t, ADDR_ADD_OSW)
DO_VSTR64_SG(vstrd_sg_os_ud, ADDR_ADD_OSD)
DO_VSTR_SG(vstrh_sg_os_uh, stw, 2, uint16_t, ADDR_ADD_OSH, false)
DO_VSTR_SG(vstrh_sg_os_uw, stw, 4, uint32_t, ADDR_ADD_OSH, false)
DO_VSTR_SG(vstrw_sg_os_uw, stl, 4, uint32_t, ADDR_ADD_OSW, false)
DO_VSTR64_SG(vstrd_sg_os_ud, ADDR_ADD_OSD, false)
DO_VLDR_SG(vldrw_sg_wb_uw, ldl, 4, uint32_t, uint32_t, ADDR_ADD, true)
DO_VLDR64_SG(vldrd_sg_wb_ud, ADDR_ADD, true)
DO_VSTR_SG(vstrw_sg_wb_uw, stl, 4, uint32_t, ADDR_ADD, true)
DO_VSTR64_SG(vstrd_sg_wb_ud, ADDR_ADD, true)
/*
* The mergemask(D, R, M) macro performs the operation "*D = R" but

View file

@ -306,6 +306,78 @@ static bool trans_VSTR_sg(DisasContext *s, arg_vldst_sg *a)
#undef F
static bool do_ldst_sg_imm(DisasContext *s, arg_vldst_sg_imm *a,
MVEGenLdStSGFn *fn, unsigned msize)
{
uint32_t offset;
TCGv_ptr qd, qm;
if (!dc_isar_feature(aa32_mve, s) ||
!mve_check_qreg_bank(s, a->qd | a->qm) ||
!fn) {
return false;
}
if (!mve_eci_check(s) || !vfp_access_check(s)) {
return true;
}
offset = a->imm << msize;
if (!a->a) {
offset = -offset;
}
qd = mve_qreg_ptr(a->qd);
qm = mve_qreg_ptr(a->qm);
fn(cpu_env, qd, qm, tcg_constant_i32(offset));
tcg_temp_free_ptr(qd);
tcg_temp_free_ptr(qm);
mve_update_eci(s);
return true;
}
static bool trans_VLDRW_sg_imm(DisasContext *s, arg_vldst_sg_imm *a)
{
static MVEGenLdStSGFn * const fns[] = {
gen_helper_mve_vldrw_sg_uw,
gen_helper_mve_vldrw_sg_wb_uw,
};
if (a->qd == a->qm) {
return false; /* UNPREDICTABLE */
}
return do_ldst_sg_imm(s, a, fns[a->w], MO_32);
}
static bool trans_VLDRD_sg_imm(DisasContext *s, arg_vldst_sg_imm *a)
{
static MVEGenLdStSGFn * const fns[] = {
gen_helper_mve_vldrd_sg_ud,
gen_helper_mve_vldrd_sg_wb_ud,
};
if (a->qd == a->qm) {
return false; /* UNPREDICTABLE */
}
return do_ldst_sg_imm(s, a, fns[a->w], MO_64);
}
static bool trans_VSTRW_sg_imm(DisasContext *s, arg_vldst_sg_imm *a)
{
static MVEGenLdStSGFn * const fns[] = {
gen_helper_mve_vstrw_sg_uw,
gen_helper_mve_vstrw_sg_wb_uw,
};
return do_ldst_sg_imm(s, a, fns[a->w], MO_32);
}
static bool trans_VSTRD_sg_imm(DisasContext *s, arg_vldst_sg_imm *a)
{
static MVEGenLdStSGFn * const fns[] = {
gen_helper_mve_vstrd_sg_ud,
gen_helper_mve_vstrd_sg_wb_ud,
};
return do_ldst_sg_imm(s, a, fns[a->w], MO_64);
}
static bool trans_VDUP(DisasContext *s, arg_VDUP *a)
{
TCGv_ptr qd;