tcg: Add write_aofs to GVecGen3i

tcg/i386: Simplify immediate 8-bit logical vector shifts
 tcg/i386: Optimize setcond of TST{EQ,NE} with 0xffffffff
 tcg/optimize: Optimize setcond with zmask
 accel/tcg: Introduce CF_BP_PAGE
 target/sh4: Update DisasContextBase.insn_start
 gitlab: Drop --static from s390x linux-user build
 gitlab: Streamline ubuntu-22.04-s390x
 -----BEGIN PGP SIGNATURE-----
 
 iQFRBAABCgA7FiEEekgeeIaLTbaoWgXAZN846K9+IV8FAmY6OoAdHHJpY2hhcmQu
 aGVuZGVyc29uQGxpbmFyby5vcmcACgkQZN846K9+IV8FEwf7Bhs9bV2Kp4LxUzGq
 +dSHHc/WuCyIILLDQ4kZyXvILuI59wYhrWBUUTzBnAZ/tEf0oMG2y57F/lIcxz9w
 VvsFicMOhtjQ8iBEfl/rkkaYs9BLcxqMTAA3PxNBE6l3bzjcHSTkhey4MoPGRibn
 CkwaLzb2ebNjfgzC1IsNf/tyiMXl0tBQM7JVV4EztaOGEmqw8X0/PyVZDiC3WUNC
 tf9yqiNIlgGkn7rj3sT/rNdi4xlzQybgrb1MCFT6z5cqsW2bwqivRpxHi4yulHKI
 VhYA3kud+TX2ASukpibsSkA+9SbcH/qwOugPhPIu+KANsFUcVKL6Anzv6Ysl9kZ0
 +Wnbow==
 =FJCW
 -----END PGP SIGNATURE-----

Merge tag 'pull-tcg-20240507' of https://gitlab.com/rth7680/qemu into staging

tcg: Add write_aofs to GVecGen3i
tcg/i386: Simplify immediate 8-bit logical vector shifts
tcg/i386: Optimize setcond of TST{EQ,NE} with 0xffffffff
tcg/optimize: Optimize setcond with zmask
accel/tcg: Introduce CF_BP_PAGE
target/sh4: Update DisasContextBase.insn_start
gitlab: Drop --static from s390x linux-user build
gitlab: Streamline ubuntu-22.04-s390x

# -----BEGIN PGP SIGNATURE-----
#
# iQFRBAABCgA7FiEEekgeeIaLTbaoWgXAZN846K9+IV8FAmY6OoAdHHJpY2hhcmQu
# aGVuZGVyc29uQGxpbmFyby5vcmcACgkQZN846K9+IV8FEwf7Bhs9bV2Kp4LxUzGq
# +dSHHc/WuCyIILLDQ4kZyXvILuI59wYhrWBUUTzBnAZ/tEf0oMG2y57F/lIcxz9w
# VvsFicMOhtjQ8iBEfl/rkkaYs9BLcxqMTAA3PxNBE6l3bzjcHSTkhey4MoPGRibn
# CkwaLzb2ebNjfgzC1IsNf/tyiMXl0tBQM7JVV4EztaOGEmqw8X0/PyVZDiC3WUNC
# tf9yqiNIlgGkn7rj3sT/rNdi4xlzQybgrb1MCFT6z5cqsW2bwqivRpxHi4yulHKI
# VhYA3kud+TX2ASukpibsSkA+9SbcH/qwOugPhPIu+KANsFUcVKL6Anzv6Ysl9kZ0
# +Wnbow==
# =FJCW
# -----END PGP SIGNATURE-----
# gpg: Signature made Tue 07 May 2024 07:28:16 AM PDT
# gpg:                using RSA key 7A481E78868B4DB6A85A05C064DF38E8AF7E215F
# gpg:                issuer "richard.henderson@linaro.org"
# gpg: Good signature from "Richard Henderson <richard.henderson@linaro.org>" [ultimate]

* tag 'pull-tcg-20240507' of https://gitlab.com/rth7680/qemu:
  gitlab: Streamline ubuntu-22.04-s390x
  gitlab: Drop --static from s390x linux-user build
  gitlab: Drop --disable-libssh from ubuntu-22.04-s390x.yml
  target/sh4: Update DisasContextBase.insn_start
  accel/tcg: Introduce CF_BP_PAGE
  tcg/optimize: Optimize setcond with zmask
  tcg/i386: Optimize setcond of TST{EQ,NE} with 0xffffffff
  tcg/i386: Simplify immediate 8-bit logical vector shifts
  tcg: Add write_aofs to GVecGen3i

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
This commit is contained in:
Richard Henderson 2024-05-07 07:34:58 -07:00
commit 571882c668
8 changed files with 174 additions and 66 deletions

View file

@ -2,7 +2,7 @@
# setup by the scripts/ci/setup/build-environment.yml task
# "Install basic packages to build QEMU on Ubuntu 22.04"
ubuntu-22.04-s390x-all-linux-static:
ubuntu-22.04-s390x-all-linux:
extends: .custom_runner_template
needs: []
stage: build
@ -15,13 +15,13 @@ ubuntu-22.04-s390x-all-linux-static:
script:
- mkdir build
- cd build
- ../configure --enable-debug --static --disable-system
- ../configure --enable-debug --disable-system --disable-tools --disable-docs
|| { cat config.log meson-logs/meson-log.txt; exit 1; }
- make --output-sync -j`nproc`
- make --output-sync check-tcg
- make --output-sync -j`nproc` check
ubuntu-22.04-s390x-all:
ubuntu-22.04-s390x-all-system:
extends: .custom_runner_template
needs: []
stage: build
@ -35,7 +35,7 @@ ubuntu-22.04-s390x-all:
script:
- mkdir build
- cd build
- ../configure --disable-libssh
- ../configure --disable-user
|| { cat config.log meson-logs/meson-log.txt; exit 1; }
- make --output-sync -j`nproc`
- make --output-sync -j`nproc` check
@ -57,7 +57,7 @@ ubuntu-22.04-s390x-alldbg:
script:
- mkdir build
- cd build
- ../configure --enable-debug --disable-libssh
- ../configure --enable-debug
|| { cat config.log meson-logs/meson-log.txt; exit 1; }
- make clean
- make --output-sync -j`nproc`
@ -80,7 +80,7 @@ ubuntu-22.04-s390x-clang:
script:
- mkdir build
- cd build
- ../configure --disable-libssh --cc=clang --cxx=clang++ --enable-sanitizers
- ../configure --cc=clang --cxx=clang++ --enable-sanitizers
|| { cat config.log meson-logs/meson-log.txt; exit 1; }
- make --output-sync -j`nproc`
- make --output-sync -j`nproc` check
@ -101,7 +101,7 @@ ubuntu-22.04-s390x-tci:
script:
- mkdir build
- cd build
- ../configure --disable-libssh --enable-tcg-interpreter
- ../configure --enable-tcg-interpreter
|| { cat config.log meson-logs/meson-log.txt; exit 1; }
- make --output-sync -j`nproc`
@ -122,7 +122,7 @@ ubuntu-22.04-s390x-notcg:
script:
- mkdir build
- cd build
- ../configure --disable-libssh --disable-tcg
- ../configure --disable-tcg
|| { cat config.log meson-logs/meson-log.txt; exit 1; }
- make --output-sync -j`nproc`
- make --output-sync -j`nproc` check

View file

@ -381,7 +381,7 @@ static bool check_for_breakpoints_slow(CPUState *cpu, vaddr pc,
* breakpoints are removed.
*/
if (match_page) {
*cflags = (*cflags & ~CF_COUNT_MASK) | CF_NO_GOTO_TB | 1;
*cflags = (*cflags & ~CF_COUNT_MASK) | CF_NO_GOTO_TB | CF_BP_PAGE | 1;
}
return false;
}

View file

@ -77,6 +77,7 @@ struct TranslationBlock {
#define CF_PARALLEL 0x00008000 /* Generate code for a parallel context */
#define CF_NOIRQ 0x00010000 /* Generate an uninterruptible TB */
#define CF_PCREL 0x00020000 /* Opcodes in TB are PC-relative */
#define CF_BP_PAGE 0x00040000 /* Breakpoint present in code page */
#define CF_CLUSTER_MASK 0xff000000 /* Top 8 bits are cluster ID */
#define CF_CLUSTER_SHIFT 24

View file

@ -183,6 +183,8 @@ typedef struct {
bool prefer_i64;
/* Load dest as a 3rd source operand. */
bool load_dest;
/* Write aofs as a 2nd dest operand. */
bool write_aofs;
} GVecGen3i;
typedef struct {

View file

@ -2189,6 +2189,7 @@ static void decode_gusa(DisasContext *ctx, CPUSH4State *env)
*/
for (i = 1; i < max_insns; ++i) {
tcg_gen_insn_start(pc + i * 2, ctx->envflags);
ctx->base.insn_start = tcg_last_op();
}
}
#endif

View file

@ -1658,6 +1658,7 @@ static void tcg_out_setcond(TCGContext *s, int rexw, TCGCond cond,
TCGArg dest, TCGArg arg1, TCGArg arg2,
int const_arg2, bool neg)
{
int cmp_rexw = rexw;
bool inv = false;
bool cleared;
int jcc;
@ -1674,6 +1675,18 @@ static void tcg_out_setcond(TCGContext *s, int rexw, TCGCond cond,
}
break;
case TCG_COND_TSTNE:
inv = true;
/* fall through */
case TCG_COND_TSTEQ:
/* If arg2 is -1, convert to LTU/GEU vs 1. */
if (const_arg2 && arg2 == 0xffffffffu) {
arg2 = 1;
cmp_rexw = 0;
goto do_ltu;
}
break;
case TCG_COND_LEU:
inv = true;
/* fall through */
@ -1697,7 +1710,7 @@ static void tcg_out_setcond(TCGContext *s, int rexw, TCGCond cond,
* We can then use NEG or INC to produce the desired result.
* This is always smaller than the SETCC expansion.
*/
tcg_out_cmp(s, TCG_COND_LTU, arg1, arg2, const_arg2, rexw);
tcg_out_cmp(s, TCG_COND_LTU, arg1, arg2, const_arg2, cmp_rexw);
/* X - X - C = -C = (C ? -1 : 0) */
tgen_arithr(s, ARITH_SBB + (neg ? rexw : 0), dest, dest);
@ -1744,7 +1757,7 @@ static void tcg_out_setcond(TCGContext *s, int rexw, TCGCond cond,
cleared = true;
}
jcc = tcg_out_cmp(s, cond, arg1, arg2, const_arg2, rexw);
jcc = tcg_out_cmp(s, cond, arg1, arg2, const_arg2, cmp_rexw);
tcg_out_modrm(s, OPC_SETCC | jcc, 0, dest);
if (!cleared) {
@ -3769,49 +3782,20 @@ int tcg_can_emit_vec_op(TCGOpcode opc, TCGType type, unsigned vece)
}
}
static void expand_vec_shi(TCGType type, unsigned vece, TCGOpcode opc,
static void expand_vec_shi(TCGType type, unsigned vece, bool right,
TCGv_vec v0, TCGv_vec v1, TCGArg imm)
{
TCGv_vec t1, t2;
uint8_t mask;
tcg_debug_assert(vece == MO_8);
t1 = tcg_temp_new_vec(type);
t2 = tcg_temp_new_vec(type);
/*
* Unpack to W, shift, and repack. Tricky bits:
* (1) Use punpck*bw x,x to produce DDCCBBAA,
* i.e. duplicate in other half of the 16-bit lane.
* (2) For right-shift, add 8 so that the high half of the lane
* becomes zero. For left-shift, and left-rotate, we must
* shift up and down again.
* (3) Step 2 leaves high half zero such that PACKUSWB
* (pack with unsigned saturation) does not modify
* the quantity.
*/
vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(v1));
vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
tcgv_vec_arg(t2), tcgv_vec_arg(v1), tcgv_vec_arg(v1));
if (opc != INDEX_op_rotli_vec) {
imm += 8;
}
if (opc == INDEX_op_shri_vec) {
tcg_gen_shri_vec(MO_16, t1, t1, imm);
tcg_gen_shri_vec(MO_16, t2, t2, imm);
if (right) {
mask = 0xff >> imm;
tcg_gen_shri_vec(MO_16, v0, v1, imm);
} else {
tcg_gen_shli_vec(MO_16, t1, t1, imm);
tcg_gen_shli_vec(MO_16, t2, t2, imm);
tcg_gen_shri_vec(MO_16, t1, t1, 8);
tcg_gen_shri_vec(MO_16, t2, t2, 8);
mask = 0xff << imm;
tcg_gen_shli_vec(MO_16, v0, v1, imm);
}
vec_gen_3(INDEX_op_x86_packus_vec, type, MO_8,
tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t2));
tcg_temp_free_vec(t1);
tcg_temp_free_vec(t2);
tcg_gen_and_vec(MO_8, v0, v0, tcg_constant_vec(type, MO_8, mask));
}
static void expand_vec_sari(TCGType type, unsigned vece,
@ -3821,7 +3805,7 @@ static void expand_vec_sari(TCGType type, unsigned vece,
switch (vece) {
case MO_8:
/* Unpack to W, shift, and repack, as in expand_vec_shi. */
/* Unpack to 16-bit, shift, and repack. */
t1 = tcg_temp_new_vec(type);
t2 = tcg_temp_new_vec(type);
vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
@ -3874,12 +3858,7 @@ static void expand_vec_rotli(TCGType type, unsigned vece,
{
TCGv_vec t;
if (vece == MO_8) {
expand_vec_shi(type, vece, INDEX_op_rotli_vec, v0, v1, imm);
return;
}
if (have_avx512vbmi2) {
if (vece != MO_8 && have_avx512vbmi2) {
vec_gen_4(INDEX_op_x86_vpshldi_vec, type, vece,
tcgv_vec_arg(v0), tcgv_vec_arg(v1), tcgv_vec_arg(v1), imm);
return;
@ -4155,10 +4134,11 @@ void tcg_expand_vec_op(TCGOpcode opc, TCGType type, unsigned vece,
switch (opc) {
case INDEX_op_shli_vec:
case INDEX_op_shri_vec:
expand_vec_shi(type, vece, opc, v0, v1, a2);
expand_vec_shi(type, vece, false, v0, v1, a2);
break;
case INDEX_op_shri_vec:
expand_vec_shi(type, vece, true, v0, v1, a2);
break;
case INDEX_op_sari_vec:
expand_vec_sari(type, vece, v0, v1, a2);
break;

View file

@ -2099,6 +2099,108 @@ static bool fold_remainder(OptContext *ctx, TCGOp *op)
return false;
}
static bool fold_setcond_zmask(OptContext *ctx, TCGOp *op, bool neg)
{
uint64_t a_zmask, b_val;
TCGCond cond;
if (!arg_is_const(op->args[2])) {
return false;
}
a_zmask = arg_info(op->args[1])->z_mask;
b_val = arg_info(op->args[2])->val;
cond = op->args[3];
if (ctx->type == TCG_TYPE_I32) {
a_zmask = (uint32_t)a_zmask;
b_val = (uint32_t)b_val;
}
/*
* A with only low bits set vs B with high bits set means that A < B.
*/
if (a_zmask < b_val) {
bool inv = false;
switch (cond) {
case TCG_COND_NE:
case TCG_COND_LEU:
case TCG_COND_LTU:
inv = true;
/* fall through */
case TCG_COND_GTU:
case TCG_COND_GEU:
case TCG_COND_EQ:
return tcg_opt_gen_movi(ctx, op, op->args[0], neg ? -inv : inv);
default:
break;
}
}
/*
* A with only lsb set is already boolean.
*/
if (a_zmask <= 1) {
bool convert = false;
bool inv = false;
switch (cond) {
case TCG_COND_EQ:
inv = true;
/* fall through */
case TCG_COND_NE:
convert = (b_val == 0);
break;
case TCG_COND_LTU:
case TCG_COND_TSTEQ:
inv = true;
/* fall through */
case TCG_COND_GEU:
case TCG_COND_TSTNE:
convert = (b_val == 1);
break;
default:
break;
}
if (convert) {
TCGOpcode add_opc, xor_opc, neg_opc;
if (!inv && !neg) {
return tcg_opt_gen_mov(ctx, op, op->args[0], op->args[1]);
}
switch (ctx->type) {
case TCG_TYPE_I32:
add_opc = INDEX_op_add_i32;
neg_opc = INDEX_op_neg_i32;
xor_opc = INDEX_op_xor_i32;
break;
case TCG_TYPE_I64:
add_opc = INDEX_op_add_i64;
neg_opc = INDEX_op_neg_i64;
xor_opc = INDEX_op_xor_i64;
break;
default:
g_assert_not_reached();
}
if (!inv) {
op->opc = neg_opc;
} else if (neg) {
op->opc = add_opc;
op->args[2] = arg_new_constant(ctx, -1);
} else {
op->opc = xor_opc;
op->args[2] = arg_new_constant(ctx, 1);
}
return false;
}
}
return false;
}
static void fold_setcond_tst_pow2(OptContext *ctx, TCGOp *op, bool neg)
{
TCGOpcode and_opc, sub_opc, xor_opc, neg_opc, shr_opc;
@ -2200,6 +2302,10 @@ static bool fold_setcond(OptContext *ctx, TCGOp *op)
if (i >= 0) {
return tcg_opt_gen_movi(ctx, op, op->args[0], i);
}
if (fold_setcond_zmask(ctx, op, false)) {
return true;
}
fold_setcond_tst_pow2(ctx, op, false);
ctx->z_mask = 1;
@ -2214,6 +2320,10 @@ static bool fold_negsetcond(OptContext *ctx, TCGOp *op)
if (i >= 0) {
return tcg_opt_gen_movi(ctx, op, op->args[0], -i);
}
if (fold_setcond_zmask(ctx, op, true)) {
return true;
}
fold_setcond_tst_pow2(ctx, op, true);
/* Value is {0,-1} so all bits are repetitions of the sign. */

View file

@ -785,7 +785,8 @@ static void expand_3_i32(uint32_t dofs, uint32_t aofs,
}
static void expand_3i_i32(uint32_t dofs, uint32_t aofs, uint32_t bofs,
uint32_t oprsz, int32_t c, bool load_dest,
uint32_t oprsz, int32_t c,
bool load_dest, bool write_aofs,
void (*fni)(TCGv_i32, TCGv_i32, TCGv_i32, int32_t))
{
TCGv_i32 t0 = tcg_temp_new_i32();
@ -801,6 +802,9 @@ static void expand_3i_i32(uint32_t dofs, uint32_t aofs, uint32_t bofs,
}
fni(t2, t0, t1, c);
tcg_gen_st_i32(t2, tcg_env, dofs + i);
if (write_aofs) {
tcg_gen_st_i32(t0, tcg_env, aofs + i);
}
}
tcg_temp_free_i32(t0);
tcg_temp_free_i32(t1);
@ -944,7 +948,8 @@ static void expand_3_i64(uint32_t dofs, uint32_t aofs,
}
static void expand_3i_i64(uint32_t dofs, uint32_t aofs, uint32_t bofs,
uint32_t oprsz, int64_t c, bool load_dest,
uint32_t oprsz, int64_t c,
bool load_dest, bool write_aofs,
void (*fni)(TCGv_i64, TCGv_i64, TCGv_i64, int64_t))
{
TCGv_i64 t0 = tcg_temp_new_i64();
@ -960,6 +965,9 @@ static void expand_3i_i64(uint32_t dofs, uint32_t aofs, uint32_t bofs,
}
fni(t2, t0, t1, c);
tcg_gen_st_i64(t2, tcg_env, dofs + i);
if (write_aofs) {
tcg_gen_st_i64(t0, tcg_env, aofs + i);
}
}
tcg_temp_free_i64(t0);
tcg_temp_free_i64(t1);
@ -1102,7 +1110,8 @@ static void expand_3_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
*/
static void expand_3i_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
uint32_t bofs, uint32_t oprsz, uint32_t tysz,
TCGType type, int64_t c, bool load_dest,
TCGType type, int64_t c,
bool load_dest, bool write_aofs,
void (*fni)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec,
int64_t))
{
@ -1118,6 +1127,9 @@ static void expand_3i_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
}
fni(vece, t2, t0, t1, c);
tcg_gen_st_vec(t2, tcg_env, dofs + i);
if (write_aofs) {
tcg_gen_st_vec(t0, tcg_env, aofs + i);
}
}
}
@ -1471,7 +1483,7 @@ void tcg_gen_gvec_3i(uint32_t dofs, uint32_t aofs, uint32_t bofs,
*/
some = QEMU_ALIGN_DOWN(oprsz, 32);
expand_3i_vec(g->vece, dofs, aofs, bofs, some, 32, TCG_TYPE_V256,
c, g->load_dest, g->fniv);
c, g->load_dest, g->write_aofs, g->fniv);
if (some == oprsz) {
break;
}
@ -1483,18 +1495,20 @@ void tcg_gen_gvec_3i(uint32_t dofs, uint32_t aofs, uint32_t bofs,
/* fallthru */
case TCG_TYPE_V128:
expand_3i_vec(g->vece, dofs, aofs, bofs, oprsz, 16, TCG_TYPE_V128,
c, g->load_dest, g->fniv);
c, g->load_dest, g->write_aofs, g->fniv);
break;
case TCG_TYPE_V64:
expand_3i_vec(g->vece, dofs, aofs, bofs, oprsz, 8, TCG_TYPE_V64,
c, g->load_dest, g->fniv);
c, g->load_dest, g->write_aofs, g->fniv);
break;
case 0:
if (g->fni8 && check_size_impl(oprsz, 8)) {
expand_3i_i64(dofs, aofs, bofs, oprsz, c, g->load_dest, g->fni8);
expand_3i_i64(dofs, aofs, bofs, oprsz, c,
g->load_dest, g->write_aofs, g->fni8);
} else if (g->fni4 && check_size_impl(oprsz, 4)) {
expand_3i_i32(dofs, aofs, bofs, oprsz, c, g->load_dest, g->fni4);
expand_3i_i32(dofs, aofs, bofs, oprsz, c,
g->load_dest, g->write_aofs, g->fni4);
} else {
assert(g->fno != NULL);
tcg_gen_gvec_3_ool(dofs, aofs, bofs, oprsz, maxsz, c, g->fno);