1
0
mirror of https://gitlab.com/qemu-project/qemu synced 2024-07-09 04:27:12 +00:00

tcg/i386: Simplify immediate 8-bit logical vector shifts

The x86 isa does not have this operation, so we need an expansion.
Use the same algorithm that we use for expanding this vector
operation with integers: perform the shift with a wider type
and then mask the bits that must be zero.

This reduces the instruction count from 5 to 2.

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
This commit is contained in:
Richard Henderson 2024-03-12 14:28:27 -10:00
parent b3ee719e64
commit 2623ca6ac1

View File

@ -3769,49 +3769,20 @@ int tcg_can_emit_vec_op(TCGOpcode opc, TCGType type, unsigned vece)
}
}
static void expand_vec_shi(TCGType type, unsigned vece, TCGOpcode opc,
static void expand_vec_shi(TCGType type, unsigned vece, bool right,
TCGv_vec v0, TCGv_vec v1, TCGArg imm)
{
TCGv_vec t1, t2;
uint8_t mask;
tcg_debug_assert(vece == MO_8);
t1 = tcg_temp_new_vec(type);
t2 = tcg_temp_new_vec(type);
/*
* Unpack to W, shift, and repack. Tricky bits:
* (1) Use punpck*bw x,x to produce DDCCBBAA,
* i.e. duplicate in other half of the 16-bit lane.
* (2) For right-shift, add 8 so that the high half of the lane
* becomes zero. For left-shift, and left-rotate, we must
* shift up and down again.
* (3) Step 2 leaves high half zero such that PACKUSWB
* (pack with unsigned saturation) does not modify
* the quantity.
*/
vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(v1));
vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
tcgv_vec_arg(t2), tcgv_vec_arg(v1), tcgv_vec_arg(v1));
if (opc != INDEX_op_rotli_vec) {
imm += 8;
}
if (opc == INDEX_op_shri_vec) {
tcg_gen_shri_vec(MO_16, t1, t1, imm);
tcg_gen_shri_vec(MO_16, t2, t2, imm);
if (right) {
mask = 0xff >> imm;
tcg_gen_shri_vec(MO_16, v0, v1, imm);
} else {
tcg_gen_shli_vec(MO_16, t1, t1, imm);
tcg_gen_shli_vec(MO_16, t2, t2, imm);
tcg_gen_shri_vec(MO_16, t1, t1, 8);
tcg_gen_shri_vec(MO_16, t2, t2, 8);
mask = 0xff << imm;
tcg_gen_shli_vec(MO_16, v0, v1, imm);
}
vec_gen_3(INDEX_op_x86_packus_vec, type, MO_8,
tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t2));
tcg_temp_free_vec(t1);
tcg_temp_free_vec(t2);
tcg_gen_and_vec(MO_8, v0, v0, tcg_constant_vec(type, MO_8, mask));
}
static void expand_vec_sari(TCGType type, unsigned vece,
@ -3821,7 +3792,7 @@ static void expand_vec_sari(TCGType type, unsigned vece,
switch (vece) {
case MO_8:
/* Unpack to W, shift, and repack, as in expand_vec_shi. */
/* Unpack to 16-bit, shift, and repack. */
t1 = tcg_temp_new_vec(type);
t2 = tcg_temp_new_vec(type);
vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
@ -3874,12 +3845,7 @@ static void expand_vec_rotli(TCGType type, unsigned vece,
{
TCGv_vec t;
if (vece == MO_8) {
expand_vec_shi(type, vece, INDEX_op_rotli_vec, v0, v1, imm);
return;
}
if (have_avx512vbmi2) {
if (vece != MO_8 && have_avx512vbmi2) {
vec_gen_4(INDEX_op_x86_vpshldi_vec, type, vece,
tcgv_vec_arg(v0), tcgv_vec_arg(v1), tcgv_vec_arg(v1), imm);
return;
@ -4155,10 +4121,11 @@ void tcg_expand_vec_op(TCGOpcode opc, TCGType type, unsigned vece,
switch (opc) {
case INDEX_op_shli_vec:
case INDEX_op_shri_vec:
expand_vec_shi(type, vece, opc, v0, v1, a2);
expand_vec_shi(type, vece, false, v0, v1, a2);
break;
case INDEX_op_shri_vec:
expand_vec_shi(type, vece, true, v0, v1, a2);
break;
case INDEX_op_sari_vec:
expand_vec_sari(type, vece, v0, v1, a2);
break;