This commit is contained in:
Nekotekina 2015-04-03 19:56:57 +03:00
parent 59effb3412
commit 1c82e8b1d5
3 changed files with 79 additions and 145 deletions

View file

@ -354,12 +354,12 @@ union _CRT_ALIGN(16) u128
return from64(~_u64[0], ~_u64[1]); return from64(~_u64[0], ~_u64[1]);
} }
__forceinline bool test() const __forceinline bool is_any_1() const // check if any bit is 1
{ {
return _u64[0] || _u64[1]; return _u64[0] || _u64[1];
} }
__forceinline bool inv_test() const __forceinline bool is_any_0() const // check if any bit is 0
{ {
return ~_u64[0] || ~_u64[1]; return ~_u64[0] || ~_u64[1];
} }

View file

@ -232,7 +232,7 @@ void ppu_interpreter::VCMPBFP_(PPUThread& CPU, ppu_opcode_t op)
{ {
VCMPBFP(CPU, op); VCMPBFP(CPU, op);
CPU.CR.cr6 = CPU.VPR[op.vd].test() ? 0 : 2; // set 2 if all in bounds CPU.CR.cr6 = CPU.VPR[op.vd].is_any_1() ? 0 : 2; // set 2 if all in bounds
} }
void ppu_interpreter::VCMPEQFP(PPUThread& CPU, ppu_opcode_t op) void ppu_interpreter::VCMPEQFP(PPUThread& CPU, ppu_opcode_t op)
@ -244,7 +244,7 @@ void ppu_interpreter::VCMPEQFP_(PPUThread& CPU, ppu_opcode_t op)
{ {
VCMPEQFP(CPU, op); VCMPEQFP(CPU, op);
CPU.CR.cr6 = CPU.VPR[op.vd].test() ? (CPU.VPR[op.vd].inv_test() ? 0 : 8) : 2; // set 2 if none equal, 8 if all equal CPU.CR.cr6 = CPU.VPR[op.vd].is_any_1() ? (CPU.VPR[op.vd].is_any_0() ? 0 : 8) : 2; // set 2 if none equal, 8 if all equal
} }
void ppu_interpreter::VCMPEQUB(PPUThread& CPU, ppu_opcode_t op) void ppu_interpreter::VCMPEQUB(PPUThread& CPU, ppu_opcode_t op)
@ -256,7 +256,7 @@ void ppu_interpreter::VCMPEQUB_(PPUThread& CPU, ppu_opcode_t op)
{ {
VCMPEQUB(CPU, op); VCMPEQUB(CPU, op);
CPU.CR.cr6 = CPU.VPR[op.vd].test() ? (CPU.VPR[op.vd].inv_test() ? 0 : 8) : 2; // set 2 if none equal, 8 if all equal CPU.CR.cr6 = CPU.VPR[op.vd].is_any_1() ? (CPU.VPR[op.vd].is_any_0() ? 0 : 8) : 2; // set 2 if none equal, 8 if all equal
} }
void ppu_interpreter::VCMPEQUH(PPUThread& CPU, ppu_opcode_t op) void ppu_interpreter::VCMPEQUH(PPUThread& CPU, ppu_opcode_t op)
@ -268,7 +268,7 @@ void ppu_interpreter::VCMPEQUH_(PPUThread& CPU, ppu_opcode_t op)
{ {
VCMPEQUH(CPU, op); VCMPEQUH(CPU, op);
CPU.CR.cr6 = CPU.VPR[op.vd].test() ? (CPU.VPR[op.vd].inv_test() ? 0 : 8) : 2; // set 2 if none equal, 8 if all equal CPU.CR.cr6 = CPU.VPR[op.vd].is_any_1() ? (CPU.VPR[op.vd].is_any_0() ? 0 : 8) : 2; // set 2 if none equal, 8 if all equal
} }
void ppu_interpreter::VCMPEQUW(PPUThread& CPU, ppu_opcode_t op) void ppu_interpreter::VCMPEQUW(PPUThread& CPU, ppu_opcode_t op)
@ -280,7 +280,7 @@ void ppu_interpreter::VCMPEQUW_(PPUThread& CPU, ppu_opcode_t op)
{ {
VCMPEQUW(CPU, op); VCMPEQUW(CPU, op);
CPU.CR.cr6 = CPU.VPR[op.vd].test() ? (CPU.VPR[op.vd].inv_test() ? 0 : 8) : 2; // set 2 if none equal, 8 if all equal CPU.CR.cr6 = CPU.VPR[op.vd].is_any_1() ? (CPU.VPR[op.vd].is_any_0() ? 0 : 8) : 2; // set 2 if none equal, 8 if all equal
} }
void ppu_interpreter::VCMPGEFP(PPUThread& CPU, ppu_opcode_t op) void ppu_interpreter::VCMPGEFP(PPUThread& CPU, ppu_opcode_t op)
@ -292,7 +292,7 @@ void ppu_interpreter::VCMPGEFP_(PPUThread& CPU, ppu_opcode_t op)
{ {
VCMPGEFP(CPU, op); VCMPGEFP(CPU, op);
CPU.CR.cr6 = CPU.VPR[op.vd].test() ? (CPU.VPR[op.vd].inv_test() ? 0 : 8) : 2; CPU.CR.cr6 = CPU.VPR[op.vd].is_any_1() ? (CPU.VPR[op.vd].is_any_0() ? 0 : 8) : 2;
} }
void ppu_interpreter::VCMPGTFP(PPUThread& CPU, ppu_opcode_t op) void ppu_interpreter::VCMPGTFP(PPUThread& CPU, ppu_opcode_t op)
@ -304,7 +304,7 @@ void ppu_interpreter::VCMPGTFP_(PPUThread& CPU, ppu_opcode_t op)
{ {
VCMPGTFP(CPU, op); VCMPGTFP(CPU, op);
CPU.CR.cr6 = CPU.VPR[op.vd].test() ? (CPU.VPR[op.vd].inv_test() ? 0 : 8) : 2; CPU.CR.cr6 = CPU.VPR[op.vd].is_any_1() ? (CPU.VPR[op.vd].is_any_0() ? 0 : 8) : 2;
} }
void ppu_interpreter::VCMPGTSB(PPUThread& CPU, ppu_opcode_t op) void ppu_interpreter::VCMPGTSB(PPUThread& CPU, ppu_opcode_t op)
@ -316,7 +316,7 @@ void ppu_interpreter::VCMPGTSB_(PPUThread& CPU, ppu_opcode_t op)
{ {
VCMPGTSB(CPU, op); VCMPGTSB(CPU, op);
CPU.CR.cr6 = CPU.VPR[op.vd].test() ? (CPU.VPR[op.vd].inv_test() ? 0 : 8) : 2; CPU.CR.cr6 = CPU.VPR[op.vd].is_any_1() ? (CPU.VPR[op.vd].is_any_0() ? 0 : 8) : 2;
} }
void ppu_interpreter::VCMPGTSH(PPUThread& CPU, ppu_opcode_t op) void ppu_interpreter::VCMPGTSH(PPUThread& CPU, ppu_opcode_t op)
@ -328,7 +328,7 @@ void ppu_interpreter::VCMPGTSH_(PPUThread& CPU, ppu_opcode_t op)
{ {
VCMPGTSH(CPU, op); VCMPGTSH(CPU, op);
CPU.CR.cr6 = CPU.VPR[op.vd].test() ? (CPU.VPR[op.vd].inv_test() ? 0 : 8) : 2; CPU.CR.cr6 = CPU.VPR[op.vd].is_any_1() ? (CPU.VPR[op.vd].is_any_0() ? 0 : 8) : 2;
} }
void ppu_interpreter::VCMPGTSW(PPUThread& CPU, ppu_opcode_t op) void ppu_interpreter::VCMPGTSW(PPUThread& CPU, ppu_opcode_t op)
@ -340,7 +340,7 @@ void ppu_interpreter::VCMPGTSW_(PPUThread& CPU, ppu_opcode_t op)
{ {
VCMPGTSW(CPU, op); VCMPGTSW(CPU, op);
CPU.CR.cr6 = CPU.VPR[op.vd].test() ? (CPU.VPR[op.vd].inv_test() ? 0 : 8) : 2; CPU.CR.cr6 = CPU.VPR[op.vd].is_any_1() ? (CPU.VPR[op.vd].is_any_0() ? 0 : 8) : 2;
} }
void ppu_interpreter::VCMPGTUB(PPUThread& CPU, ppu_opcode_t op) void ppu_interpreter::VCMPGTUB(PPUThread& CPU, ppu_opcode_t op)
@ -352,7 +352,7 @@ void ppu_interpreter::VCMPGTUB_(PPUThread& CPU, ppu_opcode_t op)
{ {
VCMPGTUB(CPU, op); VCMPGTUB(CPU, op);
CPU.CR.cr6 = CPU.VPR[op.vd].test() ? (CPU.VPR[op.vd].inv_test() ? 0 : 8) : 2; CPU.CR.cr6 = CPU.VPR[op.vd].is_any_1() ? (CPU.VPR[op.vd].is_any_0() ? 0 : 8) : 2;
} }
void ppu_interpreter::VCMPGTUH(PPUThread& CPU, ppu_opcode_t op) void ppu_interpreter::VCMPGTUH(PPUThread& CPU, ppu_opcode_t op)
@ -364,7 +364,7 @@ void ppu_interpreter::VCMPGTUH_(PPUThread& CPU, ppu_opcode_t op)
{ {
VCMPGTUH(CPU, op); VCMPGTUH(CPU, op);
CPU.CR.cr6 = CPU.VPR[op.vd].test() ? (CPU.VPR[op.vd].inv_test() ? 0 : 8) : 2; CPU.CR.cr6 = CPU.VPR[op.vd].is_any_1() ? (CPU.VPR[op.vd].is_any_0() ? 0 : 8) : 2;
} }
void ppu_interpreter::VCMPGTUW(PPUThread& CPU, ppu_opcode_t op) void ppu_interpreter::VCMPGTUW(PPUThread& CPU, ppu_opcode_t op)
@ -376,7 +376,7 @@ void ppu_interpreter::VCMPGTUW_(PPUThread& CPU, ppu_opcode_t op)
{ {
VCMPGTUW(CPU, op); VCMPGTUW(CPU, op);
CPU.CR.cr6 = CPU.VPR[op.vd].test() ? (CPU.VPR[op.vd].inv_test() ? 0 : 8) : 2; CPU.CR.cr6 = CPU.VPR[op.vd].is_any_1() ? (CPU.VPR[op.vd].is_any_0() ? 0 : 8) : 2;
} }
void ppu_interpreter::VCTSXS(PPUThread& CPU, ppu_opcode_t op) void ppu_interpreter::VCTSXS(PPUThread& CPU, ppu_opcode_t op)
@ -417,7 +417,7 @@ void ppu_interpreter::VMAXSB(PPUThread& CPU, ppu_opcode_t op)
const auto a = CPU.VPR[op.va].vi; const auto a = CPU.VPR[op.va].vi;
const auto b = CPU.VPR[op.vb].vi; const auto b = CPU.VPR[op.vb].vi;
const auto m = _mm_cmpgt_epi8(a, b); const auto m = _mm_cmpgt_epi8(a, b);
CPU.VPR[op.vd].vi = _mm_or_si128(_mm_and_si128(a, m), _mm_andnot_si128(m, b)); CPU.VPR[op.vd].vi = _mm_or_si128(_mm_and_si128(m, a), _mm_andnot_si128(m, b));
} }
void ppu_interpreter::VMAXSH(PPUThread& CPU, ppu_opcode_t op) void ppu_interpreter::VMAXSH(PPUThread& CPU, ppu_opcode_t op)
@ -430,7 +430,7 @@ void ppu_interpreter::VMAXSW(PPUThread& CPU, ppu_opcode_t op)
const auto a = CPU.VPR[op.va].vi; const auto a = CPU.VPR[op.va].vi;
const auto b = CPU.VPR[op.vb].vi; const auto b = CPU.VPR[op.vb].vi;
const auto m = _mm_cmpgt_epi32(a, b); const auto m = _mm_cmpgt_epi32(a, b);
CPU.VPR[op.vd].vi = _mm_or_si128(_mm_and_si128(a, m), _mm_andnot_si128(m, b)); CPU.VPR[op.vd].vi = _mm_or_si128(_mm_and_si128(m, a), _mm_andnot_si128(m, b));
} }
void ppu_interpreter::VMAXUB(PPUThread& CPU, ppu_opcode_t op) void ppu_interpreter::VMAXUB(PPUThread& CPU, ppu_opcode_t op)
@ -449,19 +449,27 @@ void ppu_interpreter::VMAXUW(PPUThread& CPU, ppu_opcode_t op)
const auto a = CPU.VPR[op.va].vi; const auto a = CPU.VPR[op.va].vi;
const auto b = CPU.VPR[op.vb].vi; const auto b = CPU.VPR[op.vb].vi;
const auto m = sse_cmpgt_epu32(a, b); const auto m = sse_cmpgt_epu32(a, b);
CPU.VPR[op.vd].vi = _mm_or_si128(_mm_and_si128(a, m), _mm_andnot_si128(m, b)); CPU.VPR[op.vd].vi = _mm_or_si128(_mm_and_si128(m, a), _mm_andnot_si128(m, b));
} }
void ppu_interpreter::VMHADDSHS(PPUThread& CPU, ppu_opcode_t op) void ppu_interpreter::VMHADDSHS(PPUThread& CPU, ppu_opcode_t op)
{ {
const auto a = CPU.VPR[op.va].vi; const auto a = CPU.VPR[op.va].vi;
const auto b = CPU.VPR[op.vb].vi; const auto b = CPU.VPR[op.vb].vi;
CPU.VPR[op.vd].vi = _mm_adds_epi16(_mm_or_si128(_mm_srli_epi16(_mm_mullo_epi16(a, b), 15), _mm_slli_epi16(_mm_mulhi_epi16(a, b), 1)), CPU.VPR[op.vc].vi); const auto c = CPU.VPR[op.vc].vi;
const auto m = _mm_or_si128(_mm_srli_epi16(_mm_mullo_epi16(a, b), 15), _mm_slli_epi16(_mm_mulhi_epi16(a, b), 1));
const auto s = _mm_cmpeq_epi16(m, _mm_set1_epi16(-0x8000)); // detect special case (positive 0x8000)
CPU.VPR[op.vd].vi = _mm_adds_epi16(_mm_adds_epi16(_mm_xor_si128(m, s), c), _mm_srli_epi16(s, 15));
} }
void ppu_interpreter::VMHRADDSHS(PPUThread& CPU, ppu_opcode_t op) void ppu_interpreter::VMHRADDSHS(PPUThread& CPU, ppu_opcode_t op)
{ {
CPU.VPR[op.vd].vi = _mm_adds_epi16(_mm_mulhrs_epi16(CPU.VPR[op.va].vi, CPU.VPR[op.vb].vi), CPU.VPR[op.vc].vi); const auto a = CPU.VPR[op.va].vi;
const auto b = CPU.VPR[op.vb].vi;
const auto c = CPU.VPR[op.vc].vi;
const auto m = _mm_mulhrs_epi16(a, b);
const auto s = _mm_cmpeq_epi16(m, _mm_set1_epi16(-0x8000)); // detect special case (positive 0x8000)
CPU.VPR[op.vd].vi = _mm_adds_epi16(_mm_adds_epi16(_mm_xor_si128(m, s), c), _mm_srli_epi16(s, 15));
} }
void ppu_interpreter::VMINFP(PPUThread& CPU, ppu_opcode_t op) void ppu_interpreter::VMINFP(PPUThread& CPU, ppu_opcode_t op)
@ -474,7 +482,7 @@ void ppu_interpreter::VMINSB(PPUThread& CPU, ppu_opcode_t op)
const auto a = CPU.VPR[op.va].vi; const auto a = CPU.VPR[op.va].vi;
const auto b = CPU.VPR[op.vb].vi; const auto b = CPU.VPR[op.vb].vi;
const auto m = _mm_cmpgt_epi8(a, b); const auto m = _mm_cmpgt_epi8(a, b);
CPU.VPR[op.vd].vi = _mm_or_si128(_mm_andnot_si128(a, m), _mm_and_si128(m, b)); CPU.VPR[op.vd].vi = _mm_or_si128(_mm_andnot_si128(m, a), _mm_and_si128(m, b));
} }
void ppu_interpreter::VMINSH(PPUThread& CPU, ppu_opcode_t op) void ppu_interpreter::VMINSH(PPUThread& CPU, ppu_opcode_t op)
@ -487,7 +495,7 @@ void ppu_interpreter::VMINSW(PPUThread& CPU, ppu_opcode_t op)
const auto a = CPU.VPR[op.va].vi; const auto a = CPU.VPR[op.va].vi;
const auto b = CPU.VPR[op.vb].vi; const auto b = CPU.VPR[op.vb].vi;
const auto m = _mm_cmpgt_epi32(a, b); const auto m = _mm_cmpgt_epi32(a, b);
CPU.VPR[op.vd].vi = _mm_or_si128(_mm_andnot_si128(a, m), _mm_and_si128(m, b)); CPU.VPR[op.vd].vi = _mm_or_si128(_mm_andnot_si128(m, a), _mm_and_si128(m, b));
} }
void ppu_interpreter::VMINUB(PPUThread& CPU, ppu_opcode_t op) void ppu_interpreter::VMINUB(PPUThread& CPU, ppu_opcode_t op)
@ -506,7 +514,7 @@ void ppu_interpreter::VMINUW(PPUThread& CPU, ppu_opcode_t op)
const auto a = CPU.VPR[op.va].vi; const auto a = CPU.VPR[op.va].vi;
const auto b = CPU.VPR[op.vb].vi; const auto b = CPU.VPR[op.vb].vi;
const auto m = sse_cmpgt_epu32(a, b); const auto m = sse_cmpgt_epu32(a, b);
CPU.VPR[op.vd].vi = _mm_or_si128(_mm_andnot_si128(a, m), _mm_and_si128(m, b)); CPU.VPR[op.vd].vi = _mm_or_si128(_mm_andnot_si128(m, a), _mm_and_si128(m, b));
} }
void ppu_interpreter::VMLADDUHM(PPUThread& CPU, ppu_opcode_t op) void ppu_interpreter::VMLADDUHM(PPUThread& CPU, ppu_opcode_t op)
@ -548,13 +556,14 @@ void ppu_interpreter::VMSUMMBM(PPUThread& CPU, ppu_opcode_t op)
{ {
const auto a = CPU.VPR[op.va].vi; // signed bytes const auto a = CPU.VPR[op.va].vi; // signed bytes
const auto b = CPU.VPR[op.vb].vi; // unsigned bytes const auto b = CPU.VPR[op.vb].vi; // unsigned bytes
const auto c = CPU.VPR[op.vc].vi;
const auto ah = _mm_srai_epi16(a, 8); const auto ah = _mm_srai_epi16(a, 8);
const auto bh = _mm_srli_epi16(b, 8); const auto bh = _mm_srli_epi16(b, 8);
const auto al = _mm_srai_epi16(_mm_srli_epi16(a, 8), 8); const auto al = _mm_srai_epi16(_mm_slli_epi16(a, 8), 8);
const auto bl = _mm_and_si128(b, _mm_set1_epi16(0x00ff)); const auto bl = _mm_and_si128(b, _mm_set1_epi16(0x00ff));
const auto sh = _mm_madd_epi16(ah, bh); const auto sh = _mm_madd_epi16(ah, bh);
const auto sl = _mm_madd_epi16(al, bl); const auto sl = _mm_madd_epi16(al, bl);
CPU.VPR[op.vd].vi = _mm_add_epi32(_mm_add_epi32(CPU.VPR[op.vc].vi, sh), sl); CPU.VPR[op.vd].vi = _mm_add_epi32(_mm_add_epi32(c, sh), sl);
} }
void ppu_interpreter::VMSUMSHM(PPUThread& CPU, ppu_opcode_t op) void ppu_interpreter::VMSUMSHM(PPUThread& CPU, ppu_opcode_t op)
@ -595,6 +604,7 @@ void ppu_interpreter::VMSUMUBM(PPUThread& CPU, ppu_opcode_t op)
{ {
const auto a = CPU.VPR[op.va].vi; const auto a = CPU.VPR[op.va].vi;
const auto b = CPU.VPR[op.vb].vi; const auto b = CPU.VPR[op.vb].vi;
const auto c = CPU.VPR[op.vc].vi;
const auto mask = _mm_set1_epi16(0x00ff); const auto mask = _mm_set1_epi16(0x00ff);
const auto ah = _mm_srli_epi16(a, 8); const auto ah = _mm_srli_epi16(a, 8);
const auto al = _mm_and_si128(a, mask); const auto al = _mm_and_si128(a, mask);
@ -602,18 +612,19 @@ void ppu_interpreter::VMSUMUBM(PPUThread& CPU, ppu_opcode_t op)
const auto bl = _mm_and_si128(b, mask); const auto bl = _mm_and_si128(b, mask);
const auto sh = _mm_madd_epi16(ah, bh); const auto sh = _mm_madd_epi16(ah, bh);
const auto sl = _mm_madd_epi16(al, bl); const auto sl = _mm_madd_epi16(al, bl);
CPU.VPR[op.vd].vi = _mm_add_epi32(_mm_add_epi32(CPU.VPR[op.vc].vi, sh), sl); CPU.VPR[op.vd].vi = _mm_add_epi32(_mm_add_epi32(c, sh), sl);
} }
void ppu_interpreter::VMSUMUHM(PPUThread& CPU, ppu_opcode_t op) void ppu_interpreter::VMSUMUHM(PPUThread& CPU, ppu_opcode_t op)
{ {
const auto a = CPU.VPR[op.va].vi; const auto a = CPU.VPR[op.va].vi;
const auto b = CPU.VPR[op.vb].vi; const auto b = CPU.VPR[op.vb].vi;
const auto c = CPU.VPR[op.vc].vi;
const auto ml = _mm_mullo_epi16(a, b); // low results const auto ml = _mm_mullo_epi16(a, b); // low results
const auto mh = _mm_mulhi_epu16(a, b); // high results const auto mh = _mm_mulhi_epu16(a, b); // high results
const auto ls = _mm_add_epi32(_mm_srli_epi32(ml, 16), _mm_and_si128(ml, _mm_set1_epi32(0x0000ffff))); const auto ls = _mm_add_epi32(_mm_srli_epi32(ml, 16), _mm_and_si128(ml, _mm_set1_epi32(0x0000ffff)));
const auto hs = _mm_add_epi32(_mm_slli_epi32(mh, 16), _mm_and_si128(mh, _mm_set1_epi32(0xffff0000))); const auto hs = _mm_add_epi32(_mm_slli_epi32(mh, 16), _mm_and_si128(mh, _mm_set1_epi32(0xffff0000)));
CPU.VPR[op.vd].vi = _mm_add_epi32(_mm_add_epi32(CPU.VPR[op.vc].vi, ls), hs); CPU.VPR[op.vd].vi = _mm_add_epi32(_mm_add_epi32(c, ls), hs);
} }
void ppu_interpreter::VMSUMUHS(PPUThread& CPU, ppu_opcode_t op) void ppu_interpreter::VMSUMUHS(PPUThread& CPU, ppu_opcode_t op)
@ -648,7 +659,7 @@ void ppu_interpreter::VMULESB(PPUThread& CPU, ppu_opcode_t op)
void ppu_interpreter::VMULESH(PPUThread& CPU, ppu_opcode_t op) void ppu_interpreter::VMULESH(PPUThread& CPU, ppu_opcode_t op)
{ {
CPU.VPR[op.vd].vi = _mm_madd_epi16(_mm_srli_epi16(CPU.VPR[op.va].vi, 16), _mm_srli_epi16(CPU.VPR[op.vb].vi, 16)); CPU.VPR[op.vd].vi = _mm_madd_epi16(_mm_srli_epi32(CPU.VPR[op.va].vi, 16), _mm_srli_epi32(CPU.VPR[op.vb].vi, 16));
} }
void ppu_interpreter::VMULEUB(PPUThread& CPU, ppu_opcode_t op) void ppu_interpreter::VMULEUB(PPUThread& CPU, ppu_opcode_t op)
@ -708,16 +719,11 @@ void ppu_interpreter::VOR(PPUThread& CPU, ppu_opcode_t op)
void ppu_interpreter::VPERM(PPUThread& CPU, ppu_opcode_t op) void ppu_interpreter::VPERM(PPUThread& CPU, ppu_opcode_t op)
{ {
u8 tmpSRC[32]; const auto index = _mm_andnot_si128(CPU.VPR[op.vc].vi, _mm_set1_epi8(0x1f));
memcpy(tmpSRC, CPU.VPR[op.vb]._u8, 16); const auto mask = _mm_cmpgt_epi8(index, _mm_set1_epi8(0xf));
memcpy(tmpSRC + 16, CPU.VPR[op.va]._u8, 16); const auto sa = _mm_shuffle_epi8(CPU.VPR[op.va].vi, index);
const auto sb = _mm_shuffle_epi8(CPU.VPR[op.vb].vi, index);
for (uint b = 0; b < 16; b++) CPU.VPR[op.vd].vi = _mm_or_si128(_mm_and_si128(mask, sa), _mm_andnot_si128(mask, sb));
{
u8 index = CPU.VPR[op.vc]._u8[b] & 0x1f;
CPU.VPR[op.vd]._u8[b] = tmpSRC[0x1f - index];
}
} }
void ppu_interpreter::VPKPX(PPUThread& CPU, ppu_opcode_t op) void ppu_interpreter::VPKPX(PPUThread& CPU, ppu_opcode_t op)
@ -742,108 +748,23 @@ void ppu_interpreter::VPKPX(PPUThread& CPU, ppu_opcode_t op)
void ppu_interpreter::VPKSHSS(PPUThread& CPU, ppu_opcode_t op) void ppu_interpreter::VPKSHSS(PPUThread& CPU, ppu_opcode_t op)
{ {
u128 VA = CPU.VPR[op.va]; CPU.VPR[op.vd].vi = _mm_packs_epi16(CPU.VPR[op.vb].vi, CPU.VPR[op.va].vi);
u128 VB = CPU.VPR[op.vb];
for (uint b = 0; b < 8; b++)
{
s16 result = VA._s16[b];
if (result > INT8_MAX)
{
result = INT8_MAX;
}
else if (result < INT8_MIN)
{
result = INT8_MIN;
}
CPU.VPR[op.vd]._s8[b + 8] = (s8)result;
result = VB._s16[b];
if (result > INT8_MAX)
{
result = INT8_MAX;
}
else if (result < INT8_MIN)
{
result = INT8_MIN;
}
CPU.VPR[op.vd]._s8[b] = (s8)result;
}
} }
void ppu_interpreter::VPKSHUS(PPUThread& CPU, ppu_opcode_t op) void ppu_interpreter::VPKSHUS(PPUThread& CPU, ppu_opcode_t op)
{ {
u128 VA = CPU.VPR[op.va]; CPU.VPR[op.vd].vi = _mm_packus_epi16(CPU.VPR[op.vb].vi, CPU.VPR[op.va].vi);
u128 VB = CPU.VPR[op.vb];
for (uint b = 0; b < 8; b++)
{
s16 result = VA._s16[b];
if (result > UINT8_MAX)
{
result = UINT8_MAX;
}
else if (result < 0)
{
result = 0;
}
CPU.VPR[op.vd]._u8[b + 8] = (u8)result;
result = VB._s16[b];
if (result > UINT8_MAX)
{
result = UINT8_MAX;
}
else if (result < 0)
{
result = 0;
}
CPU.VPR[op.vd]._u8[b] = (u8)result;
}
} }
void ppu_interpreter::VPKSWSS(PPUThread& CPU, ppu_opcode_t op) void ppu_interpreter::VPKSWSS(PPUThread& CPU, ppu_opcode_t op)
{ {
u128 VA = CPU.VPR[op.va]; CPU.VPR[op.vd].vi = _mm_packs_epi32(CPU.VPR[op.vb].vi, CPU.VPR[op.va].vi);
u128 VB = CPU.VPR[op.vb];
for (uint h = 0; h < 4; h++)
{
s32 result = VA._s32[h];
if (result > INT16_MAX)
{
result = INT16_MAX;
}
else if (result < INT16_MIN)
{
result = INT16_MIN;
}
CPU.VPR[op.vd]._s16[h + 4] = result;
result = VB._s32[h];
if (result > INT16_MAX)
{
result = INT16_MAX;
}
else if (result < INT16_MIN)
{
result = INT16_MIN;
}
CPU.VPR[op.vd]._s16[h] = result;
}
} }
void ppu_interpreter::VPKSWUS(PPUThread& CPU, ppu_opcode_t op) void ppu_interpreter::VPKSWUS(PPUThread& CPU, ppu_opcode_t op)
{ {
//CPU.VPR[op.vd].vi = _mm_packus_epi32(CPU.VPR[op.vb].vi, CPU.VPR[op.va].vi);
u128 VA = CPU.VPR[op.va]; u128 VA = CPU.VPR[op.va];
u128 VB = CPU.VPR[op.vb]; u128 VB = CPU.VPR[op.vb];
for (uint h = 0; h < 4; h++) for (uint h = 0; h < 4; h++)
@ -2047,7 +1968,8 @@ void ppu_interpreter::LBZX(PPUThread& CPU, ppu_opcode_t op)
void ppu_interpreter::LVX(PPUThread& CPU, ppu_opcode_t op) void ppu_interpreter::LVX(PPUThread& CPU, ppu_opcode_t op)
{ {
CPU.VPR[op.vd] = vm::read128((u64)((op.ra ? CPU.GPR[op.ra] + CPU.GPR[op.rb] : CPU.GPR[op.rb]) & ~0xfULL)); const u64 addr = (op.ra ? CPU.GPR[op.ra] + CPU.GPR[op.rb] : CPU.GPR[op.rb]) & ~0xfull;
CPU.VPR[op.vd] = vm::read128(vm::cast(addr));
} }
void ppu_interpreter::NEG(PPUThread& CPU, ppu_opcode_t op) void ppu_interpreter::NEG(PPUThread& CPU, ppu_opcode_t op)
@ -2230,7 +2152,8 @@ void ppu_interpreter::STBX(PPUThread& CPU, ppu_opcode_t op)
void ppu_interpreter::STVX(PPUThread& CPU, ppu_opcode_t op) void ppu_interpreter::STVX(PPUThread& CPU, ppu_opcode_t op)
{ {
vm::write128((u64)((op.ra ? CPU.GPR[op.ra] + CPU.GPR[op.rb] : CPU.GPR[op.rb]) & ~0xfULL), CPU.VPR[op.vs]); const u64 addr = (op.ra ? CPU.GPR[op.ra] + CPU.GPR[op.rb] : CPU.GPR[op.rb]) & ~0xfull;
vm::write128(vm::cast(addr), CPU.VPR[op.vs]);
} }
void ppu_interpreter::MULLD(PPUThread& CPU, ppu_opcode_t op) void ppu_interpreter::MULLD(PPUThread& CPU, ppu_opcode_t op)
@ -2372,7 +2295,8 @@ void ppu_interpreter::LHAX(PPUThread& CPU, ppu_opcode_t op)
void ppu_interpreter::LVXL(PPUThread& CPU, ppu_opcode_t op) void ppu_interpreter::LVXL(PPUThread& CPU, ppu_opcode_t op)
{ {
CPU.VPR[op.vd] = vm::read128((u64)((op.ra ? CPU.GPR[op.ra] + CPU.GPR[op.rb] : CPU.GPR[op.rb]) & ~0xfULL)); const u64 addr = (op.ra ? CPU.GPR[op.ra] + CPU.GPR[op.rb] : CPU.GPR[op.rb]) & ~0xfull;
CPU.VPR[op.vd] = vm::read128(vm::cast(addr));
} }
void ppu_interpreter::MFTB(PPUThread& CPU, ppu_opcode_t op) void ppu_interpreter::MFTB(PPUThread& CPU, ppu_opcode_t op)
@ -2511,7 +2435,8 @@ void ppu_interpreter::NAND(PPUThread& CPU, ppu_opcode_t op)
void ppu_interpreter::STVXL(PPUThread& CPU, ppu_opcode_t op) void ppu_interpreter::STVXL(PPUThread& CPU, ppu_opcode_t op)
{ {
vm::write128((u64)((op.ra ? CPU.GPR[op.ra] + CPU.GPR[op.rb] : CPU.GPR[op.rb]) & ~0xfULL), CPU.VPR[op.vs]); const u64 addr = (op.ra ? CPU.GPR[op.ra] + CPU.GPR[op.rb] : CPU.GPR[op.rb]) & ~0xfull;
vm::write128(vm::cast(addr), CPU.VPR[op.vs]);
} }
void ppu_interpreter::DIVD(PPUThread& CPU, ppu_opcode_t op) void ppu_interpreter::DIVD(PPUThread& CPU, ppu_opcode_t op)
@ -3186,7 +3111,7 @@ void ppu_interpreter::STDU(PPUThread& CPU, ppu_opcode_t op)
void ppu_interpreter::MTFSB1(PPUThread& CPU, ppu_opcode_t op) void ppu_interpreter::MTFSB1(PPUThread& CPU, ppu_opcode_t op)
{ {
u64 mask = (1ULL << (31 - op.crbd)); u32 mask = 1 << (31 - op.crbd);
if ((op.crbd >= 3 && op.crbd <= 6) && !(CPU.FPSCR.FPSCR & mask)) mask |= 1ULL << 31; //FPSCR.FX if ((op.crbd >= 3 && op.crbd <= 6) && !(CPU.FPSCR.FPSCR & mask)) mask |= 1ULL << 31; //FPSCR.FX
if ((op.crbd == 29) && !CPU.FPSCR.NI) LOG_WARNING(PPU, "Non-IEEE mode enabled"); if ((op.crbd == 29) && !CPU.FPSCR.NI) LOG_WARNING(PPU, "Non-IEEE mode enabled");
CPU.SetFPSCR(CPU.FPSCR.FPSCR | mask); CPU.SetFPSCR(CPU.FPSCR.FPSCR | mask);
@ -3203,7 +3128,7 @@ void ppu_interpreter::MCRFS(PPUThread& CPU, ppu_opcode_t op)
void ppu_interpreter::MTFSB0(PPUThread& CPU, ppu_opcode_t op) void ppu_interpreter::MTFSB0(PPUThread& CPU, ppu_opcode_t op)
{ {
u64 mask = (1ULL << (31 - op.crbd)); u32 mask = 1 << (31 - op.crbd);
if ((op.crbd == 29) && !CPU.FPSCR.NI) LOG_WARNING(PPU, "Non-IEEE mode disabled"); if ((op.crbd == 29) && !CPU.FPSCR.NI) LOG_WARNING(PPU, "Non-IEEE mode disabled");
CPU.SetFPSCR(CPU.FPSCR.FPSCR & ~mask); CPU.SetFPSCR(CPU.FPSCR.FPSCR & ~mask);

View file

@ -57,6 +57,11 @@ static double SilenceNaN(double x)
return (double&)bits; return (double&)bits;
} }
static float SilenceNaN(float x)
{
return static_cast<float>(SilenceNaN(static_cast<double>(x)));
}
static void SetHostRoundingMode(u32 rn) static void SetHostRoundingMode(u32 rn)
{ {
switch (rn) switch (rn)
@ -2644,7 +2649,8 @@ private:
} }
void LVX(u32 vd, u32 ra, u32 rb) void LVX(u32 vd, u32 ra, u32 rb)
{ {
CPU.VPR[vd] = vm::read128((u64)((ra ? CPU.GPR[ra] + CPU.GPR[rb] : CPU.GPR[rb]) & ~0xfULL)); const u64 addr = (ra ? CPU.GPR[ra] + CPU.GPR[rb] : CPU.GPR[rb]) & ~0xfull;
CPU.VPR[vd] = vm::read128(vm::cast(addr));
} }
void NEG(u32 rd, u32 ra, u32 oe, bool rc) void NEG(u32 rd, u32 ra, u32 oe, bool rc)
{ {
@ -2810,7 +2816,8 @@ private:
} }
void STVX(u32 vs, u32 ra, u32 rb) void STVX(u32 vs, u32 ra, u32 rb)
{ {
vm::write128((u64)((ra ? CPU.GPR[ra] + CPU.GPR[rb] : CPU.GPR[rb]) & ~0xfULL), CPU.VPR[vs]); const u64 addr = (ra ? CPU.GPR[ra] + CPU.GPR[rb] : CPU.GPR[rb]) & ~0xfull;
vm::write128(vm::cast(addr), CPU.VPR[vs]);
} }
void MULLD(u32 rd, u32 ra, u32 rb, u32 oe, bool rc) void MULLD(u32 rd, u32 ra, u32 rb, u32 oe, bool rc)
{ {
@ -2911,7 +2918,8 @@ private:
} }
void LVXL(u32 vd, u32 ra, u32 rb) void LVXL(u32 vd, u32 ra, u32 rb)
{ {
CPU.VPR[vd] = vm::read128((u64)((ra ? CPU.GPR[ra] + CPU.GPR[rb] : CPU.GPR[rb]) & ~0xfULL)); const u64 addr = (ra ? CPU.GPR[ra] + CPU.GPR[rb] : CPU.GPR[rb]) & ~0xfull;
CPU.VPR[vd] = vm::read128(vm::cast(addr));
} }
void MFTB(u32 rd, u32 spr) void MFTB(u32 rd, u32 spr)
{ {
@ -3016,7 +3024,8 @@ private:
} }
void STVXL(u32 vs, u32 ra, u32 rb) void STVXL(u32 vs, u32 ra, u32 rb)
{ {
vm::write128((u64)((ra ? CPU.GPR[ra] + CPU.GPR[rb] : CPU.GPR[rb]) & ~0xfULL), CPU.VPR[vs]); const u64 addr = (ra ? CPU.GPR[ra] + CPU.GPR[rb] : CPU.GPR[rb]) & ~0xfull;
vm::write128(vm::cast(addr), CPU.VPR[vs]);
} }
void DIVD(u32 rd, u32 ra, u32 rb, u32 oe, bool rc) void DIVD(u32 rd, u32 ra, u32 rb, u32 oe, bool rc)
{ {
@ -3238,7 +3247,7 @@ private:
{ {
u64 bits = (u64&)val; u64 bits = (u64&)val;
u32 bits32 = (bits>>32 & 0x80000000) | (bits>>29 & 0x7fffffff); u32 bits32 = (bits>>32 & 0x80000000) | (bits>>29 & 0x7fffffff);
vm::get_ref<be_t<u32>>(vm::cast(addr)) = (float)bits32; vm::get_ref<be_t<u32>>(vm::cast(addr)) = bits32;
} }
} }
void STVRX(u32 vs, u32 ra, u32 rb) void STVRX(u32 vs, u32 ra, u32 rb)
@ -3260,7 +3269,7 @@ private:
{ {
u64 bits = (u64&)val; u64 bits = (u64&)val;
u32 bits32 = (bits>>32 & 0x80000000) | (bits>>29 & 0x7fffffff); u32 bits32 = (bits>>32 & 0x80000000) | (bits>>29 & 0x7fffffff);
vm::get_ref<be_t<u32>>(vm::cast(addr)) = (float)bits32; vm::get_ref<be_t<u32>>(vm::cast(addr)) = bits32;
} }
CPU.GPR[ra] = addr; CPU.GPR[ra] = addr;
} }
@ -3579,7 +3588,7 @@ private:
{ {
u64 bits = (u64&)val; u64 bits = (u64&)val;
u32 bits32 = (bits>>32 & 0x80000000) | (bits>>29 & 0x7fffffff); u32 bits32 = (bits>>32 & 0x80000000) | (bits>>29 & 0x7fffffff);
vm::get_ref<be_t<u32>>(vm::cast(addr)) = (float)bits32; vm::get_ref<be_t<u32>>(vm::cast(addr)) = bits32;
} }
} }
void STFSU(u32 frs, u32 ra, s32 d) void STFSU(u32 frs, u32 ra, s32 d)
@ -3594,7 +3603,7 @@ private:
{ {
u64 bits = (u64&)val; u64 bits = (u64&)val;
u32 bits32 = (bits>>32 & 0x80000000) | (bits>>29 & 0x7fffffff); u32 bits32 = (bits>>32 & 0x80000000) | (bits>>29 & 0x7fffffff);
vm::get_ref<be_t<u32>>(vm::cast(addr)) = (float)bits32; vm::get_ref<be_t<u32>>(vm::cast(addr)) = bits32;
} }
CPU.GPR[ra] = addr; CPU.GPR[ra] = addr;
} }
@ -3687,8 +3696,8 @@ private:
} }
void MTFSB1(u32 crbd, bool rc) void MTFSB1(u32 crbd, bool rc)
{ {
u64 mask = (1ULL << (31 - crbd)); u32 mask = 1 << (31 - crbd);
if ((crbd >= 3 && crbd <= 6) && !(CPU.FPSCR.FPSCR & mask)) mask |= 1ULL << 31; //FPSCR.FX if ((crbd >= 3 && crbd <= 6) && !(CPU.FPSCR.FPSCR & mask)) mask |= 1 << 31; //FPSCR.FX
if ((crbd == 29) && !CPU.FPSCR.NI) LOG_WARNING(PPU, "Non-IEEE mode enabled"); if ((crbd == 29) && !CPU.FPSCR.NI) LOG_WARNING(PPU, "Non-IEEE mode enabled");
CPU.SetFPSCR(CPU.FPSCR.FPSCR | mask); CPU.SetFPSCR(CPU.FPSCR.FPSCR | mask);
@ -3702,7 +3711,7 @@ private:
} }
void MTFSB0(u32 crbd, bool rc) void MTFSB0(u32 crbd, bool rc)
{ {
u64 mask = (1ULL << (31 - crbd)); u32 mask = 1 << (31 - crbd);
if ((crbd == 29) && !CPU.FPSCR.NI) LOG_WARNING(PPU, "Non-IEEE mode disabled"); if ((crbd == 29) && !CPU.FPSCR.NI) LOG_WARNING(PPU, "Non-IEEE mode disabled");
CPU.SetFPSCR(CPU.FPSCR.FPSCR & ~mask); CPU.SetFPSCR(CPU.FPSCR.FPSCR & ~mask);