diff --git a/Utilities/BEType.h b/Utilities/BEType.h
index 806efcbdb2..96ef78f59d 100644
--- a/Utilities/BEType.h
+++ b/Utilities/BEType.h
@@ -354,12 +354,12 @@ union _CRT_ALIGN(16) u128
 		return from64(~_u64[0], ~_u64[1]);
 	}
 
-	__forceinline bool test() const
+	__forceinline bool is_any_1() const // check if any bit is 1
 	{
 		return _u64[0] || _u64[1];
 	}
 
-	__forceinline bool inv_test() const
+	__forceinline bool is_any_0() const // check if any bit is 0
 	{
 		return ~_u64[0] || ~_u64[1];
 	}
diff --git a/rpcs3/Emu/Cell/PPUInterpreter.cpp b/rpcs3/Emu/Cell/PPUInterpreter.cpp
index 5546f95e41..9257b33760 100644
--- a/rpcs3/Emu/Cell/PPUInterpreter.cpp
+++ b/rpcs3/Emu/Cell/PPUInterpreter.cpp
@@ -232,7 +232,7 @@ void ppu_interpreter::VCMPBFP_(PPUThread& CPU, ppu_opcode_t op)
 {
 	VCMPBFP(CPU, op);
 
-	CPU.CR.cr6 = CPU.VPR[op.vd].test() ? 0 : 2; // set 2 if all in bounds
+	CPU.CR.cr6 = CPU.VPR[op.vd].is_any_1() ? 0 : 2; // set 2 if all in bounds
 }
 
 void ppu_interpreter::VCMPEQFP(PPUThread& CPU, ppu_opcode_t op)
@@ -244,7 +244,7 @@ void ppu_interpreter::VCMPEQFP_(PPUThread& CPU, ppu_opcode_t op)
 {
 	VCMPEQFP(CPU, op);
 
-	CPU.CR.cr6 = CPU.VPR[op.vd].test() ? (CPU.VPR[op.vd].inv_test() ? 0 : 8) : 2; // set 2 if none equal, 8 if all equal
+	CPU.CR.cr6 = CPU.VPR[op.vd].is_any_1() ? (CPU.VPR[op.vd].is_any_0() ? 0 : 8) : 2; // set 2 if none equal, 8 if all equal
 }
 
 void ppu_interpreter::VCMPEQUB(PPUThread& CPU, ppu_opcode_t op)
@@ -256,7 +256,7 @@ void ppu_interpreter::VCMPEQUB_(PPUThread& CPU, ppu_opcode_t op)
 {
 	VCMPEQUB(CPU, op);
 
-	CPU.CR.cr6 = CPU.VPR[op.vd].test() ? (CPU.VPR[op.vd].inv_test() ? 0 : 8) : 2; // set 2 if none equal, 8 if all equal
+	CPU.CR.cr6 = CPU.VPR[op.vd].is_any_1() ? (CPU.VPR[op.vd].is_any_0() ? 0 : 8) : 2; // set 2 if none equal, 8 if all equal
 }
 
 void ppu_interpreter::VCMPEQUH(PPUThread& CPU, ppu_opcode_t op)
@@ -268,7 +268,7 @@ void ppu_interpreter::VCMPEQUH_(PPUThread& CPU, ppu_opcode_t op)
 {
 	VCMPEQUH(CPU, op);
 
-	CPU.CR.cr6 = CPU.VPR[op.vd].test() ? (CPU.VPR[op.vd].inv_test() ? 0 : 8) : 2; // set 2 if none equal, 8 if all equal
+	CPU.CR.cr6 = CPU.VPR[op.vd].is_any_1() ? (CPU.VPR[op.vd].is_any_0() ? 0 : 8) : 2; // set 2 if none equal, 8 if all equal
 }
 
 void ppu_interpreter::VCMPEQUW(PPUThread& CPU, ppu_opcode_t op)
@@ -280,7 +280,7 @@ void ppu_interpreter::VCMPEQUW_(PPUThread& CPU, ppu_opcode_t op)
 {
 	VCMPEQUW(CPU, op);
 
-	CPU.CR.cr6 = CPU.VPR[op.vd].test() ? (CPU.VPR[op.vd].inv_test() ? 0 : 8) : 2; // set 2 if none equal, 8 if all equal
+	CPU.CR.cr6 = CPU.VPR[op.vd].is_any_1() ? (CPU.VPR[op.vd].is_any_0() ? 0 : 8) : 2; // set 2 if none equal, 8 if all equal
 }
 
 void ppu_interpreter::VCMPGEFP(PPUThread& CPU, ppu_opcode_t op)
@@ -292,7 +292,7 @@ void ppu_interpreter::VCMPGEFP_(PPUThread& CPU, ppu_opcode_t op)
 {
 	VCMPGEFP(CPU, op);
 
-	CPU.CR.cr6 = CPU.VPR[op.vd].test() ? (CPU.VPR[op.vd].inv_test() ? 0 : 8) : 2;
+	CPU.CR.cr6 = CPU.VPR[op.vd].is_any_1() ? (CPU.VPR[op.vd].is_any_0() ? 0 : 8) : 2;
 }
 
 void ppu_interpreter::VCMPGTFP(PPUThread& CPU, ppu_opcode_t op)
@@ -304,7 +304,7 @@ void ppu_interpreter::VCMPGTFP_(PPUThread& CPU, ppu_opcode_t op)
 {
 	VCMPGTFP(CPU, op);
 
-	CPU.CR.cr6 = CPU.VPR[op.vd].test() ? (CPU.VPR[op.vd].inv_test() ? 0 : 8) : 2;
+	CPU.CR.cr6 = CPU.VPR[op.vd].is_any_1() ? (CPU.VPR[op.vd].is_any_0() ? 0 : 8) : 2;
 }
 
 void ppu_interpreter::VCMPGTSB(PPUThread& CPU, ppu_opcode_t op)
@@ -316,7 +316,7 @@ void ppu_interpreter::VCMPGTSB_(PPUThread& CPU, ppu_opcode_t op)
 {
 	VCMPGTSB(CPU, op);
 
-	CPU.CR.cr6 = CPU.VPR[op.vd].test() ? (CPU.VPR[op.vd].inv_test() ? 0 : 8) : 2;
+	CPU.CR.cr6 = CPU.VPR[op.vd].is_any_1() ? (CPU.VPR[op.vd].is_any_0() ? 0 : 8) : 2;
 }
 
 void ppu_interpreter::VCMPGTSH(PPUThread& CPU, ppu_opcode_t op)
@@ -328,7 +328,7 @@ void ppu_interpreter::VCMPGTSH_(PPUThread& CPU, ppu_opcode_t op)
 {
 	VCMPGTSH(CPU, op);
 
-	CPU.CR.cr6 = CPU.VPR[op.vd].test() ? (CPU.VPR[op.vd].inv_test() ? 0 : 8) : 2;
+	CPU.CR.cr6 = CPU.VPR[op.vd].is_any_1() ? (CPU.VPR[op.vd].is_any_0() ? 0 : 8) : 2;
 }
 
 void ppu_interpreter::VCMPGTSW(PPUThread& CPU, ppu_opcode_t op)
@@ -340,7 +340,7 @@ void ppu_interpreter::VCMPGTSW_(PPUThread& CPU, ppu_opcode_t op)
 {
 	VCMPGTSW(CPU, op);
 
-	CPU.CR.cr6 = CPU.VPR[op.vd].test() ? (CPU.VPR[op.vd].inv_test() ? 0 : 8) : 2;
+	CPU.CR.cr6 = CPU.VPR[op.vd].is_any_1() ? (CPU.VPR[op.vd].is_any_0() ? 0 : 8) : 2;
 }
 
 void ppu_interpreter::VCMPGTUB(PPUThread& CPU, ppu_opcode_t op)
@@ -352,7 +352,7 @@ void ppu_interpreter::VCMPGTUB_(PPUThread& CPU, ppu_opcode_t op)
 {
 	VCMPGTUB(CPU, op);
 
-	CPU.CR.cr6 = CPU.VPR[op.vd].test() ? (CPU.VPR[op.vd].inv_test() ? 0 : 8) : 2;
+	CPU.CR.cr6 = CPU.VPR[op.vd].is_any_1() ? (CPU.VPR[op.vd].is_any_0() ? 0 : 8) : 2;
 }
 
 void ppu_interpreter::VCMPGTUH(PPUThread& CPU, ppu_opcode_t op)
@@ -364,7 +364,7 @@ void ppu_interpreter::VCMPGTUH_(PPUThread& CPU, ppu_opcode_t op)
 {
 	VCMPGTUH(CPU, op);
 
-	CPU.CR.cr6 = CPU.VPR[op.vd].test() ? (CPU.VPR[op.vd].inv_test() ? 0 : 8) : 2;
+	CPU.CR.cr6 = CPU.VPR[op.vd].is_any_1() ? (CPU.VPR[op.vd].is_any_0() ? 0 : 8) : 2;
 }
 
 void ppu_interpreter::VCMPGTUW(PPUThread& CPU, ppu_opcode_t op)
@@ -376,7 +376,7 @@ void ppu_interpreter::VCMPGTUW_(PPUThread& CPU, ppu_opcode_t op)
 {
 	VCMPGTUW(CPU, op);
 
-	CPU.CR.cr6 = CPU.VPR[op.vd].test() ? (CPU.VPR[op.vd].inv_test() ? 0 : 8) : 2;
+	CPU.CR.cr6 = CPU.VPR[op.vd].is_any_1() ? (CPU.VPR[op.vd].is_any_0() ? 0 : 8) : 2;
 }
 
 void ppu_interpreter::VCTSXS(PPUThread& CPU, ppu_opcode_t op)
@@ -417,7 +417,7 @@ void ppu_interpreter::VMAXSB(PPUThread& CPU, ppu_opcode_t op)
 	const auto a = CPU.VPR[op.va].vi;
 	const auto b = CPU.VPR[op.vb].vi;
 	const auto m = _mm_cmpgt_epi8(a, b);
-	CPU.VPR[op.vd].vi = _mm_or_si128(_mm_and_si128(a, m), _mm_andnot_si128(m, b));
+	CPU.VPR[op.vd].vi = _mm_or_si128(_mm_and_si128(m, a), _mm_andnot_si128(m, b));
 }
 
 void ppu_interpreter::VMAXSH(PPUThread& CPU, ppu_opcode_t op)
@@ -430,7 +430,7 @@ void ppu_interpreter::VMAXSW(PPUThread& CPU, ppu_opcode_t op)
 	const auto a = CPU.VPR[op.va].vi;
 	const auto b = CPU.VPR[op.vb].vi;
 	const auto m = _mm_cmpgt_epi32(a, b);
-	CPU.VPR[op.vd].vi = _mm_or_si128(_mm_and_si128(a, m), _mm_andnot_si128(m, b));
+	CPU.VPR[op.vd].vi = _mm_or_si128(_mm_and_si128(m, a), _mm_andnot_si128(m, b));
 }
 
 void ppu_interpreter::VMAXUB(PPUThread& CPU, ppu_opcode_t op)
@@ -449,19 +449,27 @@ void ppu_interpreter::VMAXUW(PPUThread& CPU, ppu_opcode_t op)
 	const auto a = CPU.VPR[op.va].vi;
 	const auto b = CPU.VPR[op.vb].vi;
 	const auto m = sse_cmpgt_epu32(a, b);
-	CPU.VPR[op.vd].vi = _mm_or_si128(_mm_and_si128(a, m), _mm_andnot_si128(m, b));
+	CPU.VPR[op.vd].vi = _mm_or_si128(_mm_and_si128(m, a), _mm_andnot_si128(m, b));
 }
 
 void ppu_interpreter::VMHADDSHS(PPUThread& CPU, ppu_opcode_t op)
 {
 	const auto a = CPU.VPR[op.va].vi;
 	const auto b = CPU.VPR[op.vb].vi;
-	CPU.VPR[op.vd].vi = _mm_adds_epi16(_mm_or_si128(_mm_srli_epi16(_mm_mullo_epi16(a, b), 15), _mm_slli_epi16(_mm_mulhi_epi16(a, b), 1)), CPU.VPR[op.vc].vi);
+	const auto c = CPU.VPR[op.vc].vi;
+	const auto m = _mm_or_si128(_mm_srli_epi16(_mm_mullo_epi16(a, b), 15), _mm_slli_epi16(_mm_mulhi_epi16(a, b), 1));
+	const auto s = _mm_cmpeq_epi16(m, _mm_set1_epi16(-0x8000)); // detect special case (positive 0x8000)
+	CPU.VPR[op.vd].vi = _mm_adds_epi16(_mm_adds_epi16(_mm_xor_si128(m, s), c), _mm_srli_epi16(s, 15));
 }
 
 void ppu_interpreter::VMHRADDSHS(PPUThread& CPU, ppu_opcode_t op)
 {
-	CPU.VPR[op.vd].vi = _mm_adds_epi16(_mm_mulhrs_epi16(CPU.VPR[op.va].vi, CPU.VPR[op.vb].vi), CPU.VPR[op.vc].vi);
+	const auto a = CPU.VPR[op.va].vi;
+	const auto b = CPU.VPR[op.vb].vi;
+	const auto c = CPU.VPR[op.vc].vi;
+	const auto m = _mm_mulhrs_epi16(a, b);
+	const auto s = _mm_cmpeq_epi16(m, _mm_set1_epi16(-0x8000)); // detect special case (positive 0x8000)
+	CPU.VPR[op.vd].vi = _mm_adds_epi16(_mm_adds_epi16(_mm_xor_si128(m, s), c), _mm_srli_epi16(s, 15));
 }
 
 void ppu_interpreter::VMINFP(PPUThread& CPU, ppu_opcode_t op)
@@ -474,7 +482,7 @@ void ppu_interpreter::VMINSB(PPUThread& CPU, ppu_opcode_t op)
 	const auto a = CPU.VPR[op.va].vi;
 	const auto b = CPU.VPR[op.vb].vi;
 	const auto m = _mm_cmpgt_epi8(a, b);
-	CPU.VPR[op.vd].vi = _mm_or_si128(_mm_andnot_si128(a, m), _mm_and_si128(m, b));
+	CPU.VPR[op.vd].vi = _mm_or_si128(_mm_andnot_si128(m, a), _mm_and_si128(m, b));
 }
 
 void ppu_interpreter::VMINSH(PPUThread& CPU, ppu_opcode_t op)
@@ -487,7 +495,7 @@ void ppu_interpreter::VMINSW(PPUThread& CPU, ppu_opcode_t op)
 	const auto a = CPU.VPR[op.va].vi;
 	const auto b = CPU.VPR[op.vb].vi;
 	const auto m = _mm_cmpgt_epi32(a, b);
-	CPU.VPR[op.vd].vi = _mm_or_si128(_mm_andnot_si128(a, m), _mm_and_si128(m, b));
+	CPU.VPR[op.vd].vi = _mm_or_si128(_mm_andnot_si128(m, a), _mm_and_si128(m, b));
 }
 
 void ppu_interpreter::VMINUB(PPUThread& CPU, ppu_opcode_t op)
@@ -506,7 +514,7 @@ void ppu_interpreter::VMINUW(PPUThread& CPU, ppu_opcode_t op)
 	const auto a = CPU.VPR[op.va].vi;
 	const auto b = CPU.VPR[op.vb].vi;
 	const auto m = sse_cmpgt_epu32(a, b);
-	CPU.VPR[op.vd].vi = _mm_or_si128(_mm_andnot_si128(a, m), _mm_and_si128(m, b));
+	CPU.VPR[op.vd].vi = _mm_or_si128(_mm_andnot_si128(m, a), _mm_and_si128(m, b));
 }
 
 void ppu_interpreter::VMLADDUHM(PPUThread& CPU, ppu_opcode_t op)
@@ -548,13 +556,14 @@ void ppu_interpreter::VMSUMMBM(PPUThread& CPU, ppu_opcode_t op)
 {
 	const auto a = CPU.VPR[op.va].vi; // signed bytes
 	const auto b = CPU.VPR[op.vb].vi; // unsigned bytes
+	const auto c = CPU.VPR[op.vc].vi;
 	const auto ah = _mm_srai_epi16(a, 8);
 	const auto bh = _mm_srli_epi16(b, 8);
-	const auto al = _mm_srai_epi16(_mm_srli_epi16(a, 8), 8);
+	const auto al = _mm_srai_epi16(_mm_slli_epi16(a, 8), 8);
 	const auto bl = _mm_and_si128(b, _mm_set1_epi16(0x00ff));
 	const auto sh = _mm_madd_epi16(ah, bh);
 	const auto sl = _mm_madd_epi16(al, bl);
-	CPU.VPR[op.vd].vi = _mm_add_epi32(_mm_add_epi32(CPU.VPR[op.vc].vi, sh), sl);
+	CPU.VPR[op.vd].vi = _mm_add_epi32(_mm_add_epi32(c, sh), sl);
 }
 
 void ppu_interpreter::VMSUMSHM(PPUThread& CPU, ppu_opcode_t op)
@@ -595,6 +604,7 @@ void ppu_interpreter::VMSUMUBM(PPUThread& CPU, ppu_opcode_t op)
 {
 	const auto a = CPU.VPR[op.va].vi;
 	const auto b = CPU.VPR[op.vb].vi;
+	const auto c = CPU.VPR[op.vc].vi;
 	const auto mask = _mm_set1_epi16(0x00ff);
 	const auto ah = _mm_srli_epi16(a, 8);
 	const auto al = _mm_and_si128(a, mask);
@@ -602,18 +612,19 @@ void ppu_interpreter::VMSUMUBM(PPUThread& CPU, ppu_opcode_t op)
 	const auto bl = _mm_and_si128(b, mask);
 	const auto sh = _mm_madd_epi16(ah, bh);
 	const auto sl = _mm_madd_epi16(al, bl);
-	CPU.VPR[op.vd].vi = _mm_add_epi32(_mm_add_epi32(CPU.VPR[op.vc].vi, sh), sl);
+	CPU.VPR[op.vd].vi = _mm_add_epi32(_mm_add_epi32(c, sh), sl);
 }
 
 void ppu_interpreter::VMSUMUHM(PPUThread& CPU, ppu_opcode_t op)
 {
 	const auto a = CPU.VPR[op.va].vi;
 	const auto b = CPU.VPR[op.vb].vi;
+	const auto c = CPU.VPR[op.vc].vi;
 	const auto ml = _mm_mullo_epi16(a, b); // low results
 	const auto mh = _mm_mulhi_epu16(a, b); // high results
 	const auto ls = _mm_add_epi32(_mm_srli_epi32(ml, 16), _mm_and_si128(ml, _mm_set1_epi32(0x0000ffff)));
 	const auto hs = _mm_add_epi32(_mm_slli_epi32(mh, 16), _mm_and_si128(mh, _mm_set1_epi32(0xffff0000)));
-	CPU.VPR[op.vd].vi = _mm_add_epi32(_mm_add_epi32(CPU.VPR[op.vc].vi, ls), hs);
+	CPU.VPR[op.vd].vi = _mm_add_epi32(_mm_add_epi32(c, ls), hs);
 }
 
 void ppu_interpreter::VMSUMUHS(PPUThread& CPU, ppu_opcode_t op)
@@ -648,7 +659,7 @@ void ppu_interpreter::VMULESB(PPUThread& CPU, ppu_opcode_t op)
 
 void ppu_interpreter::VMULESH(PPUThread& CPU, ppu_opcode_t op)
 {
-	CPU.VPR[op.vd].vi = _mm_madd_epi16(_mm_srli_epi16(CPU.VPR[op.va].vi, 16), _mm_srli_epi16(CPU.VPR[op.vb].vi, 16));
+	CPU.VPR[op.vd].vi = _mm_madd_epi16(_mm_srli_epi32(CPU.VPR[op.va].vi, 16), _mm_srli_epi32(CPU.VPR[op.vb].vi, 16));
 }
 
 void ppu_interpreter::VMULEUB(PPUThread& CPU, ppu_opcode_t op)
@@ -708,16 +719,11 @@ void ppu_interpreter::VOR(PPUThread& CPU, ppu_opcode_t op)
 
 void ppu_interpreter::VPERM(PPUThread& CPU, ppu_opcode_t op)
 {
-	u8 tmpSRC[32];
-	memcpy(tmpSRC, CPU.VPR[op.vb]._u8, 16);
-	memcpy(tmpSRC + 16, CPU.VPR[op.va]._u8, 16);
-
-	for (uint b = 0; b < 16; b++)
-	{
-		u8 index = CPU.VPR[op.vc]._u8[b] & 0x1f;
-
-		CPU.VPR[op.vd]._u8[b] = tmpSRC[0x1f - index];
-	}
+	const auto index = _mm_andnot_si128(CPU.VPR[op.vc].vi, _mm_set1_epi8(0x1f));
+	const auto mask = _mm_cmpgt_epi8(index, _mm_set1_epi8(0xf));
+	const auto sa = _mm_shuffle_epi8(CPU.VPR[op.va].vi, index);
+	const auto sb = _mm_shuffle_epi8(CPU.VPR[op.vb].vi, index);
+	CPU.VPR[op.vd].vi = _mm_or_si128(_mm_and_si128(mask, sa), _mm_andnot_si128(mask, sb));
 }
 
 void ppu_interpreter::VPKPX(PPUThread& CPU, ppu_opcode_t op)
@@ -742,108 +748,23 @@ void ppu_interpreter::VPKPX(PPUThread& CPU, ppu_opcode_t op)
 
 void ppu_interpreter::VPKSHSS(PPUThread& CPU, ppu_opcode_t op)
 {
-	u128 VA = CPU.VPR[op.va];
-	u128 VB = CPU.VPR[op.vb];
-	for (uint b = 0; b < 8; b++)
-	{
-		s16 result = VA._s16[b];
-
-		if (result > INT8_MAX)
-		{
-			result = INT8_MAX;
-		}
-		else if (result < INT8_MIN)
-		{
-			result = INT8_MIN;
-		}
-
-		CPU.VPR[op.vd]._s8[b + 8] = (s8)result;
-
-		result = VB._s16[b];
-
-		if (result > INT8_MAX)
-		{
-			result = INT8_MAX;
-		}
-		else if (result < INT8_MIN)
-		{
-			result = INT8_MIN;
-		}
-
-		CPU.VPR[op.vd]._s8[b] = (s8)result;
-	}
+	CPU.VPR[op.vd].vi = _mm_packs_epi16(CPU.VPR[op.vb].vi, CPU.VPR[op.va].vi);
 }
 
 void ppu_interpreter::VPKSHUS(PPUThread& CPU, ppu_opcode_t op)
 {
-	u128 VA = CPU.VPR[op.va];
-	u128 VB = CPU.VPR[op.vb];
-	for (uint b = 0; b < 8; b++)
-	{
-		s16 result = VA._s16[b];
-
-		if (result > UINT8_MAX)
-		{
-			result = UINT8_MAX;
-		}
-		else if (result < 0)
-		{
-			result = 0;
-		}
-
-		CPU.VPR[op.vd]._u8[b + 8] = (u8)result;
-
-		result = VB._s16[b];
-
-		if (result > UINT8_MAX)
-		{
-			result = UINT8_MAX;
-		}
-		else if (result < 0)
-		{
-			result = 0;
-		}
-
-		CPU.VPR[op.vd]._u8[b] = (u8)result;
-	}
+	CPU.VPR[op.vd].vi = _mm_packus_epi16(CPU.VPR[op.vb].vi, CPU.VPR[op.va].vi);
 }
 
 void ppu_interpreter::VPKSWSS(PPUThread& CPU, ppu_opcode_t op)
 {
-	u128 VA = CPU.VPR[op.va];
-	u128 VB = CPU.VPR[op.vb];
-	for (uint h = 0; h < 4; h++)
-	{
-		s32 result = VA._s32[h];
-
-		if (result > INT16_MAX)
-		{
-			result = INT16_MAX;
-		}
-		else if (result < INT16_MIN)
-		{
-			result = INT16_MIN;
-		}
-
-		CPU.VPR[op.vd]._s16[h + 4] = result;
-
-		result = VB._s32[h];
-
-		if (result > INT16_MAX)
-		{
-			result = INT16_MAX;
-		}
-		else if (result < INT16_MIN)
-		{
-			result = INT16_MIN;
-		}
-
-		CPU.VPR[op.vd]._s16[h] = result;
-	}
+	CPU.VPR[op.vd].vi = _mm_packs_epi32(CPU.VPR[op.vb].vi, CPU.VPR[op.va].vi);
 }
 
 void ppu_interpreter::VPKSWUS(PPUThread& CPU, ppu_opcode_t op)
 {
+	//CPU.VPR[op.vd].vi = _mm_packus_epi32(CPU.VPR[op.vb].vi, CPU.VPR[op.va].vi);
+
 	u128 VA = CPU.VPR[op.va];
 	u128 VB = CPU.VPR[op.vb];
 	for (uint h = 0; h < 4; h++)
@@ -2047,7 +1968,8 @@ void ppu_interpreter::LBZX(PPUThread& CPU, ppu_opcode_t op)
 
 void ppu_interpreter::LVX(PPUThread& CPU, ppu_opcode_t op)
 {
-	CPU.VPR[op.vd] = vm::read128((u64)((op.ra ? CPU.GPR[op.ra] + CPU.GPR[op.rb] : CPU.GPR[op.rb]) & ~0xfULL));
+	const u64 addr = (op.ra ? CPU.GPR[op.ra] + CPU.GPR[op.rb] : CPU.GPR[op.rb]) & ~0xfull;
+	CPU.VPR[op.vd] = vm::read128(vm::cast(addr));
 }
 
 void ppu_interpreter::NEG(PPUThread& CPU, ppu_opcode_t op)
@@ -2230,7 +2152,8 @@ void ppu_interpreter::STBX(PPUThread& CPU, ppu_opcode_t op)
 
 void ppu_interpreter::STVX(PPUThread& CPU, ppu_opcode_t op)
 {
-	vm::write128((u64)((op.ra ? CPU.GPR[op.ra] + CPU.GPR[op.rb] : CPU.GPR[op.rb]) & ~0xfULL), CPU.VPR[op.vs]);
+	const u64 addr = (op.ra ? CPU.GPR[op.ra] + CPU.GPR[op.rb] : CPU.GPR[op.rb]) & ~0xfull;
+	vm::write128(vm::cast(addr), CPU.VPR[op.vs]);
 }
 
 void ppu_interpreter::MULLD(PPUThread& CPU, ppu_opcode_t op)
@@ -2372,7 +2295,8 @@ void ppu_interpreter::LHAX(PPUThread& CPU, ppu_opcode_t op)
 
 void ppu_interpreter::LVXL(PPUThread& CPU, ppu_opcode_t op)
 {
-	CPU.VPR[op.vd] = vm::read128((u64)((op.ra ? CPU.GPR[op.ra] + CPU.GPR[op.rb] : CPU.GPR[op.rb]) & ~0xfULL));
+	const u64 addr = (op.ra ? CPU.GPR[op.ra] + CPU.GPR[op.rb] : CPU.GPR[op.rb]) & ~0xfull;
+	CPU.VPR[op.vd] = vm::read128(vm::cast(addr));
 }
 
 void ppu_interpreter::MFTB(PPUThread& CPU, ppu_opcode_t op)
@@ -2511,7 +2435,8 @@ void ppu_interpreter::NAND(PPUThread& CPU, ppu_opcode_t op)
 
 void ppu_interpreter::STVXL(PPUThread& CPU, ppu_opcode_t op)
 {
-	vm::write128((u64)((op.ra ? CPU.GPR[op.ra] + CPU.GPR[op.rb] : CPU.GPR[op.rb]) & ~0xfULL), CPU.VPR[op.vs]);
+	const u64 addr = (op.ra ? CPU.GPR[op.ra] + CPU.GPR[op.rb] : CPU.GPR[op.rb]) & ~0xfull;
+	vm::write128(vm::cast(addr), CPU.VPR[op.vs]);
 }
 
 void ppu_interpreter::DIVD(PPUThread& CPU, ppu_opcode_t op)
@@ -3186,7 +3111,7 @@ void ppu_interpreter::STDU(PPUThread& CPU, ppu_opcode_t op)
 
 void ppu_interpreter::MTFSB1(PPUThread& CPU, ppu_opcode_t op)
 {
-	u64 mask = (1ULL << (31 - op.crbd));
+	u32 mask = 1 << (31 - op.crbd);
 	if ((op.crbd >= 3 && op.crbd <= 6) && !(CPU.FPSCR.FPSCR & mask)) mask |= 1ULL << 31;  //FPSCR.FX
 	if ((op.crbd == 29) && !CPU.FPSCR.NI) LOG_WARNING(PPU, "Non-IEEE mode enabled");
 	CPU.SetFPSCR(CPU.FPSCR.FPSCR | mask);
@@ -3203,7 +3128,7 @@ void ppu_interpreter::MCRFS(PPUThread& CPU, ppu_opcode_t op)
 
 void ppu_interpreter::MTFSB0(PPUThread& CPU, ppu_opcode_t op)
 {
-	u64 mask = (1ULL << (31 - op.crbd));
+	u32 mask = 1 << (31 - op.crbd);
 	if ((op.crbd == 29) && !CPU.FPSCR.NI) LOG_WARNING(PPU, "Non-IEEE mode disabled");
 	CPU.SetFPSCR(CPU.FPSCR.FPSCR & ~mask);
 
diff --git a/rpcs3/Emu/Cell/PPUInterpreter.h b/rpcs3/Emu/Cell/PPUInterpreter.h
index 9358019869..90f8acd973 100644
--- a/rpcs3/Emu/Cell/PPUInterpreter.h
+++ b/rpcs3/Emu/Cell/PPUInterpreter.h
@@ -57,6 +57,11 @@ static double SilenceNaN(double x)
 	return (double&)bits;
 }
 
+static float SilenceNaN(float x)
+{
+	return static_cast<float>(SilenceNaN(static_cast<double>(x)));
+}
+
 static void SetHostRoundingMode(u32 rn)
 {
 	switch (rn)
@@ -2644,7 +2649,8 @@ private:
 	}
 	void LVX(u32 vd, u32 ra, u32 rb)
 	{
-		CPU.VPR[vd] = vm::read128((u64)((ra ? CPU.GPR[ra] + CPU.GPR[rb] : CPU.GPR[rb]) & ~0xfULL));
+		const u64 addr = (ra ? CPU.GPR[ra] + CPU.GPR[rb] : CPU.GPR[rb]) & ~0xfull;
+		CPU.VPR[vd] = vm::read128(vm::cast(addr));
 	}
 	void NEG(u32 rd, u32 ra, u32 oe, bool rc)
 	{
@@ -2810,7 +2816,8 @@ private:
 	}
 	void STVX(u32 vs, u32 ra, u32 rb)
 	{
-		vm::write128((u64)((ra ? CPU.GPR[ra] + CPU.GPR[rb] : CPU.GPR[rb]) & ~0xfULL), CPU.VPR[vs]);
+		const u64 addr = (ra ? CPU.GPR[ra] + CPU.GPR[rb] : CPU.GPR[rb]) & ~0xfull;
+		vm::write128(vm::cast(addr), CPU.VPR[vs]);
 	}
 	void MULLD(u32 rd, u32 ra, u32 rb, u32 oe, bool rc)
 	{
@@ -2911,7 +2918,8 @@ private:
 	}
 	void LVXL(u32 vd, u32 ra, u32 rb)
 	{
-		CPU.VPR[vd] = vm::read128((u64)((ra ? CPU.GPR[ra] + CPU.GPR[rb] : CPU.GPR[rb]) & ~0xfULL));
+		const u64 addr = (ra ? CPU.GPR[ra] + CPU.GPR[rb] : CPU.GPR[rb]) & ~0xfull;
+		CPU.VPR[vd] = vm::read128(vm::cast(addr));
 	}
 	void MFTB(u32 rd, u32 spr)
 	{
@@ -3016,7 +3024,8 @@ private:
 	}
 	void STVXL(u32 vs, u32 ra, u32 rb)
 	{
-		vm::write128((u64)((ra ? CPU.GPR[ra] + CPU.GPR[rb] : CPU.GPR[rb]) & ~0xfULL), CPU.VPR[vs]);
+		const u64 addr = (ra ? CPU.GPR[ra] + CPU.GPR[rb] : CPU.GPR[rb]) & ~0xfull;
+		vm::write128(vm::cast(addr), CPU.VPR[vs]);
 	}
 	void DIVD(u32 rd, u32 ra, u32 rb, u32 oe, bool rc)
 	{
@@ -3238,7 +3247,7 @@ private:
 		{
 			u64 bits = (u64&)val;
 			u32 bits32 = (bits>>32 & 0x80000000) | (bits>>29 & 0x7fffffff);
-			vm::get_ref<be_t<u32>>(vm::cast(addr)) = (float)bits32;
+			vm::get_ref<be_t<u32>>(vm::cast(addr)) = bits32;
 		}
 	}
 	void STVRX(u32 vs, u32 ra, u32 rb)
@@ -3260,7 +3269,7 @@ private:
 		{
 			u64 bits = (u64&)val;
 			u32 bits32 = (bits>>32 & 0x80000000) | (bits>>29 & 0x7fffffff);
-			vm::get_ref<be_t<u32>>(vm::cast(addr)) = (float)bits32;
+			vm::get_ref<be_t<u32>>(vm::cast(addr)) = bits32;
 		}
 		CPU.GPR[ra] = addr;
 	}
@@ -3579,7 +3588,7 @@ private:
 		{
 			u64 bits = (u64&)val;
 			u32 bits32 = (bits>>32 & 0x80000000) | (bits>>29 & 0x7fffffff);
-			vm::get_ref<be_t<u32>>(vm::cast(addr)) = (float)bits32;
+			vm::get_ref<be_t<u32>>(vm::cast(addr)) = bits32;
 		}
 	}
 	void STFSU(u32 frs, u32 ra, s32 d)
@@ -3594,7 +3603,7 @@ private:
 		{
 			u64 bits = (u64&)val;
 			u32 bits32 = (bits>>32 & 0x80000000) | (bits>>29 & 0x7fffffff);
-			vm::get_ref<be_t<u32>>(vm::cast(addr)) = (float)bits32;
+			vm::get_ref<be_t<u32>>(vm::cast(addr)) = bits32;
 		}
 		CPU.GPR[ra] = addr;
 	}
@@ -3687,8 +3696,8 @@ private:
 	}
 	void MTFSB1(u32 crbd, bool rc)
 	{
-		u64 mask = (1ULL << (31 - crbd));
-		if ((crbd >= 3 && crbd <= 6) && !(CPU.FPSCR.FPSCR & mask)) mask |= 1ULL << 31;  //FPSCR.FX
+		u32 mask = 1 << (31 - crbd);
+		if ((crbd >= 3 && crbd <= 6) && !(CPU.FPSCR.FPSCR & mask)) mask |= 1 << 31;  //FPSCR.FX
 		if ((crbd == 29) && !CPU.FPSCR.NI) LOG_WARNING(PPU, "Non-IEEE mode enabled");
 		CPU.SetFPSCR(CPU.FPSCR.FPSCR | mask);
 
@@ -3702,7 +3711,7 @@ private:
 	}
 	void MTFSB0(u32 crbd, bool rc)
 	{
-		u64 mask = (1ULL << (31 - crbd));
+		u32 mask = 1 << (31 - crbd);
 		if ((crbd == 29) && !CPU.FPSCR.NI) LOG_WARNING(PPU, "Non-IEEE mode disabled");
 		CPU.SetFPSCR(CPU.FPSCR.FPSCR & ~mask);