Initial Linux Aarch64 support

* Update asmjit dependency (aarch64 branch)
* Disable USE_DISCORD_RPC by default
* Dump some JIT objects in rpcs3 cache dir
* Add SIGILL handler for all platforms
* Fix resetting zeroing denormals in thread pool
* Refactor most v128:: utils into global gv_** functions
* Refactor PPU interpreter (incomplete), remove "precise"
* - Instruction specializations with multiple accuracy flags
* - Adjust calling convention for speed
* - Removed precise/fast setting, replaced with static
* - Started refactoring interpreters for building at runtime JIT
*   (I got tired of poor compiler optimizations)
* - Expose some accuracy settings (SAT, NJ, VNAN, FPCC)
* - Add exec_bytes PPU thread variable (akin to cycle count)
* PPU LLVM: fix VCTUXS+VCTSXS instruction NaN results
* SPU interpreter: remove "precise" for now (extremely non-portable)
* - As with PPU, settings changed to static/dynamic for interpreters.
* - Precise options will be implemented later
* Fix termination after fatal error dialog
This commit is contained in:
Nekotekina 2021-12-30 19:39:18 +03:00
parent d6aa834b5f
commit 580bd2b25e
89 changed files with 20360 additions and 5612 deletions

@ -1 +1 @@
Subproject commit eae7197fce03fd52a6e71ca89207a88ce270fb1a
Subproject commit fc2a5d82f7434d7d03161275a764c051f970f41c

View file

@ -2,7 +2,7 @@
add_library(3rdparty_discordRPC INTERFACE)
# We don't want Discord Rich Presence on the BSDs and other OSes
if (USE_DISCORD_RPC AND (WIN32 OR CMAKE_SYSTEM MATCHES "Linux" OR APPLE))
if (USE_DISCORD_RPC AND (WIN32 OR CMAKE_SYSTEM MATCHES "Linux" OR APPLE) AND COMPILER_X86)
if (WIN32 AND NOT MSVC)
ExternalProject_Add(discordRPC
GIT_REPOSITORY https://github.com/discordapp/discord-rpc

14
3rdparty/llvm.cmake vendored
View file

@ -1,8 +1,10 @@
if(WITH_LLVM)
CHECK_CXX_COMPILER_FLAG("-msse -msse2 -mcx16" COMPILER_X86)
CHECK_CXX_COMPILER_FLAG("-march=armv8-a+lse" COMPILER_ARM)
if(BUILD_LLVM_SUBMODULE)
message(STATUS "LLVM will be built from the submodule.")
set(LLVM_TARGETS_TO_BUILD "X86" CACHE INTERNAL "")
option(LLVM_BUILD_RUNTIME OFF)
option(LLVM_BUILD_TOOLS OFF)
option(LLVM_INCLUDE_BENCHMARKS OFF)
@ -61,7 +63,15 @@ if(WITH_LLVM)
endif()
endif()
set(LLVM_LIBS LLVMMCJIT LLVMX86CodeGen LLVMX86AsmParser)
set(LLVM_LIBS LLVMMCJIT)
if(COMPILER_X86)
set(LLVM_LIBS ${LLVM_LIBS} LLVMX86CodeGen LLVMX86AsmParser)
endif()
if(COMPILER_ARM)
set(LLVM_LIBS ${LLVM_LIBS} LLVMX86CodeGen LLVMX86AsmParser LLVMARMCodeGen LLVMARMAsmParser)
endif()
if(WIN32 OR CMAKE_SYSTEM MATCHES "Linux")
set(LLVM_LIBS ${LLVM_LIBS} LLVMIntelJITEvents)

View file

@ -17,7 +17,7 @@ option(WITH_LLVM "Enable usage of LLVM library" ON)
option(BUILD_LLVM_SUBMODULE "Build LLVM from git submodule" ON)
option(USE_FAUDIO "FAudio audio backend" ON)
option(USE_LIBEVDEV "libevdev-based joystick support" ON)
option(USE_DISCORD_RPC "Discord rich presence integration" ON)
option(USE_DISCORD_RPC "Discord rich presence integration" OFF)
option(USE_SYSTEM_ZLIB "Prefer system ZLIB instead of the builtin one" ON)
option(USE_VULKAN "Vulkan render backend" ON)
option(USE_PRECOMPILED_HEADERS "Use precompiled headers" OFF)

View file

@ -18,6 +18,12 @@ LOG_CHANNEL(jit_log, "JIT");
void jit_announce(uptr func, usz size, std::string_view name)
{
if (!size)
{
jit_log.error("Empty function announced: %s (%p)", name, func);
return;
}
#ifdef __linux__
static const fs::file s_map(fmt::format("/tmp/perf-%d.map", getpid()), fs::rewrite + fs::append);
@ -124,15 +130,31 @@ void* jit_runtime_base::_add(asmjit::CodeHolder* code) noexcept
{
ensure(!code->flatten());
ensure(!code->resolveUnresolvedLinks());
usz codeSize = ensure(code->codeSize());
usz codeSize = code->codeSize();
if (!codeSize)
return nullptr;
auto p = ensure(this->_alloc(codeSize, 64));
ensure(!code->relocateToBase(uptr(p)));
asmjit::VirtMem::ProtectJitReadWriteScope rwScope(p, codeSize);
for (asmjit::Section* section : code->_sections)
{
std::memcpy(p + section->offset(), section->data(), section->bufferSize());
asmjit::VirtMem::ProtectJitReadWriteScope rwScope(p, codeSize);
for (asmjit::Section* section : code->_sections)
{
std::memcpy(p + section->offset(), section->data(), section->bufferSize());
}
}
if (!dump_name.empty())
{
// If directory ASMJIT doesn't exist, nothing will be written
fs::file dump(fmt::format("%s/ASMJIT/%s", fs::get_cache_dir(), dump_name), fs::rewrite);
if (dump)
{
dump.write(p, codeSize);
}
}
return p;
@ -349,8 +371,9 @@ static u64 make_null_function(const std::string& name)
using namespace asmjit;
// Build a "null" function that contains its name
const auto func = build_function_asm<void (*)()>("NULL", [&](x86::Assembler& c, auto& args)
const auto func = build_function_asm<void (*)()>("NULL", [&](native_asm& c, auto& args)
{
#if defined(ARCH_X64)
Label data = c.newLabel();
c.lea(args[0], x86::qword_ptr(data, 0));
c.jmp(Imm(&null));
@ -362,6 +385,7 @@ static u64 make_null_function(const std::string& name)
c.db(ch);
c.db(0);
c.align(AlignMode::kData, 16);
#endif
});
func_ptr = reinterpret_cast<u64>(func);

View file

@ -22,10 +22,17 @@
#pragma GCC diagnostic ignored "-Wredundant-decls"
#pragma GCC diagnostic ignored "-Wnon-virtual-dtor"
#pragma GCC diagnostic ignored "-Weffc++"
#ifndef __clang__
#ifdef __clang__
#pragma GCC diagnostic ignored "-Wdeprecated-anon-enum-enum-conversion"
#pragma GCC diagnostic ignored "-Wcast-qual"
#else
#pragma GCC diagnostic ignored "-Wduplicated-branches"
#pragma GCC diagnostic ignored "-Wdeprecated-enum-enum-conversion"
#endif
#include <asmjit/asmjit.h>
#if defined(ARCH_ARM64)
#include <asmjit/a64.h>
#endif
#pragma GCC diagnostic pop
#endif
@ -36,6 +43,14 @@
#include <string_view>
#include <unordered_map>
#if defined(ARCH_X64)
using native_asm = asmjit::x86::Assembler;
using native_args = std::array<asmjit::x86::Gp, 4>;
#elif defined(ARCH_ARM64)
using native_asm = asmjit::a64::Assembler;
using native_args = std::array<asmjit::a64::Gp, 4>;
#endif
void jit_announce(uptr func, usz size, std::string_view name);
void jit_announce(auto* func, usz size, std::string_view name)
@ -62,6 +77,8 @@ struct jit_runtime_base
const asmjit::Environment& environment() const noexcept;
void* _add(asmjit::CodeHolder* code) noexcept;
virtual uchar* _alloc(usz size, usz align) noexcept = 0;
std::string_view dump_name;
};
// ASMJIT runtime for emitting code in a single 2G region
@ -167,11 +184,39 @@ namespace asmjit
}
}
inline void build_init_args_from_ghc(native_asm& c, native_args& args)
{
#if defined(ARCH_X64)
// TODO: handle case when args don't overlap with r13/rbp/r12/rbx
c.mov(args[0], x86::r13);
c.mov(args[1], x86::rbp);
c.mov(args[2], x86::r12);
c.mov(args[3], x86::rbx);
#else
static_cast<void>(c);
static_cast<void>(args);
#endif
}
inline void build_init_ghc_args(native_asm& c, native_args& args)
{
#if defined(ARCH_X64)
// TODO: handle case when args don't overlap with r13/rbp/r12/rbx
c.mov(x86::r13, args[0]);
c.mov(x86::rbp, args[1]);
c.mov(x86::r12, args[2]);
c.mov(x86::rbx, args[3]);
#else
static_cast<void>(c);
static_cast<void>(args);
#endif
}
using imm_ptr = Imm;
}
// Build runtime function with asmjit::X86Assembler
template <typename FT, typename F>
template <typename FT, typename Asm = native_asm, typename F>
inline FT build_function_asm(std::string_view name, F&& builder)
{
using namespace asmjit;
@ -181,7 +226,8 @@ inline FT build_function_asm(std::string_view name, F&& builder)
CodeHolder code;
code.init(rt.environment());
std::array<x86::Gp, 4> args;
#if defined(ARCH_X64)
native_args args;
#ifdef _WIN32
args[0] = x86::rcx;
args[1] = x86::rdx;
@ -193,16 +239,27 @@ inline FT build_function_asm(std::string_view name, F&& builder)
args[2] = x86::rdx;
args[3] = x86::rcx;
#endif
#elif defined(ARCH_ARM64)
native_args args;
args[0] = a64::x0;
args[1] = a64::x1;
args[2] = a64::x2;
args[3] = a64::x3;
#endif
x86::Assembler compiler(&code);
Asm compiler(&code);
compiler.addEncodingOptions(EncodingOptions::kOptimizedAlign);
builder(std::ref(compiler), args);
if constexpr (std::is_invocable_v<F, Asm&, native_args&>)
builder(compiler, args);
else
builder(compiler);
rt.dump_name = name;
const auto result = rt._add(&code);
jit_announce(result, code.codeSize(), name);
return reinterpret_cast<FT>(uptr(result));
}
#ifdef __APPLE__
#if !defined(ARCH_X64) || defined(__APPLE__)
template <typename FT, usz = 4096>
class built_function
{
@ -213,9 +270,23 @@ public:
built_function& operator=(const built_function&) = delete;
template <typename F>
built_function(std::string_view name, F&& builder)
: m_func(ensure(build_function_asm<FT>(name, std::forward<F>(builder))))
template <typename F> requires (std::is_invocable_v<F, native_asm&, native_args&>)
built_function(std::string_view name, F&& builder,
u32 line = __builtin_LINE(),
u32 col = __builtin_COLUMN(),
const char* file = __builtin_FILE(),
const char* func = __builtin_FUNCTION())
: m_func(ensure(build_function_asm<FT>(name, std::forward<F>(builder)), const_str(), line, col, file, func))
{
}
template <typename F> requires (std::is_invocable_v<F>)
built_function(std::string_view, F&& getter,
u32 line = __builtin_LINE(),
u32 col = __builtin_COLUMN(),
const char* file = __builtin_FILE(),
const char* func = __builtin_FUNCTION())
: m_func(ensure(getter(), const_str(), line, col, file, func))
{
}
@ -251,7 +322,8 @@ public:
CodeHolder code;
code.init(rt.environment());
std::array<x86::Gp, 4> args;
#if defined(ARCH_X64)
native_args args;
#ifdef _WIN32
args[0] = x86::rcx;
args[1] = x86::rdx;
@ -263,10 +335,18 @@ public:
args[2] = x86::rdx;
args[3] = x86::rcx;
#endif
#elif defined(ARCH_ARM64)
native_args args;
args[0] = a64::x0;
args[1] = a64::x1;
args[2] = a64::x2;
args[3] = a64::x3;
#endif
x86::Assembler compiler(&code);
native_asm compiler(&code);
compiler.addEncodingOptions(EncodingOptions::kOptimizedAlign);
builder(std::ref(compiler), args);
builder(compiler, args);
rt.dump_name = name;
jit_announce(rt._add(&code), code.codeSize(), name);
}

View file

@ -239,7 +239,7 @@ struct fmt_class_string<T, void>
static void format(std::string& out, u64 arg)
{
const auto& obj = get_object(arg);
void format_byte_array(std::string&, const uchar*, usz);
format_byte_array(out, reinterpret_cast<const uchar*>(std::data(obj)), std::size(obj));
}

View file

@ -77,7 +77,7 @@
#include "util/logs.hpp"
#include "util/asm.hpp"
#include "util/v128.hpp"
#include "util/v128sse.hpp"
#include "util/simd.hpp"
#include "util/sysinfo.hpp"
#include "Emu/Memory/vm_locking.h"
@ -189,6 +189,7 @@ bool IsDebuggerPresent()
}
#endif
#if defined(ARCH_X64)
enum x64_reg_t : u32
{
X64R_RAX = 0,
@ -839,6 +840,7 @@ void decode_x64_reg_op(const u8* code, x64_op_t& out_op, x64_reg_t& out_reg, usz
#ifdef _WIN32
typedef CONTEXT x64_context;
typedef CONTEXT ucontext_t;
#define X64REG(context, reg) (&(&(context)->Rax)[reg])
#define XMMREG(context, reg) (reinterpret_cast<v128*>(&(&(context)->Xmm0)[reg]))
@ -1211,12 +1213,18 @@ usz get_x64_access_size(x64_context* context, x64_op_t op, x64_reg_t reg, usz d_
return d_size;
}
#elif defined(ARCH_ARM64)
#define RIP(context) ((context)->uc_mcontext.pc)
#endif /* ARCH_ */
namespace rsx
{
extern std::function<bool(u32 addr, bool is_writing)> g_access_violation_handler;
}
bool handle_access_violation(u32 addr, bool is_writing, x64_context* context) noexcept
bool handle_access_violation(u32 addr, bool is_writing, ucontext_t* context) noexcept
{
g_tls_fault_all++;
@ -1243,6 +1251,7 @@ bool handle_access_violation(u32 addr, bool is_writing, x64_context* context) no
}
}
#if defined(ARCH_X64)
const u8* const code = reinterpret_cast<u8*>(RIP(context));
x64_op_t op;
@ -1382,6 +1391,9 @@ bool handle_access_violation(u32 addr, bool is_writing, x64_context* context) no
g_tls_fault_spu++;
return true;
} while (0);
#else
static_cast<void>(context);
#endif /* ARCH_ */
if (vm::check_addr(addr, is_writing ? vm::page_writable : vm::page_readable))
{
@ -1545,7 +1557,7 @@ bool handle_access_violation(u32 addr, bool is_writing, x64_context* context) no
if (!g_tls_access_violation_recovered)
{
vm_log.notice("\n%s", dump_useful_thread_info());
vm_log.error("Access violation %s location 0x%x (%s) [type=u%u]", is_writing ? "writing" : "reading", addr, (is_writing && vm::check_addr(addr)) ? "read-only memory" : "unmapped memory", d_size * 8);
vm_log.error("Access violation %s location 0x%x (%s)", is_writing ? "writing" : "reading", addr, (is_writing && vm::check_addr(addr)) ? "read-only memory" : "unmapped memory");
}
// TODO:
@ -1582,7 +1594,7 @@ bool handle_access_violation(u32 addr, bool is_writing, x64_context* context) no
// Do not log any further access violations in this case.
if (!g_tls_access_violation_recovered)
{
vm_log.fatal("Access violation %s location 0x%x (%s) [type=u%u]", is_writing ? "writing" : (cpu && cpu->id_type() == 1 && cpu->get_pc() == addr ? "executing" : "reading"), addr, (is_writing && vm::check_addr(addr)) ? "read-only memory" : "unmapped memory", d_size * 8);
vm_log.fatal("Access violation %s location 0x%x (%s)", is_writing ? "writing" : (cpu && cpu->id_type() == 1 && cpu->get_pc() == addr ? "executing" : "reading"), addr, (is_writing && vm::check_addr(addr)) ? "read-only memory" : "unmapped memory");
}
while (Emu.IsPaused())
@ -1754,8 +1766,9 @@ const bool s_exception_handler_set = []() -> bool
static void signal_handler(int /*sig*/, siginfo_t* info, void* uct) noexcept
{
x64_context* context = static_cast<ucontext_t*>(uct);
ucontext_t* context = static_cast<ucontext_t*>(uct);
#if defined(ARCH_X64)
#ifdef __APPLE__
const u64 err = context->uc_mcontext->__es.__err;
#elif defined(__DragonFly__) || defined(__FreeBSD__)
@ -1770,6 +1783,23 @@ static void signal_handler(int /*sig*/, siginfo_t* info, void* uct) noexcept
const bool is_executing = err & 0x10;
const bool is_writing = err & 0x2;
#elif defined(ARCH_ARM64)
const bool is_executing = uptr(info->si_addr) == RIP(context);
const u32 insn = is_executing ? 0 : *reinterpret_cast<u32*>(RIP(context));
const bool is_writing = (insn & 0xbfff0000) == 0x0c000000
|| (insn & 0xbfe00000) == 0x0c800000
|| (insn & 0xbfdf0000) == 0x0d000000
|| (insn & 0xbfc00000) == 0x0d800000
|| (insn & 0x3f400000) == 0x08000000
|| (insn & 0x3bc00000) == 0x39000000
|| (insn & 0x3fc00000) == 0x3d800000
|| (insn & 0x3bc00000) == 0x38000000
|| (insn & 0x3fe00000) == 0x3c800000
|| (insn & 0x3a400000) == 0x28000000;
#else
#error "signal_handler not implemented"
#endif
const u64 exec64 = (reinterpret_cast<u64>(info->si_addr) - reinterpret_cast<u64>(vm::g_exec_addr)) / 2;
const auto cause = is_executing ? "executing" : is_writing ? "writing" : "reading";
@ -1809,6 +1839,26 @@ static void signal_handler(int /*sig*/, siginfo_t* info, void* uct) noexcept
thread_ctrl::emergency_exit(msg);
}
static void sigill_handler(int /*sig*/, siginfo_t* info, void* /*uct*/) noexcept
{
std::string msg = fmt::format("Illegal instruction at %p (%s).\n", info->si_addr, *reinterpret_cast<be_t<u128>*>(info->si_addr));
append_thread_name(msg);
if (IsDebuggerPresent())
{
sys_log.fatal("\n%s", msg);
sys_log.notice("\n%s", dump_useful_thread_info());
// Convert to SIGTRAP
raise(SIGTRAP);
return;
}
thread_ctrl::emergency_exit(msg);
}
void sigpipe_signaling_handler(int)
{
}
@ -1834,6 +1884,13 @@ const bool s_exception_handler_set = []() -> bool
}
#endif
sa.sa_sigaction = sigill_handler;
if (::sigaction(SIGILL, &sa, NULL) == -1)
{
std::fprintf(stderr, "sigaction(SIGILL) failed (%d).\n", errno);
std::abort();
}
sa.sa_handler = sigpipe_signaling_handler;
if (::sigaction(SIGPIPE, &sa, NULL) == -1)
{
@ -1852,11 +1909,7 @@ const bool s_terminate_handler_set = []() -> bool
std::set_terminate([]()
{
if (IsDebuggerPresent())
#ifdef _MSC_VER
__debugbreak();
#else
__asm("int3;");
#endif
utils::trap();
report_fatal_error("RPCS3 has abnormally terminated.");
});
@ -1935,7 +1988,7 @@ void thread_base::initialize(void (*error_cb)())
{
if (attempts == umax)
{
g_tls_wait_time += __rdtsc() - stamp0;
g_tls_wait_time += utils::get_tsc() - stamp0;
}
else if (attempts > 1)
{
@ -2096,6 +2149,8 @@ thread_base::native_entry thread_base::finalize(u64 _self) noexcept
std::fesetround(FE_TONEAREST);
gv_unset_zeroing_denormals();
static constexpr u64 s_stop_bit = 0x8000'0000'0000'0000ull;
static atomic_t<u64> s_pool_ctr = []
@ -2195,10 +2250,11 @@ thread_base::native_entry thread_base::finalize(u64 _self) noexcept
thread_base::native_entry thread_base::make_trampoline(u64(*entry)(thread_base* _base))
{
return build_function_asm<native_entry>("thread_base_trampoline", [&](asmjit::x86::Assembler& c, auto& args)
return build_function_asm<native_entry>("thread_base_trampoline", [&](native_asm& c, auto& args)
{
using namespace asmjit;
#if defined(ARCH_X64)
Label _ret = c.newLabel();
c.push(x86::rbp);
c.sub(x86::rsp, 0x20);
@ -2222,6 +2278,7 @@ thread_base::native_entry thread_base::make_trampoline(u64(*entry)(thread_base*
c.bind(_ret);
c.add(x86::rsp, 0x28);
c.ret();
#endif
});
}
@ -2364,7 +2421,7 @@ bool thread_base::join(bool dtor) const
// Hacked for too sleepy threads (1ms) TODO: make sure it's unneeded and remove
const auto timeout = dtor && Emu.IsStopped() ? atomic_wait_timeout{1'000'000} : atomic_wait_timeout::inf;
auto stamp0 = __rdtsc();
auto stamp0 = utils::get_tsc();
for (u64 i = 0; (m_sync & 3) <= 1; i++)
{
@ -2377,7 +2434,7 @@ bool thread_base::join(bool dtor) const
if (i >= 16 && !(i & (i - 1)) && timeout != atomic_wait_timeout::inf)
{
sig_log.error(u8"Thread [%s] is too sleepy. Waiting for it %.3fµs already!", *m_tname.load(), (__rdtsc() - stamp0) / (utils::get_tsc_freq() / 1000000.));
sig_log.error(u8"Thread [%s] is too sleepy. Waiting for it %.3fµs already!", *m_tname.load(), (utils::get_tsc() - stamp0) / (utils::get_tsc_freq() / 1000000.));
}
}
@ -2522,17 +2579,8 @@ void thread_base::exec()
sig_log.fatal("Thread terminated due to fatal error: %s", reason);
#ifdef _WIN32
if (IsDebuggerPresent())
{
__debugbreak();
}
#else
if (IsDebuggerPresent())
{
__asm("int3;");
}
#endif
utils::trap();
if (const auto _this = g_tls_this_thread)
{

View file

@ -478,7 +478,19 @@ class named_thread final : public Context, result_storage<Context>, thread_base
return thread::finalize(thread_state::finished);
}
#if defined(ARCH_X64)
static inline thread::native_entry trampoline = thread::make_trampoline(entry_point);
#else
static void* trampoline(void* arg)
{
if (const auto next = thread_base::finalize(entry_point(static_cast<thread_base*>(arg))))
{
return next(thread_ctrl::get_current());
}
return nullptr;
}
#endif
friend class thread_ctrl;

View file

@ -20,11 +20,20 @@ else()
# Some distros have the compilers set to use PIE by default, but RPCS3 doesn't work with PIE, so we need to disable it.
CHECK_CXX_COMPILER_FLAG("-no-pie" HAS_NO_PIE)
CHECK_CXX_COMPILER_FLAG("-march=native" COMPILER_SUPPORTS_MARCH_NATIVE)
CHECK_CXX_COMPILER_FLAG("-msse -msse2 -mcx16" COMPILER_X86)
CHECK_CXX_COMPILER_FLAG("-march=armv8.1-a" COMPILER_ARM)
add_compile_options(-Wall)
add_compile_options(-fno-exceptions)
add_compile_options(-fstack-protector)
add_compile_options(-msse -msse2 -mcx16)
if (COMPILER_X86)
add_compile_options(-msse -msse2 -mcx16)
endif()
if (COMPILER_ARM)
add_compile_options(-march=armv8.1-a)
endif()
add_compile_options(-Werror=old-style-cast)
add_compile_options(-Werror=sign-compare)

View file

@ -461,8 +461,10 @@ int aes_setkey_enc( aes_context *ctx, const unsigned char *key, unsigned int key
ctx->rk = RK = ctx->buf;
#if defined(__SSE2__) || defined(_M_X64)
if( aesni_supports( POLARSSL_AESNI_AES ) )
return( aesni_setkey_enc( reinterpret_cast<unsigned char*>(ctx->rk), key, keysize ) );
#endif
for( i = 0; i < (keysize >> 5); i++ )
{
@ -564,12 +566,14 @@ int aes_setkey_dec( aes_context *ctx, const unsigned char *key, unsigned int key
if( ret != 0 )
return( ret );
#if defined(__SSE2__) || defined(_M_X64)
if( aesni_supports( POLARSSL_AESNI_AES ) )
{
aesni_inverse_key( reinterpret_cast<unsigned char*>(ctx->rk),
reinterpret_cast<const unsigned char*>(cty.rk), ctx->nr );
goto done;
}
#endif
SK = cty.rk + cty.nr * 4;
@ -658,8 +662,10 @@ int aes_crypt_ecb( aes_context *ctx,
int i;
uint32_t *RK, X0, X1, X2, X3, Y0, Y1, Y2, Y3;
#if defined(__SSE2__) || defined(_M_X64)
if( aesni_supports( POLARSSL_AESNI_AES ) )
return( aesni_crypt_ecb( ctx, mode, input, output ) );
#endif
RK = ctx->rk;

View file

@ -1,3 +1,5 @@
#if defined(__SSE2__) || defined(_M_X64)
/*
* AES-NI support functions
*
@ -680,3 +682,5 @@ int aesni_setkey_enc( unsigned char *rk,
return( 0 );
}
#endif

View file

@ -17,7 +17,9 @@
#include <unordered_map>
#include <map>
#if defined(ARCH_X64)
#include <emmintrin.h>
#endif
DECLARE(cpu_thread::g_threads_created){0};
DECLARE(cpu_thread::g_threads_deleted){0};
@ -410,20 +412,6 @@ void cpu_thread::operator()()
{
thread_ctrl::set_thread_affinity_mask(thread_ctrl::get_affinity_mask(id_type() == 1 ? thread_class::ppu : thread_class::spu));
}
if (id_type() == 2)
{
// force input/output denormals to zero for SPU threads (FTZ/DAZ)
_mm_setcsr( _mm_getcsr() | 0x8040 );
const volatile int a = 0x1fc00000;
__m128 b = _mm_castsi128_ps(_mm_set1_epi32(a));
int c = _mm_cvtsi128_si32(_mm_castps_si128(_mm_mul_ps(b,b)));
if (c != 0)
{
sys_log.fatal("Could not disable denormals.");
}
}
while (!g_fxo->is_init<cpu_profiler>())
{

View file

@ -3,7 +3,7 @@
#include "CPUTranslator.h"
#include "util/v128.hpp"
#include "util/v128sse.hpp"
#include "util/simd.hpp"
llvm::LLVMContext g_llvm_ctx;

View file

@ -2961,11 +2961,11 @@ public:
}
// Call external function: provide name and function pointer
template <typename RT, typename... FArgs, LLVMValue... Args>
template <typename RetT = void, typename RT, typename... FArgs, LLVMValue... Args>
llvm::CallInst* call(std::string_view lame, RT(*_func)(FArgs...), Args... args)
{
static_assert(sizeof...(FArgs) == sizeof...(Args), "spu_llvm_recompiler::call(): unexpected arg number");
const auto type = llvm::FunctionType::get(get_type<RT>(), {args->getType()...}, false);
const auto type = llvm::FunctionType::get(get_type<std::conditional_t<std::is_void_v<RetT>, RT, RetT>>(), {args->getType()...}, false);
const auto func = llvm::cast<llvm::Function>(m_module->getOrInsertFunction({lame.data(), lame.size()}, type).getCallee());
#ifdef _WIN32
func->setCallingConv(llvm::CallingConv::Win64);
@ -3680,31 +3680,4 @@ struct fmt_unveil<llvm::TypeSize, void>
}
};
#ifndef _MSC_VER
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wignored-attributes"
#endif
template <>
struct llvm_value_t<__m128> : llvm_value_t<f32[4]>
{
};
template <>
struct llvm_value_t<__m128d> : llvm_value_t<f64[2]>
{
};
template <>
struct llvm_value_t<__m128i> : llvm_value_t<u8[16]>
{
};
#ifndef _MSC_VER
#pragma GCC diagnostic pop
#endif
#endif

8776
rpcs3/Emu/CPU/sse2neon.h Normal file

File diff suppressed because it is too large Load diff

View file

@ -6,9 +6,12 @@
#include "Emu/Cell/lv2/sys_event.h"
#include "cellAudio.h"
#include "emmintrin.h"
#include <cmath>
#if defined(ARCH_X64)
#include "emmintrin.h"
#endif
LOG_CHANNEL(cellAudio);
vm::gvar<char, AUDIO_PORT_OFFSET * AUDIO_PORT_COUNT> g_audio_buffer;
@ -1118,6 +1121,7 @@ void cell_audio_thread::mix(float *out_buffer, s32 offset)
// 2x CVTPS2DQ (converts float to s32)
// PACKSSDW (converts s32 to s16 with signed saturation)
#if defined(ARCH_X64)
for (usz i = 0; i < out_buffer_sz; i += 8)
{
const auto scale = _mm_set1_ps(0x8000);
@ -1125,6 +1129,9 @@ void cell_audio_thread::mix(float *out_buffer, s32 offset)
_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(out_buffer + i), scale)),
_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(out_buffer + i + 4), scale)))));
}
#else
fmt::throw_exception("Not supported");
#endif
}
}

View file

@ -17,7 +17,7 @@
#include "util/asm.hpp"
#include "util/v128.hpp"
#include "util/v128sse.hpp"
#include "util/simd.hpp"
LOG_CHANNEL(cellSpurs);
@ -738,7 +738,7 @@ s32 _spurs::create_handler(vm::ptr<CellSpurs> spurs, u32 ppuPriority)
void non_task()
{
BIND_FUNC(_spurs::handler_entry)(*this);
//BIND_FUNC(_spurs::handler_entry)(*this);
}
};
@ -933,7 +933,7 @@ s32 _spurs::create_event_helper(ppu_thread& ppu, vm::ptr<CellSpurs> spurs, u32 p
void non_task()
{
BIND_FUNC(_spurs::event_helper_entry)(*this);
//BIND_FUNC(_spurs::event_helper_entry)(*this);
}
};

View file

@ -11,7 +11,7 @@
#include "util/asm.hpp"
#include "util/v128.hpp"
#include "util/v128sse.hpp"
#include "util/simd.hpp"
LOG_CHANNEL(cellSpurs);
@ -1434,7 +1434,7 @@ s32 spursTasksetProcessRequest(spu_thread& spu, s32 request, u32* taskId, u32* i
// Verify taskset state is valid
if ((waiting & running) != v128{} || (ready & pready) != v128{} ||
(v128::andnot(enabled, running | ready | pready | signalled | waiting) != v128{}))
(gv_andn(enabled, running | ready | pready | signalled | waiting) != v128{}))
{
spu_log.error("Invalid taskset state");
spursHalt(spu);
@ -1442,7 +1442,7 @@ s32 spursTasksetProcessRequest(spu_thread& spu, s32 request, u32* taskId, u32* i
// Find the number of tasks that have become ready since the last iteration
{
v128 newlyReadyTasks = v128::andnot(ready, signalled | pready);
v128 newlyReadyTasks = gv_andn(ready, signalled | pready);
numNewlyReadyTasks = utils::popcnt128(newlyReadyTasks._u);
}
@ -1491,7 +1491,7 @@ s32 spursTasksetProcessRequest(spu_thread& spu, s32 request, u32* taskId, u32* i
}
case SPURS_TASKSET_REQUEST_POLL:
{
readyButNotRunning = v128::andnot(running, ready0);
readyButNotRunning = gv_andn(running, ready0);
if (taskset->wkl_flag_wait_task < CELL_SPURS_MAX_TASK)
{
readyButNotRunning._u &= ~(u128{1} << (~taskset->wkl_flag_wait_task & 127));
@ -1526,7 +1526,7 @@ s32 spursTasksetProcessRequest(spu_thread& spu, s32 request, u32* taskId, u32* i
}
case SPURS_TASKSET_REQUEST_SELECT_TASK:
{
readyButNotRunning = v128::andnot(running, ready0);
readyButNotRunning = gv_andn(running, ready0);
if (taskset->wkl_flag_wait_task < CELL_SPURS_MAX_TASK)
{
readyButNotRunning._u &= ~(u128{1} << (~taskset->wkl_flag_wait_task & 127));

View file

@ -203,18 +203,31 @@ struct ppu_itype
VCFSX,
VCFUX,
VCMPBFP,
VCMPBFP_,
VCMPEQFP,
VCMPEQFP_,
VCMPEQUB,
VCMPEQUB_,
VCMPEQUH,
VCMPEQUH_,
VCMPEQUW,
VCMPEQUW_,
VCMPGEFP,
VCMPGEFP_,
VCMPGTFP,
VCMPGTFP_,
VCMPGTSB,
VCMPGTSB_,
VCMPGTSH,
VCMPGTSH_,
VCMPGTSW,
VCMPGTSW_,
VCMPGTUB,
VCMPGTUB_,
VCMPGTUH,
VCMPGTUH_,
VCMPGTUW,
VCMPGTUW_,
VCTSXS,
VCTUXS,
VEXPTEFP,
@ -367,7 +380,9 @@ struct ppu_itype
LVSL,
LVEBX,
SUBFC,
SUBFCO,
ADDC,
ADDCO,
MULHDU,
MULHWU,
MFOCRF,
@ -382,6 +397,7 @@ struct ppu_itype
LVSR,
LVEHX,
SUBF,
SUBFO,
LDUX,
DCBST,
LWZUX,
@ -396,11 +412,14 @@ struct ppu_itype
LBZX,
LVX,
NEG,
NEGO,
LBZUX,
NOR,
STVEBX,
SUBFE,
SUBFEO,
ADDE,
ADDEO,
MTOCRF,
STDX,
STWCX,
@ -410,17 +429,24 @@ struct ppu_itype
STWUX,
STVEWX,
SUBFZE,
SUBFZEO,
ADDZE,
ADDZEO,
STDCX,
STBX,
STVX,
SUBFME,
SUBFMEO,
MULLD,
MULLDO,
ADDME,
ADDMEO,
MULLW,
MULLWO,
DCBTST,
STBUX,
ADD,
ADDO,
DCBT,
LHZX,
EQV,
@ -442,13 +468,17 @@ struct ppu_itype
STHUX,
OR,
DIVDU,
DIVDUO,
DIVWU,
DIVWUO,
MTSPR,
DCBI,
NAND,
STVXL,
DIVD,
DIVDO,
DIVW,
DIVWO,
LVLX,
LDBRX,
LSWX,
@ -558,6 +588,112 @@ struct ppu_itype
FCTID,
FCTIDZ,
FCFID,
SUBFCO_,
ADDCO_,
SUBFO_,
NEGO_,
SUBFEO_,
ADDEO_,
SUBFZEO_,
ADDZEO_,
SUBFMEO_,
MULLDO_,
ADDMEO_,
MULLWO_,
ADDO_,
DIVDUO_,
DIVWUO_,
DIVDO_,
DIVWO_,
RLWIMI_,
RLWINM_,
RLWNM_,
RLDICL_,
RLDICR_,
RLDIC_,
RLDIMI_,
RLDCL_,
RLDCR_,
SUBFC_,
MULHDU_,
ADDC_,
MULHWU_,
SLW_,
CNTLZW_,
SLD_,
AND_,
SUBF_,
CNTLZD_,
ANDC_,
MULHD_,
MULHW_,
NEG_,
NOR_,
SUBFE_,
ADDE_,
SUBFZE_,
ADDZE_,
MULLD_,
SUBFME_,
ADDME_,
MULLW_,
ADD_,
EQV_,
XOR_,
ORC_,
OR_,
DIVDU_,
DIVWU_,
NAND_,
DIVD_,
DIVW_,
SRW_,
SRD_,
SRAW_,
SRAD_,
SRAWI_,
SRADI_,
EXTSH_,
EXTSB_,
EXTSW_,
FDIVS_,
FSUBS_,
FADDS_,
FSQRTS_,
FRES_,
FMULS_,
FMADDS_,
FMSUBS_,
FNMSUBS_,
FNMADDS_,
MTFSB1_,
MTFSB0_,
MTFSFI_,
MFFS_,
MTFSF_,
FRSP_,
FCTIW_,
FCTIWZ_,
FDIV_,
FSUB_,
FADD_,
FSQRT_,
FSEL_,
FMUL_,
FRSQRTE_,
FMSUB_,
FMADD_,
FNMSUB_,
FNMADD_,
FNEG_,
FMR_,
FNABS_,
FABS_,
FCTID_,
FCTIDZ_,
FCFID_,
};
// Enable address-of operator for ppu_decoder<>
@ -570,6 +706,7 @@ struct ppu_itype
struct ppu_iname
{
#define NAME(x) static constexpr const char& x = *#x;
#define NAME_(x) static constexpr const char& x##_ = *#x ".";
NAME(UNK)
NAME(MFVSCR)
NAME(MTVSCR)
@ -595,18 +732,31 @@ struct ppu_iname
NAME(VCFSX)
NAME(VCFUX)
NAME(VCMPBFP)
NAME_(VCMPBFP)
NAME(VCMPEQFP)
NAME_(VCMPEQFP)
NAME(VCMPEQUB)
NAME_(VCMPEQUB)
NAME(VCMPEQUH)
NAME_(VCMPEQUH)
NAME(VCMPEQUW)
NAME_(VCMPEQUW)
NAME(VCMPGEFP)
NAME_(VCMPGEFP)
NAME(VCMPGTFP)
NAME_(VCMPGTFP)
NAME(VCMPGTSB)
NAME_(VCMPGTSB)
NAME(VCMPGTSH)
NAME_(VCMPGTSH)
NAME(VCMPGTSW)
NAME_(VCMPGTSW)
NAME(VCMPGTUB)
NAME_(VCMPGTUB)
NAME(VCMPGTUH)
NAME_(VCMPGTUH)
NAME(VCMPGTUW)
NAME_(VCMPGTUW)
NAME(VCTSXS)
NAME(VCTUXS)
NAME(VEXPTEFP)
@ -950,7 +1100,132 @@ struct ppu_iname
NAME(FCTID)
NAME(FCTIDZ)
NAME(FCFID)
NAME(SUBFCO)
NAME(ADDCO)
NAME(SUBFO)
NAME(NEGO)
NAME(SUBFEO)
NAME(ADDEO)
NAME(SUBFZEO)
NAME(ADDZEO)
NAME(SUBFMEO)
NAME(MULLDO)
NAME(ADDMEO)
NAME(MULLWO)
NAME(ADDO)
NAME(DIVDUO)
NAME(DIVWUO)
NAME(DIVDO)
NAME(DIVWO)
NAME_(SUBFCO)
NAME_(ADDCO)
NAME_(SUBFO)
NAME_(NEGO)
NAME_(SUBFEO)
NAME_(ADDEO)
NAME_(SUBFZEO)
NAME_(ADDZEO)
NAME_(SUBFMEO)
NAME_(MULLDO)
NAME_(ADDMEO)
NAME_(MULLWO)
NAME_(ADDO)
NAME_(DIVDUO)
NAME_(DIVWUO)
NAME_(DIVDO)
NAME_(DIVWO)
NAME_(RLWIMI)
NAME_(RLWINM)
NAME_(RLWNM)
NAME_(RLDICL)
NAME_(RLDICR)
NAME_(RLDIC)
NAME_(RLDIMI)
NAME_(RLDCL)
NAME_(RLDCR)
NAME_(SUBFC)
NAME_(MULHDU)
NAME_(ADDC)
NAME_(MULHWU)
NAME_(SLW)
NAME_(CNTLZW)
NAME_(SLD)
NAME_(AND)
NAME_(SUBF)
NAME_(CNTLZD)
NAME_(ANDC)
NAME_(MULHD)
NAME_(MULHW)
NAME_(NEG)
NAME_(NOR)
NAME_(SUBFE)
NAME_(ADDE)
NAME_(SUBFZE)
NAME_(ADDZE)
NAME_(MULLD)
NAME_(SUBFME)
NAME_(ADDME)
NAME_(MULLW)
NAME_(ADD)
NAME_(EQV)
NAME_(XOR)
NAME_(ORC)
NAME_(OR)
NAME_(DIVDU)
NAME_(DIVWU)
NAME_(NAND)
NAME_(DIVD)
NAME_(DIVW)
NAME_(SRW)
NAME_(SRD)
NAME_(SRAW)
NAME_(SRAD)
NAME_(SRAWI)
NAME_(SRADI)
NAME_(EXTSH)
NAME_(EXTSB)
NAME_(EXTSW)
NAME_(FDIVS)
NAME_(FSUBS)
NAME_(FADDS)
NAME_(FSQRTS)
NAME_(FRES)
NAME_(FMULS)
NAME_(FMADDS)
NAME_(FMSUBS)
NAME_(FNMSUBS)
NAME_(FNMADDS)
NAME_(MTFSB1)
NAME_(MTFSB0)
NAME_(MTFSFI)
NAME_(MFFS)
NAME_(MTFSF)
NAME_(FRSP)
NAME_(FCTIW)
NAME_(FCTIWZ)
NAME_(FDIV)
NAME_(FSUB)
NAME_(FADD)
NAME_(FSQRT)
NAME_(FSEL)
NAME_(FMUL)
NAME_(FRSQRTE)
NAME_(FMSUB)
NAME_(FMADD)
NAME_(FNMSUB)
NAME_(FNMADD)
NAME_(FNEG)
NAME_(FMR)
NAME_(FNABS)
NAME_(FABS)
NAME_(FCTID)
NAME_(FCTIDZ)
NAME_(FCFID)
#undef NAME
#undef NAME_
};
// PPU Analyser Context

View file

@ -351,18 +351,31 @@ public:
void VCFSX(ppu_opcode_t op);
void VCFUX(ppu_opcode_t op);
void VCMPBFP(ppu_opcode_t op);
void VCMPBFP_(ppu_opcode_t op) { return VCMPBFP(op); }
void VCMPEQFP(ppu_opcode_t op);
void VCMPEQFP_(ppu_opcode_t op) { return VCMPEQFP(op); }
void VCMPEQUB(ppu_opcode_t op);
void VCMPEQUB_(ppu_opcode_t op) { return VCMPEQUB(op); }
void VCMPEQUH(ppu_opcode_t op);
void VCMPEQUH_(ppu_opcode_t op) { return VCMPEQUH(op); }
void VCMPEQUW(ppu_opcode_t op);
void VCMPEQUW_(ppu_opcode_t op) { return VCMPEQUW(op); }
void VCMPGEFP(ppu_opcode_t op);
void VCMPGEFP_(ppu_opcode_t op) { return VCMPGEFP(op); }
void VCMPGTFP(ppu_opcode_t op);
void VCMPGTFP_(ppu_opcode_t op) { return VCMPGTFP(op); }
void VCMPGTSB(ppu_opcode_t op);
void VCMPGTSB_(ppu_opcode_t op) { return VCMPGTSB(op); }
void VCMPGTSH(ppu_opcode_t op);
void VCMPGTSH_(ppu_opcode_t op) { return VCMPGTSH(op); }
void VCMPGTSW(ppu_opcode_t op);
void VCMPGTSW_(ppu_opcode_t op) { return VCMPGTSW(op); }
void VCMPGTUB(ppu_opcode_t op);
void VCMPGTUB_(ppu_opcode_t op) { return VCMPGTUB(op); }
void VCMPGTUH(ppu_opcode_t op);
void VCMPGTUH_(ppu_opcode_t op) { return VCMPGTUH(op); }
void VCMPGTUW(ppu_opcode_t op);
void VCMPGTUW_(ppu_opcode_t op) { return VCMPGTUW(op); }
void VCTSXS(ppu_opcode_t op);
void VCTUXS(ppu_opcode_t op);
void VEXPTEFP(ppu_opcode_t op);
@ -708,4 +721,128 @@ public:
void FCFID(ppu_opcode_t op);
void UNK(ppu_opcode_t op);
void SUBFCO(ppu_opcode_t op) { return SUBFC(op); }
void ADDCO(ppu_opcode_t op) { return ADDC(op); }
void SUBFO(ppu_opcode_t op) { return SUBF(op); }
void NEGO(ppu_opcode_t op) { return NEG(op); }
void SUBFEO(ppu_opcode_t op) { return SUBFE(op); }
void ADDEO(ppu_opcode_t op) { return ADDE(op); }
void SUBFZEO(ppu_opcode_t op) { return SUBFZE(op); }
void ADDZEO(ppu_opcode_t op) { return ADDZE(op); }
void SUBFMEO(ppu_opcode_t op) { return SUBFME(op); }
void MULLDO(ppu_opcode_t op) { return MULLD(op); }
void ADDMEO(ppu_opcode_t op) { return ADDME(op); }
void MULLWO(ppu_opcode_t op) { return MULLW(op); }
void ADDO(ppu_opcode_t op) { return ADD(op); }
void DIVDUO(ppu_opcode_t op) { return DIVDU(op); }
void DIVWUO(ppu_opcode_t op) { return DIVWU(op); }
void DIVDO(ppu_opcode_t op) { return DIVD(op); }
void DIVWO(ppu_opcode_t op) { return DIVW(op); }
void SUBFCO_(ppu_opcode_t op) { return SUBFC(op); }
void ADDCO_(ppu_opcode_t op) { return ADDC(op); }
void SUBFO_(ppu_opcode_t op) { return SUBF(op); }
void NEGO_(ppu_opcode_t op) { return NEG(op); }
void SUBFEO_(ppu_opcode_t op) { return SUBFE(op); }
void ADDEO_(ppu_opcode_t op) { return ADDE(op); }
void SUBFZEO_(ppu_opcode_t op) { return SUBFZE(op); }
void ADDZEO_(ppu_opcode_t op) { return ADDZE(op); }
void SUBFMEO_(ppu_opcode_t op) { return SUBFME(op); }
void MULLDO_(ppu_opcode_t op) { return MULLD(op); }
void ADDMEO_(ppu_opcode_t op) { return ADDME(op); }
void MULLWO_(ppu_opcode_t op) { return MULLW(op); }
void ADDO_(ppu_opcode_t op) { return ADD(op); }
void DIVDUO_(ppu_opcode_t op) { return DIVDU(op); }
void DIVWUO_(ppu_opcode_t op) { return DIVWU(op); }
void DIVDO_(ppu_opcode_t op) { return DIVD(op); }
void DIVWO_(ppu_opcode_t op) { return DIVW(op); }
void RLWIMI_(ppu_opcode_t op) { return RLWIMI(op); }
void RLWINM_(ppu_opcode_t op) { return RLWINM(op); }
void RLWNM_(ppu_opcode_t op) { return RLWNM(op); }
void RLDICL_(ppu_opcode_t op) { return RLDICL(op); }
void RLDICR_(ppu_opcode_t op) { return RLDICR(op); }
void RLDIC_(ppu_opcode_t op) { return RLDIC(op); }
void RLDIMI_(ppu_opcode_t op) { return RLDIMI(op); }
void RLDCL_(ppu_opcode_t op) { return RLDCL(op); }
void RLDCR_(ppu_opcode_t op) { return RLDCR(op); }
void SUBFC_(ppu_opcode_t op) { return SUBFC(op); }
void MULHDU_(ppu_opcode_t op) { return MULHDU(op); }
void ADDC_(ppu_opcode_t op) { return ADDC(op); }
void MULHWU_(ppu_opcode_t op) { return MULHWU(op); }
void SLW_(ppu_opcode_t op) { return SLW(op); }
void CNTLZW_(ppu_opcode_t op) { return CNTLZW(op); }
void SLD_(ppu_opcode_t op) { return SLD(op); }
void AND_(ppu_opcode_t op) { return AND(op); }
void SUBF_(ppu_opcode_t op) { return SUBF(op); }
void CNTLZD_(ppu_opcode_t op) { return CNTLZD(op); }
void ANDC_(ppu_opcode_t op) { return ANDC(op); }
void MULHD_(ppu_opcode_t op) { return MULHD(op); }
void MULHW_(ppu_opcode_t op) { return MULHW(op); }
void NEG_(ppu_opcode_t op) { return NEG(op); }
void NOR_(ppu_opcode_t op) { return NOR(op); }
void SUBFE_(ppu_opcode_t op) { return SUBFE(op); }
void ADDE_(ppu_opcode_t op) { return ADDE(op); }
void SUBFZE_(ppu_opcode_t op) { return SUBFZE(op); }
void ADDZE_(ppu_opcode_t op) { return ADDZE(op); }
void MULLD_(ppu_opcode_t op) { return MULLD(op); }
void SUBFME_(ppu_opcode_t op) { return SUBFME(op); }
void ADDME_(ppu_opcode_t op) { return ADDME(op); }
void MULLW_(ppu_opcode_t op) { return MULLW(op); }
void ADD_(ppu_opcode_t op) { return ADD(op); }
void EQV_(ppu_opcode_t op) { return EQV(op); }
void XOR_(ppu_opcode_t op) { return XOR(op); }
void ORC_(ppu_opcode_t op) { return ORC(op); }
void OR_(ppu_opcode_t op) { return OR(op); }
void DIVDU_(ppu_opcode_t op) { return DIVDU(op); }
void DIVWU_(ppu_opcode_t op) { return DIVWU(op); }
void NAND_(ppu_opcode_t op) { return NAND(op); }
void DIVD_(ppu_opcode_t op) { return DIVD(op); }
void DIVW_(ppu_opcode_t op) { return DIVW(op); }
void SRW_(ppu_opcode_t op) { return SRW(op); }
void SRD_(ppu_opcode_t op) { return SRD(op); }
void SRAW_(ppu_opcode_t op) { return SRAW(op); }
void SRAD_(ppu_opcode_t op) { return SRAD(op); }
void SRAWI_(ppu_opcode_t op) { return SRAWI(op); }
void SRADI_(ppu_opcode_t op) { return SRADI(op); }
void EXTSH_(ppu_opcode_t op) { return EXTSH(op); }
void EXTSB_(ppu_opcode_t op) { return EXTSB(op); }
void EXTSW_(ppu_opcode_t op) { return EXTSW(op); }
void FDIVS_(ppu_opcode_t op) { return FDIVS(op); }
void FSUBS_(ppu_opcode_t op) { return FSUBS(op); }
void FADDS_(ppu_opcode_t op) { return FADDS(op); }
void FSQRTS_(ppu_opcode_t op) { return FSQRTS(op); }
void FRES_(ppu_opcode_t op) { return FRES(op); }
void FMULS_(ppu_opcode_t op) { return FMULS(op); }
void FMADDS_(ppu_opcode_t op) { return FMADDS(op); }
void FMSUBS_(ppu_opcode_t op) { return FMSUBS(op); }
void FNMSUBS_(ppu_opcode_t op) { return FNMSUBS(op); }
void FNMADDS_(ppu_opcode_t op) { return FNMADDS(op); }
void MTFSB1_(ppu_opcode_t op) { return MTFSB1(op); }
void MTFSB0_(ppu_opcode_t op) { return MTFSB0(op); }
void MTFSFI_(ppu_opcode_t op) { return MTFSFI(op); }
void MFFS_(ppu_opcode_t op) { return MFFS(op); }
void MTFSF_(ppu_opcode_t op) { return MTFSF(op); }
void FRSP_(ppu_opcode_t op) { return FRSP(op); }
void FCTIW_(ppu_opcode_t op) { return FCTIW(op); }
void FCTIWZ_(ppu_opcode_t op) { return FCTIWZ(op); }
void FDIV_(ppu_opcode_t op) { return FDIV(op); }
void FSUB_(ppu_opcode_t op) { return FSUB(op); }
void FADD_(ppu_opcode_t op) { return FADD(op); }
void FSQRT_(ppu_opcode_t op) { return FSQRT(op); }
void FSEL_(ppu_opcode_t op) { return FSEL(op); }
void FMUL_(ppu_opcode_t op) { return FMUL(op); }
void FRSQRTE_(ppu_opcode_t op) { return FRSQRTE(op); }
void FMSUB_(ppu_opcode_t op) { return FMSUB(op); }
void FMADD_(ppu_opcode_t op) { return FMADD(op); }
void FNMSUB_(ppu_opcode_t op) { return FNMSUB(op); }
void FNMADD_(ppu_opcode_t op) { return FNMADD(op); }
void FNEG_(ppu_opcode_t op) { return FNEG(op); }
void FMR_(ppu_opcode_t op) { return FMR(op); }
void FNABS_(ppu_opcode_t op) { return FNABS(op); }
void FABS_(ppu_opcode_t op) { return FABS(op); }
void FCTID_(ppu_opcode_t op) { return FCTID(op); }
void FCTIDZ_(ppu_opcode_t op) { return FCTIDZ(op); }
void FCFID_(ppu_opcode_t op) { return FCFID(op); }
};

View file

@ -1889,47 +1889,56 @@ extern std::string ppu_get_variable_name(const std::string& _module, u32 vnid)
return fmt::format("0x%08X", vnid);
}
std::vector<ppu_function_t>& ppu_function_manager::access(bool ghc)
std::vector<ppu_intrp_func_t>& ppu_function_manager::access(bool ghc)
{
static std::vector<ppu_function_t> list
static std::vector<ppu_intrp_func_t> list
{
[](ppu_thread& ppu) -> bool
[](ppu_thread& ppu, ppu_opcode_t, be_t<u32>* this_op, ppu_intrp_func*)
{
ppu.cia = vm::get_addr(this_op);
ppu_log.error("Unregistered function called (LR=0x%x)", ppu.lr);
ppu.gpr[3] = 0;
ppu.cia = static_cast<u32>(ppu.lr) & ~3;
return false;
},
[](ppu_thread& ppu) -> bool
[](ppu_thread& ppu, ppu_opcode_t, be_t<u32>* this_op, ppu_intrp_func*)
{
ppu.state += cpu_flag::ret;
ppu.cia += 4;
return false;
ppu.cia = vm::get_addr(this_op) + 4;
},
};
static std::vector<ppu_function_t> list_ghc
#if defined(ARCH_X64)
static std::vector<ppu_intrp_func_t> list_ghc
{
build_function_asm<ppu_function_t>("ppu_unregistered", [](asmjit::x86::Assembler& c, auto& args)
build_function_asm<ppu_intrp_func_t>("ppu_unregistered", [](native_asm& c, auto& args)
{
using namespace asmjit;
// Take second ghc arg
c.mov(args[0], x86::rbp);
c.mov(args[2].r32(), x86::dword_ptr(args[0], ::offset32(&ppu_thread::cia)));
c.add(args[2], x86::qword_ptr(reinterpret_cast<u64>(&vm::g_base_addr)));
c.jmp(imm_ptr(list[0]));
}),
build_function_asm<ppu_function_t>("ppu_return", [](asmjit::x86::Assembler& c, auto& args)
build_function_asm<ppu_intrp_func_t>("ppu_return", [](native_asm& c, auto& args)
{
using namespace asmjit;
// Take second ghc arg
c.mov(args[0], x86::rbp);
c.mov(args[2].r32(), x86::dword_ptr(args[0], ::offset32(&ppu_thread::cia)));
c.add(args[2], x86::qword_ptr(reinterpret_cast<u64>(&vm::g_base_addr)));
c.jmp(imm_ptr(list[1]));
}),
};
#elif defined(ARCH_ARM64)
static std::vector<ppu_intrp_func_t> list_ghc(list);
#endif
return ghc ? list_ghc : list;
}
u32 ppu_function_manager::add_function(ppu_function_t function)
u32 ppu_function_manager::add_function(ppu_intrp_func_t function)
{
auto& list = access();
auto& list2 = access(true);
@ -1937,13 +1946,22 @@ u32 ppu_function_manager::add_function(ppu_function_t function)
list.push_back(function);
// Generate trampoline
list2.push_back(build_function_asm<ppu_function_t>("ppu_trampolinea", [&](asmjit::x86::Assembler& c, auto& args)
#if defined(ARCH_X64)
list2.push_back(build_function_asm<ppu_intrp_func_t>("ppu_trampolinea", [&](native_asm& c, auto& args)
{
using namespace asmjit;
// Take second ghc arg
c.mov(args[0], x86::rbp);
c.mov(args[2].r32(), x86::dword_ptr(args[0], ::offset32(&ppu_thread::cia)));
c.add(args[2], x86::qword_ptr(reinterpret_cast<u64>(&vm::g_base_addr)));
c.jmp(imm_ptr(function));
}));
#elif defined(ARCH_ARM64)
list2.push_back(function);
#else
#error "Not implemented"
#endif
return ::size32(list) - 1;
}

View file

@ -1,23 +1,22 @@
#pragma once
#include "PPUThread.h"
#include "PPUInterpreter.h"
#include "util/v128.hpp"
using ppu_function_t = bool(*)(ppu_thread&);
// BIND_FUNC macro "converts" any appropriate HLE function to ppu_function_t, binding it to PPU thread context.
#define BIND_FUNC(func, ...) (static_cast<ppu_function_t>([](ppu_thread& ppu) -> bool {\
// BIND_FUNC macro "converts" any appropriate HLE function to ppu_intrp_func_t, binding it to PPU thread context.
#define BIND_FUNC(func, ...) (static_cast<ppu_intrp_func_t>([](ppu_thread& ppu, ppu_opcode_t, be_t<u32>* this_op, ppu_intrp_func*) {\
const auto old_f = ppu.current_function;\
if (!old_f) ppu.last_function = #func;\
ppu.current_function = #func;\
ppu.cia = vm::get_addr(this_op); \
std::memcpy(ppu.syscall_args, ppu.gpr + 3, sizeof(ppu.syscall_args)); \
ppu_func_detail::do_call(ppu, func);\
static_cast<void>(ppu.test_stopped());\
ppu.current_function = old_f;\
ppu.cia += 4;\
__VA_ARGS__;\
return false;\
}))
struct ppu_va_args_t
@ -257,9 +256,9 @@ class ppu_function_manager
};
// Access global function list
static std::vector<ppu_function_t>& access(bool ghc = false);
static std::vector<ppu_intrp_func_t>& access(bool ghc = false);
static u32 add_function(ppu_function_t function);
static u32 add_function(ppu_intrp_func_t function);
public:
ppu_function_manager() = default;
@ -270,7 +269,7 @@ public:
// Register function (shall only be called during global initialization)
template<typename T, T Func>
static inline u32 register_function(ppu_function_t func)
static inline u32 register_function(ppu_intrp_func_t func)
{
return registered<T, Func>::index = add_function(func);
}

File diff suppressed because it is too large Load diff

View file

@ -4,454 +4,41 @@
class ppu_thread;
using ppu_inter_func_t = bool(*)(ppu_thread& ppu, ppu_opcode_t op);
using ppu_intrp_func_t = void(*)(ppu_thread& ppu_, ppu_opcode_t op, be_t<u32>* this_op, struct ppu_intrp_func* next_fn);
struct ppu_interpreter
struct ppu_intrp_func
{
static bool MFVSCR(ppu_thread&, ppu_opcode_t);
static bool MTVSCR(ppu_thread&, ppu_opcode_t);
static bool VADDCUW(ppu_thread&, ppu_opcode_t);
static bool VADDFP(ppu_thread&, ppu_opcode_t);
static bool VADDUBM(ppu_thread&, ppu_opcode_t);
static bool VADDUHM(ppu_thread&, ppu_opcode_t);
static bool VADDUWM(ppu_thread&, ppu_opcode_t);
static bool VAND(ppu_thread&, ppu_opcode_t);
static bool VANDC(ppu_thread&, ppu_opcode_t);
static bool VAVGSB(ppu_thread&, ppu_opcode_t);
static bool VAVGSH(ppu_thread&, ppu_opcode_t);
static bool VAVGSW(ppu_thread&, ppu_opcode_t);
static bool VAVGUB(ppu_thread&, ppu_opcode_t);
static bool VAVGUH(ppu_thread&, ppu_opcode_t);
static bool VAVGUW(ppu_thread&, ppu_opcode_t);
static bool VCFSX(ppu_thread&, ppu_opcode_t);
static bool VCFUX(ppu_thread&, ppu_opcode_t);
static bool VCMPBFP(ppu_thread&, ppu_opcode_t);
static bool VCMPEQFP(ppu_thread&, ppu_opcode_t);
static bool VCMPEQUB(ppu_thread&, ppu_opcode_t);
static bool VCMPEQUH(ppu_thread&, ppu_opcode_t);
static bool VCMPEQUW(ppu_thread&, ppu_opcode_t);
static bool VCMPGEFP(ppu_thread&, ppu_opcode_t);
static bool VCMPGTFP(ppu_thread&, ppu_opcode_t);
static bool VCMPGTSB(ppu_thread&, ppu_opcode_t);
static bool VCMPGTSH(ppu_thread&, ppu_opcode_t);
static bool VCMPGTSW(ppu_thread&, ppu_opcode_t);
static bool VCMPGTUB(ppu_thread&, ppu_opcode_t);
static bool VCMPGTUH(ppu_thread&, ppu_opcode_t);
static bool VCMPGTUW(ppu_thread&, ppu_opcode_t);
static bool VEXPTEFP(ppu_thread&, ppu_opcode_t);
static bool VLOGEFP(ppu_thread&, ppu_opcode_t);
static bool VMAXFP(ppu_thread&, ppu_opcode_t);
static bool VMAXSB(ppu_thread&, ppu_opcode_t);
static bool VMAXSH(ppu_thread&, ppu_opcode_t);
static bool VMAXSW(ppu_thread&, ppu_opcode_t);
static bool VMAXUB(ppu_thread&, ppu_opcode_t);
static bool VMAXUH(ppu_thread&, ppu_opcode_t);
static bool VMAXUW(ppu_thread&, ppu_opcode_t);
static bool VMINFP(ppu_thread&, ppu_opcode_t);
static bool VMINSB(ppu_thread&, ppu_opcode_t);
static bool VMINSH(ppu_thread&, ppu_opcode_t);
static bool VMINSW(ppu_thread&, ppu_opcode_t);
static bool VMINUB(ppu_thread&, ppu_opcode_t);
static bool VMINUH(ppu_thread&, ppu_opcode_t);
static bool VMINUW(ppu_thread&, ppu_opcode_t);
static bool VMLADDUHM(ppu_thread&, ppu_opcode_t);
static bool VMRGHB(ppu_thread&, ppu_opcode_t);
static bool VMRGHH(ppu_thread&, ppu_opcode_t);
static bool VMRGHW(ppu_thread&, ppu_opcode_t);
static bool VMRGLB(ppu_thread&, ppu_opcode_t);
static bool VMRGLH(ppu_thread&, ppu_opcode_t);
static bool VMRGLW(ppu_thread&, ppu_opcode_t);
static bool VMSUMMBM(ppu_thread&, ppu_opcode_t);
static bool VMSUMSHM(ppu_thread&, ppu_opcode_t);
static bool VMSUMUBM(ppu_thread&, ppu_opcode_t);
static bool VMSUMUHM(ppu_thread&, ppu_opcode_t);
static bool VMULESB(ppu_thread&, ppu_opcode_t);
static bool VMULESH(ppu_thread&, ppu_opcode_t);
static bool VMULEUB(ppu_thread&, ppu_opcode_t);
static bool VMULEUH(ppu_thread&, ppu_opcode_t);
static bool VMULOSB(ppu_thread&, ppu_opcode_t);
static bool VMULOSH(ppu_thread&, ppu_opcode_t);
static bool VMULOUB(ppu_thread&, ppu_opcode_t);
static bool VMULOUH(ppu_thread&, ppu_opcode_t);
static bool VNOR(ppu_thread&, ppu_opcode_t);
static bool VOR(ppu_thread&, ppu_opcode_t);
static bool VPERM(ppu_thread&, ppu_opcode_t);
static bool VPKPX(ppu_thread&, ppu_opcode_t);
static bool VPKUHUM(ppu_thread&, ppu_opcode_t);
static bool VPKUWUM(ppu_thread&, ppu_opcode_t);
static bool VREFP(ppu_thread&, ppu_opcode_t);
static bool VRFIM(ppu_thread&, ppu_opcode_t);
static bool VRFIN(ppu_thread&, ppu_opcode_t);
static bool VRFIP(ppu_thread&, ppu_opcode_t);
static bool VRFIZ(ppu_thread&, ppu_opcode_t);
static bool VRLB(ppu_thread&, ppu_opcode_t);
static bool VRLH(ppu_thread&, ppu_opcode_t);
static bool VRLW(ppu_thread&, ppu_opcode_t);
static bool VRSQRTEFP(ppu_thread&, ppu_opcode_t);
static bool VSEL(ppu_thread&, ppu_opcode_t);
static bool VSL(ppu_thread&, ppu_opcode_t);
static bool VSLB(ppu_thread&, ppu_opcode_t);
static bool VSLDOI(ppu_thread&, ppu_opcode_t);
static bool VSLH(ppu_thread&, ppu_opcode_t);
static bool VSLO(ppu_thread&, ppu_opcode_t);
static bool VSLW(ppu_thread&, ppu_opcode_t);
static bool VSPLTB(ppu_thread&, ppu_opcode_t);
static bool VSPLTH(ppu_thread&, ppu_opcode_t);
static bool VSPLTISB(ppu_thread&, ppu_opcode_t);
static bool VSPLTISH(ppu_thread&, ppu_opcode_t);
static bool VSPLTISW(ppu_thread&, ppu_opcode_t);
static bool VSPLTW(ppu_thread&, ppu_opcode_t);
static bool VSR(ppu_thread&, ppu_opcode_t);
static bool VSRAB(ppu_thread&, ppu_opcode_t);
static bool VSRAH(ppu_thread&, ppu_opcode_t);
static bool VSRAW(ppu_thread&, ppu_opcode_t);
static bool VSRB(ppu_thread&, ppu_opcode_t);
static bool VSRH(ppu_thread&, ppu_opcode_t);
static bool VSRO(ppu_thread&, ppu_opcode_t);
static bool VSRW(ppu_thread&, ppu_opcode_t);
static bool VSUBCUW(ppu_thread&, ppu_opcode_t);
static bool VSUBFP(ppu_thread&, ppu_opcode_t);
static bool VSUBUBM(ppu_thread&, ppu_opcode_t);
static bool VSUBUHM(ppu_thread&, ppu_opcode_t);
static bool VSUBUWM(ppu_thread&, ppu_opcode_t);
static bool VUPKHPX(ppu_thread&, ppu_opcode_t);
static bool VUPKHSB(ppu_thread&, ppu_opcode_t);
static bool VUPKHSH(ppu_thread&, ppu_opcode_t);
static bool VUPKLPX(ppu_thread&, ppu_opcode_t);
static bool VUPKLSB(ppu_thread&, ppu_opcode_t);
static bool VUPKLSH(ppu_thread&, ppu_opcode_t);
static bool VXOR(ppu_thread&, ppu_opcode_t);
static bool TDI(ppu_thread&, ppu_opcode_t);
static bool TWI(ppu_thread&, ppu_opcode_t);
static bool MULLI(ppu_thread&, ppu_opcode_t);
static bool SUBFIC(ppu_thread&, ppu_opcode_t);
static bool CMPLI(ppu_thread&, ppu_opcode_t);
static bool CMPI(ppu_thread&, ppu_opcode_t);
static bool ADDIC(ppu_thread&, ppu_opcode_t);
static bool ADDI(ppu_thread&, ppu_opcode_t);
static bool ADDIS(ppu_thread&, ppu_opcode_t);
static bool BC(ppu_thread&, ppu_opcode_t);
static bool SC(ppu_thread&, ppu_opcode_t);
static bool B(ppu_thread&, ppu_opcode_t);
static bool MCRF(ppu_thread&, ppu_opcode_t);
static bool BCLR(ppu_thread&, ppu_opcode_t);
static bool CRNOR(ppu_thread&, ppu_opcode_t);
static bool CRANDC(ppu_thread&, ppu_opcode_t);
static bool ISYNC(ppu_thread&, ppu_opcode_t);
static bool CRXOR(ppu_thread&, ppu_opcode_t);
static bool CRNAND(ppu_thread&, ppu_opcode_t);
static bool CRAND(ppu_thread&, ppu_opcode_t);
static bool CREQV(ppu_thread&, ppu_opcode_t);
static bool CRORC(ppu_thread&, ppu_opcode_t);
static bool CROR(ppu_thread&, ppu_opcode_t);
static bool BCCTR(ppu_thread&, ppu_opcode_t);
static bool RLWIMI(ppu_thread&, ppu_opcode_t);
static bool RLWINM(ppu_thread&, ppu_opcode_t);
static bool RLWNM(ppu_thread&, ppu_opcode_t);
static bool ORI(ppu_thread&, ppu_opcode_t);
static bool ORIS(ppu_thread&, ppu_opcode_t);
static bool XORI(ppu_thread&, ppu_opcode_t);
static bool XORIS(ppu_thread&, ppu_opcode_t);
static bool ANDI(ppu_thread&, ppu_opcode_t);
static bool ANDIS(ppu_thread&, ppu_opcode_t);
static bool RLDICL(ppu_thread&, ppu_opcode_t);
static bool RLDICR(ppu_thread&, ppu_opcode_t);
static bool RLDIC(ppu_thread&, ppu_opcode_t);
static bool RLDIMI(ppu_thread&, ppu_opcode_t);
static bool RLDCL(ppu_thread&, ppu_opcode_t);
static bool RLDCR(ppu_thread&, ppu_opcode_t);
static bool CMP(ppu_thread&, ppu_opcode_t);
static bool TW(ppu_thread&, ppu_opcode_t);
static bool LVSL(ppu_thread&, ppu_opcode_t);
static bool LVEBX(ppu_thread&, ppu_opcode_t);
static bool SUBFC(ppu_thread&, ppu_opcode_t);
static bool MULHDU(ppu_thread&, ppu_opcode_t);
static bool ADDC(ppu_thread&, ppu_opcode_t);
static bool MULHWU(ppu_thread&, ppu_opcode_t);
static bool MFOCRF(ppu_thread&, ppu_opcode_t);
static bool LWARX(ppu_thread&, ppu_opcode_t);
static bool LDX(ppu_thread&, ppu_opcode_t);
static bool LWZX(ppu_thread&, ppu_opcode_t);
static bool SLW(ppu_thread&, ppu_opcode_t);
static bool CNTLZW(ppu_thread&, ppu_opcode_t);
static bool SLD(ppu_thread&, ppu_opcode_t);
static bool AND(ppu_thread&, ppu_opcode_t);
static bool CMPL(ppu_thread&, ppu_opcode_t);
static bool LVSR(ppu_thread&, ppu_opcode_t);
static bool LVEHX(ppu_thread&, ppu_opcode_t);
static bool SUBF(ppu_thread&, ppu_opcode_t);
static bool LDUX(ppu_thread&, ppu_opcode_t);
static bool DCBST(ppu_thread&, ppu_opcode_t);
static bool LWZUX(ppu_thread&, ppu_opcode_t);
static bool CNTLZD(ppu_thread&, ppu_opcode_t);
static bool ANDC(ppu_thread&, ppu_opcode_t);
static bool TD(ppu_thread&, ppu_opcode_t);
static bool LVEWX(ppu_thread&, ppu_opcode_t);
static bool MULHD(ppu_thread&, ppu_opcode_t);
static bool MULHW(ppu_thread&, ppu_opcode_t);
static bool LDARX(ppu_thread&, ppu_opcode_t);
static bool DCBF(ppu_thread&, ppu_opcode_t);
static bool LBZX(ppu_thread&, ppu_opcode_t);
static bool LVX(ppu_thread&, ppu_opcode_t);
static bool NEG(ppu_thread&, ppu_opcode_t);
static bool LBZUX(ppu_thread&, ppu_opcode_t);
static bool NOR(ppu_thread&, ppu_opcode_t);
static bool STVEBX(ppu_thread&, ppu_opcode_t);
static bool SUBFE(ppu_thread&, ppu_opcode_t);
static bool ADDE(ppu_thread&, ppu_opcode_t);
static bool MTOCRF(ppu_thread&, ppu_opcode_t);
static bool STDX(ppu_thread&, ppu_opcode_t);
static bool STWCX(ppu_thread&, ppu_opcode_t);
static bool STWX(ppu_thread&, ppu_opcode_t);
static bool STVEHX(ppu_thread&, ppu_opcode_t);
static bool STDUX(ppu_thread&, ppu_opcode_t);
static bool STWUX(ppu_thread&, ppu_opcode_t);
static bool STVEWX(ppu_thread&, ppu_opcode_t);
static bool SUBFZE(ppu_thread&, ppu_opcode_t);
static bool ADDZE(ppu_thread&, ppu_opcode_t);
static bool STDCX(ppu_thread&, ppu_opcode_t);
static bool STBX(ppu_thread&, ppu_opcode_t);
static bool STVX(ppu_thread&, ppu_opcode_t);
static bool MULLD(ppu_thread&, ppu_opcode_t);
static bool SUBFME(ppu_thread&, ppu_opcode_t);
static bool ADDME(ppu_thread&, ppu_opcode_t);
static bool MULLW(ppu_thread&, ppu_opcode_t);
static bool DCBTST(ppu_thread&, ppu_opcode_t);
static bool STBUX(ppu_thread&, ppu_opcode_t);
static bool ADD(ppu_thread&, ppu_opcode_t);
static bool DCBT(ppu_thread&, ppu_opcode_t);
static bool LHZX(ppu_thread&, ppu_opcode_t);
static bool EQV(ppu_thread&, ppu_opcode_t);
static bool ECIWX(ppu_thread&, ppu_opcode_t);
static bool LHZUX(ppu_thread&, ppu_opcode_t);
static bool XOR(ppu_thread&, ppu_opcode_t);
static bool MFSPR(ppu_thread&, ppu_opcode_t);
static bool LWAX(ppu_thread&, ppu_opcode_t);
static bool DST(ppu_thread&, ppu_opcode_t);
static bool LHAX(ppu_thread&, ppu_opcode_t);
static bool LVXL(ppu_thread&, ppu_opcode_t);
static bool MFTB(ppu_thread&, ppu_opcode_t);
static bool LWAUX(ppu_thread&, ppu_opcode_t);
static bool DSTST(ppu_thread&, ppu_opcode_t);
static bool LHAUX(ppu_thread&, ppu_opcode_t);
static bool STHX(ppu_thread&, ppu_opcode_t);
static bool ORC(ppu_thread&, ppu_opcode_t);
static bool ECOWX(ppu_thread&, ppu_opcode_t);
static bool STHUX(ppu_thread&, ppu_opcode_t);
static bool OR(ppu_thread&, ppu_opcode_t);
static bool DIVDU(ppu_thread&, ppu_opcode_t);
static bool DIVWU(ppu_thread&, ppu_opcode_t);
static bool MTSPR(ppu_thread&, ppu_opcode_t);
static bool DCBI(ppu_thread&, ppu_opcode_t);
static bool NAND(ppu_thread&, ppu_opcode_t);
static bool STVXL(ppu_thread&, ppu_opcode_t);
static bool DIVD(ppu_thread&, ppu_opcode_t);
static bool DIVW(ppu_thread&, ppu_opcode_t);
static bool LDBRX(ppu_thread&, ppu_opcode_t);
static bool LSWX(ppu_thread&, ppu_opcode_t);
static bool LWBRX(ppu_thread&, ppu_opcode_t);
static bool LFSX(ppu_thread&, ppu_opcode_t);
static bool SRW(ppu_thread&, ppu_opcode_t);
static bool SRD(ppu_thread&, ppu_opcode_t);
static bool LSWI(ppu_thread&, ppu_opcode_t);
static bool LFSUX(ppu_thread&, ppu_opcode_t);
static bool SYNC(ppu_thread&, ppu_opcode_t);
static bool LFDX(ppu_thread&, ppu_opcode_t);
static bool LFDUX(ppu_thread&, ppu_opcode_t);
static bool STDBRX(ppu_thread&, ppu_opcode_t);
static bool STSWX(ppu_thread&, ppu_opcode_t);
static bool STWBRX(ppu_thread&, ppu_opcode_t);
static bool STFSX(ppu_thread&, ppu_opcode_t);
static bool STFSUX(ppu_thread&, ppu_opcode_t);
static bool STSWI(ppu_thread&, ppu_opcode_t);
static bool STFDX(ppu_thread&, ppu_opcode_t);
static bool STFDUX(ppu_thread&, ppu_opcode_t);
static bool LHBRX(ppu_thread&, ppu_opcode_t);
static bool SRAW(ppu_thread&, ppu_opcode_t);
static bool SRAD(ppu_thread&, ppu_opcode_t);
static bool DSS(ppu_thread&, ppu_opcode_t);
static bool SRAWI(ppu_thread&, ppu_opcode_t);
static bool SRADI(ppu_thread&, ppu_opcode_t);
static bool EIEIO(ppu_thread&, ppu_opcode_t);
static bool STHBRX(ppu_thread&, ppu_opcode_t);
static bool EXTSH(ppu_thread&, ppu_opcode_t);
static bool EXTSB(ppu_thread&, ppu_opcode_t);
static bool STFIWX(ppu_thread&, ppu_opcode_t);
static bool EXTSW(ppu_thread&, ppu_opcode_t);
static bool ICBI(ppu_thread&, ppu_opcode_t);
static bool DCBZ(ppu_thread&, ppu_opcode_t);
static bool LWZ(ppu_thread&, ppu_opcode_t);
static bool LWZU(ppu_thread&, ppu_opcode_t);
static bool LBZ(ppu_thread&, ppu_opcode_t);
static bool LBZU(ppu_thread&, ppu_opcode_t);
static bool STW(ppu_thread&, ppu_opcode_t);
static bool STWU(ppu_thread&, ppu_opcode_t);
static bool STB(ppu_thread&, ppu_opcode_t);
static bool STBU(ppu_thread&, ppu_opcode_t);
static bool LHZ(ppu_thread&, ppu_opcode_t);
static bool LHZU(ppu_thread&, ppu_opcode_t);
static bool LHA(ppu_thread&, ppu_opcode_t);
static bool LHAU(ppu_thread&, ppu_opcode_t);
static bool STH(ppu_thread&, ppu_opcode_t);
static bool STHU(ppu_thread&, ppu_opcode_t);
static bool LMW(ppu_thread&, ppu_opcode_t);
static bool STMW(ppu_thread&, ppu_opcode_t);
static bool LFS(ppu_thread&, ppu_opcode_t);
static bool LFSU(ppu_thread&, ppu_opcode_t);
static bool LFD(ppu_thread&, ppu_opcode_t);
static bool LFDU(ppu_thread&, ppu_opcode_t);
static bool STFS(ppu_thread&, ppu_opcode_t);
static bool STFSU(ppu_thread&, ppu_opcode_t);
static bool STFD(ppu_thread&, ppu_opcode_t);
static bool STFDU(ppu_thread&, ppu_opcode_t);
static bool LD(ppu_thread&, ppu_opcode_t);
static bool LDU(ppu_thread&, ppu_opcode_t);
static bool LWA(ppu_thread&, ppu_opcode_t);
static bool STD(ppu_thread&, ppu_opcode_t);
static bool STDU(ppu_thread&, ppu_opcode_t);
static bool MTFSB1(ppu_thread&, ppu_opcode_t);
static bool MCRFS(ppu_thread&, ppu_opcode_t);
static bool MTFSB0(ppu_thread&, ppu_opcode_t);
static bool MTFSFI(ppu_thread&, ppu_opcode_t);
static bool MFFS(ppu_thread&, ppu_opcode_t);
static bool MTFSF(ppu_thread&, ppu_opcode_t);
static bool FCMPU(ppu_thread&, ppu_opcode_t);
static bool FCTIW(ppu_thread&, ppu_opcode_t);
static bool FCTIWZ(ppu_thread&, ppu_opcode_t);
static bool FSEL(ppu_thread&, ppu_opcode_t);
static bool FCMPO(ppu_thread&, ppu_opcode_t);
static bool FNEG(ppu_thread&, ppu_opcode_t);
static bool FMR(ppu_thread&, ppu_opcode_t);
static bool FNABS(ppu_thread&, ppu_opcode_t);
static bool FABS(ppu_thread&, ppu_opcode_t);
static bool FCTID(ppu_thread&, ppu_opcode_t);
static bool FCTIDZ(ppu_thread&, ppu_opcode_t);
static bool FCFID(ppu_thread&, ppu_opcode_t);
static bool LVLX(ppu_thread&, ppu_opcode_t);
static bool LVLXL(ppu_thread&, ppu_opcode_t);
static bool LVRX(ppu_thread&, ppu_opcode_t);
static bool LVRXL(ppu_thread&, ppu_opcode_t);
static bool STVLX(ppu_thread&, ppu_opcode_t);
static bool STVLXL(ppu_thread&, ppu_opcode_t);
static bool STVRX(ppu_thread&, ppu_opcode_t);
static bool STVRXL(ppu_thread&, ppu_opcode_t);
static bool UNK(ppu_thread&, ppu_opcode_t);
ppu_intrp_func_t fn;
};
struct ppu_interpreter_precise final : ppu_interpreter
template <typename IT>
struct ppu_interpreter_t;
namespace asmjit
{
static bool VPKSHSS(ppu_thread&, ppu_opcode_t);
static bool VPKSHUS(ppu_thread&, ppu_opcode_t);
static bool VPKSWSS(ppu_thread&, ppu_opcode_t);
static bool VPKSWUS(ppu_thread&, ppu_opcode_t);
static bool VPKUHUS(ppu_thread&, ppu_opcode_t);
static bool VPKUWUS(ppu_thread&, ppu_opcode_t);
static bool VADDSBS(ppu_thread&, ppu_opcode_t);
static bool VADDSHS(ppu_thread&, ppu_opcode_t);
static bool VADDSWS(ppu_thread&, ppu_opcode_t);
static bool VADDUBS(ppu_thread&, ppu_opcode_t);
static bool VADDUHS(ppu_thread&, ppu_opcode_t);
static bool VADDUWS(ppu_thread&, ppu_opcode_t);
static bool VSUBSBS(ppu_thread&, ppu_opcode_t);
static bool VSUBSHS(ppu_thread&, ppu_opcode_t);
static bool VSUBSWS(ppu_thread&, ppu_opcode_t);
static bool VSUBUBS(ppu_thread&, ppu_opcode_t);
static bool VSUBUHS(ppu_thread&, ppu_opcode_t);
static bool VSUBUWS(ppu_thread&, ppu_opcode_t);
static bool VMHADDSHS(ppu_thread&, ppu_opcode_t);
static bool VMHRADDSHS(ppu_thread&, ppu_opcode_t);
static bool VMSUMSHS(ppu_thread&, ppu_opcode_t);
static bool VMSUMUHS(ppu_thread&, ppu_opcode_t);
static bool VSUMSWS(ppu_thread&, ppu_opcode_t);
static bool VSUM2SWS(ppu_thread&, ppu_opcode_t);
static bool VSUM4SBS(ppu_thread&, ppu_opcode_t);
static bool VSUM4SHS(ppu_thread&, ppu_opcode_t);
static bool VSUM4UBS(ppu_thread&, ppu_opcode_t);
static bool VCTSXS(ppu_thread&, ppu_opcode_t);
static bool VCTUXS(ppu_thread&, ppu_opcode_t);
static bool VMADDFP(ppu_thread&, ppu_opcode_t);
static bool VNMSUBFP(ppu_thread&, ppu_opcode_t);
struct ppu_builder;
}
static bool FDIVS(ppu_thread&, ppu_opcode_t);
static bool FSUBS(ppu_thread&, ppu_opcode_t);
static bool FADDS(ppu_thread&, ppu_opcode_t);
static bool FSQRTS(ppu_thread&, ppu_opcode_t);
static bool FRES(ppu_thread&, ppu_opcode_t);
static bool FMULS(ppu_thread&, ppu_opcode_t);
static bool FMADDS(ppu_thread&, ppu_opcode_t);
static bool FMSUBS(ppu_thread&, ppu_opcode_t);
static bool FNMSUBS(ppu_thread&, ppu_opcode_t);
static bool FNMADDS(ppu_thread&, ppu_opcode_t);
struct ppu_interpreter_rt_base
{
protected:
std::unique_ptr<ppu_interpreter_t<ppu_intrp_func_t>> ptrs;
static bool FRSP(ppu_thread&, ppu_opcode_t);
static bool FDIV(ppu_thread&, ppu_opcode_t);
static bool FSUB(ppu_thread&, ppu_opcode_t);
static bool FADD(ppu_thread&, ppu_opcode_t);
static bool FSQRT(ppu_thread&, ppu_opcode_t);
static bool FMUL(ppu_thread&, ppu_opcode_t);
static bool FRSQRTE(ppu_thread&, ppu_opcode_t);
static bool FMSUB(ppu_thread&, ppu_opcode_t);
static bool FMADD(ppu_thread&, ppu_opcode_t);
static bool FNMSUB(ppu_thread&, ppu_opcode_t);
static bool FNMADD(ppu_thread&, ppu_opcode_t);
ppu_interpreter_rt_base() noexcept;
ppu_interpreter_rt_base(const ppu_interpreter_rt_base&) = delete;
ppu_interpreter_rt_base& operator=(const ppu_interpreter_rt_base&) = delete;
virtual ~ppu_interpreter_rt_base();
};
struct ppu_interpreter_fast final : ppu_interpreter
struct ppu_interpreter_rt : ppu_interpreter_rt_base
{
static bool VPKSHSS(ppu_thread&, ppu_opcode_t);
static bool VPKSHUS(ppu_thread&, ppu_opcode_t);
static bool VPKSWSS(ppu_thread&, ppu_opcode_t);
static bool VPKSWUS(ppu_thread&, ppu_opcode_t);
static bool VPKUHUS(ppu_thread&, ppu_opcode_t);
static bool VPKUWUS(ppu_thread&, ppu_opcode_t);
static bool VADDSBS(ppu_thread&, ppu_opcode_t);
static bool VADDSHS(ppu_thread&, ppu_opcode_t);
static bool VADDSWS(ppu_thread&, ppu_opcode_t);
static bool VADDUBS(ppu_thread&, ppu_opcode_t);
static bool VADDUHS(ppu_thread&, ppu_opcode_t);
static bool VADDUWS(ppu_thread&, ppu_opcode_t);
static bool VSUBSBS(ppu_thread&, ppu_opcode_t);
static bool VSUBSHS(ppu_thread&, ppu_opcode_t);
static bool VSUBSWS(ppu_thread&, ppu_opcode_t);
static bool VSUBUBS(ppu_thread&, ppu_opcode_t);
static bool VSUBUHS(ppu_thread&, ppu_opcode_t);
static bool VSUBUWS(ppu_thread&, ppu_opcode_t);
static bool VMHADDSHS(ppu_thread&, ppu_opcode_t);
static bool VMHRADDSHS(ppu_thread&, ppu_opcode_t);
static bool VMSUMSHS(ppu_thread&, ppu_opcode_t);
static bool VMSUMUHS(ppu_thread&, ppu_opcode_t);
static bool VSUMSWS(ppu_thread&, ppu_opcode_t);
static bool VSUM2SWS(ppu_thread&, ppu_opcode_t);
static bool VSUM4SBS(ppu_thread&, ppu_opcode_t);
static bool VSUM4SHS(ppu_thread&, ppu_opcode_t);
static bool VSUM4UBS(ppu_thread&, ppu_opcode_t);
static bool VCTSXS(ppu_thread&, ppu_opcode_t);
static bool VCTUXS(ppu_thread&, ppu_opcode_t);
static bool VMADDFP(ppu_thread&, ppu_opcode_t);
static bool VNMSUBFP(ppu_thread&, ppu_opcode_t);
ppu_interpreter_rt() noexcept;
static bool FDIVS(ppu_thread&, ppu_opcode_t);
static bool FSUBS(ppu_thread&, ppu_opcode_t);
static bool FADDS(ppu_thread&, ppu_opcode_t);
static bool FSQRTS(ppu_thread&, ppu_opcode_t);
static bool FRES(ppu_thread&, ppu_opcode_t);
static bool FMULS(ppu_thread&, ppu_opcode_t);
static bool FMADDS(ppu_thread&, ppu_opcode_t);
static bool FMSUBS(ppu_thread&, ppu_opcode_t);
static bool FNMSUBS(ppu_thread&, ppu_opcode_t);
static bool FNMADDS(ppu_thread&, ppu_opcode_t);
ppu_intrp_func_t decode(u32 op) const noexcept;
static bool FRSP(ppu_thread&, ppu_opcode_t);
static bool FDIV(ppu_thread&, ppu_opcode_t);
static bool FSUB(ppu_thread&, ppu_opcode_t);
static bool FADD(ppu_thread&, ppu_opcode_t);
static bool FSQRT(ppu_thread&, ppu_opcode_t);
static bool FMUL(ppu_thread&, ppu_opcode_t);
static bool FRSQRTE(ppu_thread&, ppu_opcode_t);
static bool FMSUB(ppu_thread&, ppu_opcode_t);
static bool FMADD(ppu_thread&, ppu_opcode_t);
static bool FNMSUB(ppu_thread&, ppu_opcode_t);
static bool FNMADD(ppu_thread&, ppu_opcode_t);
private:
ppu_decoder<ppu_interpreter_t<ppu_intrp_func_t>, ppu_intrp_func_t> table;
};

View file

@ -30,7 +30,7 @@ LOG_CHANNEL(ppu_loader);
extern std::string ppu_get_function_name(const std::string& _module, u32 fnid);
extern std::string ppu_get_variable_name(const std::string& _module, u32 vnid);
extern void ppu_register_range(u32 addr, u32 size);
extern void ppu_register_function_at(u32 addr, u32 size, ppu_function_t ptr);
extern void ppu_register_function_at(u32 addr, u32 size, ppu_intrp_func_t ptr);
extern void sys_initialize_tls(ppu_thread&, u64, u32, u32, u32);
@ -275,7 +275,7 @@ static void ppu_initialize_modules(ppu_linkage_info* link)
};
// Initialize double-purpose fake OPD array for HLE functions
const auto& hle_funcs = ppu_function_manager::get(g_cfg.core.ppu_decoder == ppu_decoder_type::llvm);
const auto& hle_funcs = ppu_function_manager::get(g_cfg.core.ppu_decoder != ppu_decoder_type::_static);
u32& hle_funcs_addr = g_fxo->get<ppu_function_manager>().addr;

View file

@ -123,7 +123,7 @@ public:
static void initialize_modules();
template <auto* Func>
static auto& register_static_function(const char* _module, const char* name, ppu_function_t func, u32 fnid)
static auto& register_static_function(const char* _module, const char* name, ppu_intrp_func_t func, u32 fnid)
{
auto& info = access_static_function(_module, fnid);

View file

@ -84,19 +84,22 @@ class ppu_decoder
struct instruction_info
{
u32 value;
T pointer;
T ptr0;
T ptr_rc;
u32 magn; // Non-zero for "columns" (effectively, number of most significant bits "eaten")
constexpr instruction_info(u32 v, T p, u32 m = 0)
constexpr instruction_info(u32 v, T p, T p_rc, u32 m = 0)
: value(v)
, pointer(p)
, ptr0(p)
, ptr_rc(p_rc)
, magn(m)
{
}
constexpr instruction_info(u32 v, const T* p, u32 m = 0)
constexpr instruction_info(u32 v, const T* p, const T* p_rc, u32 m = 0)
: value(v)
, pointer(*p)
, ptr0(*p)
, ptr_rc(*p_rc)
, magn(m)
{
}
@ -113,7 +116,8 @@ class ppu_decoder
{
for (u32 j = 0; j < 1u << sh; j++)
{
m_table.at((((((i << (count - v.magn)) | v.value) << sh) | j) << 6) | main_op) = v.pointer;
const u32 k = (((i << (count - v.magn)) | v.value) << sh) | j;
m_table.at((k << 6) | main_op) = k & 1 ? v.ptr_rc : v.ptr0;
}
}
}
@ -125,454 +129,498 @@ class ppu_decoder
{
for (u32 i = 0; i < 1u << 11; i++)
{
m_table.at(i << 6 | v.value) = v.pointer;
m_table.at(i << 6 | v.value) = i & 1 ? v.ptr_rc : v.ptr0;
}
}
}
}
public:
ppu_decoder() noexcept
// Helper
static const D& _first(const D& arg)
{
return arg;
}
public:
template <typename... Args>
ppu_decoder(const Args&... args) noexcept
{
// If an object is passed to the constructor, assign values from that object
#define GET_(name) [&]{ if constexpr (sizeof...(Args) > 0) return _first(args...).name; else return &D::name; }()
#define GET(name) GET_(name), GET_(name)
#define GETRC(name) GET_(name), GET_(name##_)
static_assert(sizeof...(Args) <= 1);
for (auto& x : m_table)
{
x = &D::UNK;
x = GET(UNK);
}
// Main opcodes (field 0..5)
fill_table(0x00, 6, -1,
{
{ 0x02, &D::TDI },
{ 0x03, &D::TWI },
{ 0x07, &D::MULLI },
{ 0x08, &D::SUBFIC },
{ 0x0a, &D::CMPLI },
{ 0x0b, &D::CMPI },
{ 0x0c, &D::ADDIC },
{ 0x0d, &D::ADDIC },
{ 0x0e, &D::ADDI },
{ 0x0f, &D::ADDIS },
{ 0x10, &D::BC },
{ 0x11, &D::SC },
{ 0x12, &D::B },
{ 0x14, &D::RLWIMI },
{ 0x15, &D::RLWINM },
{ 0x17, &D::RLWNM },
{ 0x18, &D::ORI },
{ 0x19, &D::ORIS },
{ 0x1a, &D::XORI },
{ 0x1b, &D::XORIS },
{ 0x1c, &D::ANDI },
{ 0x1d, &D::ANDIS },
{ 0x20, &D::LWZ },
{ 0x21, &D::LWZU },
{ 0x22, &D::LBZ },
{ 0x23, &D::LBZU },
{ 0x24, &D::STW },
{ 0x25, &D::STWU },
{ 0x26, &D::STB },
{ 0x27, &D::STBU },
{ 0x28, &D::LHZ },
{ 0x29, &D::LHZU },
{ 0x2a, &D::LHA },
{ 0x2b, &D::LHAU },
{ 0x2c, &D::STH },
{ 0x2d, &D::STHU },
{ 0x2e, &D::LMW },
{ 0x2f, &D::STMW },
{ 0x30, &D::LFS },
{ 0x31, &D::LFSU },
{ 0x32, &D::LFD },
{ 0x33, &D::LFDU },
{ 0x34, &D::STFS },
{ 0x35, &D::STFSU },
{ 0x36, &D::STFD },
{ 0x37, &D::STFDU },
{ 0x02, GET(TDI) },
{ 0x03, GET(TWI) },
{ 0x07, GET(MULLI) },
{ 0x08, GET(SUBFIC) },
{ 0x0a, GET(CMPLI) },
{ 0x0b, GET(CMPI) },
{ 0x0c, GET(ADDIC) },
{ 0x0d, GET(ADDIC) },
{ 0x0e, GET(ADDI) },
{ 0x0f, GET(ADDIS) },
{ 0x10, GET(BC) },
{ 0x11, GET(SC) },
{ 0x12, GET(B) },
{ 0x14, GETRC(RLWIMI) },
{ 0x15, GETRC(RLWINM) },
{ 0x17, GETRC(RLWNM) },
{ 0x18, GET(ORI) },
{ 0x19, GET(ORIS) },
{ 0x1a, GET(XORI) },
{ 0x1b, GET(XORIS) },
{ 0x1c, GET(ANDI) },
{ 0x1d, GET(ANDIS) },
{ 0x20, GET(LWZ) },
{ 0x21, GET(LWZU) },
{ 0x22, GET(LBZ) },
{ 0x23, GET(LBZU) },
{ 0x24, GET(STW) },
{ 0x25, GET(STWU) },
{ 0x26, GET(STB) },
{ 0x27, GET(STBU) },
{ 0x28, GET(LHZ) },
{ 0x29, GET(LHZU) },
{ 0x2a, GET(LHA) },
{ 0x2b, GET(LHAU) },
{ 0x2c, GET(STH) },
{ 0x2d, GET(STHU) },
{ 0x2e, GET(LMW) },
{ 0x2f, GET(STMW) },
{ 0x30, GET(LFS) },
{ 0x31, GET(LFSU) },
{ 0x32, GET(LFD) },
{ 0x33, GET(LFDU) },
{ 0x34, GET(STFS) },
{ 0x35, GET(STFSU) },
{ 0x36, GET(STFD) },
{ 0x37, GET(STFDU) },
});
// Group 0x04 opcodes (field 21..31)
fill_table(0x04, 11, 0,
{
{ 0x0, &D::VADDUBM },
{ 0x2, &D::VMAXUB },
{ 0x4, &D::VRLB },
{ 0x6, &D::VCMPEQUB, 1 },
{ 0x8, &D::VMULOUB },
{ 0xa, &D::VADDFP },
{ 0xc, &D::VMRGHB },
{ 0xe, &D::VPKUHUM },
{ 0x0, GET(VADDUBM) },
{ 0x2, GET(VMAXUB) },
{ 0x4, GET(VRLB) },
{ 0x006, GET(VCMPEQUB) },
{ 0x406, GET(VCMPEQUB_) },
{ 0x8, GET(VMULOUB) },
{ 0xa, GET(VADDFP) },
{ 0xc, GET(VMRGHB) },
{ 0xe, GET(VPKUHUM) },
{ 0x20, &D::VMHADDSHS, 5 },
{ 0x21, &D::VMHRADDSHS, 5 },
{ 0x22, &D::VMLADDUHM, 5 },
{ 0x24, &D::VMSUMUBM, 5 },
{ 0x25, &D::VMSUMMBM, 5 },
{ 0x26, &D::VMSUMUHM, 5 },
{ 0x27, &D::VMSUMUHS, 5 },
{ 0x28, &D::VMSUMSHM, 5 },
{ 0x29, &D::VMSUMSHS, 5 },
{ 0x2a, &D::VSEL, 5 },
{ 0x2b, &D::VPERM, 5 },
{ 0x2c, &D::VSLDOI, 5 },
{ 0x2e, &D::VMADDFP, 5 },
{ 0x2f, &D::VNMSUBFP, 5 },
{ 0x20, GET(VMHADDSHS), 5 },
{ 0x21, GET(VMHRADDSHS), 5 },
{ 0x22, GET(VMLADDUHM), 5 },
{ 0x24, GET(VMSUMUBM), 5 },
{ 0x25, GET(VMSUMMBM), 5 },
{ 0x26, GET(VMSUMUHM), 5 },
{ 0x27, GET(VMSUMUHS), 5 },
{ 0x28, GET(VMSUMSHM), 5 },
{ 0x29, GET(VMSUMSHS), 5 },
{ 0x2a, GET(VSEL), 5 },
{ 0x2b, GET(VPERM), 5 },
{ 0x2c, GET(VSLDOI), 5 },
{ 0x2e, GET(VMADDFP), 5 },
{ 0x2f, GET(VNMSUBFP), 5 },
{ 0x40, &D::VADDUHM },
{ 0x42, &D::VMAXUH },
{ 0x44, &D::VRLH },
{ 0x46, &D::VCMPEQUH, 1 },
{ 0x48, &D::VMULOUH },
{ 0x4a, &D::VSUBFP },
{ 0x4c, &D::VMRGHH },
{ 0x4e, &D::VPKUWUM },
{ 0x80, &D::VADDUWM },
{ 0x82, &D::VMAXUW },
{ 0x84, &D::VRLW },
{ 0x86, &D::VCMPEQUW, 1 },
{ 0x8c, &D::VMRGHW },
{ 0x8e, &D::VPKUHUS },
{ 0xc6, &D::VCMPEQFP, 1 },
{ 0xce, &D::VPKUWUS },
{ 0x40, GET(VADDUHM) },
{ 0x42, GET(VMAXUH) },
{ 0x44, GET(VRLH) },
{ 0x046, GET(VCMPEQUH) },
{ 0x446, GET(VCMPEQUH_) },
{ 0x48, GET(VMULOUH) },
{ 0x4a, GET(VSUBFP) },
{ 0x4c, GET(VMRGHH) },
{ 0x4e, GET(VPKUWUM) },
{ 0x80, GET(VADDUWM) },
{ 0x82, GET(VMAXUW) },
{ 0x84, GET(VRLW) },
{ 0x086, GET(VCMPEQUW) },
{ 0x486, GET(VCMPEQUW_) },
{ 0x8c, GET(VMRGHW) },
{ 0x8e, GET(VPKUHUS) },
{ 0x0c6, GET(VCMPEQFP) },
{ 0x4c6, GET(VCMPEQFP_) },
{ 0xce, GET(VPKUWUS) },
{ 0x102, &D::VMAXSB },
{ 0x104, &D::VSLB },
{ 0x108, &D::VMULOSB },
{ 0x10a, &D::VREFP },
{ 0x10c, &D::VMRGLB },
{ 0x10e, &D::VPKSHUS },
{ 0x142, &D::VMAXSH },
{ 0x144, &D::VSLH },
{ 0x148, &D::VMULOSH },
{ 0x14a, &D::VRSQRTEFP },
{ 0x14c, &D::VMRGLH },
{ 0x14e, &D::VPKSWUS },
{ 0x180, &D::VADDCUW },
{ 0x182, &D::VMAXSW },
{ 0x184, &D::VSLW },
{ 0x18a, &D::VEXPTEFP },
{ 0x18c, &D::VMRGLW },
{ 0x18e, &D::VPKSHSS },
{ 0x1c4, &D::VSL },
{ 0x1c6, &D::VCMPGEFP, 1 },
{ 0x1ca, &D::VLOGEFP },
{ 0x1ce, &D::VPKSWSS },
{ 0x200, &D::VADDUBS },
{ 0x202, &D::VMINUB },
{ 0x204, &D::VSRB },
{ 0x206, &D::VCMPGTUB, 1 },
{ 0x208, &D::VMULEUB },
{ 0x20a, &D::VRFIN },
{ 0x20c, &D::VSPLTB },
{ 0x20e, &D::VUPKHSB },
{ 0x240, &D::VADDUHS },
{ 0x242, &D::VMINUH },
{ 0x244, &D::VSRH },
{ 0x246, &D::VCMPGTUH, 1 },
{ 0x248, &D::VMULEUH },
{ 0x24a, &D::VRFIZ },
{ 0x24c, &D::VSPLTH },
{ 0x24e, &D::VUPKHSH },
{ 0x280, &D::VADDUWS },
{ 0x282, &D::VMINUW },
{ 0x284, &D::VSRW },
{ 0x286, &D::VCMPGTUW, 1 },
{ 0x28a, &D::VRFIP },
{ 0x28c, &D::VSPLTW },
{ 0x28e, &D::VUPKLSB },
{ 0x2c4, &D::VSR },
{ 0x2c6, &D::VCMPGTFP, 1 },
{ 0x2ca, &D::VRFIM },
{ 0x2ce, &D::VUPKLSH },
{ 0x300, &D::VADDSBS },
{ 0x302, &D::VMINSB },
{ 0x304, &D::VSRAB },
{ 0x306, &D::VCMPGTSB, 1 },
{ 0x308, &D::VMULESB },
{ 0x30a, &D::VCFUX },
{ 0x30c, &D::VSPLTISB },
{ 0x30e, &D::VPKPX },
{ 0x340, &D::VADDSHS },
{ 0x342, &D::VMINSH },
{ 0x344, &D::VSRAH },
{ 0x346, &D::VCMPGTSH, 1 },
{ 0x348, &D::VMULESH },
{ 0x34a, &D::VCFSX },
{ 0x34c, &D::VSPLTISH },
{ 0x34e, &D::VUPKHPX },
{ 0x380, &D::VADDSWS },
{ 0x382, &D::VMINSW },
{ 0x384, &D::VSRAW },
{ 0x386, &D::VCMPGTSW, 1 },
{ 0x38a, &D::VCTUXS },
{ 0x38c, &D::VSPLTISW },
{ 0x3c6, &D::VCMPBFP, 1 },
{ 0x3ca, &D::VCTSXS },
{ 0x3ce, &D::VUPKLPX },
{ 0x400, &D::VSUBUBM },
{ 0x402, &D::VAVGUB },
{ 0x404, &D::VAND },
{ 0x40a, &D::VMAXFP },
{ 0x40c, &D::VSLO },
{ 0x440, &D::VSUBUHM },
{ 0x442, &D::VAVGUH },
{ 0x444, &D::VANDC },
{ 0x44a, &D::VMINFP },
{ 0x44c, &D::VSRO },
{ 0x480, &D::VSUBUWM },
{ 0x482, &D::VAVGUW },
{ 0x484, &D::VOR },
{ 0x4c4, &D::VXOR },
{ 0x502, &D::VAVGSB },
{ 0x504, &D::VNOR },
{ 0x542, &D::VAVGSH },
{ 0x580, &D::VSUBCUW },
{ 0x582, &D::VAVGSW },
{ 0x600, &D::VSUBUBS },
{ 0x604, &D::MFVSCR },
{ 0x608, &D::VSUM4UBS },
{ 0x640, &D::VSUBUHS },
{ 0x644, &D::MTVSCR },
{ 0x648, &D::VSUM4SHS },
{ 0x680, &D::VSUBUWS },
{ 0x688, &D::VSUM2SWS },
{ 0x700, &D::VSUBSBS },
{ 0x708, &D::VSUM4SBS },
{ 0x740, &D::VSUBSHS },
{ 0x780, &D::VSUBSWS },
{ 0x788, &D::VSUMSWS },
{ 0x102, GET(VMAXSB) },
{ 0x104, GET(VSLB) },
{ 0x108, GET(VMULOSB) },
{ 0x10a, GET(VREFP) },
{ 0x10c, GET(VMRGLB) },
{ 0x10e, GET(VPKSHUS) },
{ 0x142, GET(VMAXSH) },
{ 0x144, GET(VSLH) },
{ 0x148, GET(VMULOSH) },
{ 0x14a, GET(VRSQRTEFP) },
{ 0x14c, GET(VMRGLH) },
{ 0x14e, GET(VPKSWUS) },
{ 0x180, GET(VADDCUW) },
{ 0x182, GET(VMAXSW) },
{ 0x184, GET(VSLW) },
{ 0x18a, GET(VEXPTEFP) },
{ 0x18c, GET(VMRGLW) },
{ 0x18e, GET(VPKSHSS) },
{ 0x1c4, GET(VSL) },
{ 0x1c6, GET(VCMPGEFP) },
{ 0x5c6, GET(VCMPGEFP_) },
{ 0x1ca, GET(VLOGEFP) },
{ 0x1ce, GET(VPKSWSS) },
{ 0x200, GET(VADDUBS) },
{ 0x202, GET(VMINUB) },
{ 0x204, GET(VSRB) },
{ 0x206, GET(VCMPGTUB) },
{ 0x606, GET(VCMPGTUB_) },
{ 0x208, GET(VMULEUB) },
{ 0x20a, GET(VRFIN) },
{ 0x20c, GET(VSPLTB) },
{ 0x20e, GET(VUPKHSB) },
{ 0x240, GET(VADDUHS) },
{ 0x242, GET(VMINUH) },
{ 0x244, GET(VSRH) },
{ 0x246, GET(VCMPGTUH) },
{ 0x646, GET(VCMPGTUH_) },
{ 0x248, GET(VMULEUH) },
{ 0x24a, GET(VRFIZ) },
{ 0x24c, GET(VSPLTH) },
{ 0x24e, GET(VUPKHSH) },
{ 0x280, GET(VADDUWS) },
{ 0x282, GET(VMINUW) },
{ 0x284, GET(VSRW) },
{ 0x286, GET(VCMPGTUW) },
{ 0x686, GET(VCMPGTUW_) },
{ 0x28a, GET(VRFIP) },
{ 0x28c, GET(VSPLTW) },
{ 0x28e, GET(VUPKLSB) },
{ 0x2c4, GET(VSR) },
{ 0x2c6, GET(VCMPGTFP) },
{ 0x6c6, GET(VCMPGTFP_) },
{ 0x2ca, GET(VRFIM) },
{ 0x2ce, GET(VUPKLSH) },
{ 0x300, GET(VADDSBS) },
{ 0x302, GET(VMINSB) },
{ 0x304, GET(VSRAB) },
{ 0x306, GET(VCMPGTSB) },
{ 0x706, GET(VCMPGTSB_) },
{ 0x308, GET(VMULESB) },
{ 0x30a, GET(VCFUX) },
{ 0x30c, GET(VSPLTISB) },
{ 0x30e, GET(VPKPX) },
{ 0x340, GET(VADDSHS) },
{ 0x342, GET(VMINSH) },
{ 0x344, GET(VSRAH) },
{ 0x346, GET(VCMPGTSH) },
{ 0x746, GET(VCMPGTSH_) },
{ 0x348, GET(VMULESH) },
{ 0x34a, GET(VCFSX) },
{ 0x34c, GET(VSPLTISH) },
{ 0x34e, GET(VUPKHPX) },
{ 0x380, GET(VADDSWS) },
{ 0x382, GET(VMINSW) },
{ 0x384, GET(VSRAW) },
{ 0x386, GET(VCMPGTSW) },
{ 0x786, GET(VCMPGTSW_) },
{ 0x38a, GET(VCTUXS) },
{ 0x38c, GET(VSPLTISW) },
{ 0x3c6, GET(VCMPBFP) },
{ 0x7c6, GET(VCMPBFP_) },
{ 0x3ca, GET(VCTSXS) },
{ 0x3ce, GET(VUPKLPX) },
{ 0x400, GET(VSUBUBM) },
{ 0x402, GET(VAVGUB) },
{ 0x404, GET(VAND) },
{ 0x40a, GET(VMAXFP) },
{ 0x40c, GET(VSLO) },
{ 0x440, GET(VSUBUHM) },
{ 0x442, GET(VAVGUH) },
{ 0x444, GET(VANDC) },
{ 0x44a, GET(VMINFP) },
{ 0x44c, GET(VSRO) },
{ 0x480, GET(VSUBUWM) },
{ 0x482, GET(VAVGUW) },
{ 0x484, GET(VOR) },
{ 0x4c4, GET(VXOR) },
{ 0x502, GET(VAVGSB) },
{ 0x504, GET(VNOR) },
{ 0x542, GET(VAVGSH) },
{ 0x580, GET(VSUBCUW) },
{ 0x582, GET(VAVGSW) },
{ 0x600, GET(VSUBUBS) },
{ 0x604, GET(MFVSCR) },
{ 0x608, GET(VSUM4UBS) },
{ 0x640, GET(VSUBUHS) },
{ 0x644, GET(MTVSCR) },
{ 0x648, GET(VSUM4SHS) },
{ 0x680, GET(VSUBUWS) },
{ 0x688, GET(VSUM2SWS) },
{ 0x700, GET(VSUBSBS) },
{ 0x708, GET(VSUM4SBS) },
{ 0x740, GET(VSUBSHS) },
{ 0x780, GET(VSUBSWS) },
{ 0x788, GET(VSUMSWS) },
});
// Group 0x13 opcodes (field 21..30)
fill_table(0x13, 10, 1,
{
{ 0x000, &D::MCRF },
{ 0x010, &D::BCLR },
{ 0x021, &D::CRNOR },
{ 0x081, &D::CRANDC },
{ 0x096, &D::ISYNC },
{ 0x0c1, &D::CRXOR },
{ 0x0e1, &D::CRNAND },
{ 0x101, &D::CRAND },
{ 0x121, &D::CREQV },
{ 0x1a1, &D::CRORC },
{ 0x1c1, &D::CROR },
{ 0x210, &D::BCCTR },
{ 0x000, GET(MCRF) },
{ 0x010, GET(BCLR) },
{ 0x021, GET(CRNOR) },
{ 0x081, GET(CRANDC) },
{ 0x096, GET(ISYNC) },
{ 0x0c1, GET(CRXOR) },
{ 0x0e1, GET(CRNAND) },
{ 0x101, GET(CRAND) },
{ 0x121, GET(CREQV) },
{ 0x1a1, GET(CRORC) },
{ 0x1c1, GET(CROR) },
{ 0x210, GET(BCCTR) },
});
// Group 0x1e opcodes (field 27..30)
fill_table(0x1e, 4, 1,
{
{ 0x0, &D::RLDICL },
{ 0x1, &D::RLDICL },
{ 0x2, &D::RLDICR },
{ 0x3, &D::RLDICR },
{ 0x4, &D::RLDIC },
{ 0x5, &D::RLDIC },
{ 0x6, &D::RLDIMI },
{ 0x7, &D::RLDIMI },
{ 0x8, &D::RLDCL },
{ 0x9, &D::RLDCR },
{ 0x0, GETRC(RLDICL) },
{ 0x1, GETRC(RLDICL) },
{ 0x2, GETRC(RLDICR) },
{ 0x3, GETRC(RLDICR) },
{ 0x4, GETRC(RLDIC) },
{ 0x5, GETRC(RLDIC) },
{ 0x6, GETRC(RLDIMI) },
{ 0x7, GETRC(RLDIMI) },
{ 0x8, GETRC(RLDCL) },
{ 0x9, GETRC(RLDCR) },
});
// Group 0x1f opcodes (field 21..30)
fill_table(0x1f, 10, 1,
{
{ 0x000, &D::CMP },
{ 0x004, &D::TW },
{ 0x006, &D::LVSL },
{ 0x007, &D::LVEBX },
{ 0x008, &D::SUBFC, 1 },
{ 0x009, &D::MULHDU },
{ 0x00a, &D::ADDC, 1 },
{ 0x00b, &D::MULHWU },
{ 0x013, &D::MFOCRF },
{ 0x014, &D::LWARX },
{ 0x015, &D::LDX },
{ 0x017, &D::LWZX },
{ 0x018, &D::SLW },
{ 0x01a, &D::CNTLZW },
{ 0x01b, &D::SLD },
{ 0x01c, &D::AND },
{ 0x020, &D::CMPL },
{ 0x026, &D::LVSR },
{ 0x027, &D::LVEHX },
{ 0x028, &D::SUBF, 1 },
{ 0x035, &D::LDUX },
{ 0x036, &D::DCBST },
{ 0x037, &D::LWZUX },
{ 0x03a, &D::CNTLZD },
{ 0x03c, &D::ANDC },
{ 0x044, &D::TD },
{ 0x047, &D::LVEWX },
{ 0x049, &D::MULHD },
{ 0x04b, &D::MULHW },
{ 0x054, &D::LDARX },
{ 0x056, &D::DCBF },
{ 0x057, &D::LBZX },
{ 0x067, &D::LVX },
{ 0x068, &D::NEG, 1 },
{ 0x077, &D::LBZUX },
{ 0x07c, &D::NOR },
{ 0x087, &D::STVEBX },
{ 0x088, &D::SUBFE, 1 },
{ 0x08a, &D::ADDE, 1 },
{ 0x090, &D::MTOCRF },
{ 0x095, &D::STDX },
{ 0x096, &D::STWCX },
{ 0x097, &D::STWX },
{ 0x0a7, &D::STVEHX },
{ 0x0b5, &D::STDUX },
{ 0x0b7, &D::STWUX },
{ 0x0c7, &D::STVEWX },
{ 0x0c8, &D::SUBFZE, 1 },
{ 0x0ca, &D::ADDZE, 1 },
{ 0x0d6, &D::STDCX },
{ 0x0d7, &D::STBX },
{ 0x0e7, &D::STVX },
{ 0x0e8, &D::SUBFME, 1 },
{ 0x0e9, &D::MULLD, 1 },
{ 0x0ea, &D::ADDME, 1 },
{ 0x0eb, &D::MULLW, 1 },
{ 0x0f6, &D::DCBTST },
{ 0x0f7, &D::STBUX },
{ 0x10a, &D::ADD, 1 },
{ 0x116, &D::DCBT },
{ 0x117, &D::LHZX },
{ 0x11c, &D::EQV },
{ 0x136, &D::ECIWX },
{ 0x137, &D::LHZUX },
{ 0x13c, &D::XOR },
{ 0x153, &D::MFSPR },
{ 0x155, &D::LWAX },
{ 0x156, &D::DST },
{ 0x157, &D::LHAX },
{ 0x167, &D::LVXL },
{ 0x173, &D::MFTB },
{ 0x175, &D::LWAUX },
{ 0x176, &D::DSTST },
{ 0x177, &D::LHAUX },
{ 0x197, &D::STHX },
{ 0x19c, &D::ORC },
{ 0x1b6, &D::ECOWX },
{ 0x1b7, &D::STHUX },
{ 0x1bc, &D::OR },
{ 0x1c9, &D::DIVDU, 1 },
{ 0x1cb, &D::DIVWU, 1 },
{ 0x1d3, &D::MTSPR },
{ 0x1d6, &D::DCBI },
{ 0x1dc, &D::NAND },
{ 0x1e7, &D::STVXL },
{ 0x1e9, &D::DIVD, 1 },
{ 0x1eb, &D::DIVW, 1 },
{ 0x207, &D::LVLX },
{ 0x214, &D::LDBRX },
{ 0x215, &D::LSWX },
{ 0x216, &D::LWBRX },
{ 0x217, &D::LFSX },
{ 0x218, &D::SRW },
{ 0x21b, &D::SRD },
{ 0x227, &D::LVRX },
{ 0x237, &D::LFSUX },
{ 0x255, &D::LSWI },
{ 0x256, &D::SYNC },
{ 0x257, &D::LFDX },
{ 0x277, &D::LFDUX },
{ 0x287, &D::STVLX },
{ 0x294, &D::STDBRX },
{ 0x295, &D::STSWX },
{ 0x296, &D::STWBRX },
{ 0x297, &D::STFSX },
{ 0x2a7, &D::STVRX },
{ 0x2b7, &D::STFSUX },
{ 0x2d5, &D::STSWI },
{ 0x2d7, &D::STFDX },
{ 0x2f7, &D::STFDUX },
{ 0x307, &D::LVLXL },
{ 0x316, &D::LHBRX },
{ 0x318, &D::SRAW },
{ 0x31a, &D::SRAD },
{ 0x327, &D::LVRXL },
{ 0x336, &D::DSS },
{ 0x338, &D::SRAWI },
{ 0x33a, &D::SRADI },
{ 0x33b, &D::SRADI },
{ 0x356, &D::EIEIO },
{ 0x387, &D::STVLXL },
{ 0x396, &D::STHBRX },
{ 0x39a, &D::EXTSH },
{ 0x3a7, &D::STVRXL },
{ 0x3ba, &D::EXTSB },
{ 0x3d7, &D::STFIWX },
{ 0x3da, &D::EXTSW },
{ 0x3d6, &D::ICBI },
{ 0x3f6, &D::DCBZ },
{ 0x000, GET(CMP) },
{ 0x004, GET(TW) },
{ 0x006, GET(LVSL) },
{ 0x007, GET(LVEBX) },
{ 0x008, GETRC(SUBFC) },
{ 0x208, GETRC(SUBFCO) },
{ 0x009, GETRC(MULHDU) },
{ 0x00a, GETRC(ADDC) },
{ 0x20a, GETRC(ADDCO) },
{ 0x00b, GETRC(MULHWU) },
{ 0x013, GET(MFOCRF) },
{ 0x014, GET(LWARX) },
{ 0x015, GET(LDX) },
{ 0x017, GET(LWZX) },
{ 0x018, GETRC(SLW) },
{ 0x01a, GETRC(CNTLZW) },
{ 0x01b, GETRC(SLD) },
{ 0x01c, GETRC(AND) },
{ 0x020, GET(CMPL) },
{ 0x026, GET(LVSR) },
{ 0x027, GET(LVEHX) },
{ 0x028, GETRC(SUBF) },
{ 0x228, GETRC(SUBFO) },
{ 0x035, GET(LDUX) },
{ 0x036, GET(DCBST) },
{ 0x037, GET(LWZUX) },
{ 0x03a, GETRC(CNTLZD) },
{ 0x03c, GETRC(ANDC) },
{ 0x044, GET(TD) },
{ 0x047, GET(LVEWX) },
{ 0x049, GETRC(MULHD) },
{ 0x04b, GETRC(MULHW) },
{ 0x054, GET(LDARX) },
{ 0x056, GET(DCBF) },
{ 0x057, GET(LBZX) },
{ 0x067, GET(LVX) },
{ 0x068, GETRC(NEG) },
{ 0x268, GETRC(NEGO) },
{ 0x077, GET(LBZUX) },
{ 0x07c, GETRC(NOR) },
{ 0x087, GET(STVEBX) },
{ 0x088, GETRC(SUBFE) },
{ 0x288, GETRC(SUBFEO) },
{ 0x08a, GETRC(ADDE) },
{ 0x28a, GETRC(ADDEO) },
{ 0x090, GET(MTOCRF) },
{ 0x095, GET(STDX) },
{ 0x096, GET(STWCX) },
{ 0x097, GET(STWX) },
{ 0x0a7, GET(STVEHX) },
{ 0x0b5, GET(STDUX) },
{ 0x0b7, GET(STWUX) },
{ 0x0c7, GET(STVEWX) },
{ 0x0c8, GETRC(SUBFZE) },
{ 0x2c8, GETRC(SUBFZEO) },
{ 0x0ca, GETRC(ADDZE) },
{ 0x2ca, GETRC(ADDZEO) },
{ 0x0d6, GET(STDCX) },
{ 0x0d7, GET(STBX) },
{ 0x0e7, GET(STVX) },
{ 0x0e8, GETRC(SUBFME) },
{ 0x2e8, GETRC(SUBFMEO) },
{ 0x0e9, GETRC(MULLD) },
{ 0x2e9, GETRC(MULLDO) },
{ 0x0ea, GETRC(ADDME) },
{ 0x2ea, GETRC(ADDMEO) },
{ 0x0eb, GETRC(MULLW) },
{ 0x2eb, GETRC(MULLWO) },
{ 0x0f6, GET(DCBTST) },
{ 0x0f7, GET(STBUX) },
{ 0x10a, GETRC(ADD) },
{ 0x30a, GETRC(ADDO) },
{ 0x116, GET(DCBT) },
{ 0x117, GET(LHZX) },
{ 0x11c, GETRC(EQV) },
{ 0x136, GET(ECIWX) },
{ 0x137, GET(LHZUX) },
{ 0x13c, GETRC(XOR) },
{ 0x153, GET(MFSPR) },
{ 0x155, GET(LWAX) },
{ 0x156, GET(DST) },
{ 0x157, GET(LHAX) },
{ 0x167, GET(LVXL) },
{ 0x173, GET(MFTB) },
{ 0x175, GET(LWAUX) },
{ 0x176, GET(DSTST) },
{ 0x177, GET(LHAUX) },
{ 0x197, GET(STHX) },
{ 0x19c, GETRC(ORC) },
{ 0x1b6, GET(ECOWX) },
{ 0x1b7, GET(STHUX) },
{ 0x1bc, GETRC(OR) },
{ 0x1c9, GETRC(DIVDU) },
{ 0x3c9, GETRC(DIVDUO) },
{ 0x1cb, GETRC(DIVWU) },
{ 0x3cb, GETRC(DIVWUO) },
{ 0x1d3, GET(MTSPR) },
{ 0x1d6, GET(DCBI) },
{ 0x1dc, GETRC(NAND) },
{ 0x1e7, GET(STVXL) },
{ 0x1e9, GETRC(DIVD) },
{ 0x3e9, GETRC(DIVDO) },
{ 0x1eb, GETRC(DIVW) },
{ 0x3eb, GETRC(DIVWO) },
{ 0x207, GET(LVLX) },
{ 0x214, GET(LDBRX) },
{ 0x215, GET(LSWX) },
{ 0x216, GET(LWBRX) },
{ 0x217, GET(LFSX) },
{ 0x218, GETRC(SRW) },
{ 0x21b, GETRC(SRD) },
{ 0x227, GET(LVRX) },
{ 0x237, GET(LFSUX) },
{ 0x255, GET(LSWI) },
{ 0x256, GET(SYNC) },
{ 0x257, GET(LFDX) },
{ 0x277, GET(LFDUX) },
{ 0x287, GET(STVLX) },
{ 0x294, GET(STDBRX) },
{ 0x295, GET(STSWX) },
{ 0x296, GET(STWBRX) },
{ 0x297, GET(STFSX) },
{ 0x2a7, GET(STVRX) },
{ 0x2b7, GET(STFSUX) },
{ 0x2d5, GET(STSWI) },
{ 0x2d7, GET(STFDX) },
{ 0x2f7, GET(STFDUX) },
{ 0x307, GET(LVLXL) },
{ 0x316, GET(LHBRX) },
{ 0x318, GETRC(SRAW) },
{ 0x31a, GETRC(SRAD) },
{ 0x327, GET(LVRXL) },
{ 0x336, GET(DSS) },
{ 0x338, GETRC(SRAWI) },
{ 0x33a, GETRC(SRADI) },
{ 0x33b, GETRC(SRADI) },
{ 0x356, GET(EIEIO) },
{ 0x387, GET(STVLXL) },
{ 0x396, GET(STHBRX) },
{ 0x39a, GETRC(EXTSH) },
{ 0x3a7, GET(STVRXL) },
{ 0x3ba, GETRC(EXTSB) },
{ 0x3d7, GET(STFIWX) },
{ 0x3da, GETRC(EXTSW) },
{ 0x3d6, GET(ICBI) },
{ 0x3f6, GET(DCBZ) },
});
// Group 0x3a opcodes (field 30..31)
fill_table(0x3a, 2, 0,
{
{ 0x0, &D::LD },
{ 0x1, &D::LDU },
{ 0x2, &D::LWA },
{ 0x0, GET(LD) },
{ 0x1, GET(LDU) },
{ 0x2, GET(LWA) },
});
// Group 0x3b opcodes (field 21..30)
fill_table(0x3b, 10, 1,
{
{ 0x12, &D::FDIVS, 5 },
{ 0x14, &D::FSUBS, 5 },
{ 0x15, &D::FADDS, 5 },
{ 0x16, &D::FSQRTS, 5 },
{ 0x18, &D::FRES, 5 },
{ 0x19, &D::FMULS, 5 },
{ 0x1c, &D::FMSUBS, 5 },
{ 0x1d, &D::FMADDS, 5 },
{ 0x1e, &D::FNMSUBS, 5 },
{ 0x1f, &D::FNMADDS, 5 },
{ 0x12, GETRC(FDIVS), 5 },
{ 0x14, GETRC(FSUBS), 5 },
{ 0x15, GETRC(FADDS), 5 },
{ 0x16, GETRC(FSQRTS), 5 },
{ 0x18, GETRC(FRES), 5 },
{ 0x19, GETRC(FMULS), 5 },
{ 0x1c, GETRC(FMSUBS), 5 },
{ 0x1d, GETRC(FMADDS), 5 },
{ 0x1e, GETRC(FNMSUBS), 5 },
{ 0x1f, GETRC(FNMADDS), 5 },
});
// Group 0x3e opcodes (field 30..31)
fill_table(0x3e, 2, 0,
{
{ 0x0, &D::STD },
{ 0x1, &D::STDU },
{ 0x0, GET(STD) },
{ 0x1, GET(STDU) },
});
// Group 0x3f opcodes (field 21..30)
fill_table(0x3f, 10, 1,
{
{ 0x026, &D::MTFSB1 },
{ 0x040, &D::MCRFS },
{ 0x046, &D::MTFSB0 },
{ 0x086, &D::MTFSFI },
{ 0x247, &D::MFFS },
{ 0x2c7, &D::MTFSF },
{ 0x026, GETRC(MTFSB1) },
{ 0x040, GET(MCRFS) },
{ 0x046, GETRC(MTFSB0) },
{ 0x086, GETRC(MTFSFI) },
{ 0x247, GETRC(MFFS) },
{ 0x2c7, GETRC(MTFSF) },
{ 0x000, &D::FCMPU },
{ 0x00c, &D::FRSP },
{ 0x00e, &D::FCTIW },
{ 0x00f, &D::FCTIWZ },
{ 0x000, GET(FCMPU) },
{ 0x00c, GETRC(FRSP) },
{ 0x00e, GETRC(FCTIW) },
{ 0x00f, GETRC(FCTIWZ) },
{ 0x012, &D::FDIV, 5 },
{ 0x014, &D::FSUB, 5 },
{ 0x015, &D::FADD, 5 },
{ 0x016, &D::FSQRT, 5 },
{ 0x017, &D::FSEL, 5 },
{ 0x019, &D::FMUL, 5 },
{ 0x01a, &D::FRSQRTE, 5 },
{ 0x01c, &D::FMSUB, 5 },
{ 0x01d, &D::FMADD, 5 },
{ 0x01e, &D::FNMSUB, 5 },
{ 0x01f, &D::FNMADD, 5 },
{ 0x012, GETRC(FDIV), 5 },
{ 0x014, GETRC(FSUB), 5 },
{ 0x015, GETRC(FADD), 5 },
{ 0x016, GETRC(FSQRT), 5 },
{ 0x017, GETRC(FSEL), 5 },
{ 0x019, GETRC(FMUL), 5 },
{ 0x01a, GETRC(FRSQRTE), 5 },
{ 0x01c, GETRC(FMSUB), 5 },
{ 0x01d, GETRC(FMADD), 5 },
{ 0x01e, GETRC(FNMSUB), 5 },
{ 0x01f, GETRC(FNMADD), 5 },
{ 0x020, &D::FCMPO },
{ 0x028, &D::FNEG },
{ 0x048, &D::FMR },
{ 0x088, &D::FNABS },
{ 0x108, &D::FABS },
{ 0x32e, &D::FCTID },
{ 0x32f, &D::FCTIDZ },
{ 0x34e, &D::FCFID },
{ 0x020, GET(FCMPO) },
{ 0x028, GETRC(FNEG) },
{ 0x048, GETRC(FMR) },
{ 0x088, GETRC(FNABS) },
{ 0x108, GETRC(FABS) },
{ 0x32e, GETRC(FCTID) },
{ 0x32f, GETRC(FCTIDZ) },
{ 0x34e, GETRC(FCFID) },
});
}
@ -587,6 +635,10 @@ public:
}
};
#undef GET_
#undef GET
#undef GETRC
namespace ppu_instructions
{
namespace fields

View file

@ -62,7 +62,7 @@
#include "util/asm.hpp"
#include "util/vm.hpp"
#include "util/v128.hpp"
#include "util/v128sse.hpp"
#include "util/simd.hpp"
#include "util/sysinfo.hpp"
extern atomic_t<u64> g_watchdog_hold_ctr;
@ -131,9 +131,8 @@ void fmt_class_string<typename ppu_thread::call_history_t>::format(std::string&
}
}
const ppu_decoder<ppu_interpreter_precise> g_ppu_interpreter_precise;
const ppu_decoder<ppu_interpreter_fast> g_ppu_interpreter_fast;
const ppu_decoder<ppu_itype> g_ppu_itype;
extern const ppu_decoder<ppu_itype> g_ppu_itype{};
extern const ppu_decoder<ppu_iname> g_ppu_iname{};
extern void ppu_initialize();
extern void ppu_finalize(const ppu_module& info);
@ -143,15 +142,16 @@ extern std::pair<std::shared_ptr<lv2_overlay>, CellError> ppu_load_overlay(const
extern void ppu_unload_prx(const lv2_prx&);
extern std::shared_ptr<lv2_prx> ppu_load_prx(const ppu_prx_object&, const std::string&, s64 file_offset);
extern void ppu_execute_syscall(ppu_thread& ppu, u64 code);
static bool ppu_break(ppu_thread& ppu, ppu_opcode_t op);
static void ppu_break(ppu_thread&, ppu_opcode_t, be_t<u32>*, ppu_intrp_func*);
extern void do_cell_atomic_128_store(u32 addr, const void* to_write);
const auto ppu_gateway = built_function<void(*)(ppu_thread*)>("ppu_gateway", [](asmjit::x86::Assembler& c, auto& args)
const auto ppu_gateway = built_function<void(*)(ppu_thread*)>("ppu_gateway", [](native_asm& c, auto& args)
{
// Gateway for PPU, converts from native to GHC calling convention, also saves RSP value for escape
using namespace asmjit;
#if defined(ARCH_X64)
#ifdef _WIN32
c.push(x86::r15);
c.push(x86::r14);
@ -192,10 +192,10 @@ const auto ppu_gateway = built_function<void(*)(ppu_thread*)>("ppu_gateway", [](
c.mov(x86::rax, x86::qword_ptr(x86::r13, x86::edx, 1, 0)); // Load call target
c.mov(x86::rdx, x86::rax);
c.shl(x86::rax, 17);
c.shr(x86::rax, 17);
c.shr(x86::rdx, 47);
c.shl(x86::rdx, 12);
c.shl(x86::rax, 16);
c.shr(x86::rax, 16);
c.shr(x86::rdx, 48);
c.shl(x86::edx, 13);
c.mov(x86::r12d, x86::edx); // Load relocation base
c.mov(x86::rbx, x86::qword_ptr(reinterpret_cast<u64>(&vm::g_base_addr)));
@ -246,116 +246,113 @@ const auto ppu_gateway = built_function<void(*)(ppu_thread*)>("ppu_gateway", [](
#endif
c.ret();
#else
c.ret(a64::x30);
#endif
});
const extern auto ppu_escape = build_function_asm<void(*)(ppu_thread*)>("ppu_escape", [](asmjit::x86::Assembler& c, auto& args)
const extern auto ppu_escape = build_function_asm<void(*)(ppu_thread*)>("ppu_escape", [](native_asm& c, auto& args)
{
using namespace asmjit;
#if defined(ARCH_X64)
// Restore native stack pointer (longjmp emulation)
c.mov(x86::rsp, x86::qword_ptr(args[0], ::offset32(&ppu_thread::saved_native_sp)));
// Return to the return location
c.sub(x86::rsp, 8);
c.ret();
#endif
});
void ppu_recompiler_fallback(ppu_thread& ppu);
const auto ppu_recompiler_fallback_ghc = build_function_asm<void(*)(ppu_thread& ppu)>("ppu_trampolineb", [](asmjit::x86::Assembler& c, auto& args)
#if defined(ARCH_X64)
const auto ppu_recompiler_fallback_ghc = build_function_asm<void(*)(ppu_thread& ppu)>("ppu_trampolineb", [](native_asm& c, auto& args)
{
using namespace asmjit;
c.mov(args[0], x86::rbp);
c.jmp(imm_ptr(ppu_recompiler_fallback));
});
#elif defined(ARCH_ARM64)
const auto ppu_recompiler_fallback_ghc = &ppu_recompiler_fallback;
#endif
// Get pointer to executable cache
static u64& ppu_ref(u32 addr)
static ppu_intrp_func_t& ppu_ref(u32 addr)
{
return *reinterpret_cast<u64*>(vm::g_exec_addr + u64{addr} * 2);
return *reinterpret_cast<ppu_intrp_func_t*>(vm::g_exec_addr + u64{addr} * 2);
}
// Get interpreter cache value
static u64 ppu_cache(u32 addr)
static ppu_intrp_func_t ppu_cache(u32 addr)
{
if (g_cfg.core.ppu_decoder > ppu_decoder_type::fast)
if (g_cfg.core.ppu_decoder != ppu_decoder_type::_static)
{
fmt::throw_exception("Invalid PPU decoder");
}
// Select opcode table
const auto& table = *(
g_cfg.core.ppu_decoder == ppu_decoder_type::precise
? &g_ppu_interpreter_precise.get_table()
: &g_ppu_interpreter_fast.get_table());
return reinterpret_cast<uptr>(table[ppu_decode(vm::read32(addr))]);
return g_fxo->get<ppu_interpreter_rt>().decode(vm::read32(addr));
}
static bool ppu_fallback(ppu_thread& ppu, ppu_opcode_t op)
static ppu_intrp_func ppu_ret = {[](ppu_thread& ppu, ppu_opcode_t, be_t<u32>* this_op, ppu_intrp_func*)
{
if (g_cfg.core.ppu_debug)
{
ppu_log.error("Unregistered instruction: 0x%08x", op.opcode);
}
// Fix PC and return (step execution)
ppu.cia = vm::get_addr(this_op);
return;
}};
ppu_ref(ppu.cia) = ppu_cache(ppu.cia);
return false;
static void ppu_fallback(ppu_thread& ppu, ppu_opcode_t op, be_t<u32>* this_op, ppu_intrp_func* next_fn)
{
const auto _pc = vm::get_addr(this_op);
const auto _fn = ppu_cache(_pc);
ppu_ref(_pc) = _fn;
return _fn(ppu, op, this_op, next_fn);
}
// TODO: Make this a dispatch call
void ppu_recompiler_fallback(ppu_thread& ppu)
{
perf_meter<"PPUFALL1"_u64> perf0;
if (g_cfg.core.ppu_debug)
{
ppu_log.error("Unregistered PPU Function (LR=0x%llx)", ppu.lr);
ppu_log.error("Unregistered PPU Function (LR=0x%x)", ppu.lr);
}
const auto& table = g_ppu_interpreter_fast.get_table();
u64 ctr = 0;
const auto& table = g_fxo->get<ppu_interpreter_rt>();
while (true)
{
if (uptr func = ppu_ref(ppu.cia); (func << 17 >> 17) != reinterpret_cast<uptr>(ppu_recompiler_fallback_ghc))
if (uptr func = uptr(ppu_ref(ppu.cia)); (func << 16 >> 16) != reinterpret_cast<uptr>(ppu_recompiler_fallback_ghc))
{
// We found a recompiler function at cia, return
break;
}
// Run instructions in interpreter
if (const u32 op = vm::read32(ppu.cia); ctr++, table[ppu_decode(op)](ppu, {op})) [[likely]]
{
ppu.cia += 4;
continue;
}
// Run one instruction in interpreter (TODO)
const u32 op = vm::read32(ppu.cia);
table.decode(op)(ppu, {op}, vm::_ptr<u32>(ppu.cia), &ppu_ret);
if (ppu.test_stopped())
{
break;
}
}
if (g_cfg.core.ppu_debug)
{
ppu_log.warning("Exiting interpreter at 0x%x (executed %u functions)", ppu.cia, ctr);
}
}
void ppu_reservation_fallback(ppu_thread& ppu)
{
const auto& table = g_ppu_interpreter_fast.get_table();
perf_meter<"PPUFALL2"_u64> perf0;
const auto& table = g_fxo->get<ppu_interpreter_rt>();
while (true)
{
// Run instructions in interpreter
// Run one instruction in interpreter (TODO)
const u32 op = vm::read32(ppu.cia);
if (table[ppu_decode(op)](ppu, {op})) [[likely]]
{
ppu.cia += 4;
}
table.decode(op)(ppu, {op}, vm::_ptr<u32>(ppu.cia), &ppu_ret);
if (!ppu.raddr || !ppu.use_full_rdata)
{
@ -372,7 +369,7 @@ void ppu_reservation_fallback(ppu_thread& ppu)
static std::unordered_map<u32, u32>* s_ppu_toc;
static bool ppu_check_toc(ppu_thread& ppu, ppu_opcode_t)
static void ppu_check_toc(ppu_thread& ppu, ppu_opcode_t op, be_t<u32>* this_op, ppu_intrp_func* next_fn)
{
// Compare TOC with expected value
const auto found = s_ppu_toc->find(ppu.cia);
@ -383,18 +380,12 @@ static bool ppu_check_toc(ppu_thread& ppu, ppu_opcode_t)
if (!ppu.state.test_and_set(cpu_flag::dbg_pause) && ppu.check_state())
{
return false;
return;
}
}
// Fallback to the interpreter function
const u64 val = ppu_cache(ppu.cia);
if (reinterpret_cast<decltype(&ppu_interpreter::UNK)>(val & 0xffffffff)(ppu, {static_cast<u32>(val >> 32)}))
{
ppu.cia += 4;
}
return false;
return ppu_cache(ppu.cia)(ppu, op, this_op, next_fn);
}
extern void ppu_register_range(u32 addr, u32 size)
@ -417,7 +408,6 @@ extern void ppu_register_range(u32 addr, u32 size)
utils::memory_commit(vm::g_stat_addr + addr, size);
}
const u64 fallback = reinterpret_cast<uptr>(ppu_fallback);
const u64 seg_base = addr;
while (size)
@ -425,11 +415,11 @@ extern void ppu_register_range(u32 addr, u32 size)
if (g_cfg.core.ppu_decoder == ppu_decoder_type::llvm)
{
// Assume addr is the start of first segment of PRX
ppu_ref(addr) = reinterpret_cast<uptr>(ppu_recompiler_fallback_ghc) | (seg_base << (32 + 3));
ppu_ref(addr) = reinterpret_cast<ppu_intrp_func_t>(reinterpret_cast<uptr>(ppu_recompiler_fallback_ghc) | (seg_base << (32 + 3)));
}
else
{
ppu_ref(addr) = fallback;
ppu_ref(addr) = ppu_fallback;
}
addr += 4;
@ -437,14 +427,14 @@ extern void ppu_register_range(u32 addr, u32 size)
}
}
static bool ppu_far_jump(ppu_thread& ppu);
static void ppu_far_jump(ppu_thread&, ppu_opcode_t, be_t<u32>*, ppu_intrp_func*);
extern void ppu_register_function_at(u32 addr, u32 size, ppu_function_t ptr = nullptr)
extern void ppu_register_function_at(u32 addr, u32 size, ppu_intrp_func_t ptr = nullptr)
{
// Initialize specific function
if (ptr)
{
ppu_ref(addr) = (reinterpret_cast<uptr>(ptr) & 0x7fff'ffff'ffffu) | (ppu_ref(addr) & ~0x7fff'ffff'ffffu);
ppu_ref(addr) = reinterpret_cast<ppu_intrp_func_t>((reinterpret_cast<uptr>(ptr) & 0xffff'ffff'ffffu) | (uptr(ppu_ref(addr)) & ~0xffff'ffff'ffffu));
return;
}
@ -464,12 +454,9 @@ extern void ppu_register_function_at(u32 addr, u32 size, ppu_function_t ptr = nu
}
// Initialize interpreter cache
const u64 _break = reinterpret_cast<uptr>(ppu_break);
const u64 far_jump = reinterpret_cast<uptr>(ppu_far_jump);
while (size)
{
if (ppu_ref(addr) != _break && ppu_ref(addr) != far_jump)
if (ppu_ref(addr) != ppu_break && ppu_ref(addr) != ppu_far_jump)
{
ppu_ref(addr) = ppu_cache(addr);
}
@ -481,12 +468,12 @@ extern void ppu_register_function_at(u32 addr, u32 size, ppu_function_t ptr = nu
extern void ppu_register_function_at(u32 addr, u32 size, u64 ptr)
{
return ppu_register_function_at(addr, size, reinterpret_cast<ppu_function_t>(ptr));
return ppu_register_function_at(addr, size, reinterpret_cast<ppu_intrp_func_t>(ptr));
}
u32 ppu_get_exported_func_addr(u32 fnid, const std::string& module_name);
bool ppu_return_from_far_jump(ppu_thread& ppu)
void ppu_return_from_far_jump(ppu_thread& ppu, ppu_opcode_t, be_t<u32>*, ppu_intrp_func*)
{
auto& calls_info = ppu.hle_func_calls_with_toc_info;
ensure(!calls_info.empty());
@ -498,7 +485,6 @@ bool ppu_return_from_far_jump(ppu_thread& ppu)
ppu.gpr[2] = restore_info->saved_r2;
calls_info.pop_back();
return false;
}
static const bool s_init_return_far_jump_func = []
@ -586,9 +572,9 @@ u32 ppu_get_far_jump(u32 pc)
return g_fxo->get<ppu_far_jumps_t>().get_target(pc);
}
static bool ppu_far_jump(ppu_thread& ppu)
static void ppu_far_jump(ppu_thread& ppu, ppu_opcode_t, be_t<u32>* this_op, ppu_intrp_func*)
{
const u32 cia = g_fxo->get<ppu_far_jumps_t>().get_target(ppu.cia, &ppu);
const u32 cia = g_fxo->get<ppu_far_jumps_t>().get_target(vm::get_addr(this_op), &ppu);
if (!vm::check_addr(cia, vm::page_executable))
{
@ -596,7 +582,6 @@ static bool ppu_far_jump(ppu_thread& ppu)
}
ppu.cia = cia;
return false;
}
bool ppu_form_branch_to_code(u32 entry, u32 target, bool link, bool with_toc, std::string module_name)
@ -658,7 +643,7 @@ bool ppu_form_branch_to_code(u32 entry, u32 target, bool link, bool with_toc, st
auto& jumps = g_fxo->get<ppu_far_jumps_t>();
std::lock_guard lock(jumps.mutex);
jumps.vals.insert_or_assign(entry, std::type_identity_t<typename ppu_far_jumps_t::all_info_t>{target, link, with_toc, std::move(module_name)});
jumps.vals.insert_or_assign(entry, ppu_far_jumps_t::all_info_t{target, link, with_toc, std::move(module_name)});
ppu_register_function_at(entry, 4, &ppu_far_jump);
return true;
@ -702,10 +687,13 @@ void ppu_remove_hle_instructions(u32 addr, u32 size)
atomic_t<bool> g_debugger_pause_all_threads_on_bp = true;
// Breakpoint entry point
static bool ppu_break(ppu_thread& ppu, ppu_opcode_t)
static void ppu_break(ppu_thread& ppu, ppu_opcode_t, be_t<u32>* this_op, ppu_intrp_func* next_fn)
{
const bool pause_all = g_debugger_pause_all_threads_on_bp;
const u32 old_cia = vm::get_addr(this_op);
ppu.cia = old_cia;
// Pause
ppu.state.atomic_op([&](bs_t<cpu_flag>& state)
{
@ -719,19 +707,14 @@ static bool ppu_break(ppu_thread& ppu, ppu_opcode_t)
Emu.CallAfter([]() { Emu.Pause(); });
}
if (ppu.check_state())
if (ppu.check_state() || old_cia != atomic_storage<u32>::load(ppu.cia))
{
return false;
// Do not execute if PC changed
return;
}
// Fallback to the interpreter function
const u64 val = ppu_cache(ppu.cia);
if (reinterpret_cast<decltype(&ppu_interpreter::UNK)>(val)(ppu, {vm::read32(ppu.cia).get()}))
{
ppu.cia += 4;
}
return false;
return ppu_cache(ppu.cia)(ppu, {*this_op}, this_op, next_fn);
}
// Set or remove breakpoint
@ -742,11 +725,9 @@ extern bool ppu_breakpoint(u32 addr, bool is_adding)
return false;
}
const u64 _break = reinterpret_cast<uptr>(&ppu_break);
// Remove breakpoint parameters
u64 to_set = 0;
u64 expected = _break;
ppu_intrp_func_t to_set = 0;
ppu_intrp_func_t expected = &ppu_break;
if (u32 hle_addr{}; g_fxo->is_init<ppu_function_manager>() && (hle_addr = g_fxo->get<ppu_function_manager>().addr))
{
@ -756,7 +737,7 @@ extern bool ppu_breakpoint(u32 addr, bool is_adding)
if (addr % 8 == 4 && index < ppu_function_manager::get().size())
{
// HLE function placement
to_set = reinterpret_cast<uptr>(ppu_function_manager::get()[index]);
to_set = ppu_function_manager::get()[index];
}
}
@ -766,23 +747,21 @@ extern bool ppu_breakpoint(u32 addr, bool is_adding)
to_set = ppu_cache(addr);
}
u64& _ref = ppu_ref(addr);
ppu_intrp_func_t& _ref = ppu_ref(addr);
if (is_adding)
{
// Swap if adding
std::swap(to_set, expected);
const u64 _fall = reinterpret_cast<uptr>(&ppu_fallback);
if (_ref == _fall)
if (_ref == &ppu_fallback)
{
ppu_log.error("Unregistered instruction replaced with a breakpoint at 0x%08x", addr);
expected = _fall;
expected = ppu_fallback;
}
}
return atomic_storage<u64>::compare_exchange(_ref, expected, to_set);
return atomic_storage<ppu_intrp_func_t>::compare_exchange(_ref, expected, to_set);
}
extern bool ppu_patch(u32 addr, u32 value)
@ -812,12 +791,9 @@ extern bool ppu_patch(u32 addr, u32 value)
*vm::get_super_ptr<u32>(addr) = value;
const u64 _break = reinterpret_cast<uptr>(&ppu_break);
const u64 fallback = reinterpret_cast<uptr>(&ppu_fallback);
if (is_exec)
{
if (ppu_ref(addr) != _break && ppu_ref(addr) != fallback)
if (ppu_ref(addr) != ppu_break && ppu_ref(addr) != ppu_fallback)
{
ppu_ref(addr) = ppu_cache(addr);
}
@ -1182,10 +1158,13 @@ void ppu_thread::cpu_task()
{
std::fesetround(FE_TONEAREST);
if (g_cfg.core.set_daz_and_ftz && g_cfg.core.ppu_decoder != ppu_decoder_type::precise)
if (g_cfg.core.set_daz_and_ftz)
{
// Set DAZ and FTZ
_mm_setcsr(_mm_getcsr() | 0x8840);
gv_set_zeroing_denormals();
}
else
{
gv_unset_zeroing_denormals();
}
// Execute cmd_queue
@ -1197,9 +1176,7 @@ void ppu_thread::cpu_task()
{
case ppu_cmd::opcode:
{
cmd_pop(), g_cfg.core.ppu_decoder == ppu_decoder_type::precise
? g_ppu_interpreter_precise.decode(arg)(*this, {arg})
: g_ppu_interpreter_fast.decode(arg)(*this, {arg});
cmd_pop(), g_fxo->get<ppu_interpreter_rt>().decode(arg)(*this, {arg}, vm::_ptr<u32>(cia - 4), &ppu_ret);
break;
}
case ppu_cmd::set_gpr:
@ -1236,7 +1213,7 @@ void ppu_thread::cpu_task()
}
case ppu_cmd::hle_call:
{
cmd_pop(), ppu_function_manager::get().at(arg)(*this);
cmd_pop(), ppu_function_manager::get().at(arg)(*this, {arg}, vm::_ptr<u32>(cia - 4), &ppu_ret);
break;
}
case ppu_cmd::opd_call:
@ -1247,8 +1224,8 @@ void ppu_thread::cpu_task()
}
case ppu_cmd::ptr_call:
{
const ppu_function_t func = cmd_get(1).as<ppu_function_t>();
cmd_pop(1), func(*this);
const ppu_intrp_func_t func = cmd_get(1).as<ppu_intrp_func_t>();
cmd_pop(1), func(*this, {}, vm::_ptr<u32>(cia - 4), &ppu_ret);
break;
}
case ppu_cmd::initialize:
@ -1323,7 +1300,7 @@ void ppu_thread::cpu_on_stop()
void ppu_thread::exec_task()
{
if (g_cfg.core.ppu_decoder == ppu_decoder_type::llvm)
if (g_cfg.core.ppu_decoder != ppu_decoder_type::_static)
{
while (true)
{
@ -1340,79 +1317,28 @@ void ppu_thread::exec_task()
}
const auto cache = vm::g_exec_addr;
using func_t = decltype(&ppu_interpreter::UNK);
const auto mem_ = vm::g_base_addr;
while (true)
{
const auto exec_op = [this](u64 op)
if (test_stopped()) [[unlikely]]
{
return reinterpret_cast<func_t>(op)(*this, {vm::read32(cia).get()});
};
if (cia % 8 || state) [[unlikely]]
{
if (test_stopped()) return;
// Decode single instruction (may be step)
if (exec_op(*reinterpret_cast<u64*>(cache + u64{cia} * 2))) { cia += 4; }
continue;
return;
}
u64 op0, op1, op2, op3;
u64 _pos = u64{cia} * 2;
gv_zeroupper();
// Reinitialize
{
const v128 _op0 = *reinterpret_cast<const v128*>(cache + _pos);
const v128 _op1 = *reinterpret_cast<const v128*>(cache + _pos + 16);
op0 = _op0._u64[0];
op1 = _op0._u64[1];
op2 = _op1._u64[0];
op3 = _op1._u64[1];
}
while (exec_op(op0)) [[likely]]
{
cia += 4;
if (exec_op(op1)) [[likely]]
{
cia += 4;
if (exec_op(op2)) [[likely]]
{
cia += 4;
if (exec_op(op3)) [[likely]]
{
cia += 4;
if (state) [[unlikely]]
{
break;
}
_pos += 32;
const v128 _op0 = *reinterpret_cast<const v128*>(cache + _pos);
const v128 _op1 = *reinterpret_cast<const v128*>(cache + _pos + 16);
op0 = _op0._u64[0];
op1 = _op0._u64[1];
op2 = _op1._u64[0];
op3 = _op1._u64[1];
continue;
}
break;
}
break;
}
break;
}
// Execute instruction (may be step; execute only one instruction if state)
const auto op = reinterpret_cast<be_t<u32>*>(mem_ + u64{cia});
const auto fn = reinterpret_cast<ppu_intrp_func*>(cache + u64{cia} * 2);
fn->fn(*this, {*op}, op, state ? &ppu_ret : fn + 1);
}
}
ppu_thread::~ppu_thread()
{
perf_log.notice("Perf stats for STCX reload: successs %u, failure %u", last_succ, last_fail);
perf_log.notice("Perf stats for instructions: total %u", exec_bytes / 4);
}
ppu_thread::ppu_thread(const ppu_thread_params& param, std::string_view name, u32 prio, int detached)
@ -1638,7 +1564,7 @@ void ppu_thread::stack_pop_verbose(u32 addr, u32 size) noexcept
ppu_log.error("Invalid thread");
}
extern ppu_function_t ppu_get_syscall(u64 code);
extern ppu_intrp_func_t ppu_get_syscall(u64 code);
void ppu_trap(ppu_thread& ppu, u64 addr)
{
@ -1728,7 +1654,7 @@ static T ppu_load_acquire_reservation(ppu_thread& ppu, u32 addr)
{
const auto _inst = v128::loadu(inst + i) & mask_vec;
if (_mm_movemask_epi8(v128::eq32(_inst, store_vec).vi))
if (!gv_testz(gv_eq32(_inst, store_vec)))
{
return false;
}
@ -1817,10 +1743,11 @@ extern u64 ppu_ldarx(ppu_thread& ppu, u32 addr)
return ppu_load_acquire_reservation<u64>(ppu, addr);
}
const auto ppu_stcx_accurate_tx = built_function<u64(*)(u32 raddr, u64 rtime, const void* _old, u64 _new)>("ppu_stcx_accurate_tx", [](asmjit::x86::Assembler& c, auto& args)
const auto ppu_stcx_accurate_tx = built_function<u64(*)(u32 raddr, u64 rtime, const void* _old, u64 _new)>("ppu_stcx_accurate_tx", [](native_asm& c, auto& args)
{
using namespace asmjit;
#if defined(ARCH_X64)
Label fall = c.newLabel();
Label fail = c.newLabel();
Label _ret = c.newLabel();
@ -2024,6 +1951,9 @@ const auto ppu_stcx_accurate_tx = built_function<u64(*)(u32 raddr, u64 rtime, co
c.bind(ret2);
#endif
c.ret();
#else
c.ret(a64::x30);
#endif
});
template <typename T>
@ -2147,7 +2077,7 @@ static bool ppu_store_reservation(ppu_thread& ppu, u32 addr, u64 reg_value)
utils::prefetch_read(ppu.rdata + 64);
ppu.last_faddr = addr;
ppu.last_ftime = res.load() & -128;
ppu.last_ftsc = __rdtsc();
ppu.last_ftsc = utils::get_tsc();
return false;
}
default:
@ -2249,7 +2179,7 @@ static bool ppu_store_reservation(ppu_thread& ppu, u32 addr, u64 reg_value)
ppu.last_faddr = addr;
ppu.last_ftime = old_rtime & -128;
ppu.last_ftsc = __rdtsc();
ppu.last_ftsc = utils::get_tsc();
std::memcpy(&ppu.rdata[addr & 0x78], &old_data, 8);
}
@ -2286,7 +2216,7 @@ namespace
// Compiled PPU module info
struct jit_module
{
std::vector<ppu_function_t> funcs;
std::vector<ppu_intrp_func_t> funcs;
std::shared_ptr<jit_compiler> pjit;
bool init = false;
};
@ -2829,7 +2759,7 @@ bool ppu_initialize(const ppu_module& info, bool check_only)
if (g_cfg.core.ppu_debug && func.size && func.toc != umax)
{
s_ppu_toc->emplace(func.addr, func.toc);
ppu_ref(func.addr) = reinterpret_cast<uptr>(&ppu_check_toc);
ppu_ref(func.addr) = &ppu_check_toc;
}
}
@ -3022,7 +2952,7 @@ bool ppu_initialize(const ppu_module& info, bool check_only)
// Fixup some information
entry.name = fmt::format("__0x%x", entry.addr - reloc);
if (has_mfvscr)
if (has_mfvscr && g_cfg.core.ppu_set_sat_bit)
{
// TODO
entry.attr += ppu_attr::has_mfvscr;
@ -3139,13 +3069,15 @@ bool ppu_initialize(const ppu_module& info, bool check_only)
enum class ppu_settings : u32
{
non_win32,
accurate_fma,
accurate_ppu_vector_nan,
java_mode_handling,
accurate_dfma,
fixup_vnan,
accurate_jm,
accurate_cache_line_stores,
reservations_128_byte,
greedy_mode,
has_mfvscr,
accurate_sat,
accurate_fpcc,
accurate_vnan,
__bitset_enum_max
};
@ -3155,20 +3087,24 @@ bool ppu_initialize(const ppu_module& info, bool check_only)
#ifndef _WIN32
settings += ppu_settings::non_win32;
#endif
if (g_cfg.core.llvm_accurate_dfma)
settings += ppu_settings::accurate_fma;
if (g_cfg.core.llvm_ppu_accurate_vector_nan)
settings += ppu_settings::accurate_ppu_vector_nan;
if (g_cfg.core.llvm_ppu_jm_handling)
settings += ppu_settings::java_mode_handling;
if (g_cfg.core.use_accurate_dfma)
settings += ppu_settings::accurate_dfma;
if (g_cfg.core.ppu_fix_vnan)
settings += ppu_settings::fixup_vnan;
if (g_cfg.core.ppu_use_nj_bit)
settings += ppu_settings::accurate_jm;
if (has_dcbz == 2)
settings += ppu_settings::accurate_cache_line_stores;
if (g_cfg.core.ppu_128_reservations_loop_max_length)
settings += ppu_settings::reservations_128_byte;
if (g_cfg.core.ppu_llvm_greedy_mode)
settings += ppu_settings::greedy_mode;
if (has_mfvscr)
settings += ppu_settings::has_mfvscr;
if (has_mfvscr && g_cfg.core.ppu_set_sat_bit)
settings += ppu_settings::accurate_sat;
if (g_cfg.core.ppu_set_fpcc)
settings += ppu_settings::accurate_fpcc, fmt::throw_exception("FPCC Not implemented");
if (g_cfg.core.ppu_set_vnan)
settings += ppu_settings::accurate_vnan, fmt::throw_exception("VNAN Not implemented");
// Write version, hash, CPU, settings
fmt::append(obj_name, "v5-kusa-%s-%s-%s.obj", fmt::base57(output, 16), fmt::base57(settings), jit_compiler::cpu(g_cfg.core.llvm_cpu));
@ -3319,10 +3255,10 @@ bool ppu_initialize(const ppu_module& info, bool check_only)
if (!func.size) continue;
const auto name = fmt::format("__0x%x", func.addr - reloc);
const auto addr = ensure(reinterpret_cast<ppu_function_t>(jit->get(name)));
const auto addr = ensure(reinterpret_cast<ppu_intrp_func_t>(jit->get(name)));
jit_mod.funcs.emplace_back(addr);
if (ppu_ref(func.addr) != reinterpret_cast<u64>(ppu_far_jump))
if (ppu_ref(func.addr) != ppu_far_jump)
ppu_register_function_at(func.addr, 4, addr);
if (g_cfg.core.ppu_debug)
@ -3342,7 +3278,7 @@ bool ppu_initialize(const ppu_module& info, bool check_only)
const u64 addr = reinterpret_cast<uptr>(ensure(jit_mod.funcs[index++]));
if (ppu_ref(func.addr) != reinterpret_cast<u64>(ppu_far_jump))
if (ppu_ref(func.addr) != ppu_far_jump)
ppu_register_function_at(func.addr, 4, addr);
if (g_cfg.core.ppu_debug)

View file

@ -276,6 +276,7 @@ public:
u32 last_faddr = 0;
u64 last_fail = 0;
u64 last_succ = 0;
u64 exec_bytes = 0; // Amount of "bytes" executed (4 for each instruction)
u32 dbg_step_pc = 0;

View file

@ -3,20 +3,19 @@
#include "Emu/system_config.h"
#include "PPUTranslator.h"
#include "PPUThread.h"
#include "PPUInterpreter.h"
#include "util/types.hpp"
#include "util/endian.hpp"
#include "util/logs.hpp"
#include "util/v128.hpp"
#include "util/v128sse.hpp"
#include "util/simd.hpp"
#include <algorithm>
using namespace llvm;
const ppu_decoder<PPUTranslator> s_ppu_decoder;
const ppu_decoder<ppu_itype> s_ppu_itype;
const ppu_decoder<ppu_iname> s_ppu_iname;
extern const ppu_decoder<ppu_itype> g_ppu_itype;
extern const ppu_decoder<ppu_iname> g_ppu_iname;
PPUTranslator::PPUTranslator(LLVMContext& context, Module* _module, const ppu_module& info, ExecutionEngine& engine)
: cpu_translator(_module, false)
@ -151,7 +150,7 @@ Function* PPUTranslator::Translate(const ppu_function& info)
{
const u32 op = vm::read32(vm::cast(addr + base));
switch (s_ppu_itype.decode(op))
switch (g_ppu_itype.decode(op))
{
case ppu_itype::UNK:
case ppu_itype::ECIWX:
@ -251,7 +250,7 @@ Function* PPUTranslator::Translate(const ppu_function& info)
if (m_rel)
{
// This is very bad. m_rel is normally set to nullptr after a relocation is handled (so it wasn't)
ppu_log.error("LLVM: [0x%x] Unsupported relocation(%u) in '%s' (opcode=0x%x '%s'). Please report.", rel_found->first, m_rel->type, m_info.name, op, s_ppu_iname.decode(op));
ppu_log.error("LLVM: [0x%x] Unsupported relocation(%u) in '%s' (opcode=0x%x '%s'). Please report.", rel_found->first, m_rel->type, m_info.name, op, g_ppu_iname.decode(op));
return nullptr;
}
}
@ -291,8 +290,8 @@ Value* PPUTranslator::VecHandleDenormal(Value* val)
Value* PPUTranslator::VecHandleResult(Value* val)
{
val = g_cfg.core.llvm_ppu_accurate_vector_nan ? VecHandleNan(val) : val;
val = g_cfg.core.llvm_ppu_jm_handling ? VecHandleDenormal(val) : val;
val = g_cfg.core.ppu_fix_vnan ? VecHandleNan(val) : val;
val = g_cfg.core.ppu_use_nj_bit ? VecHandleDenormal(val) : val;
return val;
}
@ -391,10 +390,10 @@ void PPUTranslator::CallFunction(u64 target, Value* indirect)
const auto pos = m_ir->CreateShl(indirect, 1);
const auto ptr = m_ir->CreateGEP(m_exec, pos);
const auto val = m_ir->CreateLoad(m_ir->CreateBitCast(ptr, get_type<u64*>()));
callee = FunctionCallee(type, m_ir->CreateIntToPtr(m_ir->CreateAnd(val, 0x7fff'ffff'ffff), type->getPointerTo()));
callee = FunctionCallee(type, m_ir->CreateIntToPtr(m_ir->CreateAnd(val, 0xffff'ffff'ffff), type->getPointerTo()));
// Load new segment address
seg0 = m_ir->CreateShl(m_ir->CreateLShr(val, 47), 12);
seg0 = m_ir->CreateShl(m_ir->CreateLShr(val, 48), 13);
}
m_ir->SetInsertPoint(block);
@ -640,7 +639,8 @@ void PPUTranslator::CompilationError(const std::string& error)
void PPUTranslator::MFVSCR(ppu_opcode_t op)
{
const auto vscr = m_ir->CreateOr(ZExt(IsNotZero(RegLoad(m_sat)), GetType<u32>()), m_ir->CreateShl(ZExt(RegLoad(m_nj), GetType<u32>()), 16));
const auto vsat = g_cfg.core.ppu_set_sat_bit ? ZExt(IsNotZero(RegLoad(m_sat)), GetType<u32>()) : m_ir->getInt32(0);
const auto vscr = m_ir->CreateOr(vsat, m_ir->CreateShl(ZExt(RegLoad(m_nj), GetType<u32>()), 16));
SetVr(op.vd, m_ir->CreateInsertElement(ConstantAggregateZero::get(GetType<u32[4]>()), vscr, m_ir->getInt32(m_is_be ? 3 : 0)));
}
@ -649,8 +649,10 @@ void PPUTranslator::MTVSCR(ppu_opcode_t op)
const auto vscr = m_ir->CreateExtractElement(GetVr(op.vb, VrType::vi32), m_ir->getInt32(m_is_be ? 3 : 0));
const auto nj = Trunc(m_ir->CreateLShr(vscr, 16), GetType<bool>());
RegStore(nj, m_nj);
if (g_cfg.core.llvm_ppu_jm_handling) RegStore(m_ir->CreateSelect(nj, m_ir->getInt32(0x7f80'0000), m_ir->getInt32(0x7fff'ffff)), m_jm_mask);
RegStore(m_ir->CreateInsertElement(ConstantAggregateZero::get(GetType<u32[4]>()), m_ir->CreateAnd(vscr, 1), m_ir->getInt32(0)), m_sat);
if (g_cfg.core.ppu_use_nj_bit)
RegStore(m_ir->CreateSelect(nj, m_ir->getInt32(0x7f80'0000), m_ir->getInt32(0x7fff'ffff)), m_jm_mask);
if (g_cfg.core.ppu_set_sat_bit)
RegStore(m_ir->CreateInsertElement(ConstantAggregateZero::get(GetType<u32[4]>()), m_ir->CreateAnd(vscr, 1), m_ir->getInt32(0)), m_sat);
}
void PPUTranslator::VADDCUW(ppu_opcode_t op)
@ -902,10 +904,12 @@ void PPUTranslator::VCTSXS(ppu_opcode_t op)
const auto b = get_vr<f32[4]>(op.vb);
const auto scaled = b * fsplat<f32[4]>(std::pow(2, 0 + op.vuimm));
const auto const1 = fsplat<f32[4]>(-std::pow(2, 31));
//const auto is_nan = fcmp_uno(b == b); // NaN -> 0.0
const auto sat_l = fcmp_ord(scaled < const1); // TODO ???
const auto is_nan = fcmp_uno(b != b);
const auto sat_l = fcmp_ord(scaled < const1);
const auto sat_h = fcmp_ord(scaled >= fsplat<f32[4]>(std::pow(2, 31)));
const auto converted = fpcast<s32[4]>(select(sat_l, const1, scaled));
value_t<s32[4]> converted = eval(fpcast<s32[4]>(select(sat_l, const1, scaled)));
if (g_cfg.core.ppu_fix_vnan)
converted = eval(select(is_nan, splat<s32[4]>(0), converted)); // NaN -> 0
set_vr(op.vd, select(sat_h, splat<s32[4]>(0x7fff'ffff), converted));
set_sat(sext<s32[4]>(sat_l) | sext<s32[4]>(sat_h));
}
@ -915,10 +919,12 @@ void PPUTranslator::VCTUXS(ppu_opcode_t op)
const auto b = get_vr<f32[4]>(op.vb);
const auto scaled = b * fsplat<f32[4]>(std::pow(2, 0 + op.vuimm));
const auto const0 = fsplat<f32[4]>(0.);
//const auto is_nan = fcmp_uno(b == b); // NaN -> 0.0
const auto is_nan = fcmp_uno(b != b);
const auto sat_l = fcmp_ord(scaled < const0);
const auto sat_h = fcmp_ord(scaled >= fsplat<f32[4]>(std::pow(2, 32))); // TODO ???
const auto converted = fpcast<u32[4]>(select(sat_l, const0, scaled));
const auto sat_h = fcmp_ord(scaled >= fsplat<f32[4]>(std::pow(2, 32)));
value_t<u32[4]> converted = eval(fpcast<u32[4]>(select(sat_l, const0, scaled)));
if (g_cfg.core.ppu_fix_vnan)
converted = eval(select(is_nan, splat<u32[4]>(0), converted)); // NaN -> 0
set_vr(op.vd, select(sat_h, splat<u32[4]>(0xffff'ffff), converted));
set_sat(sext<s32[4]>(sat_l) | sext<s32[4]>(sat_h));
}
@ -1334,7 +1340,7 @@ void PPUTranslator::VPKSHSS(ppu_opcode_t op)
const auto ab = shuffle2(b, a, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
const auto r = trunc<u8[16]>(min(max(ab, splat<s16[16]>(-0x80)), splat<s16[16]>(0x7f)));
set_vr(op.vd, r);
set_sat(((a + 0x80) | (b + 0x80)) >> 8);
set_sat(bitcast<u16[8]>((a + 0x80) | (b + 0x80)) >> 8);
}
void PPUTranslator::VPKSHUS(ppu_opcode_t op)
@ -1344,7 +1350,7 @@ void PPUTranslator::VPKSHUS(ppu_opcode_t op)
const auto ab = shuffle2(b, a, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
const auto r = trunc<u8[16]>(min(max(ab, splat<s16[16]>(0)), splat<s16[16]>(0xff)));
set_vr(op.vd, r);
set_sat((a | b) >> 8);
set_sat(bitcast<u16[8]>(a | b) >> 8);
}
void PPUTranslator::VPKSWSS(ppu_opcode_t op)
@ -1354,7 +1360,7 @@ void PPUTranslator::VPKSWSS(ppu_opcode_t op)
const auto ab = shuffle2(b, a, 0, 1, 2, 3, 4, 5, 6, 7);
const auto r = trunc<u16[8]>(min(max(ab, splat<s32[8]>(-0x8000)), splat<s32[8]>(0x7fff)));
set_vr(op.vd, r);
set_sat(((a + 0x8000) | (b + 0x8000)) >> 16);
set_sat(bitcast<u32[4]>((a + 0x8000) | (b + 0x8000)) >> 16);
}
void PPUTranslator::VPKSWUS(ppu_opcode_t op)
@ -1364,7 +1370,7 @@ void PPUTranslator::VPKSWUS(ppu_opcode_t op)
const auto ab = shuffle2(b, a, 0, 1, 2, 3, 4, 5, 6, 7);
const auto r = trunc<u16[8]>(min(max(ab, splat<s32[8]>(0)), splat<s32[8]>(0xffff)));
set_vr(op.vd, r);
set_sat((a | b) >> 16);
set_sat(bitcast<u32[4]>(a | b) >> 16);
}
void PPUTranslator::VPKUHUM(ppu_opcode_t op)
@ -1741,7 +1747,7 @@ void PPUTranslator::VSUMSWS(ppu_opcode_t op)
const auto s = eval(x + y + z);
const auto r = min(max(zshuffle(s, 0, 2) + zshuffle(s, 1, 2), splat<s64[2]>(-0x8000'0000ll)), splat<s64[2]>(0x7fff'ffff));
set_vr(op.vd, zshuffle(bitcast<u32[4]>(r), 0, 4, 4, 4));
set_sat((r + 0x8000'0000) >> 32);
set_sat(bitcast<u64[2]>(r + 0x8000'0000) >> 32);
}
void PPUTranslator::VSUM2SWS(ppu_opcode_t op)
@ -1752,18 +1758,15 @@ void PPUTranslator::VSUM2SWS(ppu_opcode_t op)
const auto z = b >> 32;
const auto r = min(max(x + y + z, splat<s64[2]>(-0x8000'0000ll)), splat<s64[2]>(0x7fff'ffff));
set_vr(op.vd, zshuffle(bitcast<u32[4]>(r), 0, 4, 2, 4));
set_sat((r + 0x8000'0000) >> 32);
set_sat(bitcast<u64[2]>(r + 0x8000'0000) >> 32);
}
void PPUTranslator::VSUM4SBS(ppu_opcode_t op)
{
const auto a = get_vr<s32[4]>(op.va);
const auto a = get_vr<s16[8]>(op.va);
const auto b = get_vr<s32[4]>(op.vb);
const auto x = a << 24 >> 24;
const auto y = a << 16 >> 24;
const auto z = a << 8 >> 24;
const auto w = a >> 24;
const auto s = eval(x + y + z + w); // Can't overflow
const auto x = eval(bitcast<s32[4]>((a << 8 >> 8) + (a >> 8)));
const auto s = eval((x << 16 >> 16) + (x >> 16));
const auto r = add_sat(s, b);
set_vr(op.vd, r);
set_sat(r ^ (s + b));
@ -1773,9 +1776,7 @@ void PPUTranslator::VSUM4SHS(ppu_opcode_t op)
{
const auto a = get_vr<s32[4]>(op.va);
const auto b = get_vr<s32[4]>(op.vb);
const auto x = a << 16 >> 16;
const auto y = a >> 16;
const auto s = eval(x + y); // Can't overflow
const auto s = eval((a << 16 >> 16) + (a >> 16));
const auto r = add_sat(s, b);
set_vr(op.vd, r);
set_sat(r ^ (s + b));
@ -1783,13 +1784,10 @@ void PPUTranslator::VSUM4SHS(ppu_opcode_t op)
void PPUTranslator::VSUM4UBS(ppu_opcode_t op)
{
const auto a = get_vr<u32[4]>(op.va);
const auto a = get_vr<u16[8]>(op.va);
const auto b = get_vr<u32[4]>(op.vb);
const auto x = a & 0xff;
const auto y = a << 16 >> 24;
const auto z = a << 8 >> 24;
const auto w = a >> 24;
const auto s = eval(x + y + z + w); // Can't overflow
const auto x = eval(bitcast<u32[4]>((a & 0xff) + (a >> 8)));
const auto s = eval((x & 0xffff) + (x >> 16));
const auto r = add_sat(s, b);
set_vr(op.vd, r);
set_sat(r ^ (s + b));
@ -4047,7 +4045,7 @@ void PPUTranslator::FMADDS(ppu_opcode_t op)
const auto c = GetFpr(op.frc);
llvm::Value* result;
if (g_cfg.core.llvm_accurate_dfma)
if (g_cfg.core.use_accurate_dfma)
{
result = m_ir->CreateCall(get_intrinsic<f64>(llvm::Intrinsic::fma), {a, c, b});
}
@ -4075,7 +4073,7 @@ void PPUTranslator::FMSUBS(ppu_opcode_t op)
const auto c = GetFpr(op.frc);
llvm::Value* result;
if (g_cfg.core.llvm_accurate_dfma)
if (g_cfg.core.use_accurate_dfma)
{
result = m_ir->CreateCall(get_intrinsic<f64>(llvm::Intrinsic::fma), {a, c, m_ir->CreateFNeg(b)});
}
@ -4103,7 +4101,7 @@ void PPUTranslator::FNMSUBS(ppu_opcode_t op)
const auto c = GetFpr(op.frc);
llvm::Value* result;
if (g_cfg.core.llvm_accurate_dfma)
if (g_cfg.core.use_accurate_dfma)
{
result = m_ir->CreateCall(get_intrinsic<f64>(llvm::Intrinsic::fma), {a, c, m_ir->CreateFNeg(b)});
}
@ -4131,7 +4129,7 @@ void PPUTranslator::FNMADDS(ppu_opcode_t op)
const auto c = GetFpr(op.frc);
llvm::Value* result;
if (g_cfg.core.llvm_accurate_dfma)
if (g_cfg.core.use_accurate_dfma)
{
result = m_ir->CreateCall(get_intrinsic<f64>(llvm::Intrinsic::fma), {a, c, b});
}
@ -4384,7 +4382,7 @@ void PPUTranslator::FMSUB(ppu_opcode_t op)
const auto c = GetFpr(op.frc);
llvm::Value* result;
if (g_cfg.core.llvm_accurate_dfma)
if (g_cfg.core.use_accurate_dfma)
{
result = m_ir->CreateCall(get_intrinsic<f64>(llvm::Intrinsic::fma), {a, c, m_ir->CreateFNeg(b)});
}
@ -4412,7 +4410,7 @@ void PPUTranslator::FMADD(ppu_opcode_t op)
const auto c = GetFpr(op.frc);
llvm::Value* result;
if (g_cfg.core.llvm_accurate_dfma)
if (g_cfg.core.use_accurate_dfma)
{
result = m_ir->CreateCall(get_intrinsic<f64>(llvm::Intrinsic::fma), { a, c, b });
}
@ -4440,7 +4438,7 @@ void PPUTranslator::FNMSUB(ppu_opcode_t op)
const auto c = GetFpr(op.frc);
llvm::Value* result;
if (g_cfg.core.llvm_accurate_dfma)
if (g_cfg.core.use_accurate_dfma)
{
result = m_ir->CreateCall(get_intrinsic<f64>(llvm::Intrinsic::fma), {a, c, m_ir->CreateFNeg(b)});
}
@ -4468,7 +4466,7 @@ void PPUTranslator::FNMADD(ppu_opcode_t op)
const auto c = GetFpr(op.frc);
llvm::Value* result;
if (g_cfg.core.llvm_accurate_dfma)
if (g_cfg.core.use_accurate_dfma)
{
result = m_ir->CreateCall(get_intrinsic<f64>(llvm::Intrinsic::fma), {a, c, b});
}

View file

@ -358,18 +358,31 @@ public:
void VCFSX(ppu_opcode_t op);
void VCFUX(ppu_opcode_t op);
void VCMPBFP(ppu_opcode_t op);
void VCMPBFP_(ppu_opcode_t op) { return VCMPBFP(op); }
void VCMPEQFP(ppu_opcode_t op);
void VCMPEQFP_(ppu_opcode_t op) { return VCMPEQFP(op); }
void VCMPEQUB(ppu_opcode_t op);
void VCMPEQUB_(ppu_opcode_t op) { return VCMPEQUB(op); }
void VCMPEQUH(ppu_opcode_t op);
void VCMPEQUH_(ppu_opcode_t op) { return VCMPEQUH(op); }
void VCMPEQUW(ppu_opcode_t op);
void VCMPEQUW_(ppu_opcode_t op) { return VCMPEQUW(op); }
void VCMPGEFP(ppu_opcode_t op);
void VCMPGEFP_(ppu_opcode_t op) { return VCMPGEFP(op); }
void VCMPGTFP(ppu_opcode_t op);
void VCMPGTFP_(ppu_opcode_t op) { return VCMPGTFP(op); }
void VCMPGTSB(ppu_opcode_t op);
void VCMPGTSB_(ppu_opcode_t op) { return VCMPGTSB(op); }
void VCMPGTSH(ppu_opcode_t op);
void VCMPGTSH_(ppu_opcode_t op) { return VCMPGTSH(op); }
void VCMPGTSW(ppu_opcode_t op);
void VCMPGTSW_(ppu_opcode_t op) { return VCMPGTSW(op); }
void VCMPGTUB(ppu_opcode_t op);
void VCMPGTUB_(ppu_opcode_t op) { return VCMPGTUB(op); }
void VCMPGTUH(ppu_opcode_t op);
void VCMPGTUH_(ppu_opcode_t op) { return VCMPGTUH(op); }
void VCMPGTUW(ppu_opcode_t op);
void VCMPGTUW_(ppu_opcode_t op) { return VCMPGTUW(op); }
void VCTSXS(ppu_opcode_t op);
void VCTUXS(ppu_opcode_t op);
void VEXPTEFP(ppu_opcode_t op);
@ -717,6 +730,130 @@ public:
void FCFID(ppu_opcode_t op);
void UNK(ppu_opcode_t op);
void SUBFCO(ppu_opcode_t op) { return SUBFC(op); }
void ADDCO(ppu_opcode_t op) { return ADDC(op); }
void SUBFO(ppu_opcode_t op) { return SUBF(op); }
void NEGO(ppu_opcode_t op) { return NEG(op); }
void SUBFEO(ppu_opcode_t op) { return SUBFE(op); }
void ADDEO(ppu_opcode_t op) { return ADDE(op); }
void SUBFZEO(ppu_opcode_t op) { return SUBFZE(op); }
void ADDZEO(ppu_opcode_t op) { return ADDZE(op); }
void SUBFMEO(ppu_opcode_t op) { return SUBFME(op); }
void MULLDO(ppu_opcode_t op) { return MULLD(op); }
void ADDMEO(ppu_opcode_t op) { return ADDME(op); }
void MULLWO(ppu_opcode_t op) { return MULLW(op); }
void ADDO(ppu_opcode_t op) { return ADD(op); }
void DIVDUO(ppu_opcode_t op) { return DIVDU(op); }
void DIVWUO(ppu_opcode_t op) { return DIVWU(op); }
void DIVDO(ppu_opcode_t op) { return DIVD(op); }
void DIVWO(ppu_opcode_t op) { return DIVW(op); }
void SUBFCO_(ppu_opcode_t op) { return SUBFC(op); }
void ADDCO_(ppu_opcode_t op) { return ADDC(op); }
void SUBFO_(ppu_opcode_t op) { return SUBF(op); }
void NEGO_(ppu_opcode_t op) { return NEG(op); }
void SUBFEO_(ppu_opcode_t op) { return SUBFE(op); }
void ADDEO_(ppu_opcode_t op) { return ADDE(op); }
void SUBFZEO_(ppu_opcode_t op) { return SUBFZE(op); }
void ADDZEO_(ppu_opcode_t op) { return ADDZE(op); }
void SUBFMEO_(ppu_opcode_t op) { return SUBFME(op); }
void MULLDO_(ppu_opcode_t op) { return MULLD(op); }
void ADDMEO_(ppu_opcode_t op) { return ADDME(op); }
void MULLWO_(ppu_opcode_t op) { return MULLW(op); }
void ADDO_(ppu_opcode_t op) { return ADD(op); }
void DIVDUO_(ppu_opcode_t op) { return DIVDU(op); }
void DIVWUO_(ppu_opcode_t op) { return DIVWU(op); }
void DIVDO_(ppu_opcode_t op) { return DIVD(op); }
void DIVWO_(ppu_opcode_t op) { return DIVW(op); }
void RLWIMI_(ppu_opcode_t op) { return RLWIMI(op); }
void RLWINM_(ppu_opcode_t op) { return RLWINM(op); }
void RLWNM_(ppu_opcode_t op) { return RLWNM(op); }
void RLDICL_(ppu_opcode_t op) { return RLDICL(op); }
void RLDICR_(ppu_opcode_t op) { return RLDICR(op); }
void RLDIC_(ppu_opcode_t op) { return RLDIC(op); }
void RLDIMI_(ppu_opcode_t op) { return RLDIMI(op); }
void RLDCL_(ppu_opcode_t op) { return RLDCL(op); }
void RLDCR_(ppu_opcode_t op) { return RLDCR(op); }
void SUBFC_(ppu_opcode_t op) { return SUBFC(op); }
void MULHDU_(ppu_opcode_t op) { return MULHDU(op); }
void ADDC_(ppu_opcode_t op) { return ADDC(op); }
void MULHWU_(ppu_opcode_t op) { return MULHWU(op); }
void SLW_(ppu_opcode_t op) { return SLW(op); }
void CNTLZW_(ppu_opcode_t op) { return CNTLZW(op); }
void SLD_(ppu_opcode_t op) { return SLD(op); }
void AND_(ppu_opcode_t op) { return AND(op); }
void SUBF_(ppu_opcode_t op) { return SUBF(op); }
void CNTLZD_(ppu_opcode_t op) { return CNTLZD(op); }
void ANDC_(ppu_opcode_t op) { return ANDC(op); }
void MULHD_(ppu_opcode_t op) { return MULHD(op); }
void MULHW_(ppu_opcode_t op) { return MULHW(op); }
void NEG_(ppu_opcode_t op) { return NEG(op); }
void NOR_(ppu_opcode_t op) { return NOR(op); }
void SUBFE_(ppu_opcode_t op) { return SUBFE(op); }
void ADDE_(ppu_opcode_t op) { return ADDE(op); }
void SUBFZE_(ppu_opcode_t op) { return SUBFZE(op); }
void ADDZE_(ppu_opcode_t op) { return ADDZE(op); }
void MULLD_(ppu_opcode_t op) { return MULLD(op); }
void SUBFME_(ppu_opcode_t op) { return SUBFME(op); }
void ADDME_(ppu_opcode_t op) { return ADDME(op); }
void MULLW_(ppu_opcode_t op) { return MULLW(op); }
void ADD_(ppu_opcode_t op) { return ADD(op); }
void EQV_(ppu_opcode_t op) { return EQV(op); }
void XOR_(ppu_opcode_t op) { return XOR(op); }
void ORC_(ppu_opcode_t op) { return ORC(op); }
void OR_(ppu_opcode_t op) { return OR(op); }
void DIVDU_(ppu_opcode_t op) { return DIVDU(op); }
void DIVWU_(ppu_opcode_t op) { return DIVWU(op); }
void NAND_(ppu_opcode_t op) { return NAND(op); }
void DIVD_(ppu_opcode_t op) { return DIVD(op); }
void DIVW_(ppu_opcode_t op) { return DIVW(op); }
void SRW_(ppu_opcode_t op) { return SRW(op); }
void SRD_(ppu_opcode_t op) { return SRD(op); }
void SRAW_(ppu_opcode_t op) { return SRAW(op); }
void SRAD_(ppu_opcode_t op) { return SRAD(op); }
void SRAWI_(ppu_opcode_t op) { return SRAWI(op); }
void SRADI_(ppu_opcode_t op) { return SRADI(op); }
void EXTSH_(ppu_opcode_t op) { return EXTSH(op); }
void EXTSB_(ppu_opcode_t op) { return EXTSB(op); }
void EXTSW_(ppu_opcode_t op) { return EXTSW(op); }
void FDIVS_(ppu_opcode_t op) { return FDIVS(op); }
void FSUBS_(ppu_opcode_t op) { return FSUBS(op); }
void FADDS_(ppu_opcode_t op) { return FADDS(op); }
void FSQRTS_(ppu_opcode_t op) { return FSQRTS(op); }
void FRES_(ppu_opcode_t op) { return FRES(op); }
void FMULS_(ppu_opcode_t op) { return FMULS(op); }
void FMADDS_(ppu_opcode_t op) { return FMADDS(op); }
void FMSUBS_(ppu_opcode_t op) { return FMSUBS(op); }
void FNMSUBS_(ppu_opcode_t op) { return FNMSUBS(op); }
void FNMADDS_(ppu_opcode_t op) { return FNMADDS(op); }
void MTFSB1_(ppu_opcode_t op) { return MTFSB1(op); }
void MTFSB0_(ppu_opcode_t op) { return MTFSB0(op); }
void MTFSFI_(ppu_opcode_t op) { return MTFSFI(op); }
void MFFS_(ppu_opcode_t op) { return MFFS(op); }
void MTFSF_(ppu_opcode_t op) { return MTFSF(op); }
void FRSP_(ppu_opcode_t op) { return FRSP(op); }
void FCTIW_(ppu_opcode_t op) { return FCTIW(op); }
void FCTIWZ_(ppu_opcode_t op) { return FCTIWZ(op); }
void FDIV_(ppu_opcode_t op) { return FDIV(op); }
void FSUB_(ppu_opcode_t op) { return FSUB(op); }
void FADD_(ppu_opcode_t op) { return FADD(op); }
void FSQRT_(ppu_opcode_t op) { return FSQRT(op); }
void FSEL_(ppu_opcode_t op) { return FSEL(op); }
void FMUL_(ppu_opcode_t op) { return FMUL(op); }
void FRSQRTE_(ppu_opcode_t op) { return FRSQRTE(op); }
void FMSUB_(ppu_opcode_t op) { return FMSUB(op); }
void FMADD_(ppu_opcode_t op) { return FMADD(op); }
void FNMSUB_(ppu_opcode_t op) { return FNMSUB(op); }
void FNMADD_(ppu_opcode_t op) { return FNMADD(op); }
void FNEG_(ppu_opcode_t op) { return FNEG(op); }
void FMR_(ppu_opcode_t op) { return FMR(op); }
void FNABS_(ppu_opcode_t op) { return FNABS(op); }
void FABS_(ppu_opcode_t op) { return FABS(op); }
void FCTID_(ppu_opcode_t op) { return FCTID(op); }
void FCTIDZ_(ppu_opcode_t op) { return FCTIDZ(op); }
void FCFID_(ppu_opcode_t op) { return FCFID(op); }
};
#endif

File diff suppressed because it is too large Load diff

View file

@ -88,8 +88,6 @@ private:
XmmLink XmmGet(s8 reg, XmmType type);
asmjit::x86::Mem XmmConst(const v128& data);
asmjit::x86::Mem XmmConst(const __m128& data);
asmjit::x86::Mem XmmConst(const __m128i& data);
asmjit::x86::Mem get_pc(u32 addr);
void branch_fixed(u32 target, bool absolute = false);

View file

@ -1,2 +1,7 @@
#include "stdafx.h"
#include "SPUAnalyser.h"
#include "SPUOpcodes.h"
const extern spu_decoder<spu_itype> g_spu_itype{};
const extern spu_decoder<spu_iname> g_spu_iname{};
const extern spu_decoder<spu_iflag> g_spu_iflag{};

View file

@ -4,11 +4,12 @@
#include "SPUThread.h"
const spu_decoder<SPUDisAsm> s_spu_disasm;
const spu_decoder<spu_itype> s_spu_itype;
const spu_decoder<spu_iflag> s_spu_iflag;
const extern spu_decoder<spu_itype> g_spu_itype;
const extern spu_decoder<spu_iname> g_spu_iname;
const extern spu_decoder<spu_iflag> g_spu_iflag;
#include "util/v128.hpp"
#include "util/v128sse.hpp"
#include "util/simd.hpp"
u32 SPUDisAsm::disasm(u32 pc)
{
@ -49,7 +50,7 @@ std::pair<bool, v128> SPUDisAsm::try_get_const_value(u32 reg, u32 pc, u32 TTL) c
if (pc == umax)
{
// Default arg: choose pc of previous instruction
// Default arg: choose pc of previous instruction
if (dump_pc == 0)
{
@ -68,7 +69,7 @@ std::pair<bool, v128> SPUDisAsm::try_get_const_value(u32 reg, u32 pc, u32 TTL) c
const u32 opcode = *reinterpret_cast<const be_t<u32>*>(m_offset + i);
const spu_opcode_t op0{ opcode };
const auto type = s_spu_itype.decode(opcode);
const auto type = g_spu_itype.decode(opcode);
if (type & spu_itype::branch || type == spu_itype::UNK || !opcode)
{
@ -101,7 +102,7 @@ std::pair<bool, v128> SPUDisAsm::try_get_const_value(u32 reg, u32 pc, u32 TTL) c
var = value;\
} void() /*<- Require a semicolon*/
//const auto flag = s_spu_iflag.decode(opcode);
//const auto flag = g_spu_iflag.decode(opcode);
// TODO: It detects spurious register modifications
if (u32 dst = type & spu_itype::_quadrop ? +op0.rt4 : +op0.rt; dst == reg)
@ -203,14 +204,14 @@ std::pair<bool, v128> SPUDisAsm::try_get_const_value(u32 reg, u32 pc, u32 TTL) c
v128 reg_val{};
GET_CONST_REG(reg_val, op0.ra);
return { true, reg_val };
return { true, reg_val };
}
case spu_itype::ORI:
{
v128 reg_val{};
GET_CONST_REG(reg_val, op0.ra);
return { true, reg_val | v128::from32p(op0.si10) };
return { true, reg_val | v128::from32p(op0.si10) };
}
default: return {};
}

File diff suppressed because it is too large Load diff

View file

@ -4,246 +4,39 @@
class spu_thread;
using spu_inter_func_t = bool(*)(spu_thread& spu, spu_opcode_t op);
using spu_intrp_func_t = bool(*)(spu_thread& spu, spu_opcode_t op);
template <typename IT>
struct spu_interpreter_t;
struct spu_interpreter
{
static bool UNK(spu_thread&, spu_opcode_t);
static void set_interrupt_status(spu_thread&, spu_opcode_t);
static bool STOP(spu_thread&, spu_opcode_t);
static bool LNOP(spu_thread&, spu_opcode_t);
static bool SYNC(spu_thread&, spu_opcode_t);
static bool DSYNC(spu_thread&, spu_opcode_t);
static bool MFSPR(spu_thread&, spu_opcode_t);
static bool RDCH(spu_thread&, spu_opcode_t);
static bool RCHCNT(spu_thread&, spu_opcode_t);
static bool SF(spu_thread&, spu_opcode_t);
static bool OR(spu_thread&, spu_opcode_t);
static bool BG(spu_thread&, spu_opcode_t);
static bool SFH(spu_thread&, spu_opcode_t);
static bool NOR(spu_thread&, spu_opcode_t);
static bool ABSDB(spu_thread&, spu_opcode_t);
static bool ROT(spu_thread&, spu_opcode_t);
static bool ROTM(spu_thread&, spu_opcode_t);
static bool ROTMA(spu_thread&, spu_opcode_t);
static bool SHL(spu_thread&, spu_opcode_t);
static bool ROTH(spu_thread&, spu_opcode_t);
static bool ROTHM(spu_thread&, spu_opcode_t);
static bool ROTMAH(spu_thread&, spu_opcode_t);
static bool SHLH(spu_thread&, spu_opcode_t);
static bool ROTI(spu_thread&, spu_opcode_t);
static bool ROTMI(spu_thread&, spu_opcode_t);
static bool ROTMAI(spu_thread&, spu_opcode_t);
static bool SHLI(spu_thread&, spu_opcode_t);
static bool ROTHI(spu_thread&, spu_opcode_t);
static bool ROTHMI(spu_thread&, spu_opcode_t);
static bool ROTMAHI(spu_thread&, spu_opcode_t);
static bool SHLHI(spu_thread&, spu_opcode_t);
static bool A(spu_thread&, spu_opcode_t);
static bool AND(spu_thread&, spu_opcode_t);
static bool CG(spu_thread&, spu_opcode_t);
static bool AH(spu_thread&, spu_opcode_t);
static bool NAND(spu_thread&, spu_opcode_t);
static bool AVGB(spu_thread&, spu_opcode_t);
static bool MTSPR(spu_thread&, spu_opcode_t);
static bool WRCH(spu_thread&, spu_opcode_t);
static bool BIZ(spu_thread&, spu_opcode_t);
static bool BINZ(spu_thread&, spu_opcode_t);
static bool BIHZ(spu_thread&, spu_opcode_t);
static bool BIHNZ(spu_thread&, spu_opcode_t);
static bool STOPD(spu_thread&, spu_opcode_t);
static bool STQX(spu_thread&, spu_opcode_t);
static bool BI(spu_thread&, spu_opcode_t);
static bool BISL(spu_thread&, spu_opcode_t);
static bool IRET(spu_thread&, spu_opcode_t);
static bool BISLED(spu_thread&, spu_opcode_t);
static bool HBR(spu_thread&, spu_opcode_t);
static bool GB(spu_thread&, spu_opcode_t);
static bool GBH(spu_thread&, spu_opcode_t);
static bool GBB(spu_thread&, spu_opcode_t);
static bool FSM(spu_thread&, spu_opcode_t);
static bool FSMH(spu_thread&, spu_opcode_t);
static bool FSMB(spu_thread&, spu_opcode_t);
static bool LQX(spu_thread&, spu_opcode_t);
static bool ROTQBYBI(spu_thread&, spu_opcode_t);
static bool ROTQMBYBI(spu_thread&, spu_opcode_t);
static bool SHLQBYBI(spu_thread&, spu_opcode_t);
static bool CBX(spu_thread&, spu_opcode_t);
static bool CHX(spu_thread&, spu_opcode_t);
static bool CWX(spu_thread&, spu_opcode_t);
static bool CDX(spu_thread&, spu_opcode_t);
static bool ROTQBI(spu_thread&, spu_opcode_t);
static bool ROTQMBI(spu_thread&, spu_opcode_t);
static bool SHLQBI(spu_thread&, spu_opcode_t);
static bool ROTQBY(spu_thread&, spu_opcode_t);
static bool ROTQMBY(spu_thread&, spu_opcode_t);
static bool SHLQBY(spu_thread&, spu_opcode_t);
static bool ORX(spu_thread&, spu_opcode_t);
static bool CBD(spu_thread&, spu_opcode_t);
static bool CHD(spu_thread&, spu_opcode_t);
static bool CWD(spu_thread&, spu_opcode_t);
static bool CDD(spu_thread&, spu_opcode_t);
static bool ROTQBII(spu_thread&, spu_opcode_t);
static bool ROTQMBII(spu_thread&, spu_opcode_t);
static bool SHLQBII(spu_thread&, spu_opcode_t);
static bool ROTQBYI(spu_thread&, spu_opcode_t);
static bool ROTQMBYI(spu_thread&, spu_opcode_t);
static bool SHLQBYI(spu_thread&, spu_opcode_t);
static bool NOP(spu_thread&, spu_opcode_t);
static bool CGT(spu_thread&, spu_opcode_t);
static bool XOR(spu_thread&, spu_opcode_t);
static bool CGTH(spu_thread&, spu_opcode_t);
static bool EQV(spu_thread&, spu_opcode_t);
static bool CGTB(spu_thread&, spu_opcode_t);
static bool SUMB(spu_thread&, spu_opcode_t);
static bool HGT(spu_thread&, spu_opcode_t);
static bool CLZ(spu_thread&, spu_opcode_t);
static bool XSWD(spu_thread&, spu_opcode_t);
static bool XSHW(spu_thread&, spu_opcode_t);
static bool CNTB(spu_thread&, spu_opcode_t);
static bool XSBH(spu_thread&, spu_opcode_t);
static bool CLGT(spu_thread&, spu_opcode_t);
static bool ANDC(spu_thread&, spu_opcode_t);
static bool CLGTH(spu_thread&, spu_opcode_t);
static bool ORC(spu_thread&, spu_opcode_t);
static bool CLGTB(spu_thread&, spu_opcode_t);
static bool HLGT(spu_thread&, spu_opcode_t);
static bool CEQ(spu_thread&, spu_opcode_t);
static bool MPYHHU(spu_thread&, spu_opcode_t);
static bool ADDX(spu_thread&, spu_opcode_t);
static bool SFX(spu_thread&, spu_opcode_t);
static bool CGX(spu_thread&, spu_opcode_t);
static bool BGX(spu_thread&, spu_opcode_t);
static bool MPYHHA(spu_thread&, spu_opcode_t);
static bool MPYHHAU(spu_thread&, spu_opcode_t);
static bool MPY(spu_thread&, spu_opcode_t);
static bool MPYH(spu_thread&, spu_opcode_t);
static bool MPYHH(spu_thread&, spu_opcode_t);
static bool MPYS(spu_thread&, spu_opcode_t);
static bool CEQH(spu_thread&, spu_opcode_t);
static bool MPYU(spu_thread&, spu_opcode_t);
static bool CEQB(spu_thread&, spu_opcode_t);
static bool HEQ(spu_thread&, spu_opcode_t);
static bool BRZ(spu_thread&, spu_opcode_t);
static bool STQA(spu_thread&, spu_opcode_t);
static bool BRNZ(spu_thread&, spu_opcode_t);
static bool BRHZ(spu_thread&, spu_opcode_t);
static bool BRHNZ(spu_thread&, spu_opcode_t);
static bool STQR(spu_thread&, spu_opcode_t);
static bool BRA(spu_thread&, spu_opcode_t);
static bool LQA(spu_thread&, spu_opcode_t);
static bool BRASL(spu_thread&, spu_opcode_t);
static bool BR(spu_thread&, spu_opcode_t);
static bool FSMBI(spu_thread&, spu_opcode_t);
static bool BRSL(spu_thread&, spu_opcode_t);
static bool LQR(spu_thread&, spu_opcode_t);
static bool IL(spu_thread&, spu_opcode_t);
static bool ILHU(spu_thread&, spu_opcode_t);
static bool ILH(spu_thread&, spu_opcode_t);
static bool IOHL(spu_thread&, spu_opcode_t);
static bool ORI(spu_thread&, spu_opcode_t);
static bool ORHI(spu_thread&, spu_opcode_t);
static bool ORBI(spu_thread&, spu_opcode_t);
static bool SFI(spu_thread&, spu_opcode_t);
static bool SFHI(spu_thread&, spu_opcode_t);
static bool ANDI(spu_thread&, spu_opcode_t);
static bool ANDHI(spu_thread&, spu_opcode_t);
static bool ANDBI(spu_thread&, spu_opcode_t);
static bool AI(spu_thread&, spu_opcode_t);
static bool AHI(spu_thread&, spu_opcode_t);
static bool STQD(spu_thread&, spu_opcode_t);
static bool LQD(spu_thread&, spu_opcode_t);
static bool XORI(spu_thread&, spu_opcode_t);
static bool XORHI(spu_thread&, spu_opcode_t);
static bool XORBI(spu_thread&, spu_opcode_t);
static bool CGTI(spu_thread&, spu_opcode_t);
static bool CGTHI(spu_thread&, spu_opcode_t);
static bool CGTBI(spu_thread&, spu_opcode_t);
static bool HGTI(spu_thread&, spu_opcode_t);
static bool CLGTI(spu_thread&, spu_opcode_t);
static bool CLGTHI(spu_thread&, spu_opcode_t);
static bool CLGTBI(spu_thread&, spu_opcode_t);
static bool HLGTI(spu_thread&, spu_opcode_t);
static bool MPYI(spu_thread&, spu_opcode_t);
static bool MPYUI(spu_thread&, spu_opcode_t);
static bool CEQI(spu_thread&, spu_opcode_t);
static bool CEQHI(spu_thread&, spu_opcode_t);
static bool CEQBI(spu_thread&, spu_opcode_t);
static bool HEQI(spu_thread&, spu_opcode_t);
static bool HBRA(spu_thread&, spu_opcode_t);
static bool HBRR(spu_thread&, spu_opcode_t);
static bool ILA(spu_thread&, spu_opcode_t);
static bool SELB(spu_thread&, spu_opcode_t);
static bool SHUFB(spu_thread&, spu_opcode_t);
static bool MPYA(spu_thread&, spu_opcode_t);
static bool DFCGT(spu_thread&, spu_opcode_t);
static bool DFCMGT(spu_thread&, spu_opcode_t);
static bool DFTSV(spu_thread&, spu_opcode_t);
static bool DFCEQ(spu_thread&, spu_opcode_t);
static bool DFCMEQ(spu_thread&, spu_opcode_t);
};
struct spu_interpreter_fast final : spu_interpreter
struct spu_interpreter_rt_base
{
static bool FREST(spu_thread&, spu_opcode_t);
static bool FRSQEST(spu_thread&, spu_opcode_t);
static bool FCGT(spu_thread&, spu_opcode_t);
static bool FA(spu_thread&, spu_opcode_t);
static bool FS(spu_thread&, spu_opcode_t);
static bool FM(spu_thread&, spu_opcode_t);
static bool FCMGT(spu_thread&, spu_opcode_t);
static bool DFA(spu_thread&, spu_opcode_t);
static bool DFS(spu_thread&, spu_opcode_t);
static bool DFM(spu_thread&, spu_opcode_t);
static bool DFMA(spu_thread&, spu_opcode_t);
static bool DFMS(spu_thread&, spu_opcode_t);
static bool DFNMS(spu_thread&, spu_opcode_t);
static bool DFNMA(spu_thread&, spu_opcode_t);
static bool FSCRRD(spu_thread&, spu_opcode_t);
static bool FESD(spu_thread&, spu_opcode_t);
static bool FRDS(spu_thread&, spu_opcode_t);
static bool FSCRWR(spu_thread&, spu_opcode_t);
static bool FCEQ(spu_thread&, spu_opcode_t);
static bool FCMEQ(spu_thread&, spu_opcode_t);
static bool FI(spu_thread&, spu_opcode_t);
static bool CFLTS(spu_thread&, spu_opcode_t);
static bool CFLTU(spu_thread&, spu_opcode_t);
static bool CSFLT(spu_thread&, spu_opcode_t);
static bool CUFLT(spu_thread&, spu_opcode_t);
static bool FNMS(spu_thread&, spu_opcode_t);
static bool FMA(spu_thread&, spu_opcode_t);
static bool FMS(spu_thread&, spu_opcode_t);
protected:
std::unique_ptr<spu_interpreter_t<spu_intrp_func_t>> ptrs;
spu_interpreter_rt_base() noexcept;
spu_interpreter_rt_base(const spu_interpreter_rt_base&) = delete;
spu_interpreter_rt_base& operator=(const spu_interpreter_rt_base&) = delete;
virtual ~spu_interpreter_rt_base();
};
struct spu_interpreter_precise final : spu_interpreter
struct spu_interpreter_rt : spu_interpreter_rt_base
{
static bool FREST(spu_thread&, spu_opcode_t);
static bool FRSQEST(spu_thread&, spu_opcode_t);
static bool FCGT(spu_thread&, spu_opcode_t);
static bool FA(spu_thread&, spu_opcode_t);
static bool FS(spu_thread&, spu_opcode_t);
static bool FM(spu_thread&, spu_opcode_t);
static bool FCMGT(spu_thread&, spu_opcode_t);
static bool DFA(spu_thread&, spu_opcode_t);
static bool DFS(spu_thread&, spu_opcode_t);
static bool DFM(spu_thread&, spu_opcode_t);
static bool DFMA(spu_thread&, spu_opcode_t);
static bool DFMS(spu_thread&, spu_opcode_t);
static bool DFNMS(spu_thread&, spu_opcode_t);
static bool DFNMA(spu_thread&, spu_opcode_t);
static bool FSCRRD(spu_thread&, spu_opcode_t);
static bool FESD(spu_thread&, spu_opcode_t);
static bool FRDS(spu_thread&, spu_opcode_t);
static bool FSCRWR(spu_thread&, spu_opcode_t);
static bool FCEQ(spu_thread&, spu_opcode_t);
static bool FCMEQ(spu_thread&, spu_opcode_t);
static bool FI(spu_thread&, spu_opcode_t);
static bool CFLTS(spu_thread&, spu_opcode_t);
static bool CFLTU(spu_thread&, spu_opcode_t);
static bool CSFLT(spu_thread&, spu_opcode_t);
static bool CUFLT(spu_thread&, spu_opcode_t);
static bool FNMS(spu_thread&, spu_opcode_t);
static bool FMA(spu_thread&, spu_opcode_t);
static bool FMS(spu_thread&, spu_opcode_t);
spu_interpreter_rt() noexcept;
spu_intrp_func_t decode(u32 op) const noexcept
{
return table.decode(op);
}
private:
spu_decoder<spu_interpreter_t<spu_intrp_func_t>, spu_intrp_func_t> table;
};

View file

@ -71,215 +71,227 @@ class spu_decoder
}
};
public:
spu_decoder() noexcept
// Helper
static const D& _first(const D& arg)
{
return arg;
}
public:
template <typename... Args>
spu_decoder(const Args&... args) noexcept
{
// If an object is passed to the constructor, assign values from that object
#define GET(name) [&]{ if constexpr (sizeof...(Args) > 0) return _first(args...).name; else return &D::name; }()
static_assert(sizeof...(Args) <= 1);
const std::initializer_list<instruction_info> instructions
{
{ 0, 0x0, &D::STOP },
{ 0, 0x1, &D::LNOP },
{ 0, 0x2, &D::SYNC },
{ 0, 0x3, &D::DSYNC },
{ 0, 0xc, &D::MFSPR },
{ 0, 0xd, &D::RDCH },
{ 0, 0xf, &D::RCHCNT },
{ 0, 0x40, &D::SF },
{ 0, 0x41, &D::OR },
{ 0, 0x42, &D::BG },
{ 0, 0x48, &D::SFH },
{ 0, 0x49, &D::NOR },
{ 0, 0x53, &D::ABSDB },
{ 0, 0x58, &D::ROT },
{ 0, 0x59, &D::ROTM },
{ 0, 0x5a, &D::ROTMA },
{ 0, 0x5b, &D::SHL },
{ 0, 0x5c, &D::ROTH },
{ 0, 0x5d, &D::ROTHM },
{ 0, 0x5e, &D::ROTMAH },
{ 0, 0x5f, &D::SHLH },
{ 0, 0x78, &D::ROTI },
{ 0, 0x79, &D::ROTMI },
{ 0, 0x7a, &D::ROTMAI },
{ 0, 0x7b, &D::SHLI },
{ 0, 0x7c, &D::ROTHI },
{ 0, 0x7d, &D::ROTHMI },
{ 0, 0x7e, &D::ROTMAHI },
{ 0, 0x7f, &D::SHLHI },
{ 0, 0xc0, &D::A },
{ 0, 0xc1, &D::AND },
{ 0, 0xc2, &D::CG },
{ 0, 0xc8, &D::AH },
{ 0, 0xc9, &D::NAND },
{ 0, 0xd3, &D::AVGB },
{ 0, 0x10c, &D::MTSPR },
{ 0, 0x10d, &D::WRCH },
{ 0, 0x128, &D::BIZ },
{ 0, 0x129, &D::BINZ },
{ 0, 0x12a, &D::BIHZ },
{ 0, 0x12b, &D::BIHNZ },
{ 0, 0x140, &D::STOPD },
{ 0, 0x144, &D::STQX },
{ 0, 0x1a8, &D::BI },
{ 0, 0x1a9, &D::BISL },
{ 0, 0x1aa, &D::IRET },
{ 0, 0x1ab, &D::BISLED },
{ 0, 0x1ac, &D::HBR },
{ 0, 0x1b0, &D::GB },
{ 0, 0x1b1, &D::GBH },
{ 0, 0x1b2, &D::GBB },
{ 0, 0x1b4, &D::FSM },
{ 0, 0x1b5, &D::FSMH },
{ 0, 0x1b6, &D::FSMB },
{ 0, 0x1b8, &D::FREST },
{ 0, 0x1b9, &D::FRSQEST },
{ 0, 0x1c4, &D::LQX },
{ 0, 0x1cc, &D::ROTQBYBI },
{ 0, 0x1cd, &D::ROTQMBYBI },
{ 0, 0x1cf, &D::SHLQBYBI },
{ 0, 0x1d4, &D::CBX },
{ 0, 0x1d5, &D::CHX },
{ 0, 0x1d6, &D::CWX },
{ 0, 0x1d7, &D::CDX },
{ 0, 0x1d8, &D::ROTQBI },
{ 0, 0x1d9, &D::ROTQMBI },
{ 0, 0x1db, &D::SHLQBI },
{ 0, 0x1dc, &D::ROTQBY },
{ 0, 0x1dd, &D::ROTQMBY },
{ 0, 0x1df, &D::SHLQBY },
{ 0, 0x1f0, &D::ORX },
{ 0, 0x1f4, &D::CBD },
{ 0, 0x1f5, &D::CHD },
{ 0, 0x1f6, &D::CWD },
{ 0, 0x1f7, &D::CDD },
{ 0, 0x1f8, &D::ROTQBII },
{ 0, 0x1f9, &D::ROTQMBII },
{ 0, 0x1fb, &D::SHLQBII },
{ 0, 0x1fc, &D::ROTQBYI },
{ 0, 0x1fd, &D::ROTQMBYI },
{ 0, 0x1ff, &D::SHLQBYI },
{ 0, 0x201, &D::NOP },
{ 0, 0x240, &D::CGT },
{ 0, 0x241, &D::XOR },
{ 0, 0x248, &D::CGTH },
{ 0, 0x249, &D::EQV },
{ 0, 0x250, &D::CGTB },
{ 0, 0x253, &D::SUMB },
{ 0, 0x258, &D::HGT },
{ 0, 0x2a5, &D::CLZ },
{ 0, 0x2a6, &D::XSWD },
{ 0, 0x2ae, &D::XSHW },
{ 0, 0x2b4, &D::CNTB },
{ 0, 0x2b6, &D::XSBH },
{ 0, 0x2c0, &D::CLGT },
{ 0, 0x2c1, &D::ANDC },
{ 0, 0x2c2, &D::FCGT },
{ 0, 0x2c3, &D::DFCGT },
{ 0, 0x2c4, &D::FA },
{ 0, 0x2c5, &D::FS },
{ 0, 0x2c6, &D::FM },
{ 0, 0x2c8, &D::CLGTH },
{ 0, 0x2c9, &D::ORC },
{ 0, 0x2ca, &D::FCMGT },
{ 0, 0x2cb, &D::DFCMGT },
{ 0, 0x2cc, &D::DFA },
{ 0, 0x2cd, &D::DFS },
{ 0, 0x2ce, &D::DFM },
{ 0, 0x2d0, &D::CLGTB },
{ 0, 0x2d8, &D::HLGT },
{ 0, 0x35c, &D::DFMA },
{ 0, 0x35d, &D::DFMS },
{ 0, 0x35e, &D::DFNMS },
{ 0, 0x35f, &D::DFNMA },
{ 0, 0x3c0, &D::CEQ },
{ 0, 0x3ce, &D::MPYHHU },
{ 0, 0x340, &D::ADDX },
{ 0, 0x341, &D::SFX },
{ 0, 0x342, &D::CGX },
{ 0, 0x343, &D::BGX },
{ 0, 0x346, &D::MPYHHA },
{ 0, 0x34e, &D::MPYHHAU },
{ 0, 0x398, &D::FSCRRD },
{ 0, 0x3b8, &D::FESD },
{ 0, 0x3b9, &D::FRDS },
{ 0, 0x3ba, &D::FSCRWR },
{ 0, 0x3bf, &D::DFTSV },
{ 0, 0x3c2, &D::FCEQ },
{ 0, 0x3c3, &D::DFCEQ },
{ 0, 0x3c4, &D::MPY },
{ 0, 0x3c5, &D::MPYH },
{ 0, 0x3c6, &D::MPYHH },
{ 0, 0x3c7, &D::MPYS },
{ 0, 0x3c8, &D::CEQH },
{ 0, 0x3ca, &D::FCMEQ },
{ 0, 0x3cb, &D::DFCMEQ },
{ 0, 0x3cc, &D::MPYU },
{ 0, 0x3d0, &D::CEQB },
{ 0, 0x3d4, &D::FI },
{ 0, 0x3d8, &D::HEQ },
{ 1, 0x1d8, &D::CFLTS },
{ 1, 0x1d9, &D::CFLTU },
{ 1, 0x1da, &D::CSFLT },
{ 1, 0x1db, &D::CUFLT },
{ 2, 0x40, &D::BRZ },
{ 2, 0x41, &D::STQA },
{ 2, 0x42, &D::BRNZ },
{ 2, 0x44, &D::BRHZ },
{ 2, 0x46, &D::BRHNZ },
{ 2, 0x47, &D::STQR },
{ 2, 0x60, &D::BRA },
{ 2, 0x61, &D::LQA },
{ 2, 0x62, &D::BRASL },
{ 2, 0x64, &D::BR },
{ 2, 0x65, &D::FSMBI },
{ 2, 0x66, &D::BRSL },
{ 2, 0x67, &D::LQR },
{ 2, 0x81, &D::IL },
{ 2, 0x82, &D::ILHU },
{ 2, 0x83, &D::ILH },
{ 2, 0xc1, &D::IOHL },
{ 3, 0x4, &D::ORI },
{ 3, 0x5, &D::ORHI },
{ 3, 0x6, &D::ORBI },
{ 3, 0xc, &D::SFI },
{ 3, 0xd, &D::SFHI },
{ 3, 0x14, &D::ANDI },
{ 3, 0x15, &D::ANDHI },
{ 3, 0x16, &D::ANDBI },
{ 3, 0x1c, &D::AI },
{ 3, 0x1d, &D::AHI },
{ 3, 0x24, &D::STQD },
{ 3, 0x34, &D::LQD },
{ 3, 0x44, &D::XORI },
{ 3, 0x45, &D::XORHI },
{ 3, 0x46, &D::XORBI },
{ 3, 0x4c, &D::CGTI },
{ 3, 0x4d, &D::CGTHI },
{ 3, 0x4e, &D::CGTBI },
{ 3, 0x4f, &D::HGTI },
{ 3, 0x5c, &D::CLGTI },
{ 3, 0x5d, &D::CLGTHI },
{ 3, 0x5e, &D::CLGTBI },
{ 3, 0x5f, &D::HLGTI },
{ 3, 0x74, &D::MPYI },
{ 3, 0x75, &D::MPYUI },
{ 3, 0x7c, &D::CEQI },
{ 3, 0x7d, &D::CEQHI },
{ 3, 0x7e, &D::CEQBI },
{ 3, 0x7f, &D::HEQI },
{ 4, 0x8, &D::HBRA },
{ 4, 0x9, &D::HBRR },
{ 4, 0x21, &D::ILA },
{ 7, 0x8, &D::SELB },
{ 7, 0xb, &D::SHUFB },
{ 7, 0xc, &D::MPYA },
{ 7, 0xd, &D::FNMS },
{ 7, 0xe, &D::FMA },
{ 7, 0xf, &D::FMS },
{ 0, 0x0, GET(STOP) },
{ 0, 0x1, GET(LNOP) },
{ 0, 0x2, GET(SYNC) },
{ 0, 0x3, GET(DSYNC) },
{ 0, 0xc, GET(MFSPR) },
{ 0, 0xd, GET(RDCH) },
{ 0, 0xf, GET(RCHCNT) },
{ 0, 0x40, GET(SF) },
{ 0, 0x41, GET(OR) },
{ 0, 0x42, GET(BG) },
{ 0, 0x48, GET(SFH) },
{ 0, 0x49, GET(NOR) },
{ 0, 0x53, GET(ABSDB) },
{ 0, 0x58, GET(ROT) },
{ 0, 0x59, GET(ROTM) },
{ 0, 0x5a, GET(ROTMA) },
{ 0, 0x5b, GET(SHL) },
{ 0, 0x5c, GET(ROTH) },
{ 0, 0x5d, GET(ROTHM) },
{ 0, 0x5e, GET(ROTMAH) },
{ 0, 0x5f, GET(SHLH) },
{ 0, 0x78, GET(ROTI) },
{ 0, 0x79, GET(ROTMI) },
{ 0, 0x7a, GET(ROTMAI) },
{ 0, 0x7b, GET(SHLI) },
{ 0, 0x7c, GET(ROTHI) },
{ 0, 0x7d, GET(ROTHMI) },
{ 0, 0x7e, GET(ROTMAHI) },
{ 0, 0x7f, GET(SHLHI) },
{ 0, 0xc0, GET(A) },
{ 0, 0xc1, GET(AND) },
{ 0, 0xc2, GET(CG) },
{ 0, 0xc8, GET(AH) },
{ 0, 0xc9, GET(NAND) },
{ 0, 0xd3, GET(AVGB) },
{ 0, 0x10c, GET(MTSPR) },
{ 0, 0x10d, GET(WRCH) },
{ 0, 0x128, GET(BIZ) },
{ 0, 0x129, GET(BINZ) },
{ 0, 0x12a, GET(BIHZ) },
{ 0, 0x12b, GET(BIHNZ) },
{ 0, 0x140, GET(STOPD) },
{ 0, 0x144, GET(STQX) },
{ 0, 0x1a8, GET(BI) },
{ 0, 0x1a9, GET(BISL) },
{ 0, 0x1aa, GET(IRET) },
{ 0, 0x1ab, GET(BISLED) },
{ 0, 0x1ac, GET(HBR) },
{ 0, 0x1b0, GET(GB) },
{ 0, 0x1b1, GET(GBH) },
{ 0, 0x1b2, GET(GBB) },
{ 0, 0x1b4, GET(FSM) },
{ 0, 0x1b5, GET(FSMH) },
{ 0, 0x1b6, GET(FSMB) },
{ 0, 0x1b8, GET(FREST) },
{ 0, 0x1b9, GET(FRSQEST) },
{ 0, 0x1c4, GET(LQX) },
{ 0, 0x1cc, GET(ROTQBYBI) },
{ 0, 0x1cd, GET(ROTQMBYBI) },
{ 0, 0x1cf, GET(SHLQBYBI) },
{ 0, 0x1d4, GET(CBX) },
{ 0, 0x1d5, GET(CHX) },
{ 0, 0x1d6, GET(CWX) },
{ 0, 0x1d7, GET(CDX) },
{ 0, 0x1d8, GET(ROTQBI) },
{ 0, 0x1d9, GET(ROTQMBI) },
{ 0, 0x1db, GET(SHLQBI) },
{ 0, 0x1dc, GET(ROTQBY) },
{ 0, 0x1dd, GET(ROTQMBY) },
{ 0, 0x1df, GET(SHLQBY) },
{ 0, 0x1f0, GET(ORX) },
{ 0, 0x1f4, GET(CBD) },
{ 0, 0x1f5, GET(CHD) },
{ 0, 0x1f6, GET(CWD) },
{ 0, 0x1f7, GET(CDD) },
{ 0, 0x1f8, GET(ROTQBII) },
{ 0, 0x1f9, GET(ROTQMBII) },
{ 0, 0x1fb, GET(SHLQBII) },
{ 0, 0x1fc, GET(ROTQBYI) },
{ 0, 0x1fd, GET(ROTQMBYI) },
{ 0, 0x1ff, GET(SHLQBYI) },
{ 0, 0x201, GET(NOP) },
{ 0, 0x240, GET(CGT) },
{ 0, 0x241, GET(XOR) },
{ 0, 0x248, GET(CGTH) },
{ 0, 0x249, GET(EQV) },
{ 0, 0x250, GET(CGTB) },
{ 0, 0x253, GET(SUMB) },
{ 0, 0x258, GET(HGT) },
{ 0, 0x2a5, GET(CLZ) },
{ 0, 0x2a6, GET(XSWD) },
{ 0, 0x2ae, GET(XSHW) },
{ 0, 0x2b4, GET(CNTB) },
{ 0, 0x2b6, GET(XSBH) },
{ 0, 0x2c0, GET(CLGT) },
{ 0, 0x2c1, GET(ANDC) },
{ 0, 0x2c2, GET(FCGT) },
{ 0, 0x2c3, GET(DFCGT) },
{ 0, 0x2c4, GET(FA) },
{ 0, 0x2c5, GET(FS) },
{ 0, 0x2c6, GET(FM) },
{ 0, 0x2c8, GET(CLGTH) },
{ 0, 0x2c9, GET(ORC) },
{ 0, 0x2ca, GET(FCMGT) },
{ 0, 0x2cb, GET(DFCMGT) },
{ 0, 0x2cc, GET(DFA) },
{ 0, 0x2cd, GET(DFS) },
{ 0, 0x2ce, GET(DFM) },
{ 0, 0x2d0, GET(CLGTB) },
{ 0, 0x2d8, GET(HLGT) },
{ 0, 0x35c, GET(DFMA) },
{ 0, 0x35d, GET(DFMS) },
{ 0, 0x35e, GET(DFNMS) },
{ 0, 0x35f, GET(DFNMA) },
{ 0, 0x3c0, GET(CEQ) },
{ 0, 0x3ce, GET(MPYHHU) },
{ 0, 0x340, GET(ADDX) },
{ 0, 0x341, GET(SFX) },
{ 0, 0x342, GET(CGX) },
{ 0, 0x343, GET(BGX) },
{ 0, 0x346, GET(MPYHHA) },
{ 0, 0x34e, GET(MPYHHAU) },
{ 0, 0x398, GET(FSCRRD) },
{ 0, 0x3b8, GET(FESD) },
{ 0, 0x3b9, GET(FRDS) },
{ 0, 0x3ba, GET(FSCRWR) },
{ 0, 0x3bf, GET(DFTSV) },
{ 0, 0x3c2, GET(FCEQ) },
{ 0, 0x3c3, GET(DFCEQ) },
{ 0, 0x3c4, GET(MPY) },
{ 0, 0x3c5, GET(MPYH) },
{ 0, 0x3c6, GET(MPYHH) },
{ 0, 0x3c7, GET(MPYS) },
{ 0, 0x3c8, GET(CEQH) },
{ 0, 0x3ca, GET(FCMEQ) },
{ 0, 0x3cb, GET(DFCMEQ) },
{ 0, 0x3cc, GET(MPYU) },
{ 0, 0x3d0, GET(CEQB) },
{ 0, 0x3d4, GET(FI) },
{ 0, 0x3d8, GET(HEQ) },
{ 1, 0x1d8, GET(CFLTS) },
{ 1, 0x1d9, GET(CFLTU) },
{ 1, 0x1da, GET(CSFLT) },
{ 1, 0x1db, GET(CUFLT) },
{ 2, 0x40, GET(BRZ) },
{ 2, 0x41, GET(STQA) },
{ 2, 0x42, GET(BRNZ) },
{ 2, 0x44, GET(BRHZ) },
{ 2, 0x46, GET(BRHNZ) },
{ 2, 0x47, GET(STQR) },
{ 2, 0x60, GET(BRA) },
{ 2, 0x61, GET(LQA) },
{ 2, 0x62, GET(BRASL) },
{ 2, 0x64, GET(BR) },
{ 2, 0x65, GET(FSMBI) },
{ 2, 0x66, GET(BRSL) },
{ 2, 0x67, GET(LQR) },
{ 2, 0x81, GET(IL) },
{ 2, 0x82, GET(ILHU) },
{ 2, 0x83, GET(ILH) },
{ 2, 0xc1, GET(IOHL) },
{ 3, 0x4, GET(ORI) },
{ 3, 0x5, GET(ORHI) },
{ 3, 0x6, GET(ORBI) },
{ 3, 0xc, GET(SFI) },
{ 3, 0xd, GET(SFHI) },
{ 3, 0x14, GET(ANDI) },
{ 3, 0x15, GET(ANDHI) },
{ 3, 0x16, GET(ANDBI) },
{ 3, 0x1c, GET(AI) },
{ 3, 0x1d, GET(AHI) },
{ 3, 0x24, GET(STQD) },
{ 3, 0x34, GET(LQD) },
{ 3, 0x44, GET(XORI) },
{ 3, 0x45, GET(XORHI) },
{ 3, 0x46, GET(XORBI) },
{ 3, 0x4c, GET(CGTI) },
{ 3, 0x4d, GET(CGTHI) },
{ 3, 0x4e, GET(CGTBI) },
{ 3, 0x4f, GET(HGTI) },
{ 3, 0x5c, GET(CLGTI) },
{ 3, 0x5d, GET(CLGTHI) },
{ 3, 0x5e, GET(CLGTBI) },
{ 3, 0x5f, GET(HLGTI) },
{ 3, 0x74, GET(MPYI) },
{ 3, 0x75, GET(MPYUI) },
{ 3, 0x7c, GET(CEQI) },
{ 3, 0x7d, GET(CEQHI) },
{ 3, 0x7e, GET(CEQBI) },
{ 3, 0x7f, GET(HEQI) },
{ 4, 0x8, GET(HBRA) },
{ 4, 0x9, GET(HBRR) },
{ 4, 0x21, GET(ILA) },
{ 7, 0x8, GET(SELB) },
{ 7, 0xb, GET(SHUFB) },
{ 7, 0xc, GET(MPYA) },
{ 7, 0xd, GET(FNMS) },
{ 7, 0xe, GET(FMA) },
{ 7, 0xf, GET(FMS) },
};
for (auto& x : m_table)
{
x = &D::UNK;
x = GET(UNK);
}
for (auto& entry : instructions)
@ -301,3 +313,5 @@ public:
return m_table[spu_decode(inst)];
}
};
#undef GET

View file

@ -24,15 +24,12 @@
#include <unordered_set>
#include "util/v128.hpp"
#include "util/v128sse.hpp"
#include "util/simd.hpp"
#include "util/sysinfo.hpp"
const spu_decoder<spu_itype> s_spu_itype;
const spu_decoder<spu_iname> s_spu_iname;
const spu_decoder<spu_iflag> s_spu_iflag;
extern const spu_decoder<spu_interpreter_precise> g_spu_interpreter_precise{};
extern const spu_decoder<spu_interpreter_fast> g_spu_interpreter_fast;
const extern spu_decoder<spu_itype> g_spu_itype;
const extern spu_decoder<spu_iname> g_spu_iname;
const extern spu_decoder<spu_iflag> g_spu_iflag;
// Move 4 args for calling native function from a GHC calling convention function
static u8* move_args_ghc_to_native(u8* raw)
@ -160,11 +157,12 @@ DECLARE(spu_runtime::tr_all) = []
return reinterpret_cast<spu_function_t>(trptr);
}();
DECLARE(spu_runtime::g_gateway) = built_function<spu_function_t>("spu_gateway", [](asmjit::x86::Assembler& c, auto& args)
DECLARE(spu_runtime::g_gateway) = built_function<spu_function_t>("spu_gateway", [](native_asm& c, auto& args)
{
// Gateway for SPU dispatcher, converts from native to GHC calling convention, also saves RSP value for spu_escape
using namespace asmjit;
#if defined(ARCH_X64)
#ifdef _WIN32
c.push(x86::r15);
c.push(x86::r14);
@ -247,24 +245,30 @@ DECLARE(spu_runtime::g_gateway) = built_function<spu_function_t>("spu_gateway",
#endif
c.ret();
#else
c.ret(a64::x30);
#endif
});
DECLARE(spu_runtime::g_escape) = build_function_asm<void(*)(spu_thread*)>("spu_escape", [](asmjit::x86::Assembler& c, auto& args)
DECLARE(spu_runtime::g_escape) = build_function_asm<void(*)(spu_thread*)>("spu_escape", [](native_asm& c, auto& args)
{
using namespace asmjit;
#if defined(ARCH_X64)
// Restore native stack pointer (longjmp emulation)
c.mov(x86::rsp, x86::qword_ptr(args[0], ::offset32(&spu_thread::saved_native_sp)));
// Return to the return location
c.sub(x86::rsp, 8);
c.ret();
#endif
});
DECLARE(spu_runtime::g_tail_escape) = build_function_asm<void(*)(spu_thread*, spu_function_t, u8*)>("spu_tail_escape", [](asmjit::x86::Assembler& c, auto& args)
DECLARE(spu_runtime::g_tail_escape) = build_function_asm<void(*)(spu_thread*, spu_function_t, u8*)>("spu_tail_escape", [](native_asm& c, auto& args)
{
using namespace asmjit;
#if defined(ARCH_X64)
// Restore native stack pointer (longjmp emulation)
c.mov(x86::rsp, x86::qword_ptr(args[0], ::offset32(&spu_thread::saved_native_sp)));
@ -278,6 +282,7 @@ DECLARE(spu_runtime::g_tail_escape) = build_function_asm<void(*)(spu_thread*, sp
c.xor_(x86::ebx, x86::ebx);
c.mov(x86::qword_ptr(x86::rsp), args[1]);
c.ret();
#endif
});
DECLARE(spu_runtime::g_interpreter_table) = {};
@ -364,7 +369,7 @@ void spu_cache::initialize()
{
spu_runtime::g_interpreter = spu_runtime::g_gateway;
if (g_cfg.core.spu_decoder == spu_decoder_type::precise || g_cfg.core.spu_decoder == spu_decoder_type::fast)
if (g_cfg.core.spu_decoder == spu_decoder_type::_static || g_cfg.core.spu_decoder == spu_decoder_type::dynamic)
{
for (auto& x : *spu_runtime::g_dispatcher)
{
@ -395,7 +400,7 @@ void spu_cache::initialize()
atomic_t<usz> fnext{};
atomic_t<u8> fail_flag{0};
if (g_cfg.core.spu_decoder == spu_decoder_type::fast || g_cfg.core.spu_decoder == spu_decoder_type::llvm)
if (g_cfg.core.spu_decoder == spu_decoder_type::dynamic || g_cfg.core.spu_decoder == spu_decoder_type::llvm)
{
if (auto compiler = spu_recompiler_base::make_llvm_recompiler(11))
{
@ -634,7 +639,7 @@ void spu_cache::initialize()
for (u32 i = 0; i < f->data.size(); i++)
{
fmt::append(dump, "%-10s", s_spu_iname.decode(std::bit_cast<be_t<u32>>(f->data[i])));
fmt::append(dump, "%-10s", g_spu_iname.decode(std::bit_cast<be_t<u32>>(f->data[i])));
}
n_max = std::max(n_max, ::size32(depth_n));
@ -1289,15 +1294,13 @@ void spu_recompiler_base::branch(spu_thread& spu, void*, u8* rip)
void spu_recompiler_base::old_interpreter(spu_thread& spu, void* ls, u8* /*rip*/)
{
if (g_cfg.core.spu_decoder > spu_decoder_type::fast)
if (g_cfg.core.spu_decoder != spu_decoder_type::_static)
{
fmt::throw_exception("Invalid SPU decoder");
}
// Select opcode table
const auto& table = *(g_cfg.core.spu_decoder == spu_decoder_type::precise
? &g_spu_interpreter_precise.get_table()
: &g_spu_interpreter_fast.get_table());
const auto& table = g_fxo->get<spu_interpreter_rt>();
// LS pointer
const auto base = static_cast<const u8*>(ls);
@ -1311,7 +1314,7 @@ void spu_recompiler_base::old_interpreter(spu_thread& spu, void* ls, u8* /*rip*/
}
const u32 op = *reinterpret_cast<const be_t<u32>*>(base + spu.pc);
if (table[spu_decode(op)](spu, {op}))
if (table.decode(op)(spu, {op}))
spu.pc += 4;
}
}
@ -1430,7 +1433,7 @@ spu_program spu_recompiler_base::analyse(const be_t<u32>* ls, u32 entry_point)
m_targets.erase(pos);
// Fill register access info
if (auto iflags = s_spu_iflag.decode(data))
if (auto iflags = g_spu_iflag.decode(data))
{
if (+iflags & +spu_iflag::use_ra)
m_use_ra[pos / 4] = op.ra;
@ -1441,7 +1444,7 @@ spu_program spu_recompiler_base::analyse(const be_t<u32>* ls, u32 entry_point)
}
// Analyse instruction
switch (const auto type = s_spu_itype.decode(data))
switch (const auto type = g_spu_itype.decode(data))
{
case spu_itype::UNK:
case spu_itype::DFCEQ:
@ -2297,7 +2300,7 @@ spu_program spu_recompiler_base::analyse(const be_t<u32>* ls, u32 entry_point)
// Decode instruction
const spu_opcode_t op{std::bit_cast<be_t<u32>>(result.data[(ia - lsa) / 4])};
const auto type = s_spu_itype.decode(op.opcode);
const auto type = g_spu_itype.decode(op.opcode);
u8 reg_save = 255;
@ -2790,7 +2793,7 @@ spu_program spu_recompiler_base::analyse(const be_t<u32>* ls, u32 entry_point)
{
// Decode instruction again
op.opcode = std::bit_cast<be_t<u32>>(result.data[(ia - lsa) / 41]);
last_inst = s_spu_itype.decode(op.opcode);
last_inst = g_spu_itype.decode(op.opcode);
// Propagate some constants
switch (last_inst)
@ -5035,7 +5038,7 @@ public:
// Execute interpreter instruction
const u32 op = *reinterpret_cast<const be_t<u32>*>(_spu->_ptr<u8>(0) + _spu->pc);
if (!g_spu_interpreter_fast.decode(op)(*_spu, {op}))
if (!g_fxo->get<spu_interpreter_rt>().decode(op)(*_spu, {op}))
spu_log.fatal("Bad instruction");
// Swap state
@ -5151,10 +5154,10 @@ public:
const u32 op = i << (32u - m_interp_magn);
// Instruction type
const auto itype = s_spu_itype.decode(op);
const auto itype = g_spu_itype.decode(op);
// Function name
std::string fname = fmt::format("spu_%s", s_spu_iname.decode(op));
std::string fname = fmt::format("spu_%s", g_spu_iname.decode(op));
if (last_itype != itype)
{
@ -5460,7 +5463,7 @@ public:
return _spu->check_state();
}
template <spu_inter_func_t F>
template <spu_intrp_func_t F>
static void exec_fall(spu_thread* _spu, spu_opcode_t op)
{
if (F(*_spu, op))
@ -5469,10 +5472,10 @@ public:
}
}
template <spu_inter_func_t F>
template <spu_intrp_func_t F>
void fall(spu_opcode_t op)
{
std::string name = fmt::format("spu_%s", s_spu_iname.decode(op.opcode));
std::string name = fmt::format("spu_%s", g_spu_iname.decode(op.opcode));
if (m_interp_magn)
{
@ -6808,11 +6811,21 @@ public:
set_vr(op.rt, fshl(a, zshuffle(a, 4, 0, 1, 2), b));
}
#if defined(ARCH_X64)
static __m128i exec_rotqby(__m128i a, u8 b)
{
alignas(32) const __m128i buf[2]{a, a};
return _mm_loadu_si128(reinterpret_cast<const __m128i*>(reinterpret_cast<const u8*>(buf) + (16 - (b & 0xf))));
}
#else
static v128 exec_rotqby(v128 a, u8 b)
{
alignas(32) const v128 buf[2]{a, a};
alignas(16) v128 res;
std::memcpy(&res, reinterpret_cast<const u8*>(buf) + (16 - (b & 0xf)), 16);
return res;
}
#endif
void ROTQBY(spu_opcode_t op)
{
@ -6822,7 +6835,7 @@ public:
if (!m_use_ssse3)
{
value_t<u8[16]> r;
r.value = call("spu_rotqby", &exec_rotqby, a.value, eval(extract(b, 12)).value);
r.value = call<u8[16]>("spu_rotqby", &exec_rotqby, a.value, eval(extract(b, 12)).value);
set_vr(op.rt, r);
return;
}
@ -7805,7 +7818,7 @@ public:
{
const auto [a, b, c] = get_vrs<f64[2]>(op.ra, op.rb, op.rt);
if (g_cfg.core.llvm_accurate_dfma)
if (g_cfg.core.use_accurate_dfma)
set_vr(op.rt, fmuladd(a, b, c, true));
else
set_vr(op.rt, a * b + c);
@ -7815,7 +7828,7 @@ public:
{
const auto [a, b, c] = get_vrs<f64[2]>(op.ra, op.rb, op.rt);
if (g_cfg.core.llvm_accurate_dfma)
if (g_cfg.core.use_accurate_dfma)
set_vr(op.rt, fmuladd(a, b, -c, true));
else
set_vr(op.rt, a * b - c);
@ -7825,7 +7838,7 @@ public:
{
const auto [a, b, c] = get_vrs<f64[2]>(op.ra, op.rb, op.rt);
if (g_cfg.core.llvm_accurate_dfma)
if (g_cfg.core.use_accurate_dfma)
set_vr(op.rt, fmuladd(-a, b, c, true));
else
set_vr(op.rt, c - (a * b));
@ -7835,7 +7848,7 @@ public:
{
const auto [a, b, c] = get_vrs<f64[2]>(op.ra, op.rb, op.rt);
if (g_cfg.core.llvm_accurate_dfma)
if (g_cfg.core.use_accurate_dfma)
set_vr(op.rt, -fmuladd(a, b, c, true));
else
set_vr(op.rt, -(a * b + c));
@ -9894,11 +9907,11 @@ std::unique_ptr<spu_recompiler_base> spu_recompiler_base::make_llvm_recompiler(u
return std::make_unique<spu_llvm_recompiler>(magn);
}
const spu_decoder<spu_llvm_recompiler> g_spu_llvm_decoder;
const spu_decoder<spu_llvm_recompiler> s_spu_llvm_decoder;
decltype(&spu_llvm_recompiler::UNK) spu_llvm_recompiler::decode(u32 op)
{
return g_spu_llvm_decoder.decode(op);
return s_spu_llvm_decoder.decode(op);
}
#else
@ -10025,6 +10038,11 @@ struct spu_llvm
void operator()()
{
if (g_cfg.core.spu_decoder != spu_decoder_type::llvm)
{
return;
}
// To compile (hash -> item)
std::unordered_multimap<u64, spu_item*, value_hash<u64>> enqueued;
@ -10345,7 +10363,7 @@ struct spu_fast : public spu_recompiler_base
// Fix endianness
const spu_opcode_t op{std::bit_cast<be_t<u32>>(func.data[i])};
switch (auto type = s_spu_itype.decode(op.opcode))
switch (auto type = g_spu_itype.decode(op.opcode))
{
case spu_itype::BRZ:
case spu_itype::BRHZ:

View file

@ -30,7 +30,7 @@
#include "util/vm.hpp"
#include "util/asm.hpp"
#include "util/v128.hpp"
#include "util/v128sse.hpp"
#include "util/simd.hpp"
#include "util/sysinfo.hpp"
using spu_rdata_t = decltype(spu_thread::rdata);
@ -87,14 +87,13 @@ void fmt_class_string<spu_type>::format(std::string& out, u64 arg)
// Verify AVX availability for TSX transactions
static const bool s_tsx_avx = utils::has_avx();
// For special case
static const bool s_tsx_haswell = utils::has_rtm() && !utils::has_mpx();
// Threshold for when rep mosvb is expected to outperform simd copies
// The threshold will be 0xFFFFFFFF when the performance of rep movsb is expected to be bad
static const u32 s_rep_movsb_threshold = utils::get_rep_movsb_threshold();
#ifndef _MSC_VER
#if defined(_M_X64)
extern "C" void __movsb(uchar*, const uchar*, size_t);
#elif defined(ARCH_X64)
static FORCE_INLINE void __movsb(unsigned char * Dst, const unsigned char * Src, size_t Size)
{
__asm__ __volatile__
@ -104,8 +103,12 @@ static FORCE_INLINE void __movsb(unsigned char * Dst, const unsigned char * Src,
"[Dst]" (Dst), "[Src]" (Src), "[Size]" (Size)
);
}
#else
#define s_rep_movsb_threshold umax
#define __movsb std::memcpy
#endif
#if defined(ARCH_X64)
static FORCE_INLINE bool cmp_rdata_avx(const __m256i* lhs, const __m256i* rhs)
{
#if defined(_MSC_VER) || defined(__AVX__)
@ -145,18 +148,21 @@ static FORCE_INLINE bool cmp_rdata_avx(const __m256i* lhs, const __m256i* rhs)
return result;
#endif
}
#endif
#ifdef _MSC_VER
__forceinline
#endif
extern bool cmp_rdata(const spu_rdata_t& _lhs, const spu_rdata_t& _rhs)
{
#if defined(ARCH_X64)
#ifndef __AVX__
if (s_tsx_avx) [[likely]]
#endif
{
return cmp_rdata_avx(reinterpret_cast<const __m256i*>(_lhs), reinterpret_cast<const __m256i*>(_rhs));
}
#endif
const auto lhs = reinterpret_cast<const v128*>(_lhs);
const auto rhs = reinterpret_cast<const v128*>(_rhs);
@ -165,9 +171,10 @@ extern bool cmp_rdata(const spu_rdata_t& _lhs, const spu_rdata_t& _rhs)
const v128 c = (lhs[4] ^ rhs[4]) | (lhs[5] ^ rhs[5]);
const v128 d = (lhs[6] ^ rhs[6]) | (lhs[7] ^ rhs[7]);
const v128 r = (a | b) | (c | d);
return r == v128{};
return gv_testz(r);
}
#if defined(ARCH_X64)
static FORCE_INLINE void mov_rdata_avx(__m256i* dst, const __m256i* src)
{
#ifdef _MSC_VER
@ -199,12 +206,14 @@ static FORCE_INLINE void mov_rdata_avx(__m256i* dst, const __m256i* src)
);
#endif
}
#endif
#ifdef _MSC_VER
__forceinline
#endif
extern void mov_rdata(spu_rdata_t& _dst, const spu_rdata_t& _src)
{
#if defined(ARCH_X64)
#ifndef __AVX__
if (s_tsx_avx) [[likely]]
#endif
@ -232,8 +241,12 @@ extern void mov_rdata(spu_rdata_t& _dst, const spu_rdata_t& _src)
_mm_storeu_si128(reinterpret_cast<__m128i*>(_dst + 80), v1);
_mm_storeu_si128(reinterpret_cast<__m128i*>(_dst + 96), v2);
_mm_storeu_si128(reinterpret_cast<__m128i*>(_dst + 112), v3);
#else
std::memcpy(_dst, _src, 128);
#endif
}
#if defined(ARCH_X64)
static FORCE_INLINE void mov_rdata_nt_avx(__m256i* dst, const __m256i* src)
{
#ifdef _MSC_VER
@ -265,9 +278,11 @@ static FORCE_INLINE void mov_rdata_nt_avx(__m256i* dst, const __m256i* src)
);
#endif
}
#endif
extern void mov_rdata_nt(spu_rdata_t& _dst, const spu_rdata_t& _src)
{
#if defined(ARCH_X64)
#ifndef __AVX__
if (s_tsx_avx) [[likely]]
#endif
@ -295,6 +310,9 @@ extern void mov_rdata_nt(spu_rdata_t& _dst, const spu_rdata_t& _src)
_mm_stream_si128(reinterpret_cast<__m128i*>(_dst + 80), v1);
_mm_stream_si128(reinterpret_cast<__m128i*>(_dst + 96), v2);
_mm_stream_si128(reinterpret_cast<__m128i*>(_dst + 112), v3);
#else
std::memcpy(_dst, _src, 128);
#endif
}
void do_cell_atomic_128_store(u32 addr, const void* to_write);
@ -421,10 +439,11 @@ std::array<u32, 2> op_branch_targets(u32 pc, spu_opcode_t op)
return res;
}
const auto spu_putllc_tx = built_function<u64(*)(u32 raddr, u64 rtime, void* _old, const void* _new)>("spu_putllc_tx", [](asmjit::x86::Assembler& c, auto& args)
const auto spu_putllc_tx = built_function<u64(*)(u32 raddr, u64 rtime, void* _old, const void* _new)>("spu_putllc_tx", [](native_asm& c, auto& args)
{
using namespace asmjit;
#if defined(ARCH_X64)
Label fall = c.newLabel();
Label fail = c.newLabel();
Label _ret = c.newLabel();
@ -677,12 +696,16 @@ const auto spu_putllc_tx = built_function<u64(*)(u32 raddr, u64 rtime, void* _ol
c.bind(ret2);
#endif
c.ret();
#else
c.ret(a64::x30);
#endif
});
const auto spu_putlluc_tx = built_function<u64(*)(u32 raddr, const void* rdata, u64* _stx, u64* _ftx)>("spu_putlluc_tx", [](asmjit::x86::Assembler& c, auto& args)
const auto spu_putlluc_tx = built_function<u64(*)(u32 raddr, const void* rdata, u64* _stx, u64* _ftx)>("spu_putlluc_tx", [](native_asm& c, auto& args)
{
using namespace asmjit;
#if defined(ARCH_X64)
Label fall = c.newLabel();
Label _ret = c.newLabel();
@ -803,12 +826,16 @@ const auto spu_putlluc_tx = built_function<u64(*)(u32 raddr, const void* rdata,
c.bind(ret2);
#endif
c.ret();
#else
c.ret(a64::x30);
#endif
});
const auto spu_getllar_tx = built_function<u64(*)(u32 raddr, void* rdata, cpu_thread* _cpu, u64 rtime)>("spu_getllar_tx", [](asmjit::x86::Assembler& c, auto& args)
const auto spu_getllar_tx = built_function<u64(*)(u32 raddr, void* rdata, cpu_thread* _cpu, u64 rtime)>("spu_getllar_tx", [](native_asm& c, auto& args)
{
using namespace asmjit;
#if defined(ARCH_X64)
Label fall = c.newLabel();
Label _ret = c.newLabel();
@ -938,6 +965,9 @@ const auto spu_getllar_tx = built_function<u64(*)(u32 raddr, void* rdata, cpu_th
c.bind(ret2);
#endif
c.ret();
#else
c.ret(a64::x30);
#endif
});
void spu_int_ctrl_t::set(u64 ints)
@ -967,7 +997,7 @@ spu_imm_table_t::scale_table_t::scale_table_t()
{
for (s32 i = -155; i < 174; i++)
{
m_data[i + 155].vf = _mm_set1_ps(static_cast<float>(std::exp2(i)));
m_data[i + 155] = v128::fromf32p(static_cast<float>(std::exp2(i)));
}
}
@ -1385,6 +1415,8 @@ void spu_thread::cpu_task()
std::fesetround(FE_TOWARDZERO);
gv_set_zeroing_denormals();
g_tls_log_prefix = []
{
const auto cpu = static_cast<spu_thread*>(get_current_cpu_thread());
@ -1622,7 +1654,7 @@ spu_thread::spu_thread(lv2_spu_group* group, u32 index, std::string_view name, u
jit = spu_recompiler_base::make_fast_llvm_recompiler();
}
if (g_cfg.core.spu_decoder != spu_decoder_type::fast && g_cfg.core.spu_decoder != spu_decoder_type::precise)
if (g_cfg.core.spu_decoder == spu_decoder_type::asmjit || g_cfg.core.spu_decoder == spu_decoder_type::llvm)
{
if (g_cfg.core.spu_block_size != spu_block_size_type::safe)
{
@ -2640,7 +2672,7 @@ bool spu_thread::do_putllc(const spu_mfc_cmd& args)
return false;
});
const u64 count2 = __rdtsc() - perf2.get();
const u64 count2 = utils::get_tsc() - perf2.get();
if (count2 > 20000 && g_cfg.core.perf_report) [[unlikely]]
{
@ -2672,7 +2704,7 @@ bool spu_thread::do_putllc(const spu_mfc_cmd& args)
utils::prefetch_read(rdata + 64);
last_faddr = addr;
last_ftime = res.load() & -128;
last_ftsc = __rdtsc();
last_ftsc = utils::get_tsc();
return false;
}
default:
@ -2854,7 +2886,7 @@ void do_cell_atomic_128_store(u32 addr, const void* to_write)
});
vm::reservation_acquire(addr) += 32;
result = __rdtsc() - perf0.get();
result = utils::get_tsc() - perf0.get();
}
if (result > 20000 && g_cfg.core.perf_report) [[unlikely]]
@ -3007,7 +3039,7 @@ bool spu_thread::do_mfc(bool can_escape, bool must_finish)
{
// Get commands' execution mask
// Mask bits are always set when mfc_transfers_shuffling is 0
return static_cast<u16>((0 - (1u << std::min<u32>(g_cfg.core.mfc_transfers_shuffling, size))) | __rdtsc());
return static_cast<u16>((0 - (1u << std::min<u32>(g_cfg.core.mfc_transfers_shuffling, size))) | utils::get_tsc());
};
// Process enqueued commands
@ -3684,9 +3716,9 @@ void spu_thread::set_interrupt_status(bool enable)
// Detect enabling interrupts with events masked
if (auto mask = ch_events.load().mask; mask & SPU_EVENT_INTR_BUSY_CHECK)
{
if (g_cfg.core.spu_decoder != spu_decoder_type::precise && g_cfg.core.spu_decoder != spu_decoder_type::fast)
if (g_cfg.core.spu_decoder != spu_decoder_type::_static)
{
fmt::throw_exception("SPU Interrupts not implemented (mask=0x%x): Use interpreterts", mask);
fmt::throw_exception("SPU Interrupts not implemented (mask=0x%x): Use static interpreter", mask);
}
spu_log.trace("SPU Interrupts (mask=0x%x) are using CPU busy checking mode", mask);

View file

@ -503,9 +503,9 @@ struct spu_imm_table_t
public:
scale_table_t();
FORCE_INLINE const auto& operator [](s32 scale) const
FORCE_INLINE const v128& operator [](s32 scale) const
{
return m_data[scale + 155].vf;
return m_data[scale + 155];
}
}
const scale;

View file

@ -75,30 +75,28 @@ void fmt_class_string<lv2_protocol>::format(std::string& out, u64 arg)
});
}
static bool null_func_(ppu_thread& ppu)
static void null_func_(ppu_thread& ppu, ppu_opcode_t, be_t<u32>* this_op, ppu_intrp_func*)
{
ppu_log.todo("Unimplemented syscall %s -> CELL_OK (r3=0x%llx, r4=0x%llx, r5=0x%llx, r6=0x%llx, r7=0x%llx, r8=0x%llx, r9=0x%llx, r10=0x%llx)", ppu_syscall_code(ppu.gpr[11]),
ppu.gpr[3], ppu.gpr[4], ppu.gpr[5], ppu.gpr[6], ppu.gpr[7], ppu.gpr[8], ppu.gpr[9], ppu.gpr[10]);
ppu.gpr[3] = 0;
ppu.cia += 4;
return false;
ppu.cia = vm::get_addr(this_op) + 4;
}
static bool uns_func_(ppu_thread& ppu)
static void uns_func_(ppu_thread& ppu, ppu_opcode_t, be_t<u32>* this_op, ppu_intrp_func*)
{
ppu_log.trace("Unused syscall %d -> ENOSYS", ppu.gpr[11]);
ppu.gpr[3] = CELL_ENOSYS;
ppu.cia += 4;
return false;
ppu.cia = vm::get_addr(this_op) + 4;
}
// Bind Syscall
#define BIND_SYSC(func) {BIND_FUNC(func), #func}
#define NULL_FUNC(name) {null_func_, #name}
constexpr std::pair<ppu_function_t, std::string_view> null_func{null_func_, ""};
constexpr std::pair<ppu_function_t, std::string_view> uns_func{uns_func_, ""};
constexpr std::pair<ppu_intrp_func_t, std::string_view> null_func{null_func_, ""};
constexpr std::pair<ppu_intrp_func_t, std::string_view> uns_func{uns_func_, ""};
// UNS = Unused
// ROOT = Root
@ -106,7 +104,7 @@ constexpr std::pair<ppu_function_t, std::string_view> uns_func{uns_func_, ""};
// DEX..DECR = Unavailable on retail consoles
// PM = Product Mode
// AuthID = Authentication ID
const std::array<std::pair<ppu_function_t, std::string_view>, 1024> g_ppu_syscall_table
const std::array<std::pair<ppu_intrp_func_t, std::string_view>, 1024> g_ppu_syscall_table
{
null_func,
BIND_SYSC(sys_process_getpid), //1 (0x001)
@ -1151,7 +1149,7 @@ extern void ppu_execute_syscall(ppu_thread& ppu, u64 code)
if (const auto func = g_ppu_syscall_table[code].first)
{
func(ppu);
func(ppu, {}, vm::_ptr<u32>(ppu.cia), nullptr);
ppu_log.trace("Syscall '%s' (%llu) finished, r3=0x%llx", ppu_syscall_code(code), code, ppu.gpr[3]);
return;
}
@ -1160,7 +1158,7 @@ extern void ppu_execute_syscall(ppu_thread& ppu, u64 code)
fmt::throw_exception("Invalid syscall number (%llu)", code);
}
extern ppu_function_t ppu_get_syscall(u64 code)
extern ppu_intrp_func_t ppu_get_syscall(u64 code)
{
if (code < g_ppu_syscall_table.size())
{

View file

@ -11,6 +11,10 @@
#include <winsock2.h>
#include <WS2tcpip.h>
#else
#ifdef __clang__
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wold-style-cast"
#endif
#include <errno.h>
#include <sys/time.h>
#include <sys/types.h>
@ -22,6 +26,9 @@
#include <unistd.h>
#include <fcntl.h>
#include <poll.h>
#ifdef __clang__
#pragma GCC diagnostic pop
#endif
#endif
#include "Emu/NP/np_handler.h"

View file

@ -317,7 +317,7 @@ void usb_handler_thread::operator()()
{
timeval lusb_tv{0, 200};
while (thread_ctrl::state() != thread_state::aborting)
while (ctx && thread_ctrl::state() != thread_state::aborting)
{
// Todo: Hotplug here?

View file

@ -15,6 +15,10 @@
#include <WS2tcpip.h>
#include <afunix.h> // sockaddr_un
#else
#ifdef __clang__
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wold-style-cast"
#endif
#include <errno.h>
#include <sys/types.h>
#include <sys/socket.h>
@ -25,6 +29,9 @@
#include <unistd.h>
#include <fcntl.h>
#include <sys/un.h> // sockaddr_un
#ifdef __clang__
#pragma GCC diagnostic pop
#endif
#endif
#include <charconv>

View file

@ -684,7 +684,7 @@ namespace vm
// 1. To simplify range_lock logic
// 2. To make sure it never overlaps with 32-bit addresses
// Also check that it's aligned (lowest 16 bits)
ensure((shm_self & 0xffff'8000'0000'ffff) == range_locked);
ensure((shm_self & 0xffff'0000'0000'ffff) == range_locked);
// Find another mirror and map it as shareable too
for (auto& ploc : g_locations)
@ -714,7 +714,7 @@ namespace vm
u64 shm_self = reinterpret_cast<u64>(shm->get()) ^ range_locked;
// Check (see above)
ensure((shm_self & 0xffff'8000'0000'ffff) == range_locked);
ensure((shm_self & 0xffff'0000'0000'ffff) == range_locked);
// Map range as shareable
for (u32 i = addr / 65536; i < addr / 65536 + size / 65536; i++)
@ -1129,13 +1129,16 @@ namespace vm
{
auto fill64 = [](u8* ptr, u64 data, usz count)
{
#ifdef _MSC_VER
#ifdef _M_X64
__stosq(reinterpret_cast<u64*>(ptr), data, count);
#else
#elif defined(ARCH_X64)
__asm__ ("mov %0, %%rdi; mov %1, %%rax; mov %2, %%rcx; rep stosq;"
:
: "r" (ptr), "r" (data), "r" (count)
: "rdi", "rax", "rcx", "memory");
#else
for (usz i = 0; i < count; i++)
reinterpret_cast<u64*>(ptr)[i] = data;
#endif
};

View file

@ -200,16 +200,10 @@ namespace vm
return {};
}
// Unsafe convert host ptr to PS3 VM address (clamp with 4GiB alignment assumption)
inline vm::addr_t get_addr(const void* ptr)
{
const auto [addr, ok] = try_get_addr(ptr);
if (!ok)
{
fmt::throw_exception("Not a virtual memory pointer (%p)", ptr);
}
return addr;
return vm::addr_t{static_cast<u32>(uptr(ptr))};
}
template<typename T>

View file

@ -3,6 +3,7 @@
#include "vm.h"
#include "vm_locking.h"
#include "util/atomic.hpp"
#include "util/tsc.hpp"
#include <functional>
extern bool g_use_rtm;
@ -11,7 +12,6 @@ extern u64 g_rtm_tx_limit2;
#ifdef _MSC_VER
extern "C"
{
u64 __rdtsc();
u32 _xbegin();
void _xend();
}
@ -19,15 +19,6 @@ extern "C"
namespace vm
{
inline u64 get_tsc()
{
#ifdef _MSC_VER
return __rdtsc();
#else
return __builtin_ia32_rdtsc();
#endif
}
enum : u64
{
rsrv_lock_mask = 127,
@ -108,13 +99,14 @@ namespace vm
auto& res = vm::reservation_acquire(addr);
//_m_prefetchw(&res);
#if defined(ARCH_X64)
if (g_use_rtm)
{
// Stage 1: single optimistic transaction attempt
unsigned status = -1;
u64 _old = 0;
auto stamp0 = get_tsc(), stamp1 = stamp0, stamp2 = stamp0;
auto stamp0 = utils::get_tsc(), stamp1 = stamp0, stamp2 = stamp0;
#ifndef _MSC_VER
__asm__ goto ("xbegin %l[stage2];" ::: "memory" : stage2);
@ -176,16 +168,16 @@ namespace vm
#ifndef _MSC_VER
__asm__ volatile ("mov %%eax, %0;" : "=r" (status) :: "memory");
#endif
stamp1 = get_tsc();
stamp1 = utils::get_tsc();
// Stage 2: try to lock reservation first
_old = res.fetch_add(1);
// Compute stamps excluding memory touch
stamp2 = get_tsc() - (stamp1 - stamp0);
stamp2 = utils::get_tsc() - (stamp1 - stamp0);
// Start lightened transaction
for (; !(_old & vm::rsrv_unique_lock) && stamp2 - stamp0 <= g_rtm_tx_limit2; stamp2 = get_tsc())
for (; !(_old & vm::rsrv_unique_lock) && stamp2 - stamp0 <= g_rtm_tx_limit2; stamp2 = utils::get_tsc())
{
if (cpu.has_pause_flag())
{
@ -285,6 +277,9 @@ namespace vm
return result;
}
}
#else
static_cast<void>(cpu);
#endif /* ARCH_X64 */
// Lock reservation and perform heavyweight lock
reservation_shared_lock_internal(res);

View file

@ -8,8 +8,15 @@
#ifdef _WIN32
#include <WS2tcpip.h>
#else
#ifdef __clang__
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wold-style-cast"
#endif
#include <sys/socket.h>
#include <arpa/inet.h>
#ifdef __clang__
#pragma GCC diagnostic pop
#endif
#endif
LOG_CHANNEL(dnshook_log, "DnsHook");

View file

@ -19,12 +19,19 @@
#include <WS2tcpip.h>
#include <iphlpapi.h>
#else
#ifdef __clang__
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wold-style-cast"
#endif
#include <sys/socket.h>
#include <netinet/in.h>
#include <net/if.h>
#include <arpa/inet.h>
#include <netdb.h>
#include <unistd.h>
#ifdef __clang__
#pragma GCC diagnostic pop
#endif
#endif
#if defined(__FreeBSD__) || defined(__APPLE__)

View file

@ -21,6 +21,10 @@
#include <winsock2.h>
#include <WS2tcpip.h>
#else
#ifdef __clang__
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wold-style-cast"
#endif
#include <errno.h>
#include <sys/time.h>
#include <sys/types.h>
@ -32,6 +36,9 @@
#include <fcntl.h>
#include <poll.h>
#include <netdb.h>
#ifdef __clang__
#pragma GCC diagnostic pop
#endif
#endif
LOG_CHANNEL(rpcn_log, "rpcn");

View file

@ -11,9 +11,16 @@
#ifdef _WIN32
#include <winsock2.h>
#else
#ifdef __clang__
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wold-style-cast"
#endif
#include <sys/socket.h>
#include <netinet/in.h>
#include <arpa/inet.h>
#ifdef __clang__
#pragma GCC diagnostic pop
#endif
#endif
#include "Emu/Cell/Modules/sceNp.h"

View file

@ -7,15 +7,25 @@
#include "util/sysinfo.hpp"
#include "util/asm.hpp"
#if defined(ARCH_X64)
#include "emmintrin.h"
#include "immintrin.h"
#endif
#if !defined(_MSC_VER) && defined(__clang__)
#if !defined(_MSC_VER)
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wold-style-cast"
#endif
#if defined(_MSC_VER)
#ifdef ARCH_ARM64
#if !defined(_MSC_VER)
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#endif
#undef FORCE_INLINE
#include "Emu/CPU/sse2neon.h"
#endif
#if defined(_MSC_VER) || !defined(__SSE2__)
#define PLAIN_FUNC
#define SSSE3_FUNC
#define SSE4_1_FUNC
@ -57,7 +67,7 @@ constexpr bool s_use_ssse3 = true;
constexpr bool s_use_sse4_1 = true;
constexpr bool s_use_avx2 = true;
constexpr bool s_use_avx3 = false;
#elif defined(__SSE41__)
#elif defined(__SSE4_1__)
constexpr bool s_use_ssse3 = true;
constexpr bool s_use_sse4_1 = true;
constexpr bool s_use_avx2 = false;
@ -67,11 +77,16 @@ constexpr bool s_use_ssse3 = true;
constexpr bool s_use_sse4_1 = false;
constexpr bool s_use_avx2 = false;
constexpr bool s_use_avx3 = false;
#else
#elif defined(ARCH_X64)
const bool s_use_ssse3 = utils::has_ssse3();
const bool s_use_sse4_1 = utils::has_sse41();
const bool s_use_avx2 = utils::has_avx2();
const bool s_use_avx3 = utils::has_avx512();
#else
constexpr bool s_use_ssse3 = true; // Non x86
constexpr bool s_use_sse4_1 = true; // Non x86
constexpr bool s_use_avx2 = false;
constexpr bool s_use_avx3 = false;
#endif
const __m128i s_bswap_u32_mask = _mm_set_epi8(
@ -98,7 +113,7 @@ namespace utils
namespace
{
template <bool Compare>
PLAIN_FUNC bool copy_data_swap_u32_naive(u32* dst, const u32* src, u32 count)
PLAIN_FUNC auto copy_data_swap_u32_naive(u32* dst, const u32* src, u32 count)
{
u32 result = 0;
@ -117,11 +132,14 @@ namespace
dst[i] = data;
}
return static_cast<bool>(result);
if constexpr (Compare)
{
return static_cast<bool>(result);
}
}
template <bool Compare>
SSSE3_FUNC bool copy_data_swap_u32_ssse3(u32* dst, const u32* src, u32 count)
SSSE3_FUNC auto copy_data_swap_u32_ssse3(u32* dst, const u32* src, u32 count)
{
u32 result = 0;
@ -140,9 +158,13 @@ namespace
dst[i] = data;
}
return static_cast<bool>(result);
if constexpr (Compare)
{
return static_cast<bool>(result);
}
}
#if defined(ARCH_X64)
template <bool Compare, int Size, typename RT>
void build_copy_data_swap_u32_avx3(asmjit::x86::Assembler& c, std::array<asmjit::x86::Gp, 4>& args, const RT& rmask, const RT& rload, const RT& rtest)
{
@ -199,8 +221,7 @@ namespace
c.jmp(loop);
c.bind(tail);
c.shlx(x86::eax, x86::eax, args[2].r32());
c.not_(x86::eax);
c.bzhi(x86::eax, x86::eax, args[2].r32());
c.kmovw(x86::k1, x86::eax);
c.k(x86::k1).z().vmovdqu32(rload, x86::Mem(args[1], 0, Size * 4u));
c.vpshufb(rload, rload, rmask);
@ -230,7 +251,7 @@ namespace
}
template <bool Compare>
void build_copy_data_swap_u32(asmjit::x86::Assembler& c, std::array<asmjit::x86::Gp, 4>& args)
void build_copy_data_swap_u32(native_asm& c, native_args& args)
{
using namespace asmjit;
@ -254,11 +275,18 @@ namespace
c.jmp(asmjit::imm_ptr(&copy_data_swap_u32_naive<Compare>));
}
#else
template <bool Compare>
constexpr auto build_copy_data_swap_u32()
{
return &copy_data_swap_u32_naive<Compare>;
}
#endif
}
built_function<void(*)(void*, const void*, u32)> copy_data_swap_u32("copy_data_swap_u32", &build_copy_data_swap_u32<false>);
built_function<void(*)(u32*, const u32*, u32)> copy_data_swap_u32("copy_data_swap_u32", &build_copy_data_swap_u32<false>);
built_function<bool(*)(void*, const void*, u32)> copy_data_swap_u32_cmp("copy_data_swap_u32_cmp", &build_copy_data_swap_u32<true>);
built_function<bool(*)(u32*, const u32*, u32)> copy_data_swap_u32_cmp("copy_data_swap_u32_cmp", &build_copy_data_swap_u32<true>);
namespace
{
@ -390,6 +418,7 @@ namespace
struct primitive_restart_impl
{
#if defined(ARCH_X64)
AVX2_FUNC
static
std::tuple<u16, u16> upload_u16_swapped_avx2(const void *src, void *dst, u32 iterations, u16 restart_index)
@ -428,6 +457,7 @@ namespace
return std::make_tuple(min_index, max_index);
}
#endif
SSE4_1_FUNC
static
@ -512,9 +542,11 @@ namespace
{
if (s_use_avx2)
{
#if defined(ARCH_X64)
u32 iterations = length >> 4;
written = length & ~0xF;
std::tie(min_index, max_index) = upload_u16_swapped_avx2(src.data(), dst.data(), iterations, restart_index);
#endif
}
else if (s_use_sse4_1)
{

View file

@ -51,7 +51,7 @@ void stream_vector(void *dst, u32 x, u32 y, u32 z, u32 w);
void stream_vector_from_memory(void *dst, void *src);
// Copy and swap data in 32-bit units
extern built_function<void(*)(void*, const void*, u32)> copy_data_swap_u32;
extern built_function<void(*)(u32*, const u32*, u32)> copy_data_swap_u32;
// Copy and swap data in 32-bit units, return true if changed
extern built_function<bool(*)(void*, const void*, u32)> copy_data_swap_u32_cmp;
extern built_function<bool(*)(u32*, const u32*, u32)> copy_data_swap_u32_cmp;

View file

@ -38,16 +38,16 @@ namespace gl
ensure(real_pitch == (width * 4));
if (rsx_pitch == real_pitch) [[likely]]
{
copy_data_swap_u32(dst, dst, valid_length / 4);
copy_data_swap_u32(static_cast<u32*>(dst), static_cast<u32*>(dst), valid_length / 4);
}
else
{
const u32 num_rows = utils::align(valid_length, rsx_pitch) / rsx_pitch;
u8* data = static_cast<u8*>(dst);
u32* data = static_cast<u32*>(dst);
for (u32 row = 0; row < num_rows; ++row)
{
copy_data_swap_u32(data, data, width);
data += rsx_pitch;
data += rsx_pitch / 4;
}
}
break;

View file

@ -2,9 +2,12 @@
#include "ProgramStateCache.h"
#include "emmintrin.h"
#include "util/asm.hpp"
#if defined(ARCH_X64)
#include "emmintrin.h"
#endif
template <typename Traits>
void program_state_cache<Traits>::fill_fragment_constants_buffer(std::span<f32> dst_buffer, const RSXFragmentProgram &fragment_program, bool sanitize) const
{
@ -19,12 +22,23 @@ void program_state_cache<Traits>::fill_fragment_constants_buffer(std::span<f32>
for (usz offset_in_fragment_program : I->second.FragmentConstantOffsetCache)
{
char* data = static_cast<char*>(fragment_program.get_data()) + offset_in_fragment_program;
#if defined(ARCH_X64)
const __m128i vector = _mm_loadu_si128(reinterpret_cast<__m128i*>(data));
const __m128i shuffled_vector = _mm_or_si128(_mm_slli_epi16(vector, 8), _mm_srli_epi16(vector, 8));
#else
for (u32 i = 0; i < 4; i++)
{
const u32 value = reinterpret_cast<u32*>(data)[i];
tmp[i] = std::bit_cast<f32, u32>(((value >> 8) & 0xff00ff) | ((value << 8) & 0xff00ff00));
}
#endif
if (!patch_table.is_empty())
{
#if defined(ARCH_X64)
_mm_store_ps(tmp, _mm_castsi128_ps(shuffled_vector));
#endif
for (int i = 0; i < 4; ++i)
{
@ -47,15 +61,29 @@ void program_state_cache<Traits>::fill_fragment_constants_buffer(std::span<f32>
}
else if (sanitize)
{
#if defined(ARCH_X64)
//Convert NaNs and Infs to 0
const auto masked = _mm_and_si128(shuffled_vector, _mm_set1_epi32(0x7fffffff));
const auto valid = _mm_cmplt_epi32(masked, _mm_set1_epi32(0x7f800000));
const auto result = _mm_and_si128(shuffled_vector, valid);
_mm_stream_si128(utils::bless<__m128i>(dst), result);
#else
for (u32 i = 0; i < 4; i++)
{
const u32 value = std::bit_cast<u32>(tmp[i]);
tmp[i] = (value & 0x7fffffff) < 0x7f800000 ? value : 0;
}
std::memcpy(dst, tmp, 16);
#endif
}
else
{
#if defined(ARCH_X64)
_mm_stream_si128(utils::bless<__m128i>(dst), shuffled_vector);
#else
std::memcpy(dst, tmp, 16);
#endif
}
dst += 4;

View file

@ -20,6 +20,7 @@
#include "VKShaderInterpreter.h"
#include "VKQueryPool.h"
#include "../GCM.h"
#include "util/asm.hpp"
#include <thread>
#include <optional>
@ -310,11 +311,7 @@ namespace vk
{
while (num_waiters.load() != 0)
{
#ifdef _MSC_VER
_mm_pause();
#else
__builtin_ia32_pause();
#endif
utils::pause();
}
}

View file

@ -452,6 +452,18 @@ namespace vk
enabled_features.shaderStorageImageWriteWithoutFormat = VK_FALSE;
}
if (!pgpu->features.shaderClipDistance)
{
rsx_log.error("Your GPU does not support shader clip distance. Graphics will not render correctly.");
enabled_features.shaderClipDistance = VK_FALSE;
}
if (!pgpu->features.shaderStorageBufferArrayDynamicIndexing)
{
rsx_log.error("Your GPU does not support shader storage buffer array dynamic indexing. Graphics will not render correctly.");
enabled_features.shaderStorageBufferArrayDynamicIndexing = VK_FALSE;
}
if (!pgpu->features.samplerAnisotropy)
{
rsx_log.error("Your GPU does not support anisotropic filtering. Graphics may not render correctly.");

View file

@ -12,10 +12,6 @@
namespace vk
{
#ifdef _MSC_VER
extern "C" void _mm_pause();
#endif
fence::fence(VkDevice dev)
{
owner = dev;
@ -48,11 +44,7 @@ namespace vk
{
while (!flushed)
{
#ifdef _MSC_VER
_mm_pause();
#else
__builtin_ia32_pause();
#endif
utils::pause();
}
}
@ -218,11 +210,7 @@ namespace vk
}
}
#ifdef _MSC_VER
_mm_pause();
#else
__builtin_ia32_pause();
#endif
utils::pause();
}
}
}

View file

@ -42,8 +42,10 @@ namespace rsx
{
rsx->sync();
// Write ref+get atomically (get will be written again with the same value at command end)
vm::_ref<atomic_be_t<u64>>(rsx->dma_address + ::offset32(&RsxDmaControl::get)).store(u64{rsx->fifo_ctrl->get_pos()} << 32 | arg);
// Write ref+get (get will be written again with the same value at command end)
auto& dma = vm::_ref<RsxDmaControl>(rsx->dma_address);
dma.get.release(rsx->fifo_ctrl->get_pos());
dma.ref.store(arg);
}
void semaphore_acquire(thread* rsx, u32 /*reg*/, u32 arg)
@ -436,11 +438,11 @@ namespace rsx
if (rsx->m_graphics_state & rsx::pipeline_state::transform_constants_dirty)
{
// Minor optimization: don't compare values if we already know we need invalidation
copy_data_swap_u32(values, vm::base(rsx->fifo_ctrl->get_current_arg_ptr()), rcount);
copy_data_swap_u32(values, static_cast<u32*>(vm::base(rsx->fifo_ctrl->get_current_arg_ptr())), rcount);
}
else
{
if (copy_data_swap_u32_cmp(values, vm::base(rsx->fifo_ctrl->get_current_arg_ptr()), rcount))
if (copy_data_swap_u32_cmp(values, static_cast<u32*>(vm::base(rsx->fifo_ctrl->get_current_arg_ptr())), rcount))
{
// Transform constants invalidation is expensive (~8k bytes per update)
rsx->m_graphics_state |= rsx::pipeline_state::transform_constants_dirty;
@ -472,7 +474,7 @@ namespace rsx
rcount -= max - (max_vertex_program_instructions * 4);
}
copy_data_swap_u32(&rsx::method_registers.transform_program[load_pos * 4 + index % 4], vm::base(rsx->fifo_ctrl->get_current_arg_ptr()), rcount);
copy_data_swap_u32(&rsx::method_registers.transform_program[load_pos * 4 + index % 4], static_cast<u32*>(vm::base(rsx->fifo_ctrl->get_current_arg_ptr())), rcount);
rsx->m_graphics_state |= rsx::pipeline_state::vertex_program_ucode_dirty;
rsx::method_registers.transform_program_load_set(load_pos + ((rcount + index % 4) / 4));

View file

@ -2,6 +2,8 @@
#include "perf_meter.hpp"
#include "util/sysinfo.hpp"
#include "util/fence.hpp"
#include "util/tsc.hpp"
#include "Utilities/Thread.h"
#include <map>
@ -68,18 +70,10 @@ void perf_stat_base::print(const char* name) const noexcept
}
}
#ifdef _MSC_VER
extern "C" void _mm_lfence();
#endif
SAFE_BUFFERS(void) perf_stat_base::push(u64 data[66], u64 start_time, const char* name) noexcept
{
// Event end
#ifdef _MSC_VER
const u64 end_time = (_mm_lfence(), get_tsc());
#else
const u64 end_time = (__builtin_ia32_lfence(), get_tsc());
#endif
const u64 end_time = (utils::lfence(), utils::get_tsc());
// Compute difference in seconds
const f64 diff = (end_time - start_time) * 1. / utils::get_tsc_freq();

View file

@ -2,26 +2,13 @@
#include "util/types.hpp"
#include "util/logs.hpp"
#include "util/tsc.hpp"
#include "system_config.h"
#include <array>
#include <cmath>
LOG_CHANNEL(perf_log, "PERF");
#ifdef _MSC_VER
extern "C" u64 __rdtsc();
inline u64 get_tsc()
{
return __rdtsc();
}
#else
inline u64 get_tsc()
{
return __builtin_ia32_rdtsc();
}
#endif
// TODO: constexpr with the help of bitcast
template <auto Name>
inline const auto perf_name = []
@ -145,7 +132,7 @@ public:
if constexpr (std::array<bool, sizeof...(SubEvents)>{(SubEvents == Event)...}[Index])
{
// Push actual timestamp into an array
m_timestamps[Index + 1] = get_tsc();
m_timestamps[Index + 1] = utils::get_tsc();
}
else if constexpr (Index < sizeof...(SubEvents))
{
@ -169,7 +156,7 @@ public:
// Re-initialize first timestamp
FORCE_INLINE SAFE_BUFFERS(void) restart() noexcept
{
m_timestamps[0] = get_tsc();
m_timestamps[0] = utils::get_tsc();
std::memset(m_timestamps + 1, 0, sizeof(m_timestamps) - sizeof(u64));
}

View file

@ -52,12 +52,15 @@ struct cfg_root : cfg::node
cfg::_enum<tsx_usage> enable_TSX{ this, "Enable TSX", enable_tsx_by_default() ? tsx_usage::enabled : tsx_usage::disabled }; // Enable TSX. Forcing this on Haswell/Broadwell CPUs should be used carefully
cfg::_bool spu_accurate_xfloat{ this, "Accurate xfloat", false };
cfg::_bool spu_approx_xfloat{ this, "Approximate xfloat", true };
cfg::_bool llvm_accurate_dfma{ this, "LLVM Accurate DFMA", true }; // Enable accurate double-precision FMA for CPUs which do not support it natively
cfg::_bool llvm_ppu_jm_handling{ this, "PPU LLVM Java Mode Handling", true }; // Respect current Java Mode for alti-vec ops by PPU LLVM
cfg::_int<-1, 14> ppu_128_reservations_loop_max_length{ this, "Accurate PPU 128-byte Reservation Op Max Length", 0, true }; // -1: Always accurate, 0: Never accurate, 1-14: max accurate loop length
cfg::_bool llvm_ppu_accurate_vector_nan{ this, "PPU LLVM Accurate Vector NaN values", false };
cfg::_int<-64, 64> stub_ppu_traps{ this, "Stub PPU Traps", 0, true }; // Hack, skip PPU traps for rare cases where the trap is continueable (specify relative instructions to skip)
cfg::_bool full_width_avx512{ this, "Full Width AVX-512", false};
cfg::_bool full_width_avx512{ this, "Full Width AVX-512", false };
cfg::_bool use_accurate_dfma{ this, "Use Accurate DFMA", true }; // Enable accurate double-precision FMA for CPUs which do not support it natively
cfg::_bool ppu_set_sat_bit{ this, "PPU Set Saturation Bit", false }; // Accuracy. If unset, completely disable saturation flag handling.
cfg::_bool ppu_use_nj_bit{ this, "PPU Use Non-Java Mode Bit", false }; // Accuracy. If unset, ignore NJ flag completely.
cfg::_bool ppu_fix_vnan{ this, "PPU Fixup Vector NaN Values", false }; // Accuracy. Partial.
cfg::_bool ppu_set_vnan{ this, "PPU Accurate Vector NaN Values", false }; // Accuracy. Implies ppu_fix_vnan.
cfg::_bool ppu_set_fpcc{ this, "PPU Set FPCC Bits", false }; // Accuracy.
cfg::_bool debug_console_mode{ this, "Debug Console Mode", false }; // Debug console emulation, not recommended
cfg::_bool hook_functions{ this, "Hook static functions" };

View file

@ -256,8 +256,8 @@ void fmt_class_string<spu_decoder_type>::format(std::string& out, u64 arg)
{
switch (type)
{
case spu_decoder_type::precise: return "Interpreter (precise)";
case spu_decoder_type::fast: return "Interpreter (fast)";
case spu_decoder_type::_static: return "Interpreter (static)";
case spu_decoder_type::dynamic: return "Interpreter (dynamic)";
case spu_decoder_type::asmjit: return "Recompiler (ASMJIT)";
case spu_decoder_type::llvm: return "Recompiler (LLVM)";
}
@ -440,8 +440,8 @@ void fmt_class_string<ppu_decoder_type>::format(std::string& out, u64 arg)
{
switch (type)
{
case ppu_decoder_type::precise: return "Interpreter (precise)";
case ppu_decoder_type::fast: return "Interpreter (fast)";
case ppu_decoder_type::_static: return "Interpreter (static)";
case ppu_decoder_type::dynamic: return "Interpreter (dynamic)";
case ppu_decoder_type::llvm: return "Recompiler (LLVM)";
}

View file

@ -2,15 +2,15 @@
enum class ppu_decoder_type : unsigned
{
precise = 0, // Don't change (0)
fast, // Don't change (1)
_static,
dynamic,
llvm,
};
enum class spu_decoder_type : unsigned
{
precise = 0, // Don't change (0)
fast, // Don't change (1)
_static,
dynamic,
asmjit,
llvm,
};

View file

@ -509,7 +509,7 @@
<ClInclude Include="util\media_utils.h" />
<ClInclude Include="util\serialization.hpp" />
<ClInclude Include="util\v128.hpp" />
<ClInclude Include="util\v128sse.hpp" />
<ClInclude Include="util\simd.hpp" />
<ClInclude Include="util\to_endian.hpp" />
<ClInclude Include="..\Utilities\bin_patch.h" />
<ClInclude Include="..\Utilities\BitField.h" />

View file

@ -1122,7 +1122,7 @@
<ClInclude Include="util\v128.hpp">
<Filter>Utilities</Filter>
</ClInclude>
<ClInclude Include="util\v128sse.hpp">
<ClInclude Include="util\simd.hpp">
<Filter>Utilities</Filter>
</ClInclude>
<ClInclude Include="util\to_endian.hpp">

View file

@ -42,6 +42,7 @@ DYNAMIC_IMPORT("ntdll.dll", NtSetTimerResolution, NTSTATUS(ULONG DesiredResoluti
#include <spawn.h>
#include <sys/wait.h>
#include <stdlib.h>
#include <signal.h>
#endif
#ifdef __linux__
@ -49,7 +50,7 @@ DYNAMIC_IMPORT("ntdll.dll", NtSetTimerResolution, NTSTATUS(ULONG DesiredResoluti
#include <sys/resource.h>
#endif
#if defined(__APPLE__) && defined(BLOCKS) // BLOCKS is required for dispatch_sync, but GCC-11 does not support it
#if defined(__APPLE__)
#include <dispatch/dispatch.h>
#endif
@ -96,7 +97,7 @@ LOG_CHANNEL(q_debug, "QDEBUG");
fmt::append(buf, "\nThread id = %s.", std::this_thread::get_id());
}
const std::string_view text = buf.empty() ? _text : buf;
std::string_view text = buf.empty() ? _text : buf;
if (s_headless)
{
@ -124,18 +125,16 @@ LOG_CHANNEL(q_debug, "QDEBUG");
std::cerr << fmt::format("RPCS3: %s\n", text);
}
auto show_report = [](std::string_view text)
static auto show_report = [](std::string_view text)
{
fatal_error_dialog dlg(text);
dlg.exec();
};
#if defined(__APPLE__) && defined(BLOCKS) // BLOCKS is required for dispatch_sync, but GCC-11 does not support it
// Cocoa access is not allowed outside of the main thread
// Prevents crash dialogs from freezing the program
#if defined(__APPLE__)
if (!pthread_main_np())
{
dispatch_sync(dispatch_get_main_queue(), ^ { show_report(text); });
dispatch_sync_f(dispatch_get_main_queue(), &text, [](void* text){ show_report(*static_cast<std::string_view*>(text)); });
}
else
#endif
@ -143,9 +142,12 @@ LOG_CHANNEL(q_debug, "QDEBUG");
// If Qt is already initialized, spawn a new RPCS3 process with an --error argument
if (local)
{
// Since we only show an error, we can hope for a graceful exit
show_report(text);
std::exit(0);
#ifdef _WIN32
ExitProcess(0);
#else
kill(getpid(), SIGKILL);
#endif
}
#ifdef _WIN32

View file

@ -408,7 +408,7 @@ bool cheat_engine::set_value(const u32 offset, const T value)
if (exec_code_at_end || exec_code_at_start)
{
extern void ppu_register_function_at(u32, u32, ppu_function_t);
extern void ppu_register_function_at(u32, u32, ppu_intrp_func_t);
u32 addr = offset, size = sizeof(T);

View file

@ -45,8 +45,8 @@ extern bool is_using_interpreter(u32 id_type)
switch (id_type)
{
case 1: return g_cfg.core.ppu_decoder != ppu_decoder_type::llvm;
case 2: return g_cfg.core.spu_decoder == spu_decoder_type::fast || g_cfg.core.spu_decoder == spu_decoder_type::precise;
default: return true;
case 2: return g_cfg.core.spu_decoder != spu_decoder_type::asmjit && g_cfg.core.spu_decoder != spu_decoder_type::llvm;
default: return true;
}
}
@ -528,7 +528,7 @@ void debugger_frame::keyPressEvent(QKeyEvent* event)
dis_asm.disasm(*it);
fmt::append(ret, "\n(%u) 0x%08x: %s", i, *it, dis_asm.last_opcode);
}
if (ret.empty())
{
ret = "No PPU calls have been logged";
@ -1134,7 +1134,7 @@ void debugger_frame::EnableButtons(bool enable)
if (!cpu) enable = false;
const bool step = enable && is_using_interpreter(cpu->id_type());
m_go_to_addr->setEnabled(enable);
m_go_to_pc->setEnabled(enable);
m_btn_step->setEnabled(step);

View file

@ -1104,16 +1104,16 @@ QString emu_settings::GetLocalizedSetting(const QString& original, emu_settings_
case emu_settings_type::PPUDecoder:
switch (static_cast<ppu_decoder_type>(index))
{
case ppu_decoder_type::precise: return tr("Interpreter (precise)", "PPU decoder");
case ppu_decoder_type::fast: return tr("Interpreter (fast)", "PPU decoder");
case ppu_decoder_type::_static: return tr("Interpreter (static)", "PPU decoder");
case ppu_decoder_type::dynamic: return tr("Interpreter (dynamic)", "PPU decoder");
case ppu_decoder_type::llvm: return tr("Recompiler (LLVM)", "PPU decoder");
}
break;
case emu_settings_type::SPUDecoder:
switch (static_cast<spu_decoder_type>(index))
{
case spu_decoder_type::precise: return tr("Interpreter (precise)", "SPU decoder");
case spu_decoder_type::fast: return tr("Interpreter (fast)", "SPU decoder");
case spu_decoder_type::_static: return tr("Interpreter (static)", "SPU decoder");
case spu_decoder_type::dynamic: return tr("Interpreter (dynamic)", "SPU decoder");
case spu_decoder_type::asmjit: return tr("Recompiler (ASMJIT)", "SPU decoder");
case spu_decoder_type::llvm: return tr("Recompiler (LLVM)", "SPU decoder");
}

View file

@ -23,8 +23,6 @@ enum class emu_settings_type
AccurateGETLLAR,
AccurateSpuDMA,
AccurateClineStores,
AccurateLLVMdfma,
AccurateVectorNaN,
AccurateRSXAccess,
AccurateXFloat,
AccuratePPU128Loop,
@ -40,7 +38,12 @@ enum class emu_settings_type
ClocksScale,
PerformanceReport,
FullWidthAVX512,
PPULLVMJavaModeHandling,
AccurateDFMA,
AccuratePPUSAT,
AccuratePPUNJ,
FixupPPUVNAN,
AccuratePPUVNAN,
AccuratePPUFPCC,
// Graphics
Renderer,
@ -178,8 +181,6 @@ inline static const QMap<emu_settings_type, cfg_location> settings_location =
{ emu_settings_type::AccurateGETLLAR, { "Core", "Accurate GETLLAR"}},
{ emu_settings_type::AccurateSpuDMA, { "Core", "Accurate SPU DMA"}},
{ emu_settings_type::AccurateClineStores, { "Core", "Accurate Cache Line Stores"}},
{ emu_settings_type::AccurateLLVMdfma, { "Core", "LLVM Accurate DFMA"}},
{ emu_settings_type::AccurateVectorNaN, { "Core", "PPU LLVM Accurate Vector NaN values"}},
{ emu_settings_type::AccurateRSXAccess, { "Core", "Accurate RSX reservation access"}},
{ emu_settings_type::AccurateXFloat, { "Core", "Accurate xfloat"}},
{ emu_settings_type::MFCCommandsShuffling, { "Core", "MFC Commands Shuffling Limit"}},
@ -194,7 +195,12 @@ inline static const QMap<emu_settings_type, cfg_location> settings_location =
{ emu_settings_type::PerformanceReport, { "Core", "Enable Performance Report"}},
{ emu_settings_type::FullWidthAVX512, { "Core", "Full Width AVX-512"}},
{ emu_settings_type::NumPPUThreads, { "Core", "PPU Threads"}},
{ emu_settings_type::PPULLVMJavaModeHandling, { "Core", "PPU LLVM Java Mode Handling"}},
{ emu_settings_type::AccurateDFMA, { "Core", "Use Accurate DFMA"}},
{ emu_settings_type::AccuratePPUSAT, { "Core", "PPU Set Saturation Bit"}},
{ emu_settings_type::AccuratePPUNJ, { "Core", "PPU Use Non-Java Mode Bit"}},
{ emu_settings_type::FixupPPUVNAN, { "Core", "PPU Fixup Vector NaN Values"}},
{ emu_settings_type::AccuratePPUVNAN, { "Core", "PPU Accurate Vector NaN Values"}},
{ emu_settings_type::AccuratePPUFPCC, { "Core", "PPU Set FPCC Bits"}},
// Graphics Tab
{ emu_settings_type::Renderer, { "Video", "Renderer"}},

View file

@ -319,26 +319,26 @@ settings_dialog::settings_dialog(std::shared_ptr<gui_settings> gui_settings, std
}
// PPU tool tips
SubscribeTooltip(ui->ppu_precise, tooltips.settings.ppu_precise);
SubscribeTooltip(ui->ppu_fast, tooltips.settings.ppu_fast);
SubscribeTooltip(ui->ppu__static, tooltips.settings.ppu__static);
SubscribeTooltip(ui->ppu_dynamic, tooltips.settings.ppu_dynamic);
SubscribeTooltip(ui->ppu_llvm, tooltips.settings.ppu_llvm);
QButtonGroup *ppu_bg = new QButtonGroup(this);
ppu_bg->addButton(ui->ppu_precise, static_cast<int>(ppu_decoder_type::precise));
ppu_bg->addButton(ui->ppu_fast, static_cast<int>(ppu_decoder_type::fast));
ppu_bg->addButton(ui->ppu__static, static_cast<int>(ppu_decoder_type::_static));
ppu_bg->addButton(ui->ppu_dynamic, static_cast<int>(ppu_decoder_type::dynamic));
ppu_bg->addButton(ui->ppu_llvm, static_cast<int>(ppu_decoder_type::llvm));
m_emu_settings->EnhanceRadioButton(ppu_bg, emu_settings_type::PPUDecoder);
// SPU tool tips
SubscribeTooltip(ui->spu_precise, tooltips.settings.spu_precise);
SubscribeTooltip(ui->spu_fast, tooltips.settings.spu_fast);
SubscribeTooltip(ui->spu__static, tooltips.settings.spu__static);
SubscribeTooltip(ui->spu_dynamic, tooltips.settings.spu_dynamic);
SubscribeTooltip(ui->spu_asmjit, tooltips.settings.spu_asmjit);
SubscribeTooltip(ui->spu_llvm, tooltips.settings.spu_llvm);
QButtonGroup *spu_bg = new QButtonGroup(this);
spu_bg->addButton(ui->spu_precise, static_cast<int>(spu_decoder_type::precise));
spu_bg->addButton(ui->spu_fast, static_cast<int>(spu_decoder_type::fast));
spu_bg->addButton(ui->spu__static, static_cast<int>(spu_decoder_type::_static));
spu_bg->addButton(ui->spu_dynamic, static_cast<int>(spu_decoder_type::dynamic));
spu_bg->addButton(ui->spu_asmjit, static_cast<int>(spu_decoder_type::asmjit));
spu_bg->addButton(ui->spu_llvm, static_cast<int>(spu_decoder_type::llvm));
@ -349,17 +349,24 @@ settings_dialog::settings_dialog(std::shared_ptr<gui_settings> gui_settings, std
ui->accurateXFloat->setEnabled(checked);
});
connect(ui->spu_fast, &QAbstractButton::toggled, [this](bool checked)
connect(ui->spu__static, &QAbstractButton::toggled, [this](bool checked)
{
ui->accurateXFloat->setEnabled(checked);
});
ui->accurateXFloat->setEnabled(ui->spu_llvm->isChecked() || ui->spu_fast->isChecked());
connect(ui->spu_dynamic, &QAbstractButton::toggled, [this](bool checked)
{
ui->accurateXFloat->setEnabled(checked);
});
ui->accurateXFloat->setEnabled(ui->spu_llvm->isChecked() || ui->spu_dynamic->isChecked());
#ifndef LLVM_AVAILABLE
ui->ppu_llvm->setEnabled(false);
ui->spu_llvm->setEnabled(false);
ui->spu_dynamic->setEnabled(false);
#endif
ui->ppu_dynamic->setEnabled(false);
// _____ _____ _ _ _______ _
// / ____| __ \| | | | |__ __| | |
@ -1138,12 +1145,24 @@ settings_dialog::settings_dialog(std::shared_ptr<gui_settings> gui_settings, std
m_emu_settings->EnhanceCheckBox(ui->debugConsoleMode, emu_settings_type::DebugConsoleMode);
SubscribeTooltip(ui->debugConsoleMode, tooltips.settings.debug_console_mode);
m_emu_settings->EnhanceCheckBox(ui->accurateLLVMdfma, emu_settings_type::AccurateLLVMdfma);
SubscribeTooltip(ui->accurateLLVMdfma, tooltips.settings.accurate_llvm_dfma);
ui->accurateLLVMdfma->setDisabled(utils::has_fma3() || utils::has_fma4());
m_emu_settings->EnhanceCheckBox(ui->accurateDFMA, emu_settings_type::AccurateDFMA);
SubscribeTooltip(ui->accurateDFMA, tooltips.settings.accurate_dfma);
ui->accurateDFMA->setDisabled(utils::has_fma3() || utils::has_fma4());
m_emu_settings->EnhanceCheckBox(ui->AccurateVectorNaN, emu_settings_type::AccurateVectorNaN);
SubscribeTooltip(ui->AccurateVectorNaN, tooltips.settings.accurate_vector_nan);
m_emu_settings->EnhanceCheckBox(ui->accuratePPUSAT, emu_settings_type::AccuratePPUSAT);
SubscribeTooltip(ui->accuratePPUSAT, tooltips.settings.accurate_ppusat);
m_emu_settings->EnhanceCheckBox(ui->accuratePPUNJ, emu_settings_type::AccuratePPUNJ);
SubscribeTooltip(ui->accuratePPUNJ, tooltips.settings.accurate_ppunj);
m_emu_settings->EnhanceCheckBox(ui->fixupPPUVNAN, emu_settings_type::FixupPPUVNAN);
SubscribeTooltip(ui->fixupPPUVNAN, tooltips.settings.fixup_ppuvnan);
m_emu_settings->EnhanceCheckBox(ui->accuratePPUVNAN, emu_settings_type::AccuratePPUVNAN);
SubscribeTooltip(ui->accuratePPUVNAN, tooltips.settings.accurate_ppuvnan);
m_emu_settings->EnhanceCheckBox(ui->accuratePPUFPCC, emu_settings_type::AccuratePPUFPCC);
SubscribeTooltip(ui->accuratePPUFPCC, tooltips.settings.accurate_ppufpcc);
m_emu_settings->EnhanceCheckBox(ui->silenceAllLogs, emu_settings_type::SilenceAllLogs);
SubscribeTooltip(ui->silenceAllLogs, tooltips.settings.silence_all_logs);
@ -1927,9 +1946,6 @@ settings_dialog::settings_dialog(std::shared_ptr<gui_settings> gui_settings, std
m_emu_settings->EnhanceCheckBox(ui->accurateRSXAccess, emu_settings_type::AccurateRSXAccess);
SubscribeTooltip(ui->accurateRSXAccess, tooltips.settings.accurate_rsx_access);
m_emu_settings->EnhanceCheckBox(ui->ppuLlvmJavaModeHandling, emu_settings_type::PPULLVMJavaModeHandling);
SubscribeTooltip(ui->ppuLlvmJavaModeHandling, tooltips.settings.ppu_llvm_java_mode_handling);
m_emu_settings->EnhanceCheckBox(ui->ppuPrecompilation, emu_settings_type::PPULLVMPrecompilation);
SubscribeTooltip(ui->ppuPrecompilation, tooltips.settings.ppu_precompilation);

View file

@ -57,16 +57,16 @@
</property>
<layout class="QVBoxLayout" name="ppu_layout">
<item>
<widget class="QRadioButton" name="ppu_precise">
<widget class="QRadioButton" name="ppu__static">
<property name="text">
<string notr="true">Interpreter (precise)</string>
<string notr="true">Interpreter (static)</string>
</property>
</widget>
</item>
<item>
<widget class="QRadioButton" name="ppu_fast">
<widget class="QRadioButton" name="ppu_dynamic">
<property name="text">
<string notr="true">Interpreter (fast)</string>
<string notr="true">Interpreter (dynamic)</string>
</property>
</widget>
</item>
@ -87,16 +87,16 @@
</property>
<layout class="QVBoxLayout" name="spu_layout">
<item>
<widget class="QRadioButton" name="spu_precise">
<widget class="QRadioButton" name="spu__static">
<property name="text">
<string notr="true">Interpreter (precise)</string>
<string notr="true">Interpreter (static)</string>
</property>
</widget>
</item>
<item>
<widget class="QRadioButton" name="spu_fast">
<widget class="QRadioButton" name="spu_dynamic">
<property name="text">
<string notr="true">Interpreter (fast)</string>
<string notr="true">Interpreter (dynamic)</string>
</property>
</widget>
</item>
@ -2050,9 +2050,9 @@
</widget>
</item>
<item>
<widget class="QCheckBox" name="accurateLLVMdfma">
<widget class="QCheckBox" name="accurateDFMA">
<property name="text">
<string>Accurate LLVM DFMA</string>
<string>Accurate DFMA</string>
</property>
</widget>
</item>
@ -2064,16 +2064,37 @@
</widget>
</item>
<item>
<widget class="QCheckBox" name="AccurateVectorNaN">
<widget class="QCheckBox" name="accuratePPUSAT">
<property name="text">
<string>PPU LLVM Accurate Vector NaNs</string>
<string>Accurate PPU Saturation Bit</string>
</property>
</widget>
</item>
<item>
<widget class="QCheckBox" name="ppuLlvmJavaModeHandling">
<widget class="QCheckBox" name="accuratePPUNJ">
<property name="text">
<string>PPU LLVM Java Mode Handling</string>
<string>Accurate PPU Non-Java Mode</string>
</property>
</widget>
</item>
<item>
<widget class="QCheckBox" name="fixupPPUVNAN">
<property name="text">
<string>PPU Vector NaN Fixup</string>
</property>
</widget>
</item>
<item>
<widget class="QCheckBox" name="accuratePPUVNAN">
<property name="text">
<string>Accurate PPU Vector NaN Handling</string>
</property>
</widget>
</item>
<item>
<widget class="QCheckBox" name="accuratePPUFPCC">
<property name="text">
<string>Accurate PPU Float Condition Control</string>
</property>
</widget>
</item>

View file

@ -55,12 +55,12 @@ public:
// cpu
const QString ppu_precise = tr("Interprets PPU code with absolute accuracy.\nThis is the most accurate Interpreter, but very slow to play games with.\nYou may try this as a last resort if you encounter odd bugs or crashes.\nIf unsure, use PPU Interpreter Fast or PPU Recompiler (LLVM).");
const QString ppu_fast = tr("Interprets PPU code with sacrificed accuracy in order to achieve better performance.\nThis is the fastest interpreter.\nIt very rarely breaks games even in comparison to the Precise option.\nTry this if PPU Recompiler (LLVM) fails.");
const QString ppu__static = tr("Interpreter (slow). Try this if PPU Recompiler (LLVM) doesn't work.");
const QString ppu_dynamic = tr("Alternative interpreter (slow). May be faster than static interpreter. Try this if PPU Recompiler (LLVM) doesn't work.");
const QString ppu_llvm = tr("Recompiles and caches the game's PPU code using the LLVM Recompiler once before running it for the first time.\nThis is by far the fastest option and should always be used.\nShould you face compatibility issues, fall back to one of the Interpreters and retry.\nIf unsure, use this option.");
const QString ppu_precompilation = tr("Searches the game's directory and precompiles extra PPU modules during boot.\nIf disabled, these modules will only be compiled when needed. Depending on the game, this might interrupt the gameplay unexpectedly and possibly frequently.\nOnly disable this if you want to get ingame more quickly.");
const QString spu_precise = tr("Interprets SPU code with absolute accuracy.\nThis is extremely slow but may fix broken graphics in some games.");
const QString spu_fast = tr("Interprets SPU code with sacrificed accuracy in order to achieve better performance.\nThis is slower than the SPU Recompiler but significantly faster than the precise interpreter.\nHowever, games rarely need this.");
const QString spu__static = tr("Interpreter (slow). Try this if SPU Recompiler (LLVM) doesn't work.");
const QString spu_dynamic = tr("Alternative interpreter (slow). May be faster than static interpreter. Try this if SPU Recompiler (LLVM) doesn't work.");
const QString spu_asmjit = tr("Recompiles the game's SPU code using the ASMJIT Recompiler.\nThis is the fast option with very good compatibility.\nIf unsure, use this option.");
const QString spu_llvm = tr("Recompiles and caches the game's SPU code using the LLVM Recompiler before running which adds extra start-up time.\nThis is the fastest option with very good compatibility.\nIf you experience issues, use the ASMJIT Recompiler.");
const QString accurate_xfloat = tr("Adds extra accuracy to SPU float vectors processing.\nFixes bugs in various games at the cost of performance.\nThis setting is only applied when SPU Decoder is set to Fast or LLVM.");
@ -70,6 +70,12 @@ public:
const QString spu_block_size = tr("This option controls the SPU analyser, particularly the size of compiled units. The Mega and Giga modes may improve performance by tying smaller units together, decreasing the number of compiled units but increasing their size.\nUse the Safe mode for maximum compatibility.");
const QString preferred_spu_threads = tr("Some SPU stages are sensitive to race conditions and allowing a limited number at a time helps alleviate performance stalls.\nSetting this to a smaller value might improve performance and reduce stuttering in some games.\nLeave this on auto if performance is negatively affected when setting a small value.");
const QString full_width_avx512 = tr("Enables the use of code with full width AVX-512.\nThis code can be executed much faster, but may cause a loss in performance if your CPU model experiences downclocking on wide AVX-512 loads.\nNote that AVX-512 instructions will be used regardless of this option, just at 128 and 256 bit width.");
const QString accurate_dfma = tr("Use accurate double-precision FMA instructions in PPU and SPU backends.\nWhile disabling it might give a decent performance boost if your CPU doesn't support FMA, it may also introduce subtle bugs that otherwise do not occur.\nYou shouldn't disable it if your CPU supports FMA.");
const QString accurate_ppusat = tr("Accurately set Saturation Bit values in PPU backends.\nIf unsure, do not modify this setting.");
const QString accurate_ppunj = tr("Respect Non-Java Mode Bit values for vector ops in PPU backends.\nIf unsure, do not modify this setting.");
const QString fixup_ppuvnan = tr("Fixup NaN results in vector instructions in PPU backends.\nIf unsure, do not modify this setting.");
const QString accurate_ppuvnan = tr("Accurately set NaN results in vector instructions in PPU backends.\nIf unsure, do not modify this setting.");
const QString accurate_ppufpcc = tr("Accurately set FPCC Bits in PPU backends.\nIf unsure, do not modify this setting.");
// debug
@ -80,8 +86,6 @@ public:
const QString accurate_getllar = tr("Accurately processes SPU MFC_GETLLAR operation.");
const QString accurate_spu_dma = tr("Accurately processes SPU DMA operations.");
const QString accurate_cache_line_stores = tr("Accurately processes PPU DCBZ instruction.\nIn addition, when combined with Accurate SPU DMA, SPU PUT cache line accesses will be processed atomically.");
const QString accurate_llvm_dfma = tr("Provides extra accuracy on FMA instructions at the cost of performance.\nWhile disabling it might give a decent performance boost if your CPU doesn't support FMA, it may also introduce subtle bugs that otherwise do not occur.\nYou can't disable it if your CPU supports FMA.");
const QString accurate_vector_nan = tr("Forces the floating point NaN (Not A Number) values outputted from PPU vector instructions to be accurate to the real hardware. (0x7FC00000)");
const QString accurate_rsx_access = tr("Forces RSX pauses on SPU MFC_GETLLAR and SPU MFC_PUTLLUC operations.");
const QString mfc_delay_command = tr("Forces delaying any odd MFC command, waits for at least 2 pending commands to execute them in a random order.\nMust be used with either SPU interpreters currently.\nSeverely degrades performance! If unsure, don't use this option.");
const QString hook_static_functions = tr("Allows to hook some functions like 'memcpy' replacing them with high-level implementations. May do nothing or break things. Experimental.");
@ -101,7 +105,6 @@ public:
const QString accurate_ppu_128_loop = tr("When enabled, PPU atomic operations will operate on entire cache line data, as opposed to a single 64bit block of memory when disabled.\nNumerical values control whether or not to enable the accurate version based on the atomic operation's length.");
const QString enable_performance_report = tr("Measure certain events and print a chart after the emulator is stopped. Don't enable if not asked to.");
const QString num_ppu_threads = tr("Affects maximum amount of PPU threads running concurrently, the value of 1 has very low compatibility with games.\n2 is the default, if unsure do not modify this setting.");
const QString ppu_llvm_java_mode_handling = tr("Respect current Java Mode for alti-vec ops by PPU LLVM.\nIf unsure, do not modify this setting.");
// emulator

View file

@ -1,15 +1,15 @@
#pragma once
#include "util/types.hpp"
#include "util/tsc.hpp"
#include <functional>
extern bool g_use_rtm;
extern u64 g_rtm_tx_limit1;
#ifdef _MSC_VER
#ifdef _M_X64
extern "C"
{
u64 __rdtsc();
u32 _xbegin();
void _xend();
void _mm_pause();
@ -27,24 +27,17 @@ extern "C"
s64 _div128(s64, s64, s64, s64*);
u64 _udiv128(u64, u64, u64, u64*);
void __debugbreak();
}
#endif
namespace utils
{
inline u64 get_tsc()
{
#ifdef _MSC_VER
return __rdtsc();
#else
return __builtin_ia32_rdtsc();
#endif
}
// Transaction helper (result = pair of success and op result, or just bool)
template <typename F, typename R = std::invoke_result_t<F>>
inline auto tx_start(F op)
{
#if defined(ARCH_X64)
uint status = -1;
for (auto stamp0 = get_tsc(), stamp1 = stamp0; g_use_rtm && stamp1 - stamp0 <= g_rtm_tx_limit1; stamp1 = get_tsc())
@ -90,6 +83,9 @@ namespace utils
break;
}
}
#else
static_cast<void>(op);
#endif
if constexpr (std::is_void_v<R>)
{
@ -113,7 +109,7 @@ namespace utils
const u64 value = reinterpret_cast<u64>(func);
const void* ptr = reinterpret_cast<const void*>(value);
#ifdef _MSC_VER
#ifdef _M_X64
return _mm_prefetch(static_cast<const char*>(ptr), 2);
#else
return __builtin_prefetch(ptr, 0, 2);
@ -128,7 +124,7 @@ namespace utils
return;
}
#ifdef _MSC_VER
#ifdef _M_X64
return _mm_prefetch(static_cast<const char*>(ptr), 3);
#else
return __builtin_prefetch(ptr, 0, 3);
@ -142,7 +138,7 @@ namespace utils
return;
}
#ifdef _MSC_VER
#ifdef _M_X64
return _m_prefetchw(ptr);
#else
return __builtin_prefetch(ptr, 1, 0);
@ -160,8 +156,10 @@ namespace utils
return _rotl8(x, n);
#elif defined(__clang__)
return __builtin_rotateleft8(x, n);
#else
#elif defined(ARCH_X64)
return __builtin_ia32_rolqi(x, n);
#else
return (x << (n & 7)) | (x >> ((-n & 7)));
#endif
}
@ -176,8 +174,10 @@ namespace utils
return _rotl16(x, static_cast<uchar>(n));
#elif defined(__clang__)
return __builtin_rotateleft16(x, n);
#else
#elif defined(ARCH_X64)
return __builtin_ia32_rolhi(x, n);
#else
return (x << (n & 15)) | (x >> ((-n & 15)));
#endif
}
@ -344,10 +344,14 @@ namespace utils
inline void pause()
{
#ifdef _MSC_VER
#if defined(ARCH_ARM64)
__asm__ volatile("yield");
#elif defined(_M_X64)
_mm_pause();
#else
#elif defined(ARCH_X64)
__builtin_ia32_pause();
#else
#error "Missing utils::pause() implementation"
#endif
}
@ -391,10 +395,27 @@ namespace utils
{
#ifdef _MSC_VER
return (T*)ptr;
#else
#elif defined(ARCH_X64)
T* result;
__asm__("movq %1, %0;" : "=r" (result) : "r" (ptr) : "memory");
return result;
#elif defined(ARCH_ARM64)
T* result;
__asm__("mov %0, %1" : "=r" (result) : "r" (ptr) : "memory");
return result;
#endif
}
inline void trap()
{
#ifdef _M_X64
__debugbreak();
#elif defined(ARCH_X64)
__asm__ volatile("int3");
#elif defined(ARCH_ARM64)
__asm__ volatile("brk 0x42");
#else
#error "Missing utils::trap() implementation"
#endif
}
} // namespace utils

View file

@ -35,6 +35,7 @@ namespace utils
#include "asm.hpp"
#include "endian.hpp"
#include "tsc.hpp"
// Total number of entries.
static constexpr usz s_hashtable_size = 1u << 17;
@ -804,17 +805,9 @@ namespace
};
}
#ifdef _MSC_VER
extern "C" u64 __rdtsc();
#endif
u64 utils::get_unique_tsc()
{
#ifdef _MSC_VER
const u64 stamp0 = __rdtsc();
#else
const u64 stamp0 = __builtin_ia32_rdtsc();
#endif
const u64 stamp0 = utils::get_tsc();
return s_min_tsc.atomic_op([&](u64& tsc)
{

View file

@ -4,7 +4,7 @@
#include <functional>
#include <mutex>
#ifdef _MSC_VER
#ifdef _M_X64
#pragma warning(push)
#pragma warning(disable: 4996)
@ -67,7 +67,7 @@ namespace utils
FORCE_INLINE void atomic_fence_consume()
{
#ifdef _MSC_VER
#ifdef _M_X64
_ReadWriteBarrier();
#else
__atomic_thread_fence(__ATOMIC_CONSUME);
@ -76,7 +76,7 @@ FORCE_INLINE void atomic_fence_consume()
FORCE_INLINE void atomic_fence_acquire()
{
#ifdef _MSC_VER
#ifdef _M_X64
_ReadWriteBarrier();
#else
__atomic_thread_fence(__ATOMIC_ACQUIRE);
@ -85,7 +85,7 @@ FORCE_INLINE void atomic_fence_acquire()
FORCE_INLINE void atomic_fence_release()
{
#ifdef _MSC_VER
#ifdef _M_X64
_ReadWriteBarrier();
#else
__atomic_thread_fence(__ATOMIC_RELEASE);
@ -94,7 +94,7 @@ FORCE_INLINE void atomic_fence_release()
FORCE_INLINE void atomic_fence_acq_rel()
{
#ifdef _MSC_VER
#ifdef _M_X64
_ReadWriteBarrier();
#else
__atomic_thread_fence(__ATOMIC_ACQ_REL);
@ -103,16 +103,18 @@ FORCE_INLINE void atomic_fence_acq_rel()
FORCE_INLINE void atomic_fence_seq_cst()
{
#ifdef _MSC_VER
#ifdef _M_X64
_ReadWriteBarrier();
_InterlockedOr(static_cast<long*>(_AddressOfReturnAddress()), 0);
_ReadWriteBarrier();
#else
#elif defined(ARCH_X64)
__asm__ volatile ("lock orl $0, 0(%%rsp);" ::: "cc", "memory");
#else
__atomic_thread_fence(__ATOMIC_SEQ_CST);
#endif
}
#ifdef _MSC_VER
#ifdef _M_X64
#pragma warning(pop)
#endif
@ -342,7 +344,7 @@ struct atomic_storage
using type = get_uint_t<sizeof(T)>;
#ifndef _MSC_VER
#ifndef _M_X64
#if defined(__ATOMIC_HLE_ACQUIRE) && defined(__ATOMIC_HLE_RELEASE)
static constexpr int s_hle_ack = __ATOMIC_SEQ_CST | __ATOMIC_HLE_ACQUIRE;
@ -472,7 +474,7 @@ struct atomic_storage
/* Second part: MSVC-specific */
#ifdef _MSC_VER
#ifdef _M_X64
static inline T add_fetch(T& dest, T value)
{
return atomic_storage<T>::fetch_add(dest, value) + value;
@ -529,6 +531,7 @@ struct atomic_storage
static inline bool bts(T& dest, uint bit)
{
#if defined(ARCH_X64)
uchar* dst = reinterpret_cast<uchar*>(&dest);
if constexpr (sizeof(T) < 4)
@ -539,18 +542,23 @@ struct atomic_storage
bit = bit + (ptr & 3) * 8;
dst = reinterpret_cast<T*>(ptr & -4);
}
#endif
#ifdef _MSC_VER
#ifdef _M_X64
return _interlockedbittestandset((long*)dst, bit) != 0;
#else
#elif defined(ARCH_X64)
bool result;
__asm__ volatile ("lock btsl %2, 0(%1)\n" : "=@ccc" (result) : "r" (dst), "Ir" (bit) : "cc", "memory");
return result;
#else
const T value = static_cast<T>(1) << bit;
return (__atomic_fetch_or(&dest, value, __ATOMIC_SEQ_CST) & value) != 0;
#endif
}
static inline bool btr(T& dest, uint bit)
{
#if defined(ARCH_X64)
uchar* dst = reinterpret_cast<uchar*>(&dest);
if constexpr (sizeof(T) < 4)
@ -561,18 +569,23 @@ struct atomic_storage
bit = bit + (ptr & 3) * 8;
dst = reinterpret_cast<T*>(ptr & -4);
}
#endif
#ifdef _MSC_VER
#ifdef _M_X64
return _interlockedbittestandreset((long*)dst, bit) != 0;
#else
#elif defined(ARCH_X64)
bool result;
__asm__ volatile ("lock btrl %2, 0(%1)\n" : "=@ccc" (result) : "r" (dst), "Ir" (bit) : "cc", "memory");
return result;
#else
const T value = static_cast<T>(1) << bit;
return (__atomic_fetch_and(&dest, ~value, __ATOMIC_SEQ_CST) & value) != 0;
#endif
}
static inline bool btc(T& dest, uint bit)
{
#if defined(ARCH_X64)
uchar* dst = reinterpret_cast<uchar*>(&dest);
if constexpr (sizeof(T) < 4)
@ -583,8 +596,9 @@ struct atomic_storage
bit = bit + (ptr & 3) * 8;
dst = reinterpret_cast<T*>(ptr & -4);
}
#endif
#ifdef _MSC_VER
#ifdef _M_X64
while (true)
{
// Keep trying until we actually invert desired bit
@ -593,10 +607,13 @@ struct atomic_storage
if (_interlockedbittestandreset((long*)dst, bit))
return true;
}
#else
#elif defined(ARCH_X64)
bool result;
__asm__ volatile ("lock btcl %2, 0(%1)\n" : "=@ccc" (result) : "r" (dst), "Ir" (bit) : "cc", "memory");
return result;
#else
const T value = static_cast<T>(1) << bit;
return (__atomic_fetch_xor(&dest, value, __ATOMIC_SEQ_CST) & value) != 0;
#endif
}
};
@ -606,7 +623,7 @@ struct atomic_storage
template <typename T>
struct atomic_storage<T, 1> : atomic_storage<T, 0>
{
#ifdef _MSC_VER
#ifdef _M_X64
static inline bool compare_exchange(T& dest, T& comp, T exch)
{
const char v = std::bit_cast<char>(comp);
@ -676,7 +693,7 @@ struct atomic_storage<T, 1> : atomic_storage<T, 0>
template <typename T>
struct atomic_storage<T, 2> : atomic_storage<T, 0>
{
#ifdef _MSC_VER
#ifdef _M_X64
static inline bool compare_exchange(T& dest, T& comp, T exch)
{
const short v = std::bit_cast<short>(comp);
@ -758,7 +775,7 @@ struct atomic_storage<T, 2> : atomic_storage<T, 0>
template <typename T>
struct atomic_storage<T, 4> : atomic_storage<T, 0>
{
#ifdef _MSC_VER
#ifdef _M_X64
static inline bool compare_exchange(T& dest, T& comp, T exch)
{
const long v = std::bit_cast<long>(comp);
@ -854,7 +871,7 @@ struct atomic_storage<T, 4> : atomic_storage<T, 0>
template <typename T>
struct atomic_storage<T, 8> : atomic_storage<T, 0>
{
#ifdef _MSC_VER
#ifdef _M_X64
static inline bool compare_exchange(T& dest, T& comp, T exch)
{
const llong v = std::bit_cast<llong>(comp);
@ -950,7 +967,7 @@ struct atomic_storage<T, 8> : atomic_storage<T, 0>
template <typename T>
struct atomic_storage<T, 16> : atomic_storage<T, 0>
{
#ifdef _MSC_VER
#ifdef _M_X64
static inline T load(const T& dest)
{
atomic_fence_acquire();
@ -995,7 +1012,7 @@ struct atomic_storage<T, 16> : atomic_storage<T, 0>
utils::atomic_store16(&dest, std::bit_cast<u128>(value));
atomic_fence_release();
}
#else
#elif defined(ARCH_X64)
static inline T load(const T& dest)
{
alignas(16) T r;
@ -1078,6 +1095,91 @@ struct atomic_storage<T, 16> : atomic_storage<T, 0>
__asm__ volatile("movdqa %0, %1;" :: "x" (val), "m" (dest) : "memory");
#endif
}
#elif defined(ARCH_ARM64)
static inline T load(const T& dest)
{
u32 tmp;
u64 data[2];
__asm__ volatile("1:\n"
"ldaxp %x[data0], %x[data1], %[dest]\n"
"stlxp %w[tmp], %x[data0], %x[data1], %[dest]\n"
"cbnz %w[tmp], 1b\n"
: [tmp] "=&r" (tmp), [data0] "=&r" (data[0]), [data1] "=&r" (data[1])
: [dest] "Q" (dest)
: "memory"
);
T result;
std::memcpy(&result, data, 16);
return result;
}
static inline T observe(const T& dest)
{
// TODO
return load(dest);
}
static inline bool compare_exchange(T& dest, T& comp, T exch)
{
bool result;
u64 cmp[2];
std::memcpy(cmp, &comp, 16);
u64 data[2];
std::memcpy(data, &exch, 16);
u64 prev[2];
__asm__ volatile("1:\n"
"ldaxp %x[prev0], %x[prev1], %[storage]\n"
"cmp %x[prev0], %x[cmp0]\n"
"ccmp %x[prev1], %x[cmp1], #0, eq\n"
"b.ne 2f\n"
"stlxp %w[result], %x[data0], %x[data1], %[storage]\n"
"cbnz %w[result], 1b\n"
"2:\n"
"cset %w[result], eq\n"
: [result] "=&r" (result), [storage] "+Q" (dest), [prev0] "=&r" (prev[0]), [prev1] "=&r" (prev[1])
: [data0] "r" (data[0]), [data1] "r" (data[1]), [cmp0] "r" (cmp[0]), [cmp1] "r" (cmp[1])
: "cc", "memory"
);
if (result)
{
return true;
}
std::memcpy(&comp, prev, 16);
return false;
}
static inline T exchange(T& dest, T value)
{
u32 tmp;
u64 src[2];
u64 data[2];
std::memcpy(src, &value, 16);
__asm__ volatile("1:\n"
"ldaxp %x[data0], %x[data1], %[dest]\n"
"stlxp %w[tmp], %x[src0], %x[src1], %[dest]\n"
"cbnz %w[tmp], 1b\n"
: [tmp] "=&r" (tmp), [dest] "+Q" (dest), [data0] "=&r" (data[0]), [data1] "=&r" (data[1])
: [src0] "r" (src[0]), [src1] "r" (src[1])
: "memory"
);
T result;
std::memcpy(&result, data, 16);
return result;
}
static inline void store(T& dest, T value)
{
// TODO
exchange(dest, value);
}
static inline void release(T& dest, T value)
{
// TODO
exchange(dest, value);
}
#endif
// TODO
@ -1562,17 +1664,50 @@ public:
bool bit_test_set(uint bit)
{
return atomic_storage<type>::bts(m_data, bit & (sizeof(T) * 8 - 1));
if constexpr (std::is_integral<type>::value)
{
return atomic_storage<type>::bts(m_data, bit & (sizeof(T) * 8 - 1));
}
return atomic_op([](type& v)
{
const auto old = v;
const auto bit = type(1) << (sizeof(T) * 8 - 1);
v |= bit;
return !!(old & bit);
});
}
bool bit_test_reset(uint bit)
{
return atomic_storage<type>::btr(m_data, bit & (sizeof(T) * 8 - 1));
if constexpr (std::is_integral<type>::value)
{
return atomic_storage<type>::btr(m_data, bit & (sizeof(T) * 8 - 1));
}
return atomic_op([](type& v)
{
const auto old = v;
const auto bit = type(1) << (sizeof(T) * 8 - 1);
v &= ~bit;
return !!(old & bit);
});
}
bool bit_test_invert(uint bit)
{
return atomic_storage<type>::btc(m_data, bit & (sizeof(T) * 8 - 1));
if constexpr (std::is_integral<type>::value)
{
return atomic_storage<type>::btc(m_data, bit & (sizeof(T) * 8 - 1));
}
return atomic_op([](type& v)
{
const auto old = v;
const auto bit = type(1) << (sizeof(T) * 8 - 1);
v ^= bit;
return !!(old & bit);
});
}
// Timeout is discouraged

24
rpcs3/util/fence.hpp Normal file
View file

@ -0,0 +1,24 @@
#pragma once
#include "util/types.hpp"
#ifdef _M_X64
extern "C" void _mm_lfence();
#endif
namespace utils
{
inline void lfence()
{
#ifdef _M_X64
_mm_lfence();
#elif defined(ARCH_X64)
__builtin_ia32_lfence();
#elif defined(ARCH_ARM64)
// TODO
__asm__ volatile("isb");
#else
#error "Missing lfence() implementation"
#endif
}
}

View file

@ -19,10 +19,10 @@ namespace stx
class atomic_ptr;
// Basic assumption of userspace pointer size
constexpr uint c_ptr_size = 47;
constexpr uint c_ptr_size = 48;
// Use lower 17 bits as atomic_ptr internal counter of borrowed refs (pointer itself is shifted)
constexpr uint c_ref_mask = 0x1ffff, c_ref_size = 17;
constexpr uint c_ref_mask = 0xffff, c_ref_size = 16;
// Remaining pointer bits
constexpr uptr c_ptr_mask = static_cast<uptr>(-1) << c_ref_size;

2143
rpcs3/util/simd.hpp Normal file

File diff suppressed because it is too large Load diff

View file

@ -19,15 +19,14 @@
#endif
#include "util/asm.hpp"
#include "util/fence.hpp"
#ifdef _MSC_VER
extern "C"
{
u64 _xgetbv(u32);
}
#ifdef _M_X64
extern "C" u64 _xgetbv(u32);
#endif
inline std::array<u32, 4> utils::get_cpuid(u32 func, u32 subfunc)
#if defined(ARCH_X64)
static inline std::array<u32, 4> get_cpuid(u32 func, u32 subfunc)
{
int regs[4];
#ifdef _MSC_VER
@ -38,7 +37,7 @@ inline std::array<u32, 4> utils::get_cpuid(u32 func, u32 subfunc)
return {0u+regs[0], 0u+regs[1], 0u+regs[2], 0u+regs[3]};
}
inline u64 utils::get_xgetbv(u32 xcr)
static inline u64 get_xgetbv(u32 xcr)
{
#ifdef _MSC_VER
return _xgetbv(xcr);
@ -48,6 +47,7 @@ inline u64 utils::get_xgetbv(u32 xcr)
return eax | (u64(edx) << 32);
#endif
}
#endif
#ifdef __APPLE__
// sysinfo_darwin.mm
@ -61,113 +61,192 @@ namespace Darwin_Version
bool utils::has_ssse3()
{
#if defined(ARCH_X64)
static const bool g_value = get_cpuid(0, 0)[0] >= 0x1 && get_cpuid(1, 0)[2] & 0x200;
return g_value;
#else
return false;
#endif
}
bool utils::has_sse41()
{
#if defined(ARCH_X64)
static const bool g_value = get_cpuid(0, 0)[0] >= 0x1 && get_cpuid(1, 0)[2] & 0x80000;
return g_value;
#else
return false;
#endif
}
bool utils::has_avx()
{
#if defined(ARCH_X64)
static const bool g_value = get_cpuid(0, 0)[0] >= 0x1 && get_cpuid(1, 0)[2] & 0x10000000 && (get_cpuid(1, 0)[2] & 0x0C000000) == 0x0C000000 && (get_xgetbv(0) & 0x6) == 0x6;
return g_value;
#else
return false;
#endif
}
bool utils::has_avx2()
{
#if defined(ARCH_X64)
static const bool g_value = get_cpuid(0, 0)[0] >= 0x7 && get_cpuid(7, 0)[1] & 0x20 && (get_cpuid(1, 0)[2] & 0x0C000000) == 0x0C000000 && (get_xgetbv(0) & 0x6) == 0x6;
return g_value;
#else
return false;
#endif
}
bool utils::has_rtm()
{
#if defined(ARCH_X64)
static const bool g_value = get_cpuid(0, 0)[0] >= 0x7 && (get_cpuid(7, 0)[1] & 0x800) == 0x800;
return g_value;
#elif defined(ARCH_ARM64)
return false;
#endif
}
bool utils::has_tsx_force_abort()
{
#if defined(ARCH_X64)
static const bool g_value = get_cpuid(0, 0)[0] >= 0x7 && (get_cpuid(7, 0)[3] & 0x2000) == 0x2000;
return g_value;
#else
return false;
#endif
}
bool utils::has_rtm_always_abort()
{
#if defined(ARCH_X64)
static const bool g_value = get_cpuid(0, 0)[0] >= 0x7 && (get_cpuid(7, 0)[3] & 0x800) == 0x800;
return g_value;
#else
return false;
#endif
}
bool utils::has_mpx()
{
#if defined(ARCH_X64)
static const bool g_value = get_cpuid(0, 0)[0] >= 0x7 && (get_cpuid(7, 0)[1] & 0x4000) == 0x4000;
return g_value;
#else
return false;
#endif
}
bool utils::has_avx512()
{
#if defined(ARCH_X64)
// Check AVX512F, AVX512CD, AVX512DQ, AVX512BW, AVX512VL extensions (Skylake-X level support)
static const bool g_value = get_cpuid(0, 0)[0] >= 0x7 && (get_cpuid(7, 0)[1] & 0xd0030000) == 0xd0030000 && (get_cpuid(1, 0)[2] & 0x0C000000) == 0x0C000000 && (get_xgetbv(0) & 0xe6) == 0xe6;
return g_value;
#else
return false;
#endif
}
bool utils::has_avx512_icl()
{
#if defined(ARCH_X64)
// Check AVX512IFMA, AVX512VBMI, AVX512VBMI2, AVX512VPOPCNTDQ, AVX512BITALG, AVX512VNNI, AVX512VPCLMULQDQ, AVX512GFNI, AVX512VAES (Icelake-client level support)
static const bool g_value = has_avx512() && (get_cpuid(7, 0)[1] & 0x00200000) == 0x00200000 && (get_cpuid(7, 0)[2] & 0x00005f42) == 0x00005f42;
return g_value;
#else
return false;
#endif
}
bool utils::has_avx512_vnni()
{
#if defined(ARCH_X64)
// Check AVX512VNNI
static const bool g_value = has_avx512() && get_cpuid(7, 0)[2] & 0x00000800;
return g_value;
#else
return false;
#endif
}
bool utils::has_xop()
{
#if defined(ARCH_X64)
static const bool g_value = has_avx() && get_cpuid(0x80000001, 0)[2] & 0x800;
return g_value;
#else
return false;
#endif
}
bool utils::has_clwb()
{
#if defined(ARCH_X64)
static const bool g_value = get_cpuid(0, 0)[0] >= 0x7 && (get_cpuid(7, 0)[1] & 0x1000000) == 0x1000000;
return g_value;
#else
return false;
#endif
}
bool utils::has_invariant_tsc()
{
#if defined(ARCH_X64)
static const bool g_value = get_cpuid(0, 0)[0] >= 0x7 && (get_cpuid(0x80000007, 0)[3] & 0x100) == 0x100;
return g_value;
#elif defined(ARCH_ARM64)
return true;
#endif
}
bool utils::has_fma3()
{
#if defined(ARCH_X64)
static const bool g_value = get_cpuid(0, 0)[0] >= 0x1 && get_cpuid(1, 0)[2] & 0x1000;
return g_value;
#elif defined(ARCH_ARM64)
return true;
#endif
}
bool utils::has_fma4()
{
#if defined(ARCH_X64)
static const bool g_value = get_cpuid(0, 0)[0] >= 0x7 && (get_cpuid(0x80000001, 0)[2] & 0x10000) == 0x10000;
return g_value;
#else
return false;
#endif
}
bool utils::has_erms()
{
#if defined(ARCH_X64)
static const bool g_value = get_cpuid(0, 0)[0] >= 0x7 && (get_cpuid(7, 0)[1] & 0x200) == 0x200;
return g_value;
#else
return false;
#endif
}
bool utils::has_fsrm()
{
#if defined(ARCH_X64)
static const bool g_value = get_cpuid(0, 0)[0] >= 0x7 && (get_cpuid(7, 0)[3] & 0x10) == 0x10;
return g_value;
#else
return false;
#endif
}
u32 utils::get_rep_movsb_threshold()
{
static const u32 g_value = []()
{
u32 thresh_value = 0xFFFFFFFF;
u32 thresh_value = umax;
if (has_fsrm())
{
thresh_value = 2047;
@ -187,6 +266,7 @@ std::string utils::get_cpu_brand()
{
std::string brand;
#if defined(ARCH_X64)
if (get_cpuid(0x80000000, 0)[0] >= 0x80000004)
{
for (u32 i = 0; i < 3; i++)
@ -198,6 +278,9 @@ std::string utils::get_cpu_brand()
{
brand = "Unknown CPU";
}
#else
brand = "Unidentified CPU";
#endif
brand.erase(brand.find_last_not_of('\0') + 1);
brand.erase(brand.find_last_not_of(' ') + 1);
@ -396,19 +479,6 @@ static constexpr ullong round_tsc(ullong val)
return utils::rounded_div(val, 1'000'000) * 1'000'000;
}
#ifdef _MSC_VER
extern "C" void _mm_lfence();
#endif
static inline void lfence()
{
#ifdef _MSC_VER
_mm_lfence();
#else
__builtin_ia32_lfence();
#endif
}
ullong utils::get_tsc_freq()
{
static const ullong cal_tsc = []() -> ullong
@ -449,17 +519,17 @@ ullong utils::get_tsc_freq()
{
#ifdef _WIN32
Sleep(1);
error_data[i] = (lfence(), utils::get_tsc());
error_data[i] = (utils::lfence(), utils::get_tsc());
LARGE_INTEGER ctr;
QueryPerformanceCounter(&ctr);
rdtsc_data[i] = (lfence(), utils::get_tsc());
rdtsc_data[i] = (utils::lfence(), utils::get_tsc());
timer_data[i] = ctr.QuadPart;
#else
usleep(200);
error_data[i] = (lfence(), utils::get_tsc());
error_data[i] = (utils::lfence(), utils::get_tsc());
struct timespec ts;
clock_gettime(CLOCK_MONOTONIC, &ts);
rdtsc_data[i] = (lfence(), utils::get_tsc());
rdtsc_data[i] = (utils::lfence(), utils::get_tsc());
timer_data[i] = ts.tv_nsec + (ts.tv_sec - sec_base) * 1'000'000'000;
#endif
}
@ -511,6 +581,7 @@ u32 utils::get_thread_count()
u32 utils::get_cpu_family()
{
#if defined(ARCH_X64)
static const u32 g_value = []()
{
const u32 reg_value = get_cpuid(0x00000001, 0)[0]; // Processor feature info
@ -528,10 +599,14 @@ u32 utils::get_cpu_family()
}();
return g_value;
#elif defined(ARCH_ARM64)
return 0;
#endif
}
u32 utils::get_cpu_model()
{
#if defined(ARCH_X64)
static const u32 g_value = []()
{
const u32 reg_value = get_cpuid(0x00000001, 0)[0]; // Processor feature info
@ -550,16 +625,19 @@ u32 utils::get_cpu_model()
}();
return g_value;
#elif defined(ARCH_ARM64)
return 0;
#endif
}
namespace utils
{
extern const u64 main_tid = []() -> u64
{
#ifdef _WIN32
#ifdef _WIN32
return GetCurrentThreadId();
#else
#else
return reinterpret_cast<u64>(pthread_self());
#endif
#endif
}();
}

View file

@ -5,10 +5,6 @@
namespace utils
{
std::array<u32, 4> get_cpuid(u32 func, u32 subfunc);
u64 get_xgetbv(u32 xcr);
bool has_ssse3();
bool has_sse41();
@ -20,7 +16,7 @@ namespace utils
bool has_rtm();
bool has_tsx_force_abort();
bool has_rtm_always_abort();
bool has_mpx();
@ -29,6 +25,8 @@ namespace utils
bool has_avx512_icl();
bool has_avx512_vnni();
bool has_xop();
bool has_clwb();

25
rpcs3/util/tsc.hpp Normal file
View file

@ -0,0 +1,25 @@
#pragma once
#include "util/types.hpp"
#ifdef _M_X64
extern "C" u64 __rdtsc();
#endif
namespace utils
{
inline u64 get_tsc()
{
#if defined(ARCH_ARM64)
u64 r = 0;
__asm__ volatile("mrs %0, cntvct_el0" : "=r" (r));
return r;
#elif defined(_M_X64)
return __rdtsc();
#elif defined(ARCH_X64)
return __builtin_ia32_rdtsc();
#else
#error "Missing utils::get_tsc() implementation"
#endif
}
}

View file

@ -12,6 +12,12 @@
#include <memory>
#include <bit>
#if defined(__SSE2__) || defined(_M_X64) || defined(_M_AMD64) || defined(__x86_64__) || defined(__amd64__)
#define ARCH_X64 1
#elif defined(__aarch64__) || defined(__arm64__) || defined(_M_ARM64)
#define ARCH_ARM64 1
#endif
using std::chrono::steady_clock;
using namespace std::literals;
@ -180,15 +186,15 @@ public:
}
};
#ifndef _MSC_VER
using u128 = __uint128_t;
using s128 = __int128_t;
#if defined(ARCH_X64) && !defined(_MSC_VER)
using __m128i = long long __attribute__((vector_size(16)));
using __m128d = double __attribute__((vector_size(16)));
using __m128 = float __attribute__((vector_size(16)));
#endif
#ifndef _MSC_VER
using u128 = __uint128_t;
using s128 = __int128_t;
#else
extern "C"

View file

@ -2,6 +2,9 @@
#include "util/types.hpp"
template <typename T>
concept Vector128 = (sizeof(T) == 16) && (std::is_trivial_v<T>);
// 128-bit vector type
union alignas(16) v128
{
@ -58,39 +61,23 @@ union alignas(16) v128
u128 _u;
s128 _s;
#ifdef _MSC_VER
template <typename T>
struct opaque_wrapper
v128() = default;
constexpr v128(const v128&) noexcept = default;
template <Vector128 T>
constexpr v128(const T& rhs) noexcept
: v128(std::bit_cast<v128>(rhs))
{
u128 m_data;
}
opaque_wrapper() = default;
constexpr v128& operator=(const v128&) noexcept = default;
opaque_wrapper(const T& value)
: m_data(std::bit_cast<u128>(value))
{
}
opaque_wrapper& operator=(const T& value)
{
m_data = std::bit_cast<u128>(value);
return *this;
}
operator T() const
{
return std::bit_cast<T>(m_data);
}
};
opaque_wrapper<__m128> vf;
opaque_wrapper<__m128i> vi;
opaque_wrapper<__m128d> vd;
#else
__m128 vf;
__m128i vi;
__m128d vd;
#endif
template <Vector128 T>
constexpr operator T() const noexcept
{
return std::bit_cast<T>(*this);
}
using enable_bitcopy = std::true_type;
@ -107,6 +94,14 @@ union alignas(16) v128
return from64(_0, _1);
}
static v128 from64p(u64 value)
{
v128 ret;
ret._u64[0] = value;
ret._u64[1] = value;
return ret;
}
static v128 from32(u32 _0, u32 _1 = 0, u32 _2 = 0, u32 _3 = 0)
{
v128 ret;
@ -132,6 +127,16 @@ union alignas(16) v128
return ret;
}
static v128 fromf32p(f32 value)
{
v128 ret;
ret._f[0] = value;
ret._f[1] = value;
ret._f[2] = value;
ret._f[3] = value;
return ret;
}
static v128 from16p(u16 value)
{
v128 ret;
@ -153,11 +158,18 @@ union alignas(16) v128
return ret;
}
static inline v128 fromV(const __m128i& value);
static inline v128 fromF(const __m128& value);
static inline v128 fromD(const __m128d& value);
static v128 undef()
{
#if defined(__GNUC__) || defined(__clang__)
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wuninitialized"
#endif
v128 ret;
return ret;
#if defined(__GNUC__) || defined(__clang__)
#pragma GCC diagnostic pop
#endif
}
// Unaligned load with optional index offset
static v128 loadu(const void* ptr, usz index = 0)
@ -173,45 +185,13 @@ union alignas(16) v128
std::memcpy(static_cast<u8*>(ptr) + index * sizeof(v128), &value, sizeof(v128));
}
static inline v128 add8(const v128& left, const v128& right);
static inline v128 add16(const v128& left, const v128& right);
static inline v128 add32(const v128& left, const v128& right);
static inline v128 addfs(const v128& left, const v128& right);
static inline v128 addfd(const v128& left, const v128& right);
static inline v128 sub8(const v128& left, const v128& right);
static inline v128 sub16(const v128& left, const v128& right);
static inline v128 sub32(const v128& left, const v128& right);
static inline v128 subfs(const v128& left, const v128& right);
static inline v128 subfd(const v128& left, const v128& right);
static inline v128 maxu8(const v128& left, const v128& right);
static inline v128 minu8(const v128& left, const v128& right);
static inline v128 eq8(const v128& left, const v128& right);
static inline v128 eq16(const v128& left, const v128& right);
static inline v128 eq32(const v128& left, const v128& right);
static inline v128 eq32f(const v128& left, const v128& right);
static inline v128 fma32f(v128 a, const v128& b, const v128& c);
v128 operator|(const v128&) const;
v128 operator&(const v128&) const;
v128 operator^(const v128&) const;
v128 operator~() const;
bool operator==(const v128& right) const;
// result = (~left) & (right)
static inline v128 andnot(const v128& left, const v128& right);
void clear()
{
*this = {};
@ -227,3 +207,12 @@ struct offset32_array<v128::masked_array_t<T, N, M>>
return u32{sizeof(T)} * (static_cast<u32>(arg) ^ static_cast<u32>(M));
}
};
template <>
struct std::hash<v128>
{
usz operator()(const v128& key) const
{
return key._u64[0] + key._u64[1];
}
};

View file

@ -1,178 +0,0 @@
#pragma once
#include "util/types.hpp"
#include "util/v128.hpp"
#include "util/sysinfo.hpp"
#ifdef _MSC_VER
#include <intrin.h>
#else
#include <x86intrin.h>
#endif
#include <immintrin.h>
#include <emmintrin.h>
#include <cmath>
inline bool v128_use_fma = utils::has_fma3();
inline v128 v128::fromV(const __m128i& value)
{
v128 ret;
ret.vi = value;
return ret;
}
inline v128 v128::fromF(const __m128& value)
{
v128 ret;
ret.vf = value;
return ret;
}
inline v128 v128::fromD(const __m128d& value)
{
v128 ret;
ret.vd = value;
return ret;
}
inline v128 v128::add8(const v128& left, const v128& right)
{
return fromV(_mm_add_epi8(left.vi, right.vi));
}
inline v128 v128::add16(const v128& left, const v128& right)
{
return fromV(_mm_add_epi16(left.vi, right.vi));
}
inline v128 v128::add32(const v128& left, const v128& right)
{
return fromV(_mm_add_epi32(left.vi, right.vi));
}
inline v128 v128::addfs(const v128& left, const v128& right)
{
return fromF(_mm_add_ps(left.vf, right.vf));
}
inline v128 v128::addfd(const v128& left, const v128& right)
{
return fromD(_mm_add_pd(left.vd, right.vd));
}
inline v128 v128::sub8(const v128& left, const v128& right)
{
return fromV(_mm_sub_epi8(left.vi, right.vi));
}
inline v128 v128::sub16(const v128& left, const v128& right)
{
return fromV(_mm_sub_epi16(left.vi, right.vi));
}
inline v128 v128::sub32(const v128& left, const v128& right)
{
return fromV(_mm_sub_epi32(left.vi, right.vi));
}
inline v128 v128::subfs(const v128& left, const v128& right)
{
return fromF(_mm_sub_ps(left.vf, right.vf));
}
inline v128 v128::subfd(const v128& left, const v128& right)
{
return fromD(_mm_sub_pd(left.vd, right.vd));
}
inline v128 v128::maxu8(const v128& left, const v128& right)
{
return fromV(_mm_max_epu8(left.vi, right.vi));
}
inline v128 v128::minu8(const v128& left, const v128& right)
{
return fromV(_mm_min_epu8(left.vi, right.vi));
}
inline v128 v128::eq8(const v128& left, const v128& right)
{
return fromV(_mm_cmpeq_epi8(left.vi, right.vi));
}
inline v128 v128::eq16(const v128& left, const v128& right)
{
return fromV(_mm_cmpeq_epi16(left.vi, right.vi));
}
inline v128 v128::eq32(const v128& left, const v128& right)
{
return fromV(_mm_cmpeq_epi32(left.vi, right.vi));
}
inline v128 v128::eq32f(const v128& left, const v128& right)
{
return fromF(_mm_cmpeq_ps(left.vf, right.vf));
}
inline v128 v128::fma32f(v128 a, const v128& b, const v128& c)
{
#ifndef __FMA__
if (v128_use_fma) [[likely]]
{
#ifdef _MSC_VER
a.vf = _mm_fmadd_ps(a.vf, b.vf, c.vf);
return a;
#else
__asm__("vfmadd213ps %[c], %[b], %[a]"
: [a] "+x" (a.vf)
: [b] "x" (b.vf)
, [c] "x" (c.vf));
return a;
#endif
}
for (int i = 0; i < 4; i++)
{
a._f[i] = std::fmaf(a._f[i], b._f[i], c._f[i]);
}
return a;
#else
a.vf = _mm_fmadd_ps(a.vf, b.vf, c.vf);
return a;
#endif
}
inline bool v128::operator==(const v128& right) const
{
return _mm_movemask_epi8(v128::eq32(*this, right).vi) == 0xffff;
}
// result = (~left) & (right)
inline v128 v128::andnot(const v128& left, const v128& right)
{
return fromV(_mm_andnot_si128(left.vi, right.vi));
}
inline v128 operator|(const v128& left, const v128& right)
{
return v128::fromV(_mm_or_si128(left.vi, right.vi));
}
inline v128 operator&(const v128& left, const v128& right)
{
return v128::fromV(_mm_and_si128(left.vi, right.vi));
}
inline v128 operator^(const v128& left, const v128& right)
{
return v128::fromV(_mm_xor_si128(left.vi, right.vi));
}
inline v128 operator~(const v128& other)
{
return other ^ v128::from32p(umax); // XOR with ones
}

View file

@ -27,7 +27,7 @@
#ifdef __NR_memfd_create
#elif __x86_64__
#define __NR_memfd_create 319
#elif __aarch64__
#elif ARCH_ARM64
#define __NR_memfd_create 279
#endif