From d8360332124ea52677b8beaf5d1b89bb2221a3ad Mon Sep 17 00:00:00 2001 From: Nekotekina Date: Fri, 24 Dec 2021 20:33:32 +0300 Subject: [PATCH] LLVM: enable some JIT events (Intel, Perf) Made some related adjustments. Currently incomplete. --- 3rdparty/llvm.cmake | 15 +- Utilities/JIT.cpp | 54 ++++++- Utilities/JIT.h | 20 ++- Utilities/Thread.cpp | 2 +- buildfiles/msvc/rpcs3_debug.props | 3 +- buildfiles/msvc/rpcs3_release.props | 3 +- llvm | 2 +- llvm_build/llvm_build.vcxproj | 4 +- llvm_build/llvm_build_clang_cl.vcxproj | 4 +- rpcs3/Emu/Cell/PPUFunction.cpp | 6 +- rpcs3/Emu/Cell/PPUThread.cpp | 42 +++-- rpcs3/Emu/Cell/SPUASMJITRecompiler.cpp | 4 + rpcs3/Emu/Cell/SPUInterpreter.cpp | 2 +- rpcs3/Emu/Cell/SPURecompiler.cpp | 20 ++- rpcs3/Emu/Cell/SPUThread.cpp | 210 ++++++++++++------------- rpcs3/Emu/RSX/Common/BufferUtils.cpp | 4 +- 16 files changed, 233 insertions(+), 162 deletions(-) diff --git a/3rdparty/llvm.cmake b/3rdparty/llvm.cmake index db748d8bd8..31032b3d4f 100644 --- a/3rdparty/llvm.cmake +++ b/3rdparty/llvm.cmake @@ -13,6 +13,15 @@ if(WITH_LLVM) option(LLVM_INCLUDE_UTILS OFF) option(LLVM_CCACHE_BUILD ON) + if(WIN32) + set(LLVM_USE_INTEL_JITEVENTS ON) + endif() + + if(CMAKE_SYSTEM MATCHES "Linux") + set(LLVM_USE_INTEL_JITEVENTS ON) + set(LLVM_USE_PERF ON) + endif() + set(CXX_FLAGS_OLD ${CMAKE_CXX_FLAGS}) if (MSVC) @@ -52,7 +61,11 @@ if(WITH_LLVM) endif() endif() - set(LLVM_LIBS LLVMMCJIT LLVMX86CodeGen LLVMX86AsmParser) + set(LLVM_LIBS LLVMMCJIT LLVMX86CodeGen LLVMX86AsmParser LLVMIntelJITEvents) + + if(CMAKE_SYSTEM MATCHES "Linux") + set(LLVM_LIBS ${LLVM_LIBS} LLVMPerfJITEvents) + endif() add_library(3rdparty_llvm INTERFACE) target_link_libraries(3rdparty_llvm INTERFACE ${LLVM_LIBS}) diff --git a/Utilities/JIT.cpp b/Utilities/JIT.cpp index 6413053a0e..b43e1fdd48 100644 --- a/Utilities/JIT.cpp +++ b/Utilities/JIT.cpp @@ -16,6 +16,15 @@ LOG_CHANNEL(jit_log, "JIT"); +void jit_announce(uptr func, usz size, std::string_view name) +{ +#ifdef __linux__ + static const fs::file s_map(fmt::format("/tmp/perf-%d.map", getpid()), fs::rewrite + fs::append); + + s_map.write(fmt::format("%x %x %s\n", func, size, name)); +#endif +} + static u8* get_jit_memory() { // Reserve 2G memory (magic static) @@ -230,7 +239,7 @@ asmjit::Runtime& asmjit::get_global_runtime() return asmjit::kErrorNoCodeGenerated; } - void* p = m_pos.fetch_add(utils::align(codeSize, 4096)); + void* p = m_pos.fetch_add(utils::align(codeSize, 64)); if (!p || m_pos > m_max) [[unlikely]] { *dst = nullptr; @@ -245,7 +254,6 @@ asmjit::Runtime& asmjit::get_global_runtime() return asmjit::kErrorInvalidState; } - utils::memory_protect(p, utils::align(codeSize, 4096), utils::protection::rx); flush(p, relocSize); *dst = p; @@ -331,6 +339,9 @@ asmjit::inline_runtime::~inline_runtime() #include "llvm/ExecutionEngine/ExecutionEngine.h" #include "llvm/ExecutionEngine/RTDyldMemoryManager.h" #include "llvm/ExecutionEngine/ObjectCache.h" +#include "llvm/ExecutionEngine/JITEventListener.h" +#include "llvm/Object/ObjectFile.h" +#include "llvm/Object/SymbolSize.h" #ifdef _MSC_VER #pragma warning(pop) #else @@ -386,7 +397,7 @@ static u64 make_null_function(const std::string& name) using namespace asmjit; // Build a "null" function that contains its name - const auto func = build_function_asm([&](X86Assembler& c, auto& args) + const auto func = build_function_asm("NULL", [&](X86Assembler& c, auto& args) { Label data = c.newLabel(); c.lea(args[0], x86::qword_ptr(data, 0)); @@ -406,6 +417,34 @@ static u64 make_null_function(const std::string& name) } } +struct JITAnnouncer : llvm::JITEventListener +{ + void notifyObjectLoaded(u64, const llvm::object::ObjectFile& obj, const llvm::RuntimeDyld::LoadedObjectInfo& info) override + { + using namespace llvm; + + object::OwningBinary debug_obj_ = info.getObjectForDebug(obj); + const object::ObjectFile& debug_obj = *debug_obj_.getBinary(); + + for (const auto& [sym, size] : computeSymbolSizes(debug_obj)) + { + Expected type_ = sym.getType(); + if (!type_ || *type_ != object::SymbolRef::ST_Function) + continue; + + Expected name = sym.getName(); + if (!name) + continue; + + Expected addr = sym.getAddress(); + if (!addr) + continue; + + jit_announce(*addr, size, {name->data(), name->size()}); + } + } +}; + // Simple memory manager struct MemoryManager1 : llvm::RTDyldMemoryManager { @@ -429,7 +468,8 @@ struct MemoryManager1 : llvm::RTDyldMemoryManager ~MemoryManager1() override { - utils::memory_release(ptr, c_max_size * 2); + // Hack: don't release to prevent reuse of address space, see jit_announce + utils::memory_decommit(ptr, c_max_size * 2); } llvm::JITSymbol findSymbol(const std::string& name) override @@ -812,6 +852,12 @@ jit_compiler::jit_compiler(const std::unordered_map& _link, co } } + if (!_link.empty() || !(flags & 0x1)) + { + m_engine->RegisterJITEventListener(llvm::JITEventListener::createIntelJITEventListener()); + m_engine->RegisterJITEventListener(new JITAnnouncer); + } + if (!m_engine) { fmt::throw_exception("LLVM: Failed to create ExecutionEngine: %s", result); diff --git a/Utilities/JIT.h b/Utilities/JIT.h index 027130d8aa..1b111eba0e 100644 --- a/Utilities/JIT.h +++ b/Utilities/JIT.h @@ -34,6 +34,13 @@ #include #include +void jit_announce(uptr func, usz size, std::string_view name); + +void jit_announce(auto* func, usz size, std::string_view name) +{ + jit_announce(uptr(func), size, name); +} + enum class jit_class { ppu_code, @@ -161,7 +168,7 @@ namespace asmjit // Build runtime function with asmjit::X86Assembler template -inline FT build_function_asm(F&& builder) +inline FT build_function_asm(std::string_view name, F&& builder) { using namespace asmjit; @@ -195,6 +202,7 @@ inline FT build_function_asm(F&& builder) return nullptr; } + jit_announce(result, code.getCodeSize(), name); return result; } @@ -210,8 +218,8 @@ public: built_function& operator=(const built_function&) = delete; template - built_function(F&& builder) - : m_func(ensure(build_function_asm(std::forward(builder)))) + built_function(std::string_view name, F&& builder) + : m_func(ensure(build_function_asm(name, std::forward(builder)))) { } @@ -238,7 +246,7 @@ public: built_function& operator=(const built_function&) = delete; template - built_function(F&& builder) + built_function(std::string_view name, F&& builder) { using namespace asmjit; @@ -270,6 +278,10 @@ public: { ensure(false); } + else + { + jit_announce(result, code.getCodeSize(), name); + } } operator FT() const noexcept diff --git a/Utilities/Thread.cpp b/Utilities/Thread.cpp index ec453ff51c..077464845c 100644 --- a/Utilities/Thread.cpp +++ b/Utilities/Thread.cpp @@ -2190,7 +2190,7 @@ thread_base::native_entry thread_base::finalize(u64 _self) noexcept thread_base::native_entry thread_base::make_trampoline(u64(*entry)(thread_base* _base)) { - return build_function_asm([&](asmjit::X86Assembler& c, auto& args) + return build_function_asm("thread_base_trampoline", [&](asmjit::X86Assembler& c, auto& args) { using namespace asmjit; diff --git a/buildfiles/msvc/rpcs3_debug.props b/buildfiles/msvc/rpcs3_debug.props index a75bc968ad..d35ce58c59 100644 --- a/buildfiles/msvc/rpcs3_debug.props +++ b/buildfiles/msvc/rpcs3_debug.props @@ -1,4 +1,4 @@ - + @@ -61,6 +61,7 @@ LLVMBitWriter.lib; LLVMCoroutines.lib; LLVMObjCARCOpts.lib; + LLVMIntelJITEvents.lib; diff --git a/buildfiles/msvc/rpcs3_release.props b/buildfiles/msvc/rpcs3_release.props index d2526fefdf..038eb81d82 100644 --- a/buildfiles/msvc/rpcs3_release.props +++ b/buildfiles/msvc/rpcs3_release.props @@ -1,4 +1,4 @@ - + @@ -62,6 +62,7 @@ LLVMBitWriter.lib; LLVMCoroutines.lib; LLVMObjCARCOpts.lib; + LLVMIntelJITEvents.lib; diff --git a/llvm b/llvm index 318b8fe374..a670c459ea 160000 --- a/llvm +++ b/llvm @@ -1 +1 @@ -Subproject commit 318b8fe3746615f914522d4e177c537ce80d1d08 +Subproject commit a670c459ea782411885b1e9861c89d04609d648f diff --git a/llvm_build/llvm_build.vcxproj b/llvm_build/llvm_build.vcxproj index 7f89b0310c..6fc5889d78 100644 --- a/llvm_build/llvm_build.vcxproj +++ b/llvm_build/llvm_build.vcxproj @@ -39,9 +39,9 @@ call vsdevcmd.bat -arch=amd64 - cmake -G Ninja -DCMAKE_CXX_COMPILER="cl.exe" -DCMAKE_C_COMPILER="cl.exe" -DCMAKE_BUILD_TYPE="Release" -DCMAKE_INSTALL_PREFIX="./Release" -DLLVM_TARGETS_TO_BUILD=X86 -DLLVM_DEFAULT_TARGET_TRIPLE:STRING=x86_64-pc-windows-msvc -DLLVM_HOST_TRIPLE:STRING=x86_64-pc-windows-msvc -DLLVM_BUILD_RUNTIME=OFF -DLLVM_BUILD_TOOLS=OFF -DLLVM_INCLUDE_DOCS=OFF -DLLVM_INCLUDE_EXAMPLES=OFF -DLLVM_INCLUDE_TESTS=OFF -DLLVM_INCLUDE_TOOLS=OFF -DLLVM_INCLUDE_UTILS=OFF -DCMAKE_SYSTEM_VERSION=6.1 -DLLVM_TEMPORARILY_ALLOW_OLD_TOOLCHAIN=ON -DCMAKE_VS_WINDOWS_TARGET_PLATFORM_VERSION=$(WindowsTargetPlatformVersion) -DLLVM_USE_CRT_DEBUG=MTd -DLLVM_USE_CRT_RELEASE=MT ../llvm + cmake -G Ninja -DCMAKE_CXX_COMPILER="cl.exe" -DCMAKE_C_COMPILER="cl.exe" -DCMAKE_BUILD_TYPE="Release" -DCMAKE_INSTALL_PREFIX="./Release" -DLLVM_TARGETS_TO_BUILD=X86 -DLLVM_DEFAULT_TARGET_TRIPLE:STRING=x86_64-pc-windows-msvc -DLLVM_HOST_TRIPLE:STRING=x86_64-pc-windows-msvc -DLLVM_BUILD_RUNTIME=OFF -DLLVM_BUILD_TOOLS=OFF -DLLVM_INCLUDE_DOCS=OFF -DLLVM_INCLUDE_EXAMPLES=OFF -DLLVM_INCLUDE_TESTS=OFF -DLLVM_INCLUDE_TOOLS=OFF -DLLVM_INCLUDE_UTILS=OFF -DCMAKE_SYSTEM_VERSION=6.1 -DLLVM_TEMPORARILY_ALLOW_OLD_TOOLCHAIN=ON -DCMAKE_VS_WINDOWS_TARGET_PLATFORM_VERSION=$(WindowsTargetPlatformVersion) -DLLVM_USE_CRT_DEBUG=MTd -DLLVM_USE_CRT_RELEASE=MT -DLLVM_USE_INTEL_JITEVENTS=ON ../llvm call vsdevcmd.bat -arch=amd64 - cmake -G Ninja -DCMAKE_CXX_COMPILER="cl.exe" -DCMAKE_C_COMPILER="cl.exe" -DCMAKE_BUILD_TYPE="Debug" -DCMAKE_INSTALL_PREFIX="./Debug" -DLLVM_TARGETS_TO_BUILD=X86 -DLLVM_DEFAULT_TARGET_TRIPLE:STRING=x86_64-pc-windows-msvc -DLLVM_HOST_TRIPLE:STRING=x86_64-pc-windows-msvc -DLLVM_BUILD_RUNTIME=OFF -DLLVM_BUILD_TOOLS=OFF -DLLVM_INCLUDE_DOCS=OFF -DLLVM_INCLUDE_EXAMPLES=OFF -DLLVM_INCLUDE_TESTS=OFF -DLLVM_INCLUDE_TOOLS=OFF -DLLVM_INCLUDE_UTILS=OFF -DCMAKE_SYSTEM_VERSION=6.1 -DLLVM_TEMPORARILY_ALLOW_OLD_TOOLCHAIN=ON -DCMAKE_VS_WINDOWS_TARGET_PLATFORM_VERSION=$(WindowsTargetPlatformVersion) -DLLVM_USE_CRT_DEBUG=MTd -DLLVM_USE_CRT_RELEASE=MT ../llvm + cmake -G Ninja -DCMAKE_CXX_COMPILER="cl.exe" -DCMAKE_C_COMPILER="cl.exe" -DCMAKE_BUILD_TYPE="Debug" -DCMAKE_INSTALL_PREFIX="./Debug" -DLLVM_TARGETS_TO_BUILD=X86 -DLLVM_DEFAULT_TARGET_TRIPLE:STRING=x86_64-pc-windows-msvc -DLLVM_HOST_TRIPLE:STRING=x86_64-pc-windows-msvc -DLLVM_BUILD_RUNTIME=OFF -DLLVM_BUILD_TOOLS=OFF -DLLVM_INCLUDE_DOCS=OFF -DLLVM_INCLUDE_EXAMPLES=OFF -DLLVM_INCLUDE_TESTS=OFF -DLLVM_INCLUDE_TOOLS=OFF -DLLVM_INCLUDE_UTILS=OFF -DCMAKE_SYSTEM_VERSION=6.1 -DLLVM_TEMPORARILY_ALLOW_OLD_TOOLCHAIN=ON -DCMAKE_VS_WINDOWS_TARGET_PLATFORM_VERSION=$(WindowsTargetPlatformVersion) -DLLVM_USE_CRT_DEBUG=MTd -DLLVM_USE_CRT_RELEASE=MT -DLLVM_USE_INTEL_JITEVENTS=ON ../llvm echo Cleaning.. for /F "delims= eol=|" %%f in (' dir /b ^| findstr /V "[^.]*\build[^.]*\.vcxproj"') do ( diff --git a/llvm_build/llvm_build_clang_cl.vcxproj b/llvm_build/llvm_build_clang_cl.vcxproj index a0f24875d2..189bf6fee2 100644 --- a/llvm_build/llvm_build_clang_cl.vcxproj +++ b/llvm_build/llvm_build_clang_cl.vcxproj @@ -39,9 +39,9 @@ call vsdevcmd.bat -arch=amd64 - cmake -G Ninja -DCMAKE_CXX_COMPILER="clang-cl.exe" -DCMAKE_C_COMPILER="clang-cl.exe" -DCMAKE_BUILD_TYPE="Release" -DLLVM_TARGETS_TO_BUILD=X86 -DLLVM_DEFAULT_TARGET_TRIPLE:STRING=x86_64-pc-windows-msvc -DLLVM_HOST_TRIPLE:STRING=x86_64-pc-windows-msvc -DCMAKE_INSTALL_PREFIX="./Release" -DLLVM_BUILD_RUNTIME=OFF -DLLVM_BUILD_TOOLS=OFF -DLLVM_INCLUDE_DOCS=OFF -DLLVM_INCLUDE_EXAMPLES=OFF -DLLVM_INCLUDE_TESTS=OFF -DLLVM_INCLUDE_TOOLS=OFF -DLLVM_INCLUDE_UTILS=OFF -DCMAKE_SYSTEM_VERSION=6.1 -DLLVM_TEMPORARILY_ALLOW_OLD_TOOLCHAIN=ON -DCMAKE_VS_WINDOWS_TARGET_PLATFORM_VERSION=$(WindowsTargetPlatformVersion) -DLLVM_USE_CRT_DEBUG=MTd -DLLVM_USE_CRT_RELEASE=MT ../llvm + cmake -G Ninja -DCMAKE_CXX_COMPILER="clang-cl.exe" -DCMAKE_C_COMPILER="clang-cl.exe" -DCMAKE_BUILD_TYPE="Release" -DLLVM_TARGETS_TO_BUILD=X86 -DLLVM_DEFAULT_TARGET_TRIPLE:STRING=x86_64-pc-windows-msvc -DLLVM_HOST_TRIPLE:STRING=x86_64-pc-windows-msvc -DCMAKE_INSTALL_PREFIX="./Release" -DLLVM_BUILD_RUNTIME=OFF -DLLVM_BUILD_TOOLS=OFF -DLLVM_INCLUDE_DOCS=OFF -DLLVM_INCLUDE_EXAMPLES=OFF -DLLVM_INCLUDE_TESTS=OFF -DLLVM_INCLUDE_TOOLS=OFF -DLLVM_INCLUDE_UTILS=OFF -DCMAKE_SYSTEM_VERSION=6.1 -DLLVM_TEMPORARILY_ALLOW_OLD_TOOLCHAIN=ON -DCMAKE_VS_WINDOWS_TARGET_PLATFORM_VERSION=$(WindowsTargetPlatformVersion) -DLLVM_USE_CRT_DEBUG=MTd -DLLVM_USE_CRT_RELEASE=MT -DLLVM_USE_INTEL_JITEVENTS=ON ../llvm call vsdevcmd.bat -arch=amd64 - cmake -G Ninja -DCMAKE_CXX_COMPILER="clang-cl.exe" -DCMAKE_C_COMPILER="clang-cl.exe" -DCMAKE_BUILD_TYPE="Debug" -DLLVM_TARGETS_TO_BUILD=X86 -DLLVM_DEFAULT_TARGET_TRIPLE:STRING=x86_64-pc-windows-msvc -DLLVM_HOST_TRIPLE:STRING=x86_64-pc-windows-msvc -DCMAKE_INSTALL_PREFIX="./Debug" -DLLVM_BUILD_RUNTIME=OFF -DLLVM_BUILD_TOOLS=OFF -DLLVM_INCLUDE_DOCS=OFF -DLLVM_INCLUDE_EXAMPLES=OFF -DLLVM_INCLUDE_TESTS=OFF -DLLVM_INCLUDE_TOOLS=OFF -DLLVM_INCLUDE_UTILS=OFF -DCMAKE_SYSTEM_VERSION=6.1 -DLLVM_TEMPORARILY_ALLOW_OLD_TOOLCHAIN=ON -DCMAKE_VS_WINDOWS_TARGET_PLATFORM_VERSION=$(WindowsTargetPlatformVersion) -DLLVM_USE_CRT_DEBUG=MTd -DLLVM_USE_CRT_RELEASE=MT ../llvm + cmake -G Ninja -DCMAKE_CXX_COMPILER="clang-cl.exe" -DCMAKE_C_COMPILER="clang-cl.exe" -DCMAKE_BUILD_TYPE="Debug" -DLLVM_TARGETS_TO_BUILD=X86 -DLLVM_DEFAULT_TARGET_TRIPLE:STRING=x86_64-pc-windows-msvc -DLLVM_HOST_TRIPLE:STRING=x86_64-pc-windows-msvc -DCMAKE_INSTALL_PREFIX="./Debug" -DLLVM_BUILD_RUNTIME=OFF -DLLVM_BUILD_TOOLS=OFF -DLLVM_INCLUDE_DOCS=OFF -DLLVM_INCLUDE_EXAMPLES=OFF -DLLVM_INCLUDE_TESTS=OFF -DLLVM_INCLUDE_TOOLS=OFF -DLLVM_INCLUDE_UTILS=OFF -DCMAKE_SYSTEM_VERSION=6.1 -DLLVM_TEMPORARILY_ALLOW_OLD_TOOLCHAIN=ON -DCMAKE_VS_WINDOWS_TARGET_PLATFORM_VERSION=$(WindowsTargetPlatformVersion) -DLLVM_USE_CRT_DEBUG=MTd -DLLVM_USE_CRT_RELEASE=MT -DLLVM_USE_INTEL_JITEVENTS=ON ../llvm echo Cleaning.. for /F "delims= eol=|" %%f in (' dir /b ^| findstr /V "[^.]*\build[^.]*\.vcxproj"') do ( diff --git a/rpcs3/Emu/Cell/PPUFunction.cpp b/rpcs3/Emu/Cell/PPUFunction.cpp index 3999b10b0c..8c12152836 100644 --- a/rpcs3/Emu/Cell/PPUFunction.cpp +++ b/rpcs3/Emu/Cell/PPUFunction.cpp @@ -1910,14 +1910,14 @@ std::vector& ppu_function_manager::access(bool ghc) static std::vector list_ghc { - build_function_asm([](asmjit::X86Assembler& c, auto& args) + build_function_asm("ppu_unregistered", [](asmjit::X86Assembler& c, auto& args) { using namespace asmjit; c.mov(args[0], x86::rbp); c.jmp(imm_ptr(list[0])); }), - build_function_asm([](asmjit::X86Assembler& c, auto& args) + build_function_asm("ppu_return", [](asmjit::X86Assembler& c, auto& args) { using namespace asmjit; @@ -1937,7 +1937,7 @@ u32 ppu_function_manager::add_function(ppu_function_t function) list.push_back(function); // Generate trampoline - list2.push_back(build_function_asm([&](asmjit::X86Assembler& c, auto& args) + list2.push_back(build_function_asm("ppu_trampolinea", [&](asmjit::X86Assembler& c, auto& args) { using namespace asmjit; diff --git a/rpcs3/Emu/Cell/PPUThread.cpp b/rpcs3/Emu/Cell/PPUThread.cpp index b46b21afc0..10fefd6a2a 100644 --- a/rpcs3/Emu/Cell/PPUThread.cpp +++ b/rpcs3/Emu/Cell/PPUThread.cpp @@ -147,7 +147,7 @@ static bool ppu_break(ppu_thread& ppu, ppu_opcode_t op); extern void do_cell_atomic_128_store(u32 addr, const void* to_write); -const auto ppu_gateway = built_function([](asmjit::X86Assembler& c, auto& args) +const auto ppu_gateway = built_function("ppu_gateway", [](asmjit::X86Assembler& c, auto& args) { // Gateway for PPU, converts from native to GHC calling convention, also saves RSP value for escape using namespace asmjit; @@ -248,7 +248,7 @@ const auto ppu_gateway = built_function([](asmjit::X86Asse c.ret(); }); -const extern auto ppu_escape = build_function_asm([](asmjit::X86Assembler& c, auto& args) +const extern auto ppu_escape = build_function_asm("ppu_escape", [](asmjit::X86Assembler& c, auto& args) { using namespace asmjit; @@ -256,12 +256,13 @@ const extern auto ppu_escape = build_function_asm([](asmji c.mov(x86::rsp, x86::qword_ptr(args[0], ::offset32(&ppu_thread::saved_native_sp))); // Return to the return location - c.jmp(x86::qword_ptr(x86::rsp, -8)); + c.sub(x86::rsp, 8); + c.ret(); }); void ppu_recompiler_fallback(ppu_thread& ppu); -const auto ppu_recompiler_fallback_ghc = build_function_asm([](asmjit::X86Assembler& c, auto& args) +const auto ppu_recompiler_fallback_ghc = build_function_asm("ppu_trampolineb", [](asmjit::X86Assembler& c, auto& args) { using namespace asmjit; @@ -1816,7 +1817,7 @@ extern u64 ppu_ldarx(ppu_thread& ppu, u32 addr) return ppu_load_acquire_reservation(ppu, addr); } -const auto ppu_stcx_accurate_tx = built_function([](asmjit::X86Assembler& c, auto& args) +const auto ppu_stcx_accurate_tx = built_function("ppu_stcx_accurate_tx", [](asmjit::X86Assembler& c, auto& args) { using namespace asmjit; @@ -1832,11 +1833,7 @@ const auto ppu_stcx_accurate_tx = built_function(&vm::g_sudo_addr))); c.lea(x86::rbp, x86::qword_ptr(x86::rbp, args[0])); c.and_(x86::rbp, -128); @@ -1855,11 +1852,9 @@ const auto ppu_stcx_accurate_tx = built_function(+vm::g_reservations), args[0])); - c.and_(x86::rbx, -128 / 2); - c.prefetchw(x86::byte_ptr(x86::rbx)); + c.lea(x86::r11, x86::qword_ptr(reinterpret_cast(+vm::g_reservations), args[0])); + c.and_(x86::r11, -128 / 2); c.and_(args[0].r32(), 63); - c.mov(x86::r13, args[1]); // Prepare data if (s_tsx_avx) @@ -1894,8 +1889,6 @@ const auto ppu_stcx_accurate_tx = built_function(&g_rtm_tx_limit2))); c.jae(fall); }); - c.prefetchw(x86::byte_ptr(x86::rbp, 0)); - c.prefetchw(x86::byte_ptr(x86::rbp, 64)); // Check pause flag c.bt(x86::dword_ptr(args[2], ::offset32(&ppu_thread::state) - ::offset32(&ppu_thread::rdata)), static_cast(cpu_flag::pause)); @@ -1939,7 +1932,7 @@ const auto ppu_stcx_accurate_tx = built_function(m_hash_start)))); + } // Install compiled function pointer const bool added = !add_loc->compiled && add_loc->compiled.compare_and_swap_test(nullptr, fn); diff --git a/rpcs3/Emu/Cell/SPUInterpreter.cpp b/rpcs3/Emu/Cell/SPUInterpreter.cpp index 56cf5b4dbf..9b413421f1 100644 --- a/rpcs3/Emu/Cell/SPUInterpreter.cpp +++ b/rpcs3/Emu/Cell/SPUInterpreter.cpp @@ -1733,7 +1733,7 @@ bool spu_interpreter::SHUFB(spu_thread& spu, spu_opcode_t op) return true; } -const spu_inter_func_t optimized_shufb = build_function_asm([](asmjit::X86Assembler& c, auto& /*args*/) +const spu_inter_func_t optimized_shufb = build_function_asm("spu_shufb", [](asmjit::X86Assembler& c, auto& /*args*/) { using namespace asmjit; diff --git a/rpcs3/Emu/Cell/SPURecompiler.cpp b/rpcs3/Emu/Cell/SPURecompiler.cpp index 550a8103b2..0eaf6c76f3 100644 --- a/rpcs3/Emu/Cell/SPURecompiler.cpp +++ b/rpcs3/Emu/Cell/SPURecompiler.cpp @@ -160,7 +160,7 @@ DECLARE(spu_runtime::tr_all) = [] return reinterpret_cast(trptr); }(); -DECLARE(spu_runtime::g_gateway) = built_function([](asmjit::X86Assembler& c, auto& args) +DECLARE(spu_runtime::g_gateway) = built_function("spu_gateway", [](asmjit::X86Assembler& c, auto& args) { // Gateway for SPU dispatcher, converts from native to GHC calling convention, also saves RSP value for spu_escape using namespace asmjit; @@ -249,7 +249,7 @@ DECLARE(spu_runtime::g_gateway) = built_function([](asmjit::X86A c.ret(); }); -DECLARE(spu_runtime::g_escape) = build_function_asm([](asmjit::X86Assembler& c, auto& args) +DECLARE(spu_runtime::g_escape) = build_function_asm("spu_escape", [](asmjit::X86Assembler& c, auto& args) { using namespace asmjit; @@ -257,10 +257,11 @@ DECLARE(spu_runtime::g_escape) = build_function_asm([](asm c.mov(x86::rsp, x86::qword_ptr(args[0], ::offset32(&spu_thread::saved_native_sp))); // Return to the return location - c.jmp(x86::qword_ptr(x86::rsp, -8)); + c.sub(x86::rsp, 8); + c.ret(); }); -DECLARE(spu_runtime::g_tail_escape) = build_function_asm([](asmjit::X86Assembler& c, auto& args) +DECLARE(spu_runtime::g_tail_escape) = build_function_asm("spu_tail_escape", [](asmjit::X86Assembler& c, auto& args) { using namespace asmjit; @@ -268,14 +269,15 @@ DECLARE(spu_runtime::g_tail_escape) = build_function_asm(reinterpret_cast(wxptr)); + + jit_announce(wxptr, raw - wxptr, "spu_ubertrampoline"); } if (auto _old = stuff_it->trampoline.compare_and_swap(nullptr, result)) @@ -3480,7 +3484,7 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator #endif // Get function chunk name - const std::string name = fmt::format("spu-chunk-0x%05x", addr); + const std::string name = fmt::format("spu-cx%05x-%s", addr, fmt::base57(be_t{m_hash_start})); llvm::Function* result = llvm::cast(m_module->getOrInsertFunction(name, chunk_type).getCallee()); // Set parameters @@ -3505,7 +3509,7 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator // 5. $3 const auto func_type = get_ftype(); - const std::string fname = fmt::format("spu-function-0x%05x", addr); + const std::string fname = fmt::format("spu-fx%05x-%s", addr, fmt::base57(be_t{m_hash_start})); llvm::Function* fn = llvm::cast(m_module->getOrInsertFunction(fname, func_type).getCallee()); fn->setLinkage(llvm::GlobalValue::InternalLinkage); diff --git a/rpcs3/Emu/Cell/SPUThread.cpp b/rpcs3/Emu/Cell/SPUThread.cpp index 360682ccbc..9e0cf8d653 100644 --- a/rpcs3/Emu/Cell/SPUThread.cpp +++ b/rpcs3/Emu/Cell/SPUThread.cpp @@ -405,7 +405,7 @@ std::array op_branch_targets(u32 pc, spu_opcode_t op) return res; } -const auto spu_putllc_tx = built_function([](asmjit::X86Assembler& c, auto& args) +const auto spu_putllc_tx = built_function("spu_putllc_tx", [](asmjit::X86Assembler& c, auto& args) { using namespace asmjit; @@ -420,12 +420,8 @@ const auto spu_putllc_tx = built_function(&vm::g_sudo_addr))); - c.lea(x86::rbp, x86::qword_ptr(x86::rbp, args[0])); - c.prefetchw(x86::byte_ptr(x86::rbp, 0)); - c.prefetchw(x86::byte_ptr(x86::rbp, 64)); + build_swap_rdx_with(c, args, x86::r10); + c.mov(args[1], x86::qword_ptr(reinterpret_cast(&vm::g_sudo_addr))); + c.lea(args[1], x86::qword_ptr(args[1], args[0])); + c.prefetchw(x86::byte_ptr(args[1], 0)); + c.prefetchw(x86::byte_ptr(args[1], 64)); c.and_(args[0].r32(), 0xff80); c.shr(args[0].r32(), 1); - c.lea(x86::rbx, x86::qword_ptr(reinterpret_cast(+vm::g_reservations), args[0])); - c.prefetchw(x86::byte_ptr(x86::rbx)); - c.mov(x86::r13, args[1]); + c.lea(x86::r11, x86::qword_ptr(reinterpret_cast(+vm::g_reservations), args[0])); // Prepare data if (s_tsx_avx) @@ -504,8 +498,6 @@ const auto spu_putllc_tx = built_function(&g_rtm_tx_limit2))); c.jae(fall); }); - c.prefetchw(x86::byte_ptr(x86::rbp, 0)); - c.prefetchw(x86::byte_ptr(x86::rbp, 64)); // Check pause flag c.bt(x86::dword_ptr(args[2], ::offset32(&spu_thread::state) - ::offset32(&spu_thread::rdata)), static_cast(cpu_flag::pause)); @@ -514,10 +506,10 @@ const auto spu_putllc_tx = built_function([](asmjit::X86Assembler& c, auto& args) +const auto spu_putlluc_tx = built_function("spu_putlluc_tx", [](asmjit::X86Assembler& c, auto& args) { using namespace asmjit; @@ -680,30 +676,20 @@ const auto spu_putlluc_tx = built_function(&vm::g_sudo_addr))); - c.lea(x86::rbp, x86::qword_ptr(x86::rbp, args[0])); - c.prefetchw(x86::byte_ptr(x86::rbp, 0)); - c.prefetchw(x86::byte_ptr(x86::rbp, 64)); - c.and_(args[0].r32(), 0xff80); - c.shr(args[0].r32(), 1); - c.lea(x86::rbx, x86::qword_ptr(reinterpret_cast(+vm::g_reservations), args[0])); - c.prefetchw(x86::byte_ptr(x86::rbx)); - c.mov(x86::r13, args[1]); + build_swap_rdx_with(c, args, x86::r10); + c.mov(x86::r11, x86::qword_ptr(reinterpret_cast(&vm::g_sudo_addr))); + c.lea(x86::r11, x86::qword_ptr(x86::r11, args[0])); + c.prefetchw(x86::byte_ptr(x86::r11, 0)); + c.prefetchw(x86::byte_ptr(x86::r11, 64)); // Prepare data if (s_tsx_avx) @@ -725,6 +711,10 @@ const auto spu_putlluc_tx = built_function(+vm::g_reservations), args[0])); + // Alloc args[0] to stamp0 const auto stamp0 = args[0]; build_get_tsc(c, stamp0); @@ -739,35 +729,29 @@ const auto spu_putlluc_tx = built_function(cpu_flag::pause)); - // c.jc(fall); c.xbegin(tx1); if (s_tsx_avx) { - c.vmovaps(x86::yword_ptr(x86::rbp, 0), x86::ymm0); - c.vmovaps(x86::yword_ptr(x86::rbp, 32), x86::ymm1); - c.vmovaps(x86::yword_ptr(x86::rbp, 64), x86::ymm2); - c.vmovaps(x86::yword_ptr(x86::rbp, 96), x86::ymm3); + c.vmovaps(x86::yword_ptr(x86::r11, 0), x86::ymm0); + c.vmovaps(x86::yword_ptr(x86::r11, 32), x86::ymm1); + c.vmovaps(x86::yword_ptr(x86::r11, 64), x86::ymm2); + c.vmovaps(x86::yword_ptr(x86::r11, 96), x86::ymm3); } else { - c.movaps(x86::oword_ptr(x86::rbp, 0), x86::xmm0); - c.movaps(x86::oword_ptr(x86::rbp, 16), x86::xmm1); - c.movaps(x86::oword_ptr(x86::rbp, 32), x86::xmm2); - c.movaps(x86::oword_ptr(x86::rbp, 48), x86::xmm3); - c.movaps(x86::oword_ptr(x86::rbp, 64), x86::xmm4); - c.movaps(x86::oword_ptr(x86::rbp, 80), x86::xmm5); - c.movaps(x86::oword_ptr(x86::rbp, 96), x86::xmm6); - c.movaps(x86::oword_ptr(x86::rbp, 112), x86::xmm7); + c.movaps(x86::oword_ptr(x86::r11, 0), x86::xmm0); + c.movaps(x86::oword_ptr(x86::r11, 16), x86::xmm1); + c.movaps(x86::oword_ptr(x86::r11, 32), x86::xmm2); + c.movaps(x86::oword_ptr(x86::r11, 48), x86::xmm3); + c.movaps(x86::oword_ptr(x86::r11, 64), x86::xmm4); + c.movaps(x86::oword_ptr(x86::r11, 80), x86::xmm5); + c.movaps(x86::oword_ptr(x86::r11, 96), x86::xmm6); + c.movaps(x86::oword_ptr(x86::r11, 112), x86::xmm7); } c.xend(); - c.lock().add(x86::qword_ptr(x86::rbx), 32); + c.lock().add(x86::qword_ptr(args[1]), 32); // stx++ c.add(x86::qword_ptr(args[2]), 1); build_get_tsc(c); @@ -786,6 +770,7 @@ const auto spu_putlluc_tx = built_function([](asmjit::X86Assembler& c, auto& args) +const auto spu_getllar_tx = built_function("spu_getllar_tx", [](asmjit::X86Assembler& c, auto& args) { using namespace asmjit; @@ -815,8 +803,6 @@ const auto spu_getllar_tx = built_function(&vm::g_sudo_addr))); c.lea(x86::rbp, x86::qword_ptr(x86::rbp, args[0])); c.and_(args[0].r32(), 0xff80); c.shr(args[0].r32(), 1); - c.lea(x86::rbx, x86::qword_ptr(reinterpret_cast(+vm::g_reservations), args[0])); - c.mov(x86::r13, args[1]); + c.lea(x86::r11, x86::qword_ptr(reinterpret_cast(+vm::g_reservations), args[0])); // Alloc args[0] to stamp0 const auto stamp0 = args[0]; @@ -853,7 +838,7 @@ const auto spu_getllar_tx = built_function(cpu_flag::pause)); c.jc(fall); - c.mov(x86::rax, x86::qword_ptr(x86::rbx)); + c.mov(x86::rax, x86::qword_ptr(x86::r11)); c.and_(x86::rax, -128); c.cmp(x86::rax, args[3]); c.jne(fall); @@ -926,9 +911,16 @@ const auto spu_getllar_tx = built_function copy_data_swap_u32(&build_copy_data_swap_u32); +built_function copy_data_swap_u32("copy_data_swap_u32", &build_copy_data_swap_u32); -built_function copy_data_swap_u32_cmp(&build_copy_data_swap_u32); +built_function copy_data_swap_u32_cmp("copy_data_swap_u32_cmp", &build_copy_data_swap_u32); namespace {