LLVM: enable some JIT events (Intel, Perf)

Made some related adjustments.
Currently incomplete.
This commit is contained in:
Nekotekina 2021-12-24 20:33:32 +03:00
parent 510041a873
commit d836033212
16 changed files with 233 additions and 162 deletions

15
3rdparty/llvm.cmake vendored
View file

@ -13,6 +13,15 @@ if(WITH_LLVM)
option(LLVM_INCLUDE_UTILS OFF)
option(LLVM_CCACHE_BUILD ON)
if(WIN32)
set(LLVM_USE_INTEL_JITEVENTS ON)
endif()
if(CMAKE_SYSTEM MATCHES "Linux")
set(LLVM_USE_INTEL_JITEVENTS ON)
set(LLVM_USE_PERF ON)
endif()
set(CXX_FLAGS_OLD ${CMAKE_CXX_FLAGS})
if (MSVC)
@ -52,7 +61,11 @@ if(WITH_LLVM)
endif()
endif()
set(LLVM_LIBS LLVMMCJIT LLVMX86CodeGen LLVMX86AsmParser)
set(LLVM_LIBS LLVMMCJIT LLVMX86CodeGen LLVMX86AsmParser LLVMIntelJITEvents)
if(CMAKE_SYSTEM MATCHES "Linux")
set(LLVM_LIBS ${LLVM_LIBS} LLVMPerfJITEvents)
endif()
add_library(3rdparty_llvm INTERFACE)
target_link_libraries(3rdparty_llvm INTERFACE ${LLVM_LIBS})

View file

@ -16,6 +16,15 @@
LOG_CHANNEL(jit_log, "JIT");
void jit_announce(uptr func, usz size, std::string_view name)
{
#ifdef __linux__
static const fs::file s_map(fmt::format("/tmp/perf-%d.map", getpid()), fs::rewrite + fs::append);
s_map.write(fmt::format("%x %x %s\n", func, size, name));
#endif
}
static u8* get_jit_memory()
{
// Reserve 2G memory (magic static)
@ -230,7 +239,7 @@ asmjit::Runtime& asmjit::get_global_runtime()
return asmjit::kErrorNoCodeGenerated;
}
void* p = m_pos.fetch_add(utils::align(codeSize, 4096));
void* p = m_pos.fetch_add(utils::align(codeSize, 64));
if (!p || m_pos > m_max) [[unlikely]]
{
*dst = nullptr;
@ -245,7 +254,6 @@ asmjit::Runtime& asmjit::get_global_runtime()
return asmjit::kErrorInvalidState;
}
utils::memory_protect(p, utils::align(codeSize, 4096), utils::protection::rx);
flush(p, relocSize);
*dst = p;
@ -331,6 +339,9 @@ asmjit::inline_runtime::~inline_runtime()
#include "llvm/ExecutionEngine/ExecutionEngine.h"
#include "llvm/ExecutionEngine/RTDyldMemoryManager.h"
#include "llvm/ExecutionEngine/ObjectCache.h"
#include "llvm/ExecutionEngine/JITEventListener.h"
#include "llvm/Object/ObjectFile.h"
#include "llvm/Object/SymbolSize.h"
#ifdef _MSC_VER
#pragma warning(pop)
#else
@ -386,7 +397,7 @@ static u64 make_null_function(const std::string& name)
using namespace asmjit;
// Build a "null" function that contains its name
const auto func = build_function_asm<void (*)()>([&](X86Assembler& c, auto& args)
const auto func = build_function_asm<void (*)()>("NULL", [&](X86Assembler& c, auto& args)
{
Label data = c.newLabel();
c.lea(args[0], x86::qword_ptr(data, 0));
@ -406,6 +417,34 @@ static u64 make_null_function(const std::string& name)
}
}
struct JITAnnouncer : llvm::JITEventListener
{
void notifyObjectLoaded(u64, const llvm::object::ObjectFile& obj, const llvm::RuntimeDyld::LoadedObjectInfo& info) override
{
using namespace llvm;
object::OwningBinary<object::ObjectFile> debug_obj_ = info.getObjectForDebug(obj);
const object::ObjectFile& debug_obj = *debug_obj_.getBinary();
for (const auto& [sym, size] : computeSymbolSizes(debug_obj))
{
Expected<object::SymbolRef::Type> type_ = sym.getType();
if (!type_ || *type_ != object::SymbolRef::ST_Function)
continue;
Expected<StringRef> name = sym.getName();
if (!name)
continue;
Expected<u64> addr = sym.getAddress();
if (!addr)
continue;
jit_announce(*addr, size, {name->data(), name->size()});
}
}
};
// Simple memory manager
struct MemoryManager1 : llvm::RTDyldMemoryManager
{
@ -429,7 +468,8 @@ struct MemoryManager1 : llvm::RTDyldMemoryManager
~MemoryManager1() override
{
utils::memory_release(ptr, c_max_size * 2);
// Hack: don't release to prevent reuse of address space, see jit_announce
utils::memory_decommit(ptr, c_max_size * 2);
}
llvm::JITSymbol findSymbol(const std::string& name) override
@ -812,6 +852,12 @@ jit_compiler::jit_compiler(const std::unordered_map<std::string, u64>& _link, co
}
}
if (!_link.empty() || !(flags & 0x1))
{
m_engine->RegisterJITEventListener(llvm::JITEventListener::createIntelJITEventListener());
m_engine->RegisterJITEventListener(new JITAnnouncer);
}
if (!m_engine)
{
fmt::throw_exception("LLVM: Failed to create ExecutionEngine: %s", result);

View file

@ -34,6 +34,13 @@
#include <string_view>
#include <unordered_map>
void jit_announce(uptr func, usz size, std::string_view name);
void jit_announce(auto* func, usz size, std::string_view name)
{
jit_announce(uptr(func), size, name);
}
enum class jit_class
{
ppu_code,
@ -161,7 +168,7 @@ namespace asmjit
// Build runtime function with asmjit::X86Assembler
template <typename FT, typename F>
inline FT build_function_asm(F&& builder)
inline FT build_function_asm(std::string_view name, F&& builder)
{
using namespace asmjit;
@ -195,6 +202,7 @@ inline FT build_function_asm(F&& builder)
return nullptr;
}
jit_announce(result, code.getCodeSize(), name);
return result;
}
@ -210,8 +218,8 @@ public:
built_function& operator=(const built_function&) = delete;
template <typename F>
built_function(F&& builder)
: m_func(ensure(build_function_asm<FT>(std::forward<F>(builder))))
built_function(std::string_view name, F&& builder)
: m_func(ensure(build_function_asm<FT>(name, std::forward<F>(builder))))
{
}
@ -238,7 +246,7 @@ public:
built_function& operator=(const built_function&) = delete;
template <typename F>
built_function(F&& builder)
built_function(std::string_view name, F&& builder)
{
using namespace asmjit;
@ -270,6 +278,10 @@ public:
{
ensure(false);
}
else
{
jit_announce(result, code.getCodeSize(), name);
}
}
operator FT() const noexcept

View file

@ -2190,7 +2190,7 @@ thread_base::native_entry thread_base::finalize(u64 _self) noexcept
thread_base::native_entry thread_base::make_trampoline(u64(*entry)(thread_base* _base))
{
return build_function_asm<native_entry>([&](asmjit::X86Assembler& c, auto& args)
return build_function_asm<native_entry>("thread_base_trampoline", [&](asmjit::X86Assembler& c, auto& args)
{
using namespace asmjit;

View file

@ -1,4 +1,4 @@
<?xml version="1.0" encoding="utf-8"?>
<?xml version="1.0" encoding="utf-8"?>
<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<ImportGroup Label="PropertySheets" />
<PropertyGroup Label="UserMacros" />
@ -61,6 +61,7 @@
LLVMBitWriter.lib;
LLVMCoroutines.lib;
LLVMObjCARCOpts.lib;
LLVMIntelJITEvents.lib;
</AdditionalDependencies>
</Link>
</ItemDefinitionGroup>

View file

@ -1,4 +1,4 @@
<?xml version="1.0" encoding="utf-8"?>
<?xml version="1.0" encoding="utf-8"?>
<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<ImportGroup Label="PropertySheets" />
<PropertyGroup Label="UserMacros" />
@ -62,6 +62,7 @@
LLVMBitWriter.lib;
LLVMCoroutines.lib;
LLVMObjCARCOpts.lib;
LLVMIntelJITEvents.lib;
</AdditionalDependencies>
</Link>
</ItemDefinitionGroup>

2
llvm

@ -1 +1 @@
Subproject commit 318b8fe3746615f914522d4e177c537ce80d1d08
Subproject commit a670c459ea782411885b1e9861c89d04609d648f

View file

@ -39,9 +39,9 @@
</ImportGroup>
<PropertyGroup Label="UserMacros">
<CmakeReleaseCLI>call vsdevcmd.bat -arch=amd64
cmake -G Ninja -DCMAKE_CXX_COMPILER="cl.exe" -DCMAKE_C_COMPILER="cl.exe" -DCMAKE_BUILD_TYPE="Release" -DCMAKE_INSTALL_PREFIX="./Release" -DLLVM_TARGETS_TO_BUILD=X86 -DLLVM_DEFAULT_TARGET_TRIPLE:STRING=x86_64-pc-windows-msvc -DLLVM_HOST_TRIPLE:STRING=x86_64-pc-windows-msvc -DLLVM_BUILD_RUNTIME=OFF -DLLVM_BUILD_TOOLS=OFF -DLLVM_INCLUDE_DOCS=OFF -DLLVM_INCLUDE_EXAMPLES=OFF -DLLVM_INCLUDE_TESTS=OFF -DLLVM_INCLUDE_TOOLS=OFF -DLLVM_INCLUDE_UTILS=OFF -DCMAKE_SYSTEM_VERSION=6.1 -DLLVM_TEMPORARILY_ALLOW_OLD_TOOLCHAIN=ON -DCMAKE_VS_WINDOWS_TARGET_PLATFORM_VERSION=$(WindowsTargetPlatformVersion) -DLLVM_USE_CRT_DEBUG=MTd -DLLVM_USE_CRT_RELEASE=MT ../llvm</CmakeReleaseCLI>
cmake -G Ninja -DCMAKE_CXX_COMPILER="cl.exe" -DCMAKE_C_COMPILER="cl.exe" -DCMAKE_BUILD_TYPE="Release" -DCMAKE_INSTALL_PREFIX="./Release" -DLLVM_TARGETS_TO_BUILD=X86 -DLLVM_DEFAULT_TARGET_TRIPLE:STRING=x86_64-pc-windows-msvc -DLLVM_HOST_TRIPLE:STRING=x86_64-pc-windows-msvc -DLLVM_BUILD_RUNTIME=OFF -DLLVM_BUILD_TOOLS=OFF -DLLVM_INCLUDE_DOCS=OFF -DLLVM_INCLUDE_EXAMPLES=OFF -DLLVM_INCLUDE_TESTS=OFF -DLLVM_INCLUDE_TOOLS=OFF -DLLVM_INCLUDE_UTILS=OFF -DCMAKE_SYSTEM_VERSION=6.1 -DLLVM_TEMPORARILY_ALLOW_OLD_TOOLCHAIN=ON -DCMAKE_VS_WINDOWS_TARGET_PLATFORM_VERSION=$(WindowsTargetPlatformVersion) -DLLVM_USE_CRT_DEBUG=MTd -DLLVM_USE_CRT_RELEASE=MT -DLLVM_USE_INTEL_JITEVENTS=ON ../llvm</CmakeReleaseCLI>
<CmakeDebugCLI>call vsdevcmd.bat -arch=amd64
cmake -G Ninja -DCMAKE_CXX_COMPILER="cl.exe" -DCMAKE_C_COMPILER="cl.exe" -DCMAKE_BUILD_TYPE="Debug" -DCMAKE_INSTALL_PREFIX="./Debug" -DLLVM_TARGETS_TO_BUILD=X86 -DLLVM_DEFAULT_TARGET_TRIPLE:STRING=x86_64-pc-windows-msvc -DLLVM_HOST_TRIPLE:STRING=x86_64-pc-windows-msvc -DLLVM_BUILD_RUNTIME=OFF -DLLVM_BUILD_TOOLS=OFF -DLLVM_INCLUDE_DOCS=OFF -DLLVM_INCLUDE_EXAMPLES=OFF -DLLVM_INCLUDE_TESTS=OFF -DLLVM_INCLUDE_TOOLS=OFF -DLLVM_INCLUDE_UTILS=OFF -DCMAKE_SYSTEM_VERSION=6.1 -DLLVM_TEMPORARILY_ALLOW_OLD_TOOLCHAIN=ON -DCMAKE_VS_WINDOWS_TARGET_PLATFORM_VERSION=$(WindowsTargetPlatformVersion) -DLLVM_USE_CRT_DEBUG=MTd -DLLVM_USE_CRT_RELEASE=MT ../llvm</CmakeDebugCLI>
cmake -G Ninja -DCMAKE_CXX_COMPILER="cl.exe" -DCMAKE_C_COMPILER="cl.exe" -DCMAKE_BUILD_TYPE="Debug" -DCMAKE_INSTALL_PREFIX="./Debug" -DLLVM_TARGETS_TO_BUILD=X86 -DLLVM_DEFAULT_TARGET_TRIPLE:STRING=x86_64-pc-windows-msvc -DLLVM_HOST_TRIPLE:STRING=x86_64-pc-windows-msvc -DLLVM_BUILD_RUNTIME=OFF -DLLVM_BUILD_TOOLS=OFF -DLLVM_INCLUDE_DOCS=OFF -DLLVM_INCLUDE_EXAMPLES=OFF -DLLVM_INCLUDE_TESTS=OFF -DLLVM_INCLUDE_TOOLS=OFF -DLLVM_INCLUDE_UTILS=OFF -DCMAKE_SYSTEM_VERSION=6.1 -DLLVM_TEMPORARILY_ALLOW_OLD_TOOLCHAIN=ON -DCMAKE_VS_WINDOWS_TARGET_PLATFORM_VERSION=$(WindowsTargetPlatformVersion) -DLLVM_USE_CRT_DEBUG=MTd -DLLVM_USE_CRT_RELEASE=MT -DLLVM_USE_INTEL_JITEVENTS=ON ../llvm</CmakeDebugCLI>
<CmakeCleanCLI>echo Cleaning..
for /F "delims= eol=|" %%f in ('
dir /b ^| findstr /V "[^.]*\build[^.]*\.vcxproj"') do (

View file

@ -39,9 +39,9 @@
</ImportGroup>
<PropertyGroup Label="UserMacros">
<CmakeReleaseCLI>call vsdevcmd.bat -arch=amd64
cmake -G Ninja -DCMAKE_CXX_COMPILER="clang-cl.exe" -DCMAKE_C_COMPILER="clang-cl.exe" -DCMAKE_BUILD_TYPE="Release" -DLLVM_TARGETS_TO_BUILD=X86 -DLLVM_DEFAULT_TARGET_TRIPLE:STRING=x86_64-pc-windows-msvc -DLLVM_HOST_TRIPLE:STRING=x86_64-pc-windows-msvc -DCMAKE_INSTALL_PREFIX="./Release" -DLLVM_BUILD_RUNTIME=OFF -DLLVM_BUILD_TOOLS=OFF -DLLVM_INCLUDE_DOCS=OFF -DLLVM_INCLUDE_EXAMPLES=OFF -DLLVM_INCLUDE_TESTS=OFF -DLLVM_INCLUDE_TOOLS=OFF -DLLVM_INCLUDE_UTILS=OFF -DCMAKE_SYSTEM_VERSION=6.1 -DLLVM_TEMPORARILY_ALLOW_OLD_TOOLCHAIN=ON -DCMAKE_VS_WINDOWS_TARGET_PLATFORM_VERSION=$(WindowsTargetPlatformVersion) -DLLVM_USE_CRT_DEBUG=MTd -DLLVM_USE_CRT_RELEASE=MT ../llvm</CmakeReleaseCLI>
cmake -G Ninja -DCMAKE_CXX_COMPILER="clang-cl.exe" -DCMAKE_C_COMPILER="clang-cl.exe" -DCMAKE_BUILD_TYPE="Release" -DLLVM_TARGETS_TO_BUILD=X86 -DLLVM_DEFAULT_TARGET_TRIPLE:STRING=x86_64-pc-windows-msvc -DLLVM_HOST_TRIPLE:STRING=x86_64-pc-windows-msvc -DCMAKE_INSTALL_PREFIX="./Release" -DLLVM_BUILD_RUNTIME=OFF -DLLVM_BUILD_TOOLS=OFF -DLLVM_INCLUDE_DOCS=OFF -DLLVM_INCLUDE_EXAMPLES=OFF -DLLVM_INCLUDE_TESTS=OFF -DLLVM_INCLUDE_TOOLS=OFF -DLLVM_INCLUDE_UTILS=OFF -DCMAKE_SYSTEM_VERSION=6.1 -DLLVM_TEMPORARILY_ALLOW_OLD_TOOLCHAIN=ON -DCMAKE_VS_WINDOWS_TARGET_PLATFORM_VERSION=$(WindowsTargetPlatformVersion) -DLLVM_USE_CRT_DEBUG=MTd -DLLVM_USE_CRT_RELEASE=MT -DLLVM_USE_INTEL_JITEVENTS=ON ../llvm</CmakeReleaseCLI>
<CmakeDebugCLI>call vsdevcmd.bat -arch=amd64
cmake -G Ninja -DCMAKE_CXX_COMPILER="clang-cl.exe" -DCMAKE_C_COMPILER="clang-cl.exe" -DCMAKE_BUILD_TYPE="Debug" -DLLVM_TARGETS_TO_BUILD=X86 -DLLVM_DEFAULT_TARGET_TRIPLE:STRING=x86_64-pc-windows-msvc -DLLVM_HOST_TRIPLE:STRING=x86_64-pc-windows-msvc -DCMAKE_INSTALL_PREFIX="./Debug" -DLLVM_BUILD_RUNTIME=OFF -DLLVM_BUILD_TOOLS=OFF -DLLVM_INCLUDE_DOCS=OFF -DLLVM_INCLUDE_EXAMPLES=OFF -DLLVM_INCLUDE_TESTS=OFF -DLLVM_INCLUDE_TOOLS=OFF -DLLVM_INCLUDE_UTILS=OFF -DCMAKE_SYSTEM_VERSION=6.1 -DLLVM_TEMPORARILY_ALLOW_OLD_TOOLCHAIN=ON -DCMAKE_VS_WINDOWS_TARGET_PLATFORM_VERSION=$(WindowsTargetPlatformVersion) -DLLVM_USE_CRT_DEBUG=MTd -DLLVM_USE_CRT_RELEASE=MT ../llvm</CmakeDebugCLI>
cmake -G Ninja -DCMAKE_CXX_COMPILER="clang-cl.exe" -DCMAKE_C_COMPILER="clang-cl.exe" -DCMAKE_BUILD_TYPE="Debug" -DLLVM_TARGETS_TO_BUILD=X86 -DLLVM_DEFAULT_TARGET_TRIPLE:STRING=x86_64-pc-windows-msvc -DLLVM_HOST_TRIPLE:STRING=x86_64-pc-windows-msvc -DCMAKE_INSTALL_PREFIX="./Debug" -DLLVM_BUILD_RUNTIME=OFF -DLLVM_BUILD_TOOLS=OFF -DLLVM_INCLUDE_DOCS=OFF -DLLVM_INCLUDE_EXAMPLES=OFF -DLLVM_INCLUDE_TESTS=OFF -DLLVM_INCLUDE_TOOLS=OFF -DLLVM_INCLUDE_UTILS=OFF -DCMAKE_SYSTEM_VERSION=6.1 -DLLVM_TEMPORARILY_ALLOW_OLD_TOOLCHAIN=ON -DCMAKE_VS_WINDOWS_TARGET_PLATFORM_VERSION=$(WindowsTargetPlatformVersion) -DLLVM_USE_CRT_DEBUG=MTd -DLLVM_USE_CRT_RELEASE=MT -DLLVM_USE_INTEL_JITEVENTS=ON ../llvm</CmakeDebugCLI>
<CmakeCleanCLI>echo Cleaning..
for /F "delims= eol=|" %%f in ('
dir /b ^| findstr /V "[^.]*\build[^.]*\.vcxproj"') do (

View file

@ -1910,14 +1910,14 @@ std::vector<ppu_function_t>& ppu_function_manager::access(bool ghc)
static std::vector<ppu_function_t> list_ghc
{
build_function_asm<ppu_function_t>([](asmjit::X86Assembler& c, auto& args)
build_function_asm<ppu_function_t>("ppu_unregistered", [](asmjit::X86Assembler& c, auto& args)
{
using namespace asmjit;
c.mov(args[0], x86::rbp);
c.jmp(imm_ptr(list[0]));
}),
build_function_asm<ppu_function_t>([](asmjit::X86Assembler& c, auto& args)
build_function_asm<ppu_function_t>("ppu_return", [](asmjit::X86Assembler& c, auto& args)
{
using namespace asmjit;
@ -1937,7 +1937,7 @@ u32 ppu_function_manager::add_function(ppu_function_t function)
list.push_back(function);
// Generate trampoline
list2.push_back(build_function_asm<ppu_function_t>([&](asmjit::X86Assembler& c, auto& args)
list2.push_back(build_function_asm<ppu_function_t>("ppu_trampolinea", [&](asmjit::X86Assembler& c, auto& args)
{
using namespace asmjit;

View file

@ -147,7 +147,7 @@ static bool ppu_break(ppu_thread& ppu, ppu_opcode_t op);
extern void do_cell_atomic_128_store(u32 addr, const void* to_write);
const auto ppu_gateway = built_function<void(*)(ppu_thread*)>([](asmjit::X86Assembler& c, auto& args)
const auto ppu_gateway = built_function<void(*)(ppu_thread*)>("ppu_gateway", [](asmjit::X86Assembler& c, auto& args)
{
// Gateway for PPU, converts from native to GHC calling convention, also saves RSP value for escape
using namespace asmjit;
@ -248,7 +248,7 @@ const auto ppu_gateway = built_function<void(*)(ppu_thread*)>([](asmjit::X86Asse
c.ret();
});
const extern auto ppu_escape = build_function_asm<void(*)(ppu_thread*)>([](asmjit::X86Assembler& c, auto& args)
const extern auto ppu_escape = build_function_asm<void(*)(ppu_thread*)>("ppu_escape", [](asmjit::X86Assembler& c, auto& args)
{
using namespace asmjit;
@ -256,12 +256,13 @@ const extern auto ppu_escape = build_function_asm<void(*)(ppu_thread*)>([](asmji
c.mov(x86::rsp, x86::qword_ptr(args[0], ::offset32(&ppu_thread::saved_native_sp)));
// Return to the return location
c.jmp(x86::qword_ptr(x86::rsp, -8));
c.sub(x86::rsp, 8);
c.ret();
});
void ppu_recompiler_fallback(ppu_thread& ppu);
const auto ppu_recompiler_fallback_ghc = build_function_asm<void(*)(ppu_thread& ppu)>([](asmjit::X86Assembler& c, auto& args)
const auto ppu_recompiler_fallback_ghc = build_function_asm<void(*)(ppu_thread& ppu)>("ppu_trampolineb", [](asmjit::X86Assembler& c, auto& args)
{
using namespace asmjit;
@ -1816,7 +1817,7 @@ extern u64 ppu_ldarx(ppu_thread& ppu, u32 addr)
return ppu_load_acquire_reservation<u64>(ppu, addr);
}
const auto ppu_stcx_accurate_tx = built_function<u64(*)(u32 raddr, u64 rtime, const void* _old, u64 _new)>([](asmjit::X86Assembler& c, auto& args)
const auto ppu_stcx_accurate_tx = built_function<u64(*)(u32 raddr, u64 rtime, const void* _old, u64 _new)>("ppu_stcx_accurate_tx", [](asmjit::X86Assembler& c, auto& args)
{
using namespace asmjit;
@ -1832,11 +1833,7 @@ const auto ppu_stcx_accurate_tx = built_function<u64(*)(u32 raddr, u64 rtime, co
// Create stack frame if necessary (Windows ABI has only 6 volatile vector registers)
c.push(x86::rbp);
c.push(x86::r13);
c.push(x86::r12);
c.push(x86::rbx);
c.push(x86::r14);
c.push(x86::r15);
c.sub(x86::rsp, 40);
#ifdef _WIN32
if (!s_tsx_avx)
@ -1847,7 +1844,7 @@ const auto ppu_stcx_accurate_tx = built_function<u64(*)(u32 raddr, u64 rtime, co
#endif
// Prepare registers
build_swap_rdx_with(c, args, x86::r12);
build_swap_rdx_with(c, args, x86::r10);
c.mov(x86::rbp, x86::qword_ptr(reinterpret_cast<u64>(&vm::g_sudo_addr)));
c.lea(x86::rbp, x86::qword_ptr(x86::rbp, args[0]));
c.and_(x86::rbp, -128);
@ -1855,11 +1852,9 @@ const auto ppu_stcx_accurate_tx = built_function<u64(*)(u32 raddr, u64 rtime, co
c.prefetchw(x86::byte_ptr(x86::rbp, 64));
c.movzx(args[0].r32(), args[0].r16());
c.shr(args[0].r32(), 1);
c.lea(x86::rbx, x86::qword_ptr(reinterpret_cast<u64>(+vm::g_reservations), args[0]));
c.and_(x86::rbx, -128 / 2);
c.prefetchw(x86::byte_ptr(x86::rbx));
c.lea(x86::r11, x86::qword_ptr(reinterpret_cast<u64>(+vm::g_reservations), args[0]));
c.and_(x86::r11, -128 / 2);
c.and_(args[0].r32(), 63);
c.mov(x86::r13, args[1]);
// Prepare data
if (s_tsx_avx)
@ -1894,8 +1889,6 @@ const auto ppu_stcx_accurate_tx = built_function<u64(*)(u32 raddr, u64 rtime, co
c.cmp(x86::rax, x86::qword_ptr(reinterpret_cast<u64>(&g_rtm_tx_limit2)));
c.jae(fall);
});
c.prefetchw(x86::byte_ptr(x86::rbp, 0));
c.prefetchw(x86::byte_ptr(x86::rbp, 64));
// Check pause flag
c.bt(x86::dword_ptr(args[2], ::offset32(&ppu_thread::state) - ::offset32(&ppu_thread::rdata)), static_cast<u32>(cpu_flag::pause));
@ -1939,7 +1932,7 @@ const auto ppu_stcx_accurate_tx = built_function<u64(*)(u32 raddr, u64 rtime, co
c.mov(x86::qword_ptr(x86::rbp, args[0], 1, 0), args[3]);
c.xend();
c.lock().add(x86::qword_ptr(x86::rbx), 64);
c.lock().add(x86::qword_ptr(x86::r11), 64);
build_get_tsc(c);
c.sub(x86::rax, stamp0);
c.jmp(_ret);
@ -1975,7 +1968,7 @@ const auto ppu_stcx_accurate_tx = built_function<u64(*)(u32 raddr, u64 rtime, co
c.jmp(_ret);
c.bind(fail2);
c.lock().sub(x86::qword_ptr(x86::rbx), 1);
c.lock().sub(x86::qword_ptr(x86::r11), 64);
c.bind(load);
// Store previous data back to rdata
@ -2019,12 +2012,17 @@ const auto ppu_stcx_accurate_tx = built_function<u64(*)(u32 raddr, u64 rtime, co
}
c.add(x86::rsp, 40);
c.pop(x86::r15);
c.pop(x86::r14);
c.pop(x86::rbx);
c.pop(x86::r12);
c.pop(x86::r13);
c.pop(x86::rbp);
#ifdef __linux__
// Hack for perf profiling (TODO)
Label ret2 = c.newLabel();
c.lea(x86::rdx, x86::qword_ptr(ret2));
c.push(x86::rdx);
c.push(x86::rdx);
c.bind(ret2);
#endif
c.ret();
});

View file

@ -904,6 +904,10 @@ spu_function_t spu_recompiler::compile(spu_program&& _func)
spu_log.fatal("Failed to build a function");
}
else
{
jit_announce(fn, code.getCodeSize(), fmt::format("spu-b-%s", fmt::base57(be_t<u64>(m_hash_start))));
}
// Install compiled function pointer
const bool added = !add_loc->compiled && add_loc->compiled.compare_and_swap_test(nullptr, fn);

View file

@ -1733,7 +1733,7 @@ bool spu_interpreter::SHUFB(spu_thread& spu, spu_opcode_t op)
return true;
}
const spu_inter_func_t optimized_shufb = build_function_asm<spu_inter_func_t>([](asmjit::X86Assembler& c, auto& /*args*/)
const spu_inter_func_t optimized_shufb = build_function_asm<spu_inter_func_t>("spu_shufb", [](asmjit::X86Assembler& c, auto& /*args*/)
{
using namespace asmjit;

View file

@ -160,7 +160,7 @@ DECLARE(spu_runtime::tr_all) = []
return reinterpret_cast<spu_function_t>(trptr);
}();
DECLARE(spu_runtime::g_gateway) = built_function<spu_function_t>([](asmjit::X86Assembler& c, auto& args)
DECLARE(spu_runtime::g_gateway) = built_function<spu_function_t>("spu_gateway", [](asmjit::X86Assembler& c, auto& args)
{
// Gateway for SPU dispatcher, converts from native to GHC calling convention, also saves RSP value for spu_escape
using namespace asmjit;
@ -249,7 +249,7 @@ DECLARE(spu_runtime::g_gateway) = built_function<spu_function_t>([](asmjit::X86A
c.ret();
});
DECLARE(spu_runtime::g_escape) = build_function_asm<void(*)(spu_thread*)>([](asmjit::X86Assembler& c, auto& args)
DECLARE(spu_runtime::g_escape) = build_function_asm<void(*)(spu_thread*)>("spu_escape", [](asmjit::X86Assembler& c, auto& args)
{
using namespace asmjit;
@ -257,10 +257,11 @@ DECLARE(spu_runtime::g_escape) = build_function_asm<void(*)(spu_thread*)>([](asm
c.mov(x86::rsp, x86::qword_ptr(args[0], ::offset32(&spu_thread::saved_native_sp)));
// Return to the return location
c.jmp(x86::qword_ptr(x86::rsp, -8));
c.sub(x86::rsp, 8);
c.ret();
});
DECLARE(spu_runtime::g_tail_escape) = build_function_asm<void(*)(spu_thread*, spu_function_t, u8*)>([](asmjit::X86Assembler& c, auto& args)
DECLARE(spu_runtime::g_tail_escape) = build_function_asm<void(*)(spu_thread*, spu_function_t, u8*)>("spu_tail_escape", [](asmjit::X86Assembler& c, auto& args)
{
using namespace asmjit;
@ -268,14 +269,15 @@ DECLARE(spu_runtime::g_tail_escape) = build_function_asm<void(*)(spu_thread*, sp
c.mov(x86::rsp, x86::qword_ptr(args[0], ::offset32(&spu_thread::saved_native_sp)));
// Adjust stack for initial call instruction in the gateway
c.sub(x86::rsp, 8);
c.sub(x86::rsp, 16);
// Tail call, GHC CC (second arg)
c.mov(x86::r13, args[0]);
c.mov(x86::rbp, x86::qword_ptr(args[0], ::offset32(&spu_thread::ls)));
c.mov(x86::r12, args[2]);
c.xor_(x86::ebx, x86::ebx);
c.jmp(args[1]);
c.mov(x86::qword_ptr(x86::rsp), args[1]);
c.ret();
});
DECLARE(spu_runtime::g_interpreter_table) = {};
@ -1066,6 +1068,8 @@ spu_function_t spu_runtime::rebuild_ubertrampoline(u32 id_inst)
workload.clear();
result = reinterpret_cast<spu_function_t>(reinterpret_cast<u64>(wxptr));
jit_announce(wxptr, raw - wxptr, "spu_ubertrampoline");
}
if (auto _old = stuff_it->trampoline.compare_and_swap(nullptr, result))
@ -3480,7 +3484,7 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator
#endif
// Get function chunk name
const std::string name = fmt::format("spu-chunk-0x%05x", addr);
const std::string name = fmt::format("spu-cx%05x-%s", addr, fmt::base57(be_t<u64>{m_hash_start}));
llvm::Function* result = llvm::cast<llvm::Function>(m_module->getOrInsertFunction(name, chunk_type).getCallee());
// Set parameters
@ -3505,7 +3509,7 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator
// 5. $3
const auto func_type = get_ftype<u32[4], u8*, u8*, u32, u32[4], u32[4]>();
const std::string fname = fmt::format("spu-function-0x%05x", addr);
const std::string fname = fmt::format("spu-fx%05x-%s", addr, fmt::base57(be_t<u64>{m_hash_start}));
llvm::Function* fn = llvm::cast<llvm::Function>(m_module->getOrInsertFunction(fname, func_type).getCallee());
fn->setLinkage(llvm::GlobalValue::InternalLinkage);

View file

@ -405,7 +405,7 @@ std::array<u32, 2> op_branch_targets(u32 pc, spu_opcode_t op)
return res;
}
const auto spu_putllc_tx = built_function<u64(*)(u32 raddr, u64 rtime, void* _old, const void* _new)>([](asmjit::X86Assembler& c, auto& args)
const auto spu_putllc_tx = built_function<u64(*)(u32 raddr, u64 rtime, void* _old, const void* _new)>("spu_putllc_tx", [](asmjit::X86Assembler& c, auto& args)
{
using namespace asmjit;
@ -420,12 +420,8 @@ const auto spu_putllc_tx = built_function<u64(*)(u32 raddr, u64 rtime, void* _ol
//}
// Create stack frame if necessary (Windows ABI has only 6 volatile vector registers)
c.push(x86::rbp);
c.push(x86::r13);
c.push(x86::r12);
c.push(x86::rbx);
c.sub(x86::rsp, 168);
#ifdef _WIN32
c.sub(x86::rsp, 168);
if (s_tsx_avx)
{
c.vmovups(x86::oword_ptr(x86::rsp, 0), x86::xmm6);
@ -447,16 +443,14 @@ const auto spu_putllc_tx = built_function<u64(*)(u32 raddr, u64 rtime, void* _ol
#endif
// Prepare registers
build_swap_rdx_with(c, args, x86::r12);
c.mov(x86::rbp, x86::qword_ptr(reinterpret_cast<u64>(&vm::g_sudo_addr)));
c.lea(x86::rbp, x86::qword_ptr(x86::rbp, args[0]));
c.prefetchw(x86::byte_ptr(x86::rbp, 0));
c.prefetchw(x86::byte_ptr(x86::rbp, 64));
build_swap_rdx_with(c, args, x86::r10);
c.mov(args[1], x86::qword_ptr(reinterpret_cast<u64>(&vm::g_sudo_addr)));
c.lea(args[1], x86::qword_ptr(args[1], args[0]));
c.prefetchw(x86::byte_ptr(args[1], 0));
c.prefetchw(x86::byte_ptr(args[1], 64));
c.and_(args[0].r32(), 0xff80);
c.shr(args[0].r32(), 1);
c.lea(x86::rbx, x86::qword_ptr(reinterpret_cast<u64>(+vm::g_reservations), args[0]));
c.prefetchw(x86::byte_ptr(x86::rbx));
c.mov(x86::r13, args[1]);
c.lea(x86::r11, x86::qword_ptr(reinterpret_cast<u64>(+vm::g_reservations), args[0]));
// Prepare data
if (s_tsx_avx)
@ -504,8 +498,6 @@ const auto spu_putllc_tx = built_function<u64(*)(u32 raddr, u64 rtime, void* _ol
c.cmp(x86::rax, x86::qword_ptr(reinterpret_cast<u64>(&g_rtm_tx_limit2)));
c.jae(fall);
});
c.prefetchw(x86::byte_ptr(x86::rbp, 0));
c.prefetchw(x86::byte_ptr(x86::rbp, 64));
// Check pause flag
c.bt(x86::dword_ptr(args[2], ::offset32(&spu_thread::state) - ::offset32(&spu_thread::rdata)), static_cast<u32>(cpu_flag::pause));
@ -514,10 +506,10 @@ const auto spu_putllc_tx = built_function<u64(*)(u32 raddr, u64 rtime, void* _ol
if (s_tsx_avx)
{
c.vxorps(x86::ymm0, x86::ymm0, x86::yword_ptr(x86::rbp, 0));
c.vxorps(x86::ymm1, x86::ymm1, x86::yword_ptr(x86::rbp, 32));
c.vxorps(x86::ymm2, x86::ymm2, x86::yword_ptr(x86::rbp, 64));
c.vxorps(x86::ymm3, x86::ymm3, x86::yword_ptr(x86::rbp, 96));
c.vxorps(x86::ymm0, x86::ymm0, x86::yword_ptr(args[1], 0));
c.vxorps(x86::ymm1, x86::ymm1, x86::yword_ptr(args[1], 32));
c.vxorps(x86::ymm2, x86::ymm2, x86::yword_ptr(args[1], 64));
c.vxorps(x86::ymm3, x86::ymm3, x86::yword_ptr(args[1], 96));
c.vorps(x86::ymm0, x86::ymm0, x86::ymm1);
c.vorps(x86::ymm1, x86::ymm2, x86::ymm3);
c.vorps(x86::ymm0, x86::ymm1, x86::ymm0);
@ -525,14 +517,14 @@ const auto spu_putllc_tx = built_function<u64(*)(u32 raddr, u64 rtime, void* _ol
}
else
{
c.xorps(x86::xmm0, x86::oword_ptr(x86::rbp, 0));
c.xorps(x86::xmm1, x86::oword_ptr(x86::rbp, 16));
c.xorps(x86::xmm2, x86::oword_ptr(x86::rbp, 32));
c.xorps(x86::xmm3, x86::oword_ptr(x86::rbp, 48));
c.xorps(x86::xmm4, x86::oword_ptr(x86::rbp, 64));
c.xorps(x86::xmm5, x86::oword_ptr(x86::rbp, 80));
c.xorps(x86::xmm6, x86::oword_ptr(x86::rbp, 96));
c.xorps(x86::xmm7, x86::oword_ptr(x86::rbp, 112));
c.xorps(x86::xmm0, x86::oword_ptr(args[1], 0));
c.xorps(x86::xmm1, x86::oword_ptr(args[1], 16));
c.xorps(x86::xmm2, x86::oword_ptr(args[1], 32));
c.xorps(x86::xmm3, x86::oword_ptr(args[1], 48));
c.xorps(x86::xmm4, x86::oword_ptr(args[1], 64));
c.xorps(x86::xmm5, x86::oword_ptr(args[1], 80));
c.xorps(x86::xmm6, x86::oword_ptr(args[1], 96));
c.xorps(x86::xmm7, x86::oword_ptr(args[1], 112));
c.orps(x86::xmm0, x86::xmm1);
c.orps(x86::xmm2, x86::xmm3);
c.orps(x86::xmm4, x86::xmm5);
@ -547,25 +539,25 @@ const auto spu_putllc_tx = built_function<u64(*)(u32 raddr, u64 rtime, void* _ol
if (s_tsx_avx)
{
c.vmovaps(x86::yword_ptr(x86::rbp, 0), x86::ymm4);
c.vmovaps(x86::yword_ptr(x86::rbp, 32), x86::ymm5);
c.vmovaps(x86::yword_ptr(x86::rbp, 64), x86::ymm6);
c.vmovaps(x86::yword_ptr(x86::rbp, 96), x86::ymm7);
c.vmovaps(x86::yword_ptr(args[1], 0), x86::ymm4);
c.vmovaps(x86::yword_ptr(args[1], 32), x86::ymm5);
c.vmovaps(x86::yword_ptr(args[1], 64), x86::ymm6);
c.vmovaps(x86::yword_ptr(args[1], 96), x86::ymm7);
}
else
{
c.movaps(x86::oword_ptr(x86::rbp, 0), x86::xmm8);
c.movaps(x86::oword_ptr(x86::rbp, 16), x86::xmm9);
c.movaps(x86::oword_ptr(x86::rbp, 32), x86::xmm10);
c.movaps(x86::oword_ptr(x86::rbp, 48), x86::xmm11);
c.movaps(x86::oword_ptr(x86::rbp, 64), x86::xmm12);
c.movaps(x86::oword_ptr(x86::rbp, 80), x86::xmm13);
c.movaps(x86::oword_ptr(x86::rbp, 96), x86::xmm14);
c.movaps(x86::oword_ptr(x86::rbp, 112), x86::xmm15);
c.movaps(x86::oword_ptr(args[1], 0), x86::xmm8);
c.movaps(x86::oword_ptr(args[1], 16), x86::xmm9);
c.movaps(x86::oword_ptr(args[1], 32), x86::xmm10);
c.movaps(x86::oword_ptr(args[1], 48), x86::xmm11);
c.movaps(x86::oword_ptr(args[1], 64), x86::xmm12);
c.movaps(x86::oword_ptr(args[1], 80), x86::xmm13);
c.movaps(x86::oword_ptr(args[1], 96), x86::xmm14);
c.movaps(x86::oword_ptr(args[1], 112), x86::xmm15);
}
c.xend();
c.lock().add(x86::qword_ptr(x86::rbx), 64);
c.lock().add(x86::qword_ptr(x86::r11), 64);
c.add(x86::qword_ptr(args[2], ::offset32(&spu_thread::stx) - ::offset32(&spu_thread::rdata)), 1);
build_get_tsc(c);
c.sub(x86::rax, stamp0);
@ -577,21 +569,21 @@ const auto spu_putllc_tx = built_function<u64(*)(u32 raddr, u64 rtime, void* _ol
// Load previous data to store back to rdata
if (s_tsx_avx)
{
c.vmovaps(x86::ymm0, x86::yword_ptr(x86::rbp, 0));
c.vmovaps(x86::ymm1, x86::yword_ptr(x86::rbp, 32));
c.vmovaps(x86::ymm2, x86::yword_ptr(x86::rbp, 64));
c.vmovaps(x86::ymm3, x86::yword_ptr(x86::rbp, 96));
c.vmovaps(x86::ymm0, x86::yword_ptr(args[1], 0));
c.vmovaps(x86::ymm1, x86::yword_ptr(args[1], 32));
c.vmovaps(x86::ymm2, x86::yword_ptr(args[1], 64));
c.vmovaps(x86::ymm3, x86::yword_ptr(args[1], 96));
}
else
{
c.movaps(x86::xmm0, x86::oword_ptr(x86::rbp, 0));
c.movaps(x86::xmm1, x86::oword_ptr(x86::rbp, 16));
c.movaps(x86::xmm2, x86::oword_ptr(x86::rbp, 32));
c.movaps(x86::xmm3, x86::oword_ptr(x86::rbp, 48));
c.movaps(x86::xmm4, x86::oword_ptr(x86::rbp, 64));
c.movaps(x86::xmm5, x86::oword_ptr(x86::rbp, 80));
c.movaps(x86::xmm6, x86::oword_ptr(x86::rbp, 96));
c.movaps(x86::xmm7, x86::oword_ptr(x86::rbp, 112));
c.movaps(x86::xmm0, x86::oword_ptr(args[1], 0));
c.movaps(x86::xmm1, x86::oword_ptr(args[1], 16));
c.movaps(x86::xmm2, x86::oword_ptr(args[1], 32));
c.movaps(x86::xmm3, x86::oword_ptr(args[1], 48));
c.movaps(x86::xmm4, x86::oword_ptr(args[1], 64));
c.movaps(x86::xmm5, x86::oword_ptr(args[1], 80));
c.movaps(x86::xmm6, x86::oword_ptr(args[1], 96));
c.movaps(x86::xmm7, x86::oword_ptr(args[1], 112));
}
c.xend();
@ -603,7 +595,7 @@ const auto spu_putllc_tx = built_function<u64(*)(u32 raddr, u64 rtime, void* _ol
c.jmp(_ret);
c.bind(fail2);
c.lock().sub(x86::qword_ptr(x86::rbx), 64);
c.lock().sub(x86::qword_ptr(x86::r11), 64);
c.bind(load);
// Store previous data back to rdata
@ -652,6 +644,7 @@ const auto spu_putllc_tx = built_function<u64(*)(u32 raddr, u64 rtime, void* _ol
c.movups(x86::xmm14, x86::oword_ptr(x86::rsp, 128));
c.movups(x86::xmm15, x86::oword_ptr(x86::rsp, 144));
}
c.add(x86::rsp, 168);
#endif
if (s_tsx_avx)
@ -659,15 +652,18 @@ const auto spu_putllc_tx = built_function<u64(*)(u32 raddr, u64 rtime, void* _ol
c.vzeroupper();
}
c.add(x86::rsp, 168);
c.pop(x86::rbx);
c.pop(x86::r12);
c.pop(x86::r13);
c.pop(x86::rbp);
#ifdef __linux__
// Hack for perf profiling (TODO)
Label ret2 = c.newLabel();
c.lea(x86::rdx, x86::qword_ptr(ret2));
c.push(x86::rdx);
c.push(x86::rdx);
c.bind(ret2);
#endif
c.ret();
});
const auto spu_putlluc_tx = built_function<u64(*)(u32 raddr, const void* rdata, u64* _stx, u64* _ftx)>([](asmjit::X86Assembler& c, auto& args)
const auto spu_putlluc_tx = built_function<u64(*)(u32 raddr, const void* rdata, u64* _stx, u64* _ftx)>("spu_putlluc_tx", [](asmjit::X86Assembler& c, auto& args)
{
using namespace asmjit;
@ -680,30 +676,20 @@ const auto spu_putlluc_tx = built_function<u64(*)(u32 raddr, const void* rdata,
//}
// Create stack frame if necessary (Windows ABI has only 6 volatile vector registers)
c.push(x86::rbp);
c.push(x86::r13);
c.push(x86::r12);
c.push(x86::rbx);
c.sub(x86::rsp, 40);
#ifdef _WIN32
c.sub(x86::rsp, 40);
if (!s_tsx_avx)
{
c.movups(x86::oword_ptr(x86::rsp, 0), x86::xmm6);
c.movups(x86::oword_ptr(x86::rsp, 16), x86::xmm7);
}
#endif
// Prepare registers
build_swap_rdx_with(c, args, x86::r12);
c.mov(x86::rbp, x86::qword_ptr(reinterpret_cast<u64>(&vm::g_sudo_addr)));
c.lea(x86::rbp, x86::qword_ptr(x86::rbp, args[0]));
c.prefetchw(x86::byte_ptr(x86::rbp, 0));
c.prefetchw(x86::byte_ptr(x86::rbp, 64));
c.and_(args[0].r32(), 0xff80);
c.shr(args[0].r32(), 1);
c.lea(x86::rbx, x86::qword_ptr(reinterpret_cast<u64>(+vm::g_reservations), args[0]));
c.prefetchw(x86::byte_ptr(x86::rbx));
c.mov(x86::r13, args[1]);
build_swap_rdx_with(c, args, x86::r10);
c.mov(x86::r11, x86::qword_ptr(reinterpret_cast<u64>(&vm::g_sudo_addr)));
c.lea(x86::r11, x86::qword_ptr(x86::r11, args[0]));
c.prefetchw(x86::byte_ptr(x86::r11, 0));
c.prefetchw(x86::byte_ptr(x86::r11, 64));
// Prepare data
if (s_tsx_avx)
@ -725,6 +711,10 @@ const auto spu_putlluc_tx = built_function<u64(*)(u32 raddr, const void* rdata,
c.movaps(x86::xmm7, x86::oword_ptr(args[1], 112));
}
c.and_(args[0].r32(), 0xff80);
c.shr(args[0].r32(), 1);
c.lea(args[1], x86::qword_ptr(reinterpret_cast<u64>(+vm::g_reservations), args[0]));
// Alloc args[0] to stamp0
const auto stamp0 = args[0];
build_get_tsc(c, stamp0);
@ -739,35 +729,29 @@ const auto spu_putlluc_tx = built_function<u64(*)(u32 raddr, const void* rdata,
c.jae(fall);
});
c.prefetchw(x86::byte_ptr(x86::rbp, 0));
c.prefetchw(x86::byte_ptr(x86::rbp, 64));
// // Check pause flag
// c.bt(x86::dword_ptr(args[2], ::offset32(&cpu_thread::state)), static_cast<u32>(cpu_flag::pause));
// c.jc(fall);
c.xbegin(tx1);
if (s_tsx_avx)
{
c.vmovaps(x86::yword_ptr(x86::rbp, 0), x86::ymm0);
c.vmovaps(x86::yword_ptr(x86::rbp, 32), x86::ymm1);
c.vmovaps(x86::yword_ptr(x86::rbp, 64), x86::ymm2);
c.vmovaps(x86::yword_ptr(x86::rbp, 96), x86::ymm3);
c.vmovaps(x86::yword_ptr(x86::r11, 0), x86::ymm0);
c.vmovaps(x86::yword_ptr(x86::r11, 32), x86::ymm1);
c.vmovaps(x86::yword_ptr(x86::r11, 64), x86::ymm2);
c.vmovaps(x86::yword_ptr(x86::r11, 96), x86::ymm3);
}
else
{
c.movaps(x86::oword_ptr(x86::rbp, 0), x86::xmm0);
c.movaps(x86::oword_ptr(x86::rbp, 16), x86::xmm1);
c.movaps(x86::oword_ptr(x86::rbp, 32), x86::xmm2);
c.movaps(x86::oword_ptr(x86::rbp, 48), x86::xmm3);
c.movaps(x86::oword_ptr(x86::rbp, 64), x86::xmm4);
c.movaps(x86::oword_ptr(x86::rbp, 80), x86::xmm5);
c.movaps(x86::oword_ptr(x86::rbp, 96), x86::xmm6);
c.movaps(x86::oword_ptr(x86::rbp, 112), x86::xmm7);
c.movaps(x86::oword_ptr(x86::r11, 0), x86::xmm0);
c.movaps(x86::oword_ptr(x86::r11, 16), x86::xmm1);
c.movaps(x86::oword_ptr(x86::r11, 32), x86::xmm2);
c.movaps(x86::oword_ptr(x86::r11, 48), x86::xmm3);
c.movaps(x86::oword_ptr(x86::r11, 64), x86::xmm4);
c.movaps(x86::oword_ptr(x86::r11, 80), x86::xmm5);
c.movaps(x86::oword_ptr(x86::r11, 96), x86::xmm6);
c.movaps(x86::oword_ptr(x86::r11, 112), x86::xmm7);
}
c.xend();
c.lock().add(x86::qword_ptr(x86::rbx), 32);
c.lock().add(x86::qword_ptr(args[1]), 32);
// stx++
c.add(x86::qword_ptr(args[2]), 1);
build_get_tsc(c);
@ -786,6 +770,7 @@ const auto spu_putlluc_tx = built_function<u64(*)(u32 raddr, const void* rdata,
c.movups(x86::xmm6, x86::oword_ptr(x86::rsp, 0));
c.movups(x86::xmm7, x86::oword_ptr(x86::rsp, 16));
}
c.add(x86::rsp, 40);
#endif
if (s_tsx_avx)
@ -793,15 +778,18 @@ const auto spu_putlluc_tx = built_function<u64(*)(u32 raddr, const void* rdata,
c.vzeroupper();
}
c.add(x86::rsp, 40);
c.pop(x86::rbx);
c.pop(x86::r12);
c.pop(x86::r13);
c.pop(x86::rbp);
#ifdef __linux__
// Hack for perf profiling (TODO)
Label ret2 = c.newLabel();
c.lea(x86::rdx, x86::qword_ptr(ret2));
c.push(x86::rdx);
c.push(x86::rdx);
c.bind(ret2);
#endif
c.ret();
});
const auto spu_getllar_tx = built_function<u64(*)(u32 raddr, void* rdata, cpu_thread* _cpu, u64 rtime)>([](asmjit::X86Assembler& c, auto& args)
const auto spu_getllar_tx = built_function<u64(*)(u32 raddr, void* rdata, cpu_thread* _cpu, u64 rtime)>("spu_getllar_tx", [](asmjit::X86Assembler& c, auto& args)
{
using namespace asmjit;
@ -815,8 +803,6 @@ const auto spu_getllar_tx = built_function<u64(*)(u32 raddr, void* rdata, cpu_th
// Create stack frame if necessary (Windows ABI has only 6 volatile vector registers)
c.push(x86::rbp);
c.push(x86::r13);
c.push(x86::r12);
c.push(x86::rbx);
c.sub(x86::rsp, 40);
#ifdef _WIN32
@ -828,13 +814,12 @@ const auto spu_getllar_tx = built_function<u64(*)(u32 raddr, void* rdata, cpu_th
#endif
// Prepare registers
build_swap_rdx_with(c, args, x86::r12);
build_swap_rdx_with(c, args, x86::r10);
c.mov(x86::rbp, x86::qword_ptr(reinterpret_cast<u64>(&vm::g_sudo_addr)));
c.lea(x86::rbp, x86::qword_ptr(x86::rbp, args[0]));
c.and_(args[0].r32(), 0xff80);
c.shr(args[0].r32(), 1);
c.lea(x86::rbx, x86::qword_ptr(reinterpret_cast<u64>(+vm::g_reservations), args[0]));
c.mov(x86::r13, args[1]);
c.lea(x86::r11, x86::qword_ptr(reinterpret_cast<u64>(+vm::g_reservations), args[0]));
// Alloc args[0] to stamp0
const auto stamp0 = args[0];
@ -853,7 +838,7 @@ const auto spu_getllar_tx = built_function<u64(*)(u32 raddr, void* rdata, cpu_th
// Check pause flag
c.bt(x86::dword_ptr(args[2], ::offset32(&cpu_thread::state)), static_cast<u32>(cpu_flag::pause));
c.jc(fall);
c.mov(x86::rax, x86::qword_ptr(x86::rbx));
c.mov(x86::rax, x86::qword_ptr(x86::r11));
c.and_(x86::rax, -128);
c.cmp(x86::rax, args[3]);
c.jne(fall);
@ -926,9 +911,16 @@ const auto spu_getllar_tx = built_function<u64(*)(u32 raddr, void* rdata, cpu_th
c.add(x86::rsp, 40);
c.pop(x86::rbx);
c.pop(x86::r12);
c.pop(x86::r13);
c.pop(x86::rbp);
#ifdef __linux__
// Hack for perf profiling (TODO)
Label ret2 = c.newLabel();
c.lea(x86::rdx, x86::qword_ptr(ret2));
c.push(x86::rdx);
c.push(x86::rdx);
c.bind(ret2);
#endif
c.ret();
});

View file

@ -256,9 +256,9 @@ namespace
}
}
built_function<void(*)(void*, const void*, u32)> copy_data_swap_u32(&build_copy_data_swap_u32<false>);
built_function<void(*)(void*, const void*, u32)> copy_data_swap_u32("copy_data_swap_u32", &build_copy_data_swap_u32<false>);
built_function<bool(*)(void*, const void*, u32)> copy_data_swap_u32_cmp(&build_copy_data_swap_u32<true>);
built_function<bool(*)(void*, const void*, u32)> copy_data_swap_u32_cmp("copy_data_swap_u32_cmp", &build_copy_data_swap_u32<true>);
namespace
{