types.hpp: remove intrinsic includes

Replace v128 with u128 in some places.
Removed some unused files.
This commit is contained in:
Nekotekina 2020-12-21 17:12:05 +03:00
parent 5f618814f6
commit bd269bccaf
64 changed files with 899 additions and 2265 deletions

View file

@ -243,6 +243,18 @@ void fmt_class_string<v128>::format(std::string& out, u64 arg)
fmt::append(out, "0x%016llx%016llx", vec._u64[1], vec._u64[0]);
}
template <>
void fmt_class_string<u128>::format(std::string& out, u64 arg)
{
// TODO: it should be supported as full-fledged integral type (with %u, %d, etc, fmt)
const u128& num = get_object(arg);
#ifdef _MSC_VER
fmt::append(out, "0x%016llx%016llx", num.hi, num.lo);
#else
fmt::append(out, "0x%016llx%016llx", static_cast<u64>(num >> 64), static_cast<u64>(num));
#endif
}
template <>
void fmt_class_string<src_loc>::format(std::string& out, u64 arg)
{

View file

@ -76,6 +76,8 @@
#include "util/vm.hpp"
#include "util/logs.hpp"
#include "util/asm.hpp"
#include "util/v128.hpp"
#include "util/v128sse.hpp"
#include "util/sysinfo.hpp"
#include "Emu/Memory/vm_locking.h"

File diff suppressed because it is too large Load diff

View file

@ -5,7 +5,6 @@
#include "Utilities/mutex.h"
#include <cmath>
#include "util/v128.hpp"
#include "util/asm.hpp"
LOG_CHANNEL(edat_log, "EDAT");
@ -138,15 +137,15 @@ std::tuple<u64, s32, s32> dec_section(unsigned char* metadata)
return std::make_tuple(offset, length, compression_end);
}
v128 get_block_key(int block, NPD_HEADER *npd)
u128 get_block_key(int block, NPD_HEADER *npd)
{
unsigned char empty_key[0x10] = {};
unsigned char *src_key = (npd->version <= 1) ? empty_key : npd->dev_hash;
v128 dest_key{};
memcpy(dest_key._bytes, src_key, 0xC);
u128 dest_key{};
std::memcpy(&dest_key, src_key, 0xC);
s32 swappedBlock = swap32(block);
memcpy(&dest_key._bytes[0xC], &swappedBlock, sizeof(swappedBlock));
std::memcpy(reinterpret_cast<uchar*>(&dest_key) + 0xC, &swappedBlock, sizeof(swappedBlock));
return dest_key;
}
@ -251,7 +250,7 @@ s64 decrypt_block(const fs::file* in, u8* out, EDAT_HEADER *edat, NPD_HEADER *np
auto b_key = get_block_key(block_num, npd);
// Encrypt the block key with the crypto key.
aesecb128_encrypt(crypt_key, b_key._bytes, key_result);
aesecb128_encrypt(crypt_key, reinterpret_cast<uchar*>(&b_key), key_result);
if ((edat->flags & EDAT_FLAG_0x10) != 0)
aesecb128_encrypt(crypt_key, key_result, hash); // If FLAG 0x10 is set, encrypt again to get the final hash.
else
@ -556,9 +555,10 @@ int validate_dev_klic(const u8* klicensee, NPD_HEADER *npd)
memcpy(dev + 0xC, &type, 4);
// Check for an empty dev_hash (can't validate if devklic is NULL);
auto klic = v128::loadu(klicensee);
u128 klic;
std::memcpy(&klic, klicensee, sizeof(klic));
if (klic == v128{})
if (!klic)
{
// Allow empty dev hash.
return 1;
@ -566,10 +566,10 @@ int validate_dev_klic(const u8* klicensee, NPD_HEADER *npd)
else
{
// Generate klicensee xor key.
auto key = klic ^ std::bit_cast<v128>(NP_OMAC_KEY_2);
u128 key = klic ^ std::bit_cast<u128>(NP_OMAC_KEY_2);
// Hash with generated key and compare with dev_hash.
return cmac_hash_compare(key._bytes, 0x10, dev, 0x60, npd->dev_hash, 0x10);
return cmac_hash_compare(reinterpret_cast<uchar*>(&key), 0x10, dev, 0x60, npd->dev_hash, 0x10);
}
}
@ -668,7 +668,7 @@ bool extract_all_data(const fs::file* input, const fs::file* output, const char*
}
// Set decryption key.
v128 key{};
u128 key{};
// Check EDAT/SDAT flag.
if ((EDAT.flags & SDAT_FLAG) == SDAT_FLAG)
@ -682,7 +682,7 @@ bool extract_all_data(const fs::file* input, const fs::file* output, const char*
}
// Generate SDAT key.
key = std::bit_cast<v128>(NPD.dev_hash) ^ std::bit_cast<v128>(SDAT_KEY);
key = std::bit_cast<u128>(NPD.dev_hash) ^ std::bit_cast<u128>(SDAT_KEY);
}
else
{
@ -715,7 +715,7 @@ bool extract_all_data(const fs::file* input, const fs::file* output, const char*
memcpy(&key, rifkey, 0x10);
// Make sure we don't have an empty RIF key.
if (key == v128{})
if (!key)
{
edat_log.error("EDAT: A valid RAP file is needed for this EDAT file! (local activation)");
return 1;
@ -726,7 +726,7 @@ bool extract_all_data(const fs::file* input, const fs::file* output, const char*
memcpy(&key, rifkey, 0x10);
// Make sure we don't have an empty RIF key.
if (key == v128{})
if (!key)
{
edat_log.error("EDAT: A valid RAP file is needed for this EDAT file! (network activation)");
return 1;
@ -735,7 +735,7 @@ bool extract_all_data(const fs::file* input, const fs::file* output, const char*
if (verbose)
{
be_t<v128> data;
be_t<u128> data;
std::memcpy(&data, devklic, sizeof(data));
edat_log.notice("DEVKLIC: %s", data);
@ -746,18 +746,18 @@ bool extract_all_data(const fs::file* input, const fs::file* output, const char*
if (verbose)
{
edat_log.notice("DECRYPTION KEY: %s", std::bit_cast<be_t<v128>>(key));
edat_log.notice("DECRYPTION KEY: %s", std::bit_cast<be_t<u128>>(key));
}
input->seek(0);
if (check_data(key._bytes, &EDAT, &NPD, input, verbose))
if (check_data(reinterpret_cast<uchar*>(&key), &EDAT, &NPD, input, verbose))
{
edat_log.error("EDAT: Data parsing failed!");
return 1;
}
input->seek(0);
if (decrypt_data(input, output, &EDAT, &NPD, key._bytes, verbose))
if (decrypt_data(input, output, &EDAT, &NPD, reinterpret_cast<uchar*>(&key), verbose))
{
edat_log.error("EDAT: Data decryption failed!");
return 1;
@ -766,14 +766,14 @@ bool extract_all_data(const fs::file* input, const fs::file* output, const char*
return 0;
}
v128 GetEdatRifKeyFromRapFile(const fs::file& rap_file)
u128 GetEdatRifKeyFromRapFile(const fs::file& rap_file)
{
v128 rapkey{};
v128 rifkey{};
u128 rapkey{};
u128 rifkey{};
rap_file.read<v128>(rapkey);
rap_file.read<u128>(rapkey);
rap_to_rif(rapkey._bytes, rifkey._bytes);
rap_to_rif(reinterpret_cast<uchar*>(&rapkey), reinterpret_cast<uchar*>(&rifkey));
return rifkey;
}
@ -824,8 +824,8 @@ fs::file DecryptEDAT(const fs::file& input, const std::string& input_file_name,
input.seek(0);
// Set keys (RIF and DEVKLIC).
v128 rifKey{};
v128 devklic{};
u128 rifKey{};
u128 devklic{};
// Select the EDAT key mode.
switch (mode)
@ -879,7 +879,7 @@ fs::file DecryptEDAT(const fs::file& input, const std::string& input_file_name,
// Delete the bad output file if any errors arise.
fs::file output = fs::make_stream<std::vector<u8>>();
if (extract_all_data(&input, &output, input_file_name.c_str(), devklic._bytes, rifKey._bytes, verbose))
if (extract_all_data(&input, &output, input_file_name.c_str(), reinterpret_cast<uchar*>(&devklic), reinterpret_cast<uchar*>(&rifKey), verbose))
{
output.release();
return fs::file{};
@ -905,12 +905,12 @@ bool EDATADecrypter::ReadHeader()
if ((edatHeader.flags & SDAT_FLAG) == SDAT_FLAG)
{
// Generate SDAT key.
dec_key = std::bit_cast<v128>(npdHeader.dev_hash) ^ std::bit_cast<v128>(SDAT_KEY);
dec_key = std::bit_cast<u128>(npdHeader.dev_hash) ^ std::bit_cast<u128>(SDAT_KEY);
}
else
{
// verify key
if (validate_dev_klic(dev_key._bytes, &npdHeader) == 0)
if (validate_dev_klic(reinterpret_cast<uchar*>(&dev_key), &npdHeader) == 0)
{
edat_log.error("EDAT: Failed validating klic");
return false;
@ -923,7 +923,7 @@ bool EDATADecrypter::ReadHeader()
{
dec_key = std::move(rif_key);
if (dec_key == v128{})
if (!dec_key)
{
edat_log.warning("EDAT: Empty Dec key for local activation!");
}
@ -932,7 +932,7 @@ bool EDATADecrypter::ReadHeader()
{
dec_key = std::move(rif_key);
if (dec_key == v128{})
if (!dec_key)
{
edat_log.warning("EDAT: Empty Dec key for network activation!");
}
@ -978,7 +978,7 @@ u64 EDATADecrypter::ReadData(u64 pos, u8* data, u64 size)
for (u32 i = starting_block; i < ending_block; ++i)
{
edata_file.seek(0);
u64 res = decrypt_block(&edata_file, &data_buf[writeOffset], &edatHeader, &npdHeader, dec_key._bytes, i, total_blocks, edatHeader.file_size);
u64 res = decrypt_block(&edata_file, &data_buf[writeOffset], &edatHeader, &npdHeader, reinterpret_cast<uchar*>(&dec_key), i, total_blocks, edatHeader.file_size);
if (res == umax)
{
edat_log.error("Error Decrypting data");

View file

@ -6,8 +6,6 @@
#include "Utilities/File.h"
#include "util/v128.hpp"
constexpr u32 SDAT_FLAG = 0x01000000;
constexpr u32 EDAT_COMPRESSED_FLAG = 0x00000001;
constexpr u32 EDAT_FLAG_0x02 = 0x00000002;
@ -18,8 +16,8 @@ constexpr u32 EDAT_DEBUG_DATA_FLAG = 0x80000000;
struct loaded_npdrm_keys
{
atomic_t<v128> devKlic{};
atomic_t<v128> rifKey{};
atomic_t<u128> devKlic{};
atomic_t<u128> rifKey{};
atomic_t<u32> npdrm_fds{0};
};
@ -49,7 +47,7 @@ extern fs::file DecryptEDAT(const fs::file& input, const std::string& input_file
extern bool VerifyEDATHeaderWithKLicense(const fs::file& input, const std::string& input_file_name, const u8* custom_klic, std::string* contentID);
v128 GetEdatRifKeyFromRapFile(const fs::file& rap_file);
u128 GetEdatRifKeyFromRapFile(const fs::file& rap_file);
struct EDATADecrypter final : fs::file_base
{
@ -66,18 +64,20 @@ struct EDATADecrypter final : fs::file_base
std::unique_ptr<u8[]> data_buf;
u64 data_buf_size{0};
v128 dec_key{};
u128 dec_key{};
// edat usage
v128 rif_key{};
v128 dev_key{};
u128 rif_key{};
u128 dev_key{};
public:
// SdataByFd usage
EDATADecrypter(fs::file&& input)
: edata_file(std::move(input)) {}
// Edat usage
EDATADecrypter(fs::file&& input, const v128& dev_key, const v128& rif_key)
: edata_file(std::move(input)), rif_key(rif_key), dev_key(dev_key) {}
EDATADecrypter(fs::file&& input, const u128& dev_key, const u128& rif_key)
: edata_file(std::move(input))
, rif_key(rif_key)
, dev_key(dev_key) {}
~EDATADecrypter() override {}
// false if invalid

View file

@ -9,8 +9,6 @@
#include <algorithm>
#include <zlib.h>
#include "util/v128.hpp"
inline u8 Read8(const fs::file& f)
{
u8 ret;
@ -1489,7 +1487,7 @@ bool verify_npdrm_self_headers(const fs::file& self, u8* klic_key)
return true;
}
v128 get_default_self_klic()
u128 get_default_self_klic()
{
return std::bit_cast<v128>(NP_KLIC_FREE);
return std::bit_cast<u128>(NP_KLIC_FREE);
}

View file

@ -509,5 +509,4 @@ private:
fs::file decrypt_self(fs::file elf_or_self, u8* klic_key = nullptr, SelfAdditionalInfo* additional_info = nullptr);
bool verify_npdrm_self_headers(const fs::file& self, u8* klic_key = nullptr);
union v128;
v128 get_default_self_klic();
u128 get_default_self_klic();

View file

@ -33,7 +33,6 @@ target_include_directories(rpcs3_emu
# Utilities
target_sources(rpcs3_emu PRIVATE
../util/atomic.cpp
../util/atomic2.cpp
../util/fixed_typemap.cpp
../util/logs.cpp
../util/yaml.cpp

View file

@ -15,6 +15,9 @@
#include <unordered_map>
#include <map>
#include <immintrin.h>
#include <emmintrin.h>
DECLARE(cpu_thread::g_threads_created){0};
DECLARE(cpu_thread::g_threads_deleted){0};
DECLARE(cpu_thread::g_suspend_counter){0};
@ -938,7 +941,7 @@ bool cpu_thread::suspend_work::push(cpu_thread* _this) noexcept
break;
}
_mm_pause();
utils::pause();
}
// Second increment: all threads paused

View file

@ -2,6 +2,9 @@
#include "CPUTranslator.h"
#include "util/v128.hpp"
#include "util/v128sse.hpp"
llvm::LLVMContext g_llvm_ctx;
cpu_translator::cpu_translator(llvm::Module* _module, bool is_be)

View file

@ -5,6 +5,9 @@
#include "Emu/Cell/lv2/sys_process.h"
#include "Emu/Cell/lv2/sys_event.h"
#include "cellAudio.h"
#include "emmintrin.h"
#include "immintrin.h"
#include <cmath>
LOG_CHANNEL(cellAudio);

View file

@ -3,8 +3,6 @@
#include "stdafx.h"
#include <Emu/Memory/vm_ptr.h>
#include "util/v128.hpp"
// Return codes
enum CellSaveDataError : u32
{
@ -300,7 +298,7 @@ struct CellSaveDataFileSet
be_t<u32> fileOperation;
vm::bptr<void> reserved;
be_t<u32> fileType;
be_t<v128, 1> secureFileId;
be_t<u128, 1> secureFileId;
vm::bptr<char> fileName;
be_t<u32> fileOffset;
be_t<u32> fileSize;

View file

@ -16,6 +16,7 @@
#include "cellSpurs.h"
#include "util/v128.hpp"
#include "util/v128sse.hpp"
LOG_CHANNEL(cellSpurs);

View file

@ -14,6 +14,7 @@
#include <mutex>
#include "util/v128.hpp"
#include "util/v128sse.hpp"
LOG_CHANNEL(cellSpurs);

View file

@ -15,8 +15,6 @@
#include "Emu/NP/np_handler.h"
#include "Emu/NP/np_contexts.h"
#include "util/v128.hpp"
LOG_CHANNEL(sceNp);
template <>
@ -447,12 +445,12 @@ error_code sceNpTerm()
error_code npDrmIsAvailable(vm::cptr<u8> k_licensee_addr, vm::cptr<char> drm_path)
{
v128 k_licensee{};
u128 k_licensee{};
if (k_licensee_addr)
{
std::memcpy(&k_licensee, k_licensee_addr.get_ptr(), sizeof(k_licensee));
sceNp.notice("npDrmIsAvailable(): KLicense key %s", std::bit_cast<be_t<v128>>(k_licensee));
sceNp.notice("npDrmIsAvailable(): KLicense key %s", std::bit_cast<be_t<u128>>(k_licensee));
}
if (Emu.GetCat() == "PE")
@ -488,7 +486,7 @@ error_code npDrmIsAvailable(vm::cptr<u8> k_licensee_addr, vm::cptr<char> drm_pat
if (!k_licensee_addr)
k_licensee = get_default_self_klic();
if (verify_npdrm_self_headers(enc_file, k_licensee._bytes))
if (verify_npdrm_self_headers(enc_file, reinterpret_cast<u8*>(&k_licensee)))
{
npdrmkeys->devKlic = k_licensee;
}
@ -504,7 +502,7 @@ error_code npDrmIsAvailable(vm::cptr<u8> k_licensee_addr, vm::cptr<char> drm_pat
std::string contentID;
if (VerifyEDATHeaderWithKLicense(enc_file, enc_drm_path_local, k_licensee._bytes, &contentID))
if (VerifyEDATHeaderWithKLicense(enc_file, enc_drm_path_local, reinterpret_cast<u8*>(&k_licensee), &contentID))
{
const std::string rap_file = rap_dir_path + contentID + ".rap";
npdrmkeys->devKlic = k_licensee;

View file

@ -12,6 +12,7 @@
#include "util/asm.hpp"
#include "util/v128.hpp"
#include "util/v128sse.hpp"
#include "util/sysinfo.hpp"
#if !defined(_MSC_VER) && defined(__clang__)

View file

@ -66,6 +66,8 @@
#include "util/asm.hpp"
#include "util/vm.hpp"
#include "util/v128.hpp"
#include "util/v128sse.hpp"
#include "util/sysinfo.hpp"
const bool s_use_ssse3 = utils::has_ssse3();

View file

@ -9,6 +9,7 @@
#include "util/endian.hpp"
#include "util/logs.hpp"
#include "util/v128.hpp"
#include "util/v128sse.hpp"
#include <algorithm>
using namespace llvm;

View file

@ -12,6 +12,7 @@
#include "util/asm.hpp"
#include "util/v128.hpp"
#include "util/v128sse.hpp"
#include "util/sysinfo.hpp"
#include <cmath>
@ -959,7 +960,7 @@ spu_recompiler::XmmLink spu_recompiler::XmmGet(s8 reg, XmmType type) // get xmm
return result;
}
inline asmjit::X86Mem spu_recompiler::XmmConst(v128 data)
inline asmjit::X86Mem spu_recompiler::XmmConst(const v128& data)
{
// Find existing const
auto& xmm_label = xmm_consts[std::make_pair(data._u64[0], data._u64[1])];
@ -980,12 +981,12 @@ inline asmjit::X86Mem spu_recompiler::XmmConst(v128 data)
return asmjit::x86::oword_ptr(xmm_label);
}
inline asmjit::X86Mem spu_recompiler::XmmConst(__m128 data)
inline asmjit::X86Mem spu_recompiler::XmmConst(const __m128& data)
{
return XmmConst(v128::fromF(data));
}
inline asmjit::X86Mem spu_recompiler::XmmConst(__m128i data)
inline asmjit::X86Mem spu_recompiler::XmmConst(const __m128i& data)
{
return XmmConst(v128::fromV(data));
}

View file

@ -5,7 +5,7 @@
#include <functional>
#include "util/v128.hpp"
union v128;
// SPU ASMJIT Recompiler
class spu_recompiler : public spu_recompiler_base
@ -87,9 +87,9 @@ private:
XmmLink XmmAlloc();
XmmLink XmmGet(s8 reg, XmmType type);
asmjit::X86Mem XmmConst(v128 data);
asmjit::X86Mem XmmConst(__m128 data);
asmjit::X86Mem XmmConst(__m128i data);
asmjit::X86Mem XmmConst(const v128& data);
asmjit::X86Mem XmmConst(const __m128& data);
asmjit::X86Mem XmmConst(const __m128i& data);
asmjit::X86Mem get_pc(u32 addr);
void branch_fixed(u32 target, bool absolute = false);

View file

@ -8,6 +8,7 @@ const spu_decoder<spu_itype> s_spu_itype;
const spu_decoder<spu_iflag> s_spu_iflag;
#include "util/v128.hpp"
#include "util/v128sse.hpp"
u32 SPUDisAsm::disasm(u32 pc)
{
@ -161,7 +162,7 @@ std::pair<bool, v128> SPUDisAsm::try_get_const_value(u32 reg, u32 pc) const
return {};
}
typename SPUDisAsm::insert_mask_info SPUDisAsm::try_get_insert_mask_info(v128 mask)
typename SPUDisAsm::insert_mask_info SPUDisAsm::try_get_insert_mask_info(const v128& mask)
{
if ((mask & v128::from8p(0xe0)) != v128{})
{
@ -302,3 +303,29 @@ void SPUDisAsm::IOHL(spu_opcode_t op)
DisAsm("iohl", spu_reg_name[op.rt], op.i16);
}
void SPUDisAsm::SHUFB(spu_opcode_t op)
{
const auto [is_const, value] = try_get_const_value(op.rc);
if (is_const)
{
const auto [size, dst, src] = try_get_insert_mask_info(value);
if (size)
{
if ((size >= 4u && !src) || (size == 2u && src == 1u) || (size == 1u && src == 3u))
{
// Comment insertion pattern for CWD-alike instruction
DisAsm("shufb", spu_reg_name[op.rt4], spu_reg_name[op.ra], spu_reg_name[op.rb], fmt::format("%s #i%u[%u]", spu_reg_name[op.rc], size * 8, dst).c_str());
return;
}
// Comment insertion pattern for unknown instruction formations
DisAsm("shufb", spu_reg_name[op.rt4], spu_reg_name[op.ra], spu_reg_name[op.rb], fmt::format("%s #i%u[%u] = [%u]", spu_reg_name[op.rc], size * 8, dst, src).c_str());
return;
}
}
DisAsm("shufb", spu_reg_name[op.rt4], spu_reg_name[op.ra], spu_reg_name[op.rb], spu_reg_name[op.rc]);
}

View file

@ -3,7 +3,7 @@
#include "PPCDisAsm.h"
#include "SPUOpcodes.h"
#include "util/v128.hpp"
union v128;
static constexpr const char* spu_reg_name[128] =
{
@ -172,7 +172,7 @@ public:
u32 src_index;
};
static insert_mask_info try_get_insert_mask_info(v128 mask);
static insert_mask_info try_get_insert_mask_info(const v128& mask);
//0 - 10
void STOP(spu_opcode_t op)
@ -972,31 +972,7 @@ public:
{
DisAsm("selb", spu_reg_name[op.rt4], spu_reg_name[op.ra], spu_reg_name[op.rb], spu_reg_name[op.rc]);
}
void SHUFB(spu_opcode_t op)
{
const auto [is_const, value] = try_get_const_value(op.rc);
if (is_const)
{
const auto [size, dst, src] = try_get_insert_mask_info(value);
if (size)
{
if ((size >= 4u && !src) || (size == 2u && src == 1u) || (size == 1u && src == 3u))
{
// Comment insertion pattern for CWD-alike instruction
DisAsm("shufb", spu_reg_name[op.rt4], spu_reg_name[op.ra], spu_reg_name[op.rb], fmt::format("%s #i%u[%u]", spu_reg_name[op.rc], size * 8, dst).c_str());
return;
}
// Comment insertion pattern for unknown instruction formations
DisAsm("shufb", spu_reg_name[op.rt4], spu_reg_name[op.ra], spu_reg_name[op.rb], fmt::format("%s #i%u[%u] = [%u]", spu_reg_name[op.rc], size * 8, dst, src).c_str());
return;
}
}
DisAsm("shufb", spu_reg_name[op.rt4], spu_reg_name[op.ra], spu_reg_name[op.rb], spu_reg_name[op.rc]);
}
void SHUFB(spu_opcode_t op);
void MPYA(spu_opcode_t op)
{
DisAsm("mpya", spu_reg_name[op.rt4], spu_reg_name[op.ra], spu_reg_name[op.rb], spu_reg_name[op.rc]);

View file

@ -7,6 +7,7 @@
#include "util/asm.hpp"
#include "util/v128.hpp"
#include "util/v128sse.hpp"
#include "util/sysinfo.hpp"
#include <cmath>

View file

@ -18,6 +18,7 @@
#include <thread>
#include "util/v128.hpp"
#include "util/v128sse.hpp"
#include "util/sysinfo.hpp"
extern atomic_t<const char*> g_progr;

View file

@ -31,6 +31,7 @@
#include "util/vm.hpp"
#include "util/asm.hpp"
#include "util/v128.hpp"
#include "util/v128sse.hpp"
#include "util/sysinfo.hpp"
using spu_rdata_t = decltype(spu_thread::rdata);
@ -1558,7 +1559,7 @@ void spu_thread::cpu_return()
for (u32 status; !thread->exit_status.try_read(status)
|| status != thread->last_exit_status;)
{
_mm_pause();
utils::pause();
}
}
}

View file

@ -504,7 +504,7 @@ struct spu_imm_table_t
public:
scale_table_t();
FORCE_INLINE __m128 operator [] (s32 scale) const
FORCE_INLINE const auto& operator [](s32 scale) const
{
return m_data[scale + 155].vf;
}

View file

@ -32,7 +32,9 @@ static error_code overlay_load_module(vm::ptr<u32> ovlmid, const std::string& vp
src = std::move(lv2_file);
}
const ppu_exec_object obj = decrypt_self(std::move(src), g_fxo->get<loaded_npdrm_keys>()->devKlic.load()._bytes);
u128 klic = g_fxo->get<loaded_npdrm_keys>()->devKlic.load();
const ppu_exec_object obj = decrypt_self(std::move(src), reinterpret_cast<u8*>(&klic));
if (obj != elf_error::ok)
{

View file

@ -403,10 +403,9 @@ void _sys_process_exit2(ppu_thread& ppu, s32 status, vm::ptr<sys_exit2_param> ar
Emu.disc = std::move(disc);
Emu.hdd1 = std::move(hdd1);
if (klic != v128{})
if (klic)
{
// TODO: Use std::optional
Emu.klic.assign(std::begin(klic._bytes), std::end(klic._bytes));
Emu.klic.emplace_back(klic);
}
Emu.SetForceBoot(true);

View file

@ -263,7 +263,9 @@ static error_code prx_load_module(const std::string& vpath, u64 flags, vm::ptr<s
src = std::move(lv2_file);
}
const ppu_prx_object obj = decrypt_self(std::move(src), g_fxo->get<loaded_npdrm_keys>()->devKlic.load()._bytes);
u128 klic = g_fxo->get<loaded_npdrm_keys>()->devKlic.load();
const ppu_prx_object obj = decrypt_self(std::move(src), reinterpret_cast<u8*>(&klic));
if (obj != elf_error::ok)
{

View file

@ -251,7 +251,9 @@ error_code sys_spu_image_open(ppu_thread& ppu, vm::ptr<sys_spu_image> img, vm::c
return {fs_error, path};
}
const fs::file elf_file = decrypt_self(std::move(file), g_fxo->get<loaded_npdrm_keys>()->devKlic.load()._bytes);
u128 klic = g_fxo->get<loaded_npdrm_keys>()->devKlic.load();
const fs::file elf_file = decrypt_self(std::move(file), reinterpret_cast<u8*>(&klic));
if (!elf_file)
{

View file

@ -323,7 +323,7 @@ namespace vm
break;
}
_mm_pause();
utils::pause();
}
}
@ -525,7 +525,7 @@ namespace vm
break;
}
_mm_pause();
utils::pause();
}
for (auto lock = g_locks.cbegin(), end = lock + g_cfg.core.ppu_threads; lock != end; lock++)
@ -533,7 +533,9 @@ namespace vm
if (auto ptr = +*lock)
{
while (!(ptr->state & cpu_flag::wait))
_mm_pause();
{
utils::pause();
}
}
}
}
@ -1606,7 +1608,7 @@ namespace vm
case 2: atomic_storage<u16>::release(*static_cast<u16*>(dst), *static_cast<u16*>(src)); break;
case 4: atomic_storage<u32>::release(*static_cast<u32*>(dst), *static_cast<u32*>(src)); break;
case 8: atomic_storage<u64>::release(*static_cast<u64*>(dst), *static_cast<u64*>(src)); break;
case 16: _mm_store_si128(static_cast<__m128i*>(dst), _mm_loadu_si128(static_cast<__m128i*>(src))); break;
case 16: atomic_storage<u128>::release(*static_cast<u128*>(dst), *static_cast<u128*>(src)); break;
}
return true;

View file

@ -8,8 +8,26 @@
extern bool g_use_rtm;
extern u64 g_rtm_tx_limit2;
#ifdef _MSC_VER
extern "C"
{
u64 __rdtsc();
u32 _xbegin();
void _xend();
}
#endif
namespace vm
{
inline u64 get_tsc()
{
#ifdef _MSC_VER
return __rdtsc();
#else
return __builtin_ia32_rdtsc();
#endif
}
enum : u64
{
rsrv_lock_mask = 127,
@ -81,28 +99,28 @@ namespace vm
const auto sptr = vm::get_super_ptr<T>(static_cast<u32>(ptr.addr()));
// Prefetch some data
_m_prefetchw(sptr);
_m_prefetchw(reinterpret_cast<char*>(sptr) + 64);
//_m_prefetchw(sptr);
//_m_prefetchw(reinterpret_cast<char*>(sptr) + 64);
// Use 128-byte aligned addr
const u32 addr = static_cast<u32>(ptr.addr()) & -128;
auto& res = vm::reservation_acquire(addr, 128);
_m_prefetchw(&res);
//_m_prefetchw(&res);
if (g_use_rtm)
{
// Stage 1: single optimistic transaction attempt
unsigned status = _XBEGIN_STARTED;
unsigned status = -1;
u64 _old = 0;
auto stamp0 = __rdtsc(), stamp1 = stamp0, stamp2 = stamp0;
auto stamp0 = get_tsc(), stamp1 = stamp0, stamp2 = stamp0;
#ifndef _MSC_VER
__asm__ goto ("xbegin %l[stage2];" ::: "memory" : stage2);
#else
status = _xbegin();
if (status == _XBEGIN_STARTED)
if (status == umax)
#endif
{
if (res & rsrv_unique_lock)
@ -158,16 +176,16 @@ namespace vm
#ifndef _MSC_VER
__asm__ volatile ("mov %%eax, %0;" : "=r" (status) :: "memory");
#endif
stamp1 = __rdtsc();
stamp1 = get_tsc();
// Stage 2: try to lock reservation first
_old = res.fetch_add(1);
// Compute stamps excluding memory touch
stamp2 = __rdtsc() - (stamp1 - stamp0);
stamp2 = get_tsc() - (stamp1 - stamp0);
// Start lightened transaction
for (; !(_old & vm::rsrv_unique_lock) && stamp2 - stamp0 <= g_rtm_tx_limit2; stamp2 = __rdtsc())
for (; !(_old & vm::rsrv_unique_lock) && stamp2 - stamp0 <= g_rtm_tx_limit2; stamp2 = get_tsc())
{
if (cpu.has_pause_flag())
{
@ -179,7 +197,7 @@ namespace vm
#else
status = _xbegin();
if (status != _XBEGIN_STARTED) [[unlikely]]
if (status != umax) [[unlikely]]
{
goto retry;
}

View file

@ -3,10 +3,12 @@
#include "../rsx_methods.h"
#include "../RSXThread.h"
#include "util/v128.hpp"
#include "util/to_endian.hpp"
#include "util/sysinfo.hpp"
#include "emmintrin.h"
#include "immintrin.h"
#define DEBUG_VERTEX_STREAMING 0
#if !defined(_MSC_VER) && defined(__clang__)
@ -166,7 +168,7 @@ namespace
const u32 dword_count = size >> 2;
const u32 iterations = dword_count >> 2;
v128 bits_diff{};
__m128i bits_diff = _mm_setzero_si128();
if (s_use_ssse3) [[likely]]
{
@ -177,12 +179,12 @@ namespace
if constexpr (!unaligned)
{
bits_diff = bits_diff | v128::fromV(_mm_xor_si128(_mm_load_si128(dst_ptr), shuffled_vector));
bits_diff = _mm_or_si128(bits_diff, _mm_xor_si128(_mm_load_si128(dst_ptr), shuffled_vector));
_mm_stream_si128(dst_ptr, shuffled_vector);
}
else
{
bits_diff = bits_diff | v128::fromV(_mm_xor_si128(_mm_loadu_si128(dst_ptr), shuffled_vector));
bits_diff = _mm_or_si128(bits_diff, _mm_xor_si128(_mm_loadu_si128(dst_ptr), shuffled_vector));
_mm_storeu_si128(dst_ptr, shuffled_vector);
}
@ -200,12 +202,12 @@ namespace
if constexpr (!unaligned)
{
bits_diff = bits_diff | v128::fromV(_mm_xor_si128(_mm_load_si128(dst_ptr), vec2));
bits_diff = _mm_or_si128(bits_diff, _mm_xor_si128(_mm_load_si128(dst_ptr), vec2));
_mm_stream_si128(dst_ptr, vec2);
}
else
{
bits_diff = bits_diff | v128::fromV(_mm_xor_si128(_mm_loadu_si128(dst_ptr), vec2));
bits_diff = _mm_or_si128(bits_diff, _mm_xor_si128(_mm_loadu_si128(dst_ptr), vec2));
_mm_storeu_si128(dst_ptr, vec2);
}
@ -228,12 +230,12 @@ namespace
if (dst_ptr2[i] != data)
{
dst_ptr2[i] = data;
bits_diff._u32[0] = UINT32_MAX;
bits_diff = _mm_set1_epi64x(-1);
}
}
}
return bits_diff != v128{};
return _mm_cvtsi128_si64(_mm_packs_epi32(bits_diff, bits_diff)) != 0;
}
template bool stream_data_to_memory_swapped_and_compare_u32<false>(void *dst, const void *src, u32 size);

View file

@ -284,7 +284,7 @@ bool vertex_program_compare::operator()(const RSXVertexProgram &binary1, const R
{
const auto inst1 = v128::loadu(instBuffer1, instIndex);
const auto inst2 = v128::loadu(instBuffer2, instIndex);
if (inst1 != inst2)
if (inst1._u ^ inst2._u)
{
return false;
}
@ -475,7 +475,7 @@ bool fragment_program_compare::operator()(const RSXFragmentProgram& binary1, con
const auto inst1 = v128::loadu(instBuffer1, instIndex);
const auto inst2 = v128::loadu(instBuffer2, instIndex);
if (inst1 != inst2)
if (inst1._u ^ inst2._u)
return false;
instIndex++;

View file

@ -397,62 +397,7 @@ public:
std::forward<Args>(args)...); // Other arguments
}
void fill_fragment_constants_buffer(gsl::span<f32> dst_buffer, const RSXFragmentProgram &fragment_program, bool sanitize = false) const
{
const auto I = m_fragment_shader_cache.find(fragment_program);
if (I == m_fragment_shader_cache.end())
return;
ensure((dst_buffer.size_bytes() >= ::narrow<int>(I->second.FragmentConstantOffsetCache.size()) * 16u));
f32* dst = dst_buffer.data();
alignas(16) f32 tmp[4];
for (usz offset_in_fragment_program : I->second.FragmentConstantOffsetCache)
{
char* data = static_cast<char*>(fragment_program.get_data()) + offset_in_fragment_program;
const __m128i vector = _mm_loadu_si128(reinterpret_cast<__m128i*>(data));
const __m128i shuffled_vector = _mm_or_si128(_mm_slli_epi16(vector, 8), _mm_srli_epi16(vector, 8));
if (!patch_table.is_empty())
{
_mm_store_ps(tmp, _mm_castsi128_ps(shuffled_vector));
bool patched;
for (int i = 0; i < 4; ++i)
{
patched = false;
for (auto& e : patch_table.db)
{
//TODO: Use fp comparison with fabsf without hurting performance
patched = e.second.test_and_set(tmp[i], &dst[i]);
if (patched)
{
break;
}
}
if (!patched)
{
dst[i] = tmp[i];
}
}
}
else if (sanitize)
{
//Convert NaNs and Infs to 0
const auto masked = _mm_and_si128(shuffled_vector, _mm_set1_epi32(0x7fffffff));
const auto valid = _mm_cmplt_epi32(masked, _mm_set1_epi32(0x7f800000));
const auto result = _mm_and_si128(shuffled_vector, valid);
_mm_stream_si128(std::bit_cast<__m128i*>(dst), result);
}
else
{
_mm_stream_si128(std::bit_cast<__m128i*>(dst), shuffled_vector);
}
dst += 4;
}
}
void fill_fragment_constants_buffer(gsl::span<f32> dst_buffer, const RSXFragmentProgram& fragment_program, bool sanitize = false) const;
void clear()
{

View file

@ -0,0 +1,64 @@
#pragma once
#include "ProgramStateCache.h"
#include "emmintrin.h"
#include "immintrin.h"
template <typename Traits>
void program_state_cache<Traits>::fill_fragment_constants_buffer(gsl::span<f32> dst_buffer, const RSXFragmentProgram &fragment_program, bool sanitize) const
{
const auto I = m_fragment_shader_cache.find(fragment_program);
if (I == m_fragment_shader_cache.end())
return;
ensure((dst_buffer.size_bytes() >= ::narrow<int>(I->second.FragmentConstantOffsetCache.size()) * 16u));
f32* dst = dst_buffer.data();
alignas(16) f32 tmp[4];
for (usz offset_in_fragment_program : I->second.FragmentConstantOffsetCache)
{
char* data = static_cast<char*>(fragment_program.get_data()) + offset_in_fragment_program;
const __m128i vector = _mm_loadu_si128(reinterpret_cast<__m128i*>(data));
const __m128i shuffled_vector = _mm_or_si128(_mm_slli_epi16(vector, 8), _mm_srli_epi16(vector, 8));
if (!patch_table.is_empty())
{
_mm_store_ps(tmp, _mm_castsi128_ps(shuffled_vector));
bool patched;
for (int i = 0; i < 4; ++i)
{
patched = false;
for (auto& e : patch_table.db)
{
//TODO: Use fp comparison with fabsf without hurting performance
patched = e.second.test_and_set(tmp[i], &dst[i]);
if (patched)
{
break;
}
}
if (!patched)
{
dst[i] = tmp[i];
}
}
}
else if (sanitize)
{
//Convert NaNs and Infs to 0
const auto masked = _mm_and_si128(shuffled_vector, _mm_set1_epi32(0x7fffffff));
const auto valid = _mm_cmplt_epi32(masked, _mm_set1_epi32(0x7f800000));
const auto result = _mm_and_si128(shuffled_vector, valid);
_mm_stream_si128(std::bit_cast<__m128i*>(dst), result);
}
else
{
_mm_stream_si128(std::bit_cast<__m128i*>(dst), shuffled_vector);
}
dst += 4;
}
}

View file

@ -7,6 +7,8 @@
#include "Emu/Memory/vm_locking.h"
#include "Emu/RSX/rsx_methods.h"
#include "../Common/program_state_cache2.hpp"
#define DUMP_VERTEX_DATA 0
u64 GLGSRender::get_cycles()

View file

@ -3,6 +3,8 @@
#include "Emu/System.h"
#include "Emu/Cell/Modules/cellMsgDialog.h"
#include "util/asm.hpp"
namespace rsx
{
void shader_loading_dialog::create(const std::string& msg, const std::string& title)
@ -27,7 +29,7 @@ namespace rsx
while (ref_cnt.load() && !Emu.IsStopped())
{
_mm_pause();
utils::pause();
}
}
@ -87,7 +89,7 @@ namespace rsx
{
while (ref_cnt.load() && !Emu.IsStopped())
{
_mm_pause();
utils::pause();
}
}
}

View file

@ -6,6 +6,7 @@
#include "rsx_utils.h"
#include <thread>
#include "util/asm.hpp"
namespace rsx
{
@ -171,13 +172,13 @@ namespace rsx
while (_thr->m_enqueued_count.load() > _thr->m_processed_count.load())
{
rsxthr->on_semaphore_acquire_wait();
_mm_pause();
utils::pause();
}
}
else
{
while (_thr->m_enqueued_count.load() > _thr->m_processed_count.load())
_mm_pause();
utils::pause();
}
return true;

View file

@ -862,7 +862,7 @@ namespace rsx
for (; t == now; now = get_time_ns())
{
_mm_pause();
utils::pause();
}
timestamp_ctrl = now;
@ -2662,7 +2662,7 @@ namespace rsx
for (u32 ea = address >> 20, end = ea + (size >> 20); ea < end; ea++)
{
const u32 io = utils::ror32(iomap_table.io[ea], 20);
const u32 io = utils::rol32(iomap_table.io[ea], 32 - 20);
if (io + 1)
{
@ -2747,7 +2747,7 @@ namespace rsx
if (Emu.IsStopped())
break;
_mm_pause();
utils::pause();
}
}
@ -2771,7 +2771,7 @@ namespace rsx
while (external_interrupt_lock)
{
// TODO: Investigate non busy-spinning method
_mm_pause();
utils::pause();
}
external_interrupt_ack.store(false);

View file

@ -10,6 +10,8 @@
#include "Emu/RSX/rsx_methods.h"
#include "Emu/Memory/vm_locking.h"
#include "../Common/program_state_cache2.hpp"
#include "util/asm.hpp"
namespace vk
@ -679,7 +681,7 @@ bool VKGSRender::on_access_violation(u32 address, bool is_writing)
// Wait for deadlock to clear
while (m_queue_status & flush_queue_state::deadlock)
{
_mm_pause();
utils::pause();
}
g_fxo->get<rsx::dma_manager>()->clear_mem_fault_flag();

View file

@ -300,7 +300,11 @@ namespace vk
{
while (num_waiters.load() != 0)
{
#ifdef _MSC_VER
_mm_pause();
#else
__builtin_ia32_pause();
#endif
}
}

View file

@ -1006,7 +1006,11 @@ namespace vk
}
//std::this_thread::yield();
#ifdef _MSC_VER
_mm_pause();
#else
__builtin_ia32_pause();
#endif
}
}

View file

@ -24,6 +24,10 @@
#include "3rdparty/GPUOpen/include/vk_mem_alloc.h"
#ifdef _MSC_VER
extern "C" void _mm_pause();
#endif
#ifdef __APPLE__
#define VK_DISABLE_COMPONENT_SWIZZLE 1
#else
@ -1231,7 +1235,11 @@ private:
{
while (!flushed)
{
#ifdef _MSC_VER
_mm_pause();
#else
__builtin_ia32_pause();
#endif
}
}

View file

@ -1564,7 +1564,7 @@ game_boot_result Emulator::Load(const std::string& title_id, bool add_only, bool
elf_file.open(decrypted_path);
}
// Decrypt SELF
else if ((elf_file = decrypt_self(std::move(elf_file), klic.empty() ? nullptr : klic.data(), &g_ps3_process_info.self_info)))
else if ((elf_file = decrypt_self(std::move(elf_file), klic.empty() ? nullptr : reinterpret_cast<u8*>(&klic[0]), &g_ps3_process_info.self_info)))
{
if (true)
{

View file

@ -124,7 +124,7 @@ public:
std::vector<std::string> argv;
std::vector<std::string> envp;
std::vector<u8> data;
std::vector<u8> klic;
std::vector<u128> klic;
std::string disc;
std::string hdd1;

View file

@ -707,7 +707,7 @@ std::string vfs::unescape(std::string_view name)
std::string vfs::host::hash_path(const std::string& path, const std::string& dev_root)
{
return fmt::format(u8"%s/%s%s", dev_root, fmt::base57(std::hash<std::string>()(path)), fmt::base57(__rdtsc()));
return fmt::format(u8"%s/%s%s", dev_root, fmt::base57(std::hash<std::string>()(path)), fmt::base57(utils::get_unique_tsc()));
}
bool vfs::host::rename(const std::string& from, const std::string& to, const lv2_fs_mount_point* mp, bool overwrite)

View file

@ -1,6 +1,8 @@
#include "stdafx.h"
#include "stdafx.h"
#include "perf_meter.hpp"
#include "util/sysinfo.hpp"
#include <map>
#include <mutex>
@ -65,6 +67,36 @@ void perf_stat_base::print(const char* name) noexcept
}
}
#ifdef _MSC_VER
extern "C" void _mm_lfence();
#endif
SAFE_BUFFERS void perf_stat_base::push(u64 data[66], u64 start_time, const char* name) noexcept
{
// Event end
#ifdef _MSC_VER
const u64 end_time = (_mm_lfence(), get_tsc());
#else
const u64 end_time = (__builtin_ia32_lfence(), get_tsc());
#endif
// Compute difference in seconds
const f64 diff = (end_time - start_time) * 1. / utils::get_tsc_freq();
// Register perf stat in nanoseconds
const u64 ns = static_cast<u64>(diff * 1000'000'000.);
// Print in microseconds
if (static_cast<u64>(diff * 1000'000.) >= g_cfg.core.perf_report_threshold)
{
perf_log.notice(u8"%s: %.3fµs", name, diff * 1000'000.);
}
data[0] += ns != 0;
data[64 - std::countl_zero(ns)]++;
data[65] += ns;
}
static shared_mutex s_perf_mutex;
static std::map<std::string, perf_stat_base> s_perf_acc;

View file

@ -7,10 +7,22 @@
#include <array>
#include <cmath>
#include "util/sysinfo.hpp"
LOG_CHANNEL(perf_log, "PERF");
#ifdef _MSC_VER
extern "C" u64 __rdtsc();
inline u64 get_tsc()
{
return __rdtsc();
}
#else
inline u64 get_tsc()
{
return __builtin_ia32_rdtsc();
}
#endif
// TODO: constexpr with the help of bitcast
template <auto Name>
inline const auto perf_name = []
@ -32,6 +44,9 @@ protected:
// Accumulate values from a thread
void push(u64 ns[66]) noexcept;
// Get end time; accumulate value to the TLS
static void push(u64 ns[66], u64 start_time, const char* name) noexcept;
// Register TLS storage for stats
static void add(u64 ns[66], const char* name) noexcept;
@ -73,27 +88,9 @@ class perf_stat final : public perf_stat_base
} g_tls_perf_stat;
public:
static NEVER_INLINE void push(u64 start_time) noexcept
static SAFE_BUFFERS FORCE_INLINE void push(u64 start_time) noexcept
{
// Event end
const u64 end_time = (_mm_lfence(), __rdtsc());
// Compute difference in seconds
const f64 diff = (end_time - start_time) * 1. / utils::get_tsc_freq();
// Register perf stat in nanoseconds
const u64 ns = static_cast<u64>(diff * 1000'000'000.);
// Print in microseconds
if (static_cast<u64>(diff * 1000'000.) >= g_cfg.core.perf_report_threshold)
{
perf_log.notice(u8"%s: %.3fµs", perf_name<ShortName>.data(), diff * 1000'000.);
}
auto& data = g_tls_perf_stat.m_log;
data[0] += ns != 0;
data[64 - std::countl_zero(ns)]++;
data[65] += ns;
perf_stat_base::push(g_tls_perf_stat.m_log, start_time, perf_name<ShortName>.data());
}
};
@ -149,7 +146,7 @@ public:
if constexpr (std::array<bool, sizeof...(SubEvents)>{(SubEvents == Event)...}[Index])
{
// Push actual timestamp into an array
m_timestamps[Index + 1] = __rdtsc();
m_timestamps[Index + 1] = get_tsc();
}
else if constexpr (Index < sizeof...(SubEvents))
{
@ -173,7 +170,7 @@ public:
// Re-initialize first timestamp
SAFE_BUFFERS FORCE_INLINE void restart() noexcept
{
m_timestamps[0] = __rdtsc();
m_timestamps[0] = get_tsc();
std::memset(m_timestamps + 1, 0, sizeof(m_timestamps) - sizeof(u64));
}

View file

@ -23,7 +23,7 @@ bool TRPLoader::Install(const std::string& dest, bool show)
const std::string& local_path = vfs::get(dest);
const auto temp = fmt::format(u8"%s.temp%u", local_path, __rdtsc());
const auto temp = fmt::format(u8"%s.temp%u", local_path, utils::get_unique_tsc());
if (!fs::create_dir(temp))
{

View file

@ -120,9 +120,6 @@
<ClCompile Include="util\atomic.cpp">
<PrecompiledHeader>NotUsing</PrecompiledHeader>
</ClCompile>
<ClCompile Include="util\atomic2.cpp">
<PrecompiledHeader>NotUsing</PrecompiledHeader>
</ClCompile>
<ClCompile Include="util\yaml.cpp">
<PrecompiledHeader>NotUsing</PrecompiledHeader>
<ExceptionHandling>Sync</ExceptionHandling>
@ -516,6 +513,7 @@
<ClInclude Include="Emu\system_config_types.h" />
<ClInclude Include="util\atomic.hpp" />
<ClInclude Include="util\v128.hpp" />
<ClInclude Include="util\v128sse.hpp" />
<ClInclude Include="util\to_endian.hpp" />
<ClInclude Include="..\Utilities\bin_patch.h" />
<ClInclude Include="..\Utilities\BitField.h" />
@ -531,7 +529,6 @@
<ClInclude Include="..\Utilities\mutex.h" />
<ClInclude Include="..\Utilities\sema.h" />
<ClInclude Include="..\Utilities\sync.h" />
<ClInclude Include="util\atomic2.hpp" />
<ClInclude Include="util\endian.hpp" />
<ClInclude Include="util\fixed_typemap.hpp" />
<ClInclude Include="util\init_mutex.hpp" />
@ -742,6 +739,7 @@
<ClInclude Include="Emu\RSX\Common\BufferUtils.h" />
<ClInclude Include="Emu\RSX\Common\FragmentProgramDecompiler.h" />
<ClInclude Include="Emu\RSX\Common\ProgramStateCache.h" />
<ClInclude Include="Emu\RSX\Common\program_state_cache2.hpp" />
<ClInclude Include="Emu\RSX\Common\ring_buffer_helper.h" />
<ClInclude Include="Emu\RSX\Common\ShaderParam.h" />
<ClInclude Include="Emu\RSX\Common\surface_store.h" />

View file

@ -935,9 +935,6 @@
<ClCompile Include="Emu\RSX\Overlays\overlay_osk_panel.cpp">
<Filter>Emu\GPU\RSX\Overlays</Filter>
</ClCompile>
<ClCompile Include="util\atomic2.cpp">
<Filter>Utilities</Filter>
</ClCompile>
<ClCompile Include="util\logs.cpp">
<Filter>Utilities</Filter>
</ClCompile>
@ -1072,6 +1069,9 @@
<ClInclude Include="util\v128.hpp">
<Filter>Utilities</Filter>
</ClInclude>
<ClInclude Include="util\v128sse.hpp">
<Filter>Utilities</Filter>
</ClInclude>
<ClInclude Include="util\to_endian.hpp">
<Filter>Utilities</Filter>
</ClInclude>
@ -1153,6 +1153,9 @@
<ClInclude Include="Emu\RSX\Common\ProgramStateCache.h">
<Filter>Emu\GPU\RSX\Common</Filter>
</ClInclude>
<ClInclude Include="Emu\RSX\Common\program_state_cache2.hpp">
<Filter>Emu\GPU\RSX\Common</Filter>
</ClInclude>
<ClInclude Include="Emu\RSX\Common\FragmentProgramDecompiler.h">
<Filter>Emu\GPU\RSX\Common</Filter>
</ClInclude>
@ -1819,9 +1822,6 @@
<ClInclude Include="util\yaml.hpp">
<Filter>Utilities</Filter>
</ClInclude>
<ClInclude Include="util\atomic2.hpp">
<Filter>Utilities</Filter>
</ClInclude>
<ClInclude Include="util\endian.hpp">
<Filter>Utilities</Filter>
</ClInclude>

View file

@ -44,7 +44,6 @@ DYNAMIC_IMPORT("ntdll.dll", NtSetTimerResolution, NTSTATUS(ULONG DesiredResoluti
#include <thread>
#include <charconv>
#include "util/v128.hpp"
#include "util/sysinfo.hpp"
inline std::string sstr(const QString& _in) { return _in.toStdString(); }
@ -301,8 +300,6 @@ int main(int argc, char** argv)
const u64 intro_time = (intro_stats.ru_utime.tv_sec + intro_stats.ru_stime.tv_sec) * 1000000000ull + (intro_stats.ru_utime.tv_usec + intro_stats.ru_stime.tv_usec) * 1000ull;
#endif
v128::use_fma = utils::has_fma3();
s_argv0 = argv[0]; // Save for report_fatal_error
// Only run RPCS3 to display an error

View file

@ -885,12 +885,12 @@ void main_window::DecryptSPRXLibraries()
gui_log.notice("Decrypting binaries...");
// Always start with no KLIC
std::vector<v128> klics{v128{}};
std::vector<u128> klics{u128{}};
if (const auto keys = g_fxo->get<loaded_npdrm_keys>())
{
// Second klic: get it from a running game
if (const v128 klic = keys->devKlic; klic != v128{})
if (const u128 klic = keys->devKlic)
{
klics.emplace_back(klic);
}
@ -913,7 +913,7 @@ void main_window::DecryptSPRXLibraries()
if (elf_file.open(old_path) && elf_file.size() >= 4 && elf_file.read<u32>() == "SCE\0"_u32)
{
// First KLIC is no KLIC
elf_file = decrypt_self(std::move(elf_file), key_it != 0 ? klics[key_it]._bytes : nullptr);
elf_file = decrypt_self(std::move(elf_file), key_it != 0 ? reinterpret_cast<u8*>(&klics[key_it]) : nullptr);
if (!elf_file)
{
@ -985,11 +985,15 @@ void main_window::DecryptSPRXLibraries()
ensure(text.size() == 32);
// It must succeed (only hex characters are present)
std::from_chars(&text[0], &text[16], klic._u64[1], 16); // Not a typo: on LE systems the u64[1] part will be swapped with u64[0] later
std::from_chars(&text[16], &text[32], klic._u64[0], 16); // And on BE systems it will be already swapped by index internally
u64 lo_ = 0;
u64 hi_ = 0;
std::from_chars(&text[0], &text[16], lo_, 16);
std::from_chars(&text[16], &text[32], hi_, 16);
// Needs to be in big endian because the left to right byte-order means big endian
klic = std::bit_cast<be_t<v128>>(klic);
be_t<u64> lo = std::bit_cast<be_t<u64>>(lo_);
be_t<u64> hi = std::bit_cast<be_t<u64>>(hi_);
klic = (u128{+hi} << 64) | +lo;
// Retry with specified KLIC
key_it -= +std::exchange(tried, true); // Rewind on second and above attempt

View file

@ -179,7 +179,7 @@ void register_editor_dialog::updateRegister(int reg)
else if (reg >= ppu_v0 && reg <= ppu_v31)
{
const auto r = ppu.vr[reg_index];
str = r == v128::from32p(r._u32[0]) ? fmt::format("%08x$", r._u32[0]) : fmt::format("%08x %08x %08x %08x", r.u32r[0], r.u32r[1], r.u32r[2], r.u32r[3]);
str = !r._u ? fmt::format("%08x$", r._u32[0]) : fmt::format("%08x %08x %08x %08x", r.u32r[0], r.u32r[1], r.u32r[2], r.u32r[3]);
}
}
else if (reg == PPU_CR) str = fmt::format("%08x", ppu.cr.pack());
@ -198,7 +198,7 @@ void register_editor_dialog::updateRegister(int reg)
{
const u32 reg_index = reg % 128;
const auto r = spu.gpr[reg_index];
str = r == v128::from32p(r._u32[0]) ? fmt::format("%08x$", r._u32[0]) : fmt::format("%08x %08x %08x %08x", r.u32r[0], r.u32r[1], r.u32r[2], r.u32r[3]);
str = !r._u ? fmt::format("%08x$", r._u32[0]) : fmt::format("%08x %08x %08x %08x", r.u32r[0], r.u32r[1], r.u32r[2], r.u32r[3]);
}
else if (reg == MFC_PEVENTS) str = fmt::format("%08x", +spu.ch_events.load().events);
else if (reg == MFC_EVENTS_MASK) str = fmt::format("%08x", +spu.ch_events.load().mask);

View file

@ -5,22 +5,54 @@
extern bool g_use_rtm;
extern u64 g_rtm_tx_limit1;
#ifdef _MSC_VER
extern "C"
{
u64 __rdtsc();
u32 _xbegin();
void _xend();
void _mm_pause();
void _mm_prefetch(const char*, int);
void _m_prefetchw(const volatile void*);
uchar _rotl8(uchar, uchar);
ushort _rotl16(ushort, uchar);
uint _rotl(uint, int);
u64 _rotl64(u64, int);
s64 __mulh(s64, s64);
u64 __umulh(u64, u64);
s64 _div128(s64, s64, s64, s64*);
u64 _udiv128(u64, u64, u64, u64*);
}
#endif
namespace utils
{
inline u64 get_tsc()
{
#ifdef _MSC_VER
return __rdtsc();
#else
return __builtin_ia32_rdtsc();
#endif
}
// Transaction helper (result = pair of success and op result, or just bool)
template <typename F, typename R = std::invoke_result_t<F>>
inline auto tx_start(F op)
{
uint status = -1;
for (auto stamp0 = __rdtsc(), stamp1 = stamp0; g_use_rtm && stamp1 - stamp0 <= g_rtm_tx_limit1; stamp1 = __rdtsc())
for (auto stamp0 = get_tsc(), stamp1 = stamp0; g_use_rtm && stamp1 - stamp0 <= g_rtm_tx_limit1; stamp1 = get_tsc())
{
#ifndef _MSC_VER
__asm__ goto ("xbegin %l[retry];" ::: "memory" : retry);
#else
status = _xbegin();
if (status != _XBEGIN_STARTED) [[unlikely]]
if (status != umax) [[unlikely]]
{
goto retry;
}
@ -80,7 +112,7 @@ namespace utils
const void* ptr = reinterpret_cast<const void*>(value);
#ifdef _MSC_VER
return _mm_prefetch(reinterpret_cast<const char*>(ptr), _MM_HINT_T1);
return _mm_prefetch(reinterpret_cast<const char*>(ptr), 2);
#else
return __builtin_prefetch(ptr, 0, 2);
#endif
@ -95,7 +127,7 @@ namespace utils
}
#ifdef _MSC_VER
return _mm_prefetch(reinterpret_cast<const char*>(ptr), _MM_HINT_T0);
return _mm_prefetch(reinterpret_cast<const char*>(ptr), 3);
#else
return __builtin_prefetch(ptr, 0, 3);
#endif
@ -108,7 +140,11 @@ namespace utils
return;
}
#ifdef _MSC_VER
return _m_prefetchw(ptr);
#else
return __builtin_prefetch(ptr, 1, 0);
#endif
}
constexpr u8 rol8(u8 x, u8 n)
@ -120,8 +156,10 @@ namespace utils
#ifdef _MSC_VER
return _rotl8(x, n);
#elif defined(__clang__)
return __builtin_rotateleft8(x, n);
#else
return __rolb(x, n);
return __builtin_ia32_rolqi(x, n);
#endif
}
@ -133,9 +171,11 @@ namespace utils
}
#ifdef _MSC_VER
return _rotl16(x, n);
return _rotl16(x, static_cast<uchar>(n));
#elif defined(__clang__)
return __builtin_rotateleft16(x, n);
#else
return __rolw(x, n);
return __builtin_ia32_rolhi(x, n);
#endif
}
@ -148,22 +188,10 @@ namespace utils
#ifdef _MSC_VER
return _rotl(x, n);
#elif defined(__clang__)
return __builtin_rotateleft32(x, n);
#else
return __rold(x, n);
#endif
}
constexpr u32 ror32(u32 x, u32 n)
{
if (std::is_constant_evaluated())
{
return (x >> (n & 31)) | (x << (((0 - n) & 31)));
}
#ifdef _MSC_VER
return _rotr(x, n);
#else
return __rord(x, n);
return (x << n) | (x >> (32 - n));
#endif
}
@ -176,8 +204,10 @@ namespace utils
#ifdef _MSC_VER
return _rotl64(x, static_cast<int>(n));
#elif defined(__clang__)
return __builtin_rotateleft64(x, n);
#else
return __rolq(x, static_cast<int>(n));
return (x << n) | (x >> (64 - n));
#endif
}
@ -285,12 +315,21 @@ namespace utils
#endif
}
inline void pause()
{
#ifdef _MSC_VER
_mm_pause();
#else
__builtin_ia32_pause();
#endif
}
// Synchronization helper (cache-friendly busy waiting)
inline void busy_wait(usz cycles = 3000)
{
const u64 start = __rdtsc();
do _mm_pause();
while (__rdtsc() - start < cycles);
const u64 start = get_tsc();
do pause();
while (get_tsc() - start < cycles);
}
// Align to power of 2

View file

@ -6,6 +6,25 @@
#define USE_STD
#endif
#ifdef _MSC_VER
#include "emmintrin.h"
#include "immintrin.h"
namespace utils
{
u128 __vectorcall atomic_load16(const void* ptr)
{
return std::bit_cast<u128>(_mm_load_si128((__m128i*)ptr));
}
void __vectorcall atomic_store16(void* ptr, u128 value)
{
_mm_store_si128((__m128i*)ptr, std::bit_cast<__m128i>(value));
}
}
#endif
#include "Utilities/sync.h"
#include "Utilities/StrFmt.h"
@ -847,9 +866,17 @@ namespace
};
}
u64 atomic_wait::get_unique_tsc()
#ifdef _MSC_VER
extern "C" u64 __rdtsc();
#endif
u64 utils::get_unique_tsc()
{
#ifdef _MSC_VER
const u64 stamp0 = __rdtsc();
#else
const u64 stamp0 = __builtin_ia32_rdtsc();
#endif
return s_min_tsc.atomic_op([&](u64& tsc)
{
@ -1026,7 +1053,7 @@ FORCE_INLINE auto root_info::slot_search(uptr iptr, u32 size, u64 thread_id, u12
SAFE_BUFFERS void atomic_wait_engine::wait(const void* data, u32 size, u128 old_value, u64 timeout, u128 mask, atomic_wait::info* ext)
{
const auto stamp0 = atomic_wait::get_unique_tsc();
const auto stamp0 = utils::get_unique_tsc();
if (!s_tls_wait_cb(data, 0, stamp0))
{

View file

@ -7,6 +7,62 @@
#ifdef _MSC_VER
#pragma warning(push)
#pragma warning(disable: 4996)
extern "C"
{
void _ReadWriteBarrier();
void* _AddressOfReturnAddress();
uchar _bittest(const long*, long);
uchar _interlockedbittestandset(volatile long*, long);
uchar _interlockedbittestandreset(volatile long*, long);
char _InterlockedCompareExchange8(volatile char*, char, char);
char _InterlockedExchange8(volatile char*, char);
char _InterlockedExchangeAdd8(volatile char*, char);
char _InterlockedAnd8(volatile char*, char);
char _InterlockedOr8(volatile char*, char);
char _InterlockedXor8(volatile char*, char);
short _InterlockedCompareExchange16(volatile short*, short, short);
short _InterlockedExchange16(volatile short*, short);
short _InterlockedExchangeAdd16(volatile short*, short);
short _InterlockedAnd16(volatile short*, short);
short _InterlockedOr16(volatile short*, short);
short _InterlockedXor16(volatile short*, short);
short _InterlockedIncrement16(volatile short*);
short _InterlockedDecrement16(volatile short*);
long _InterlockedCompareExchange(volatile long*, long, long);
long _InterlockedCompareExchange_HLEAcquire(volatile long*, long, long);
long _InterlockedExchange(volatile long*, long);
long _InterlockedExchangeAdd(volatile long*, long);
long _InterlockedExchangeAdd_HLERelease(volatile long*, long);
long _InterlockedAnd(volatile long*, long);
long _InterlockedOr(volatile long*, long);
long _InterlockedXor(volatile long*, long);
long _InterlockedIncrement(volatile long*);
long _InterlockedDecrement(volatile long*);
s64 _InterlockedCompareExchange64(volatile s64*, s64, s64);
s64 _InterlockedCompareExchange64_HLEAcquire(volatile s64*, s64, s64);
s64 _InterlockedExchange64(volatile s64*, s64);
s64 _InterlockedExchangeAdd64(volatile s64*, s64);
s64 _InterlockedExchangeAdd64_HLERelease(volatile s64*, s64);
s64 _InterlockedAnd64(volatile s64*, s64);
s64 _InterlockedOr64(volatile s64*, s64);
s64 _InterlockedXor64(volatile s64*, s64);
s64 _InterlockedIncrement64(volatile s64*);
s64 _InterlockedDecrement64(volatile s64*);
uchar _InterlockedCompareExchange128(volatile s64*, s64, s64, s64*);
}
namespace utils
{
u128 __vectorcall atomic_load16(const void*);
void __vectorcall atomic_store16(void*, u128);
}
#endif
FORCE_INLINE void atomic_fence_consume()
@ -238,7 +294,10 @@ namespace atomic_wait
template <typename... T, typename = std::void_t<decltype(std::declval<T>().template wait<op::eq>(any_value))...>>
list(T&... vars) -> list<sizeof...(T), T...>;
}
namespace utils
{
// RDTSC with adjustment for being unique
u64 get_unique_tsc();
}
@ -871,18 +930,14 @@ struct atomic_storage<T, 16> : atomic_storage<T, 0>
static inline T load(const T& dest)
{
atomic_fence_acquire();
__m128i val = _mm_load_si128(reinterpret_cast<const __m128i*>(&dest));
u128 val = utils::atomic_load16(&dest);
atomic_fence_acquire();
return std::bit_cast<T>(val);
}
static inline T observe(const T& dest)
{
// Barriers are kept intentionally
atomic_fence_acquire();
__m128i val = _mm_load_si128(reinterpret_cast<const __m128i*>(&dest));
atomic_fence_acquire();
return std::bit_cast<T>(val);
return load(dest);
}
static inline bool compare_exchange(T& dest, T& comp, T exch)
@ -906,32 +961,31 @@ struct atomic_storage<T, 16> : atomic_storage<T, 0>
static inline void store(T& dest, T value)
{
atomic_fence_acq_rel();
_mm_store_si128(reinterpret_cast<__m128i*>(&dest), std::bit_cast<__m128i>(value));
release(dest, value);
atomic_fence_seq_cst();
}
static inline void release(T& dest, T value)
{
atomic_fence_release();
_mm_store_si128(reinterpret_cast<__m128i*>(&dest), std::bit_cast<__m128i>(value));
utils::atomic_store16(&dest, std::bit_cast<u128>(value));
atomic_fence_release();
}
#else
static inline T load(const T& dest)
{
__atomic_thread_fence(__ATOMIC_ACQUIRE);
__m128i val = _mm_load_si128(reinterpret_cast<const __m128i*>(&dest));
__atomic_thread_fence(__ATOMIC_ACQUIRE);
return std::bit_cast<T>(val);
__m128i r;
#ifdef __AVX__
__asm__ volatile("vmovdqa %1, %0;" : "=x" (r) : "m" (dest) : "memory");
#else
__asm__ volatile("movdqa %1, %0;" : "=x" (r) : "m" (dest) : "memory");
#endif
return std::bit_cast<T>(r);
}
static inline T observe(const T& dest)
{
// Barriers are kept intentionally
__atomic_thread_fence(__ATOMIC_ACQUIRE);
__m128i val = _mm_load_si128(reinterpret_cast<const __m128i*>(&dest));
__atomic_thread_fence(__ATOMIC_ACQUIRE);
return std::bit_cast<T>(val);
return load(dest);
}
static inline bool compare_exchange(T& dest, T& comp, T exch)
@ -987,16 +1041,17 @@ struct atomic_storage<T, 16> : atomic_storage<T, 0>
static inline void store(T& dest, T value)
{
__atomic_thread_fence(__ATOMIC_ACQ_REL);
_mm_store_si128(reinterpret_cast<__m128i*>(&dest), std::bit_cast<__m128i>(value));
release(dest, value);
atomic_fence_seq_cst();
}
static inline void release(T& dest, T value)
{
__atomic_thread_fence(__ATOMIC_RELEASE);
_mm_store_si128(reinterpret_cast<__m128i*>(&dest), std::bit_cast<__m128i>(value));
__atomic_thread_fence(__ATOMIC_RELEASE);
#ifdef __AVX__
__asm__ volatile("vmovdqa %0, %1;" :: "x" (reinterpret_cast<__m128i&>(value)), "m" (dest) : "memory");
#else
__asm__ volatile("movdqa %0, %1;" :: "x" (reinterpret_cast<__m128i&>(value)), "m" (dest) : "memory");
#endif
}
#endif

View file

@ -1,532 +0,0 @@
#include "atomic2.hpp"
#include "Utilities/JIT.h"
#include "util/sysinfo.hpp"
//
static const bool s_use_rtm = utils::has_rtm();
template <unsigned Count>
static const auto commit_tx = build_function_asm<s32(*)(const stx::multi_cas_item*)>([](asmjit::X86Assembler& c, auto& args)
{
static_assert(Count <= 8);
using namespace asmjit;
// Fill registers with item data
c.lea(x86::rax, x86::qword_ptr(args[0], 120));
if constexpr (Count >= 1)
{
c.mov(x86::rcx, x86::qword_ptr(x86::rax, -120));
c.mov(x86::rdx, x86::qword_ptr(x86::rax, -112));
c.mov(x86::r8, x86::qword_ptr(x86::rax, -104));
}
if constexpr (Count >= 2)
{
c.mov(x86::r9, x86::qword_ptr(x86::rax, -96));
c.mov(x86::r10, x86::qword_ptr(x86::rax, -88));
c.mov(x86::r11, x86::qword_ptr(x86::rax, -80));
}
if constexpr (Count >= 3)
{
if (utils::has_avx())
{
c.vzeroupper();
}
#ifdef _WIN32
c.push(x86::rsi);
#endif
c.mov(x86::rsi, x86::qword_ptr(x86::rax, -72));
c.movups(x86::xmm0, x86::oword_ptr(x86::rax, -64));
}
if constexpr (Count >= 4)
{
#ifdef _WIN32
c.push(x86::rdi);
#endif
c.mov(x86::rdi, x86::qword_ptr(x86::rax, -48));
c.movups(x86::xmm1, x86::oword_ptr(x86::rax, -40));
}
if constexpr (Count >= 5)
{
c.push(x86::rbx);
c.mov(x86::rbx, x86::qword_ptr(x86::rax, -24));
c.movups(x86::xmm2, x86::oword_ptr(x86::rax, -16));
}
if constexpr (Count >= 6)
{
c.push(x86::rbp);
c.mov(x86::rbp, x86::qword_ptr(x86::rax));
c.movups(x86::xmm3, x86::oword_ptr(x86::rax, 8));
}
if constexpr (Count >= 7)
{
c.push(x86::r12);
c.mov(x86::r12, x86::qword_ptr(x86::rax, 24));
c.movups(x86::xmm4, x86::oword_ptr(x86::rax, 32));
}
if constexpr (Count >= 8)
{
c.push(x86::r13);
c.mov(x86::r13, x86::qword_ptr(x86::rax, 48));
c.movups(x86::xmm5, x86::oword_ptr(x86::rax, 56));
}
// Begin transaction
Label begin = c.newLabel();
Label fall = c.newLabel();
Label stop = c.newLabel();
Label wait = c.newLabel();
Label ret = c.newLabel();
c.bind(begin);
c.xbegin(fall);
// Compare phase
if constexpr (Count >= 1)
{
c.cmp(x86::qword_ptr(x86::rcx), x86::rdx);
c.jne(stop);
}
if constexpr (Count >= 2)
{
c.cmp(x86::qword_ptr(x86::r9), x86::r10);
c.jne(stop);
}
if constexpr (Count >= 3)
{
c.movq(x86::rax, x86::xmm0);
c.cmp(x86::qword_ptr(x86::rsi), x86::rax);
c.jne(stop);
}
if constexpr (Count >= 4)
{
c.movq(x86::rax, x86::xmm1);
c.cmp(x86::qword_ptr(x86::rdi), x86::rax);
c.jne(stop);
}
if constexpr (Count >= 5)
{
c.movq(x86::rax, x86::xmm2);
c.cmp(x86::qword_ptr(x86::rbx), x86::rax);
c.jne(stop);
}
if constexpr (Count >= 6)
{
c.movq(x86::rax, x86::xmm3);
c.cmp(x86::qword_ptr(x86::rbp), x86::rax);
c.jne(stop);
}
if constexpr (Count >= 7)
{
c.movq(x86::rax, x86::xmm4);
c.cmp(x86::qword_ptr(x86::r12), x86::rax);
c.jne(stop);
}
if constexpr (Count >= 8)
{
c.movq(x86::rax, x86::xmm5);
c.cmp(x86::qword_ptr(x86::r13), x86::rax);
c.jne(stop);
}
// Check for transactions in progress
if constexpr (Count >= 1)
{
c.cmp(x86::qword_ptr(x86::rcx, 8), 0);
c.jne(wait);
}
if constexpr (Count >= 2)
{
c.cmp(x86::qword_ptr(x86::r9, 8), 0);
c.jne(wait);
}
if constexpr (Count >= 3)
{
c.cmp(x86::qword_ptr(x86::rsi, 8), 0);
c.jne(wait);
}
if constexpr (Count >= 4)
{
c.cmp(x86::qword_ptr(x86::rdi, 8), 0);
c.jne(wait);
}
if constexpr (Count >= 5)
{
c.cmp(x86::qword_ptr(x86::rbx, 8), 0);
c.jne(wait);
}
if constexpr (Count >= 6)
{
c.cmp(x86::qword_ptr(x86::rbp, 8), 0);
c.jne(wait);
}
if constexpr (Count >= 7)
{
c.cmp(x86::qword_ptr(x86::r12, 8), 0);
c.jne(wait);
}
if constexpr (Count >= 8)
{
c.cmp(x86::qword_ptr(x86::r13, 8), 0);
c.jne(wait);
}
// Write phase
if constexpr (Count >= 1)
c.mov(x86::qword_ptr(x86::rcx), x86::r8);
if constexpr (Count >= 2)
c.mov(x86::qword_ptr(x86::r9), x86::r11);
if constexpr (Count >= 3)
c.movhps(x86::qword_ptr(x86::rsi), x86::xmm0);
if constexpr (Count >= 4)
c.movhps(x86::qword_ptr(x86::rdi), x86::xmm1);
if constexpr (Count >= 5)
c.movhps(x86::qword_ptr(x86::rbx), x86::xmm2);
if constexpr (Count >= 6)
c.movhps(x86::qword_ptr(x86::rbp), x86::xmm3);
if constexpr (Count >= 7)
c.movhps(x86::qword_ptr(x86::r12), x86::xmm4);
if constexpr (Count >= 8)
c.movhps(x86::qword_ptr(x86::r13), x86::xmm5);
// End transaction (success)
c.xend();
c.mov(x86::eax, 1);
c.bind(ret);
if constexpr (Count >= 8)
c.pop(x86::r13);
if constexpr (Count >= 7)
c.pop(x86::r12);
if constexpr (Count >= 6)
c.pop(x86::rbp);
if constexpr (Count >= 5)
c.pop(x86::rbx);
#ifdef _WIN32
if constexpr (Count >= 4)
c.pop(x86::rdi);
if constexpr (Count >= 3)
c.pop(x86::rsi);
#endif
c.ret();
// Transaction abort
c.bind(stop);
c.xend();
c.xor_(x86::eax, x86::eax);
c.jmp(fall);
// Abort when there is still a chance of success
c.bind(wait);
c.xend();
c.mov(x86::eax, 0xffu << 24);
c.jmp(fall);
// Transaction fallback: return zero
c.bind(fall);
c.test(x86::eax, _XABORT_RETRY);
c.jnz(begin);
c.sar(x86::eax, 24);
c.jmp(ret);
});
// 4095 records max
static constexpr u64 s_rec_gcount = 4096 / 64;
// Global record pool
static stx::multi_cas_record s_records[s_rec_gcount * 64]{};
// Allocation bits (without first element)
static atomic_t<u64> s_rec_bits[s_rec_gcount]{1};
static constexpr u64 s_state_mask = 3;
static constexpr u64 s_state_undef = 0;
static constexpr u64 s_state_failure = 1;
static constexpr u64 s_state_success = 2;
static constexpr u64 s_ref_mask = ~s_state_mask;
static constexpr u64 s_ref_one = s_state_mask + 1;
static u64 rec_alloc()
{
const u32 start = static_cast<u32>(__rdtsc());
for (u32 i = 0;; i++)
{
const u32 group = (i + start) % s_rec_gcount;
const auto [bits, ok] = s_rec_bits[group].fetch_op([](u64& bits)
{
if (~bits)
{
// Set lowest clear bit
bits |= bits + 1;
return true;
}
return false;
});
if (ok)
{
// Find lowest clear bit
return group * 64 + std::countr_one(bits);
}
}
// TODO: unreachable
std::abort();
return 0;
}
static bool cmpxchg16(s64(&dest)[2], s64(&cmp_res)[2], s64 exch_high, s64 exch_low)
{
#ifdef _MSC_VER
return !!_InterlockedCompareExchange128(dest, exch_high, exch_low, cmp_res);
#else
s64 exch[2]{exch_low, exch_high};
return __atomic_compare_exchange(&dest, &cmp_res, &exch, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST);
#endif
}
bool stx::multi_cas_record::commit() const noexcept
{
// Transaction cancelled
if (m_count == 0)
{
return true;
}
static auto rec_unref = [](u64 id)
{
if (id && id < s_rec_gcount * 64)
{
auto [_, ok] = s_records[id].m_state.fetch_op([](u64& state)
{
if (state < s_ref_one)
{
return 0;
}
state -= s_ref_one;
if (state < s_ref_one)
{
state = 0;
return 2;
}
return 1;
});
if (ok > 1)
{
s_rec_bits[id / 64] &= ~(u64{1} << (id % 64));
}
}
};
// Helper function to complete successful transaction
static auto rec_complete = [](u64 id)
{
for (u32 i = 0; i < s_records[id].m_count; i++)
{
auto& item = s_records[id].m_list[i];
atomic2 cmp;
cmp.m_data[0] = item.m_old;
cmp.m_data[1] = id;
if (item.m_addr->load() == item.m_old && atomic_storage<s64>::load(item.m_addr->m_data[1]) == static_cast<s64>(id))
{
if (cmpxchg16(item.m_addr->m_data, cmp.m_data, 0, item.m_new))
{
}
}
}
};
// Helper function to deal with existing transaction
static auto rec_try_abort = [](u64 id) -> u64
{
if (id >= s_rec_gcount * 64)
{
std::abort();
}
auto [_old, ok] = s_records[id].m_state.fetch_op([](u64& state)
{
if (state < s_ref_one)
{
// Don't reference if no references
return false;
}
if ((state & s_state_mask) == s_state_undef)
{
// Break transaction if possible
state |= s_state_failure;
}
state += s_ref_one;
return true;
});
if (!ok)
{
return 0;
}
if ((_old & s_state_mask) != s_state_success)
{
// Allow to overwrite failing transaction
return id;
}
// Help to complete
rec_complete(id);
rec_unref(id);
return 0;
};
// Single CAS path
if (m_count == 1)
{
atomic2 cmp;
while (auto ptr = m_list[0].m_addr)
{
if (ptr->load() != m_list[0].m_old)
{
return false;
}
cmp.m_data[0] = m_list[0].m_old;
cmp.m_data[1] = atomic_storage<s64>::load(ptr->m_data[1]);
if (!cmp.m_data[1] && cmpxchg16(ptr->m_data, cmp.m_data, 0, m_list[0].m_new))
{
return true;
}
else if (cmp.m_data[0] != static_cast<s64>(m_list[0].m_old))
{
return false;
}
else if (cmp.m_data[1])
{
if (u64 _id = rec_try_abort(cmp.m_data[1]))
{
if (cmpxchg16(ptr->m_data, cmp.m_data, 0, m_list[0].m_new))
{
rec_unref(_id);
return true;
}
rec_unref(_id);
}
}
}
// Unreachable
std::abort();
}
// Try TSX if available
if (s_use_rtm)
{
switch (m_count)
{
case 2: if (s32 r = commit_tx<2>(m_list)) return r > 0; break;
case 3: if (s32 r = commit_tx<3>(m_list)) return r > 0; break;
case 4: if (s32 r = commit_tx<4>(m_list)) return r > 0; break;
case 5: if (s32 r = commit_tx<5>(m_list)) return r > 0; break;
case 6: if (s32 r = commit_tx<6>(m_list)) return r > 0; break;
case 7: if (s32 r = commit_tx<7>(m_list)) return r > 0; break;
case 8: if (s32 r = commit_tx<8>(m_list)) return r > 0; break;
}
}
// Allocate global record and copy data
const u64 id = rec_alloc();
for (u32 i = 0; i < (m_count + 1) / 2; i++)
{
std::memcpy(s_records[id].m_list + i * 2, m_list + i * 2, sizeof(multi_cas_item) * 2);
}
s_records[id].m_count = m_count;
s_records[id].m_state = s_ref_one;
// Try to install CAS items
for (u32 i = 0; i < m_count && (s_records[id].m_state & s_state_mask) == s_state_undef; i++)
{
atomic2 cmp;
while (auto ptr = m_list[i].m_addr)
{
if (ptr->load() != m_list[i].m_old)
{
s_records[id].m_state |= s_state_failure;
break;
}
cmp.m_data[0] = m_list[i].m_old;
cmp.m_data[1] = atomic_storage<s64>::load(ptr->m_data[1]);
if (!cmp.m_data[1] && cmpxchg16(ptr->m_data, cmp.m_data, id, m_list[i].m_old))
{
break;
}
else if (cmp.m_data[0] != static_cast<s64>(m_list[i].m_old))
{
s_records[id].m_state |= s_state_failure;
break;
}
else if (cmp.m_data[1])
{
if (u64 _id = rec_try_abort(cmp.m_data[1]))
{
if (cmpxchg16(ptr->m_data, cmp.m_data, id, m_list[i].m_old))
{
rec_unref(_id);
break;
}
rec_unref(_id);
}
}
}
}
// Try to acknowledge transaction success
auto [_, ok] = s_records[id].m_state.fetch_op([](u64& state)
{
if (state & s_state_failure)
{
return false;
}
state |= s_state_success;
return true;
});
// Complete transaction on success, or cleanup on failure
for (u32 i = 0; i < m_count; i++)
{
auto& item = m_list[i];
atomic2 cmp;
cmp.m_data[0] = item.m_old;
cmp.m_data[1] = id;
if (item.m_addr->load() == item.m_old && atomic_storage<s64>::load(item.m_addr->m_data[1]) == static_cast<s64>(id))
{
// Restore old or set new
if (cmpxchg16(item.m_addr->m_data, cmp.m_data, 0, ok ? item.m_new : item.m_old))
{
}
}
}
rec_unref(id);
return ok;
}

View file

@ -1,156 +0,0 @@
#pragma once
#include <cstdint>
#include "util/atomic.hpp"
namespace stx
{
// Unsigned 64-bit atomic for multi-cas (occupies 128 bits)
class alignas(16) atomic2
{
// First 64-bit value is an actual value, second one is an allocated control block pointer (if not zero)
s64 m_data[2]{};
friend class multi_cas_record;
public:
// Can't be really uninitialized or it'll be fundamentally broken
constexpr atomic2() noexcept = default;
atomic2(const atomic2&) = delete;
atomic2& operator=(const atomic2&) = delete;
constexpr atomic2(u64 value) noexcept
: m_data{static_cast<s64>(value), s64{0}}
{
}
// Simply observe the state
u64 load() const noexcept
{
return atomic_storage<u64>::load(m_data[0]);
}
// void wait(u64 old_value) const noexcept;
// void notify_one() noexcept;
// void notify_all() noexcept;
};
// Atomic CAS item
class multi_cas_item
{
atomic2* m_addr;
u64 m_old;
u64 m_new;
friend class multi_cas_record;
public:
multi_cas_item() noexcept = default;
multi_cas_item(const multi_cas_item&) = delete;
multi_cas_item& operator=(const multi_cas_item&) = delete;
u64 get_old() const noexcept
{
return m_old;
}
operator u64() const noexcept
{
return m_new;
}
void operator=(u64 value) noexcept
{
m_new = value;
}
};
// An object passed to multi_cas lambda
class alignas(64) multi_cas_record
{
// Ref counter and Multi-CAS state
atomic_t<u64> m_state;
// Total number of CASes
u64 m_count;
// Support up to 10 CASes
multi_cas_item m_list[10];
public:
// Read atomic value and allocate "writable" item
multi_cas_item& load(atomic2& atom) noexcept
{
if (m_count >= std::size(m_list))
{
std::abort();
}
auto& r = m_list[m_count++];
r.m_addr = &atom;
r.m_old = atom.load();
r.m_new = r.m_old;
return r;
}
// Reset transaction (invalidates item references)
void cancel() noexcept
{
m_count = 0;
}
// Try to commit sudoku (don't call)
bool commit() const noexcept;
};
template <typename T>
struct multi_cas_result
{
static constexpr bool is_void = false;
T ret;
};
template <>
struct multi_cas_result<void>
{
static constexpr bool is_void = true;
};
template <typename Context>
class multi_cas final : Context, multi_cas_record, public multi_cas_result<std::invoke_result_t<Context, multi_cas_record&>>
{
using result = multi_cas_result<std::invoke_result_t<Context, multi_cas_record&>>;
using record = multi_cas_record;
public:
// Implicit deduction guide candidate constructor (for lambda)
multi_cas(Context&& f) noexcept
: Context(std::forward<Context>(f))
{
while (true)
{
multi_cas_record& rec = *this;
record::cancel();
if constexpr (result::is_void)
{
Context::operator()(rec);
}
else
{
result::ret = Context::operator()(rec);
}
if (record::commit())
{
return;
}
}
}
};
}

View file

@ -17,6 +17,13 @@
#include "util/asm.hpp"
#ifdef _MSC_VER
extern "C"
{
u64 _xgetbv(u32);
}
#endif
inline std::array<u32, 4> utils::get_cpuid(u32 func, u32 subfunc)
{
int regs[4];
@ -303,6 +310,19 @@ static constexpr ullong round_tsc(ullong val)
return utils::rounded_div(val, 1'000'000) * 1'000'000;
}
#ifdef _MSC_VER
extern "C" void _mm_lfence();
#endif
static inline void lfence()
{
#ifdef _MSC_VER
_mm_lfence();
#else
__builtin_ia32_lfence();
#endif
}
ullong utils::get_tsc_freq()
{
static const ullong cal_tsc = []() -> ullong
@ -343,17 +363,17 @@ ullong utils::get_tsc_freq()
{
#ifdef _WIN32
Sleep(1);
error_data[i] = (_mm_lfence(), __rdtsc());
error_data[i] = (lfence(), utils::get_tsc());
LARGE_INTEGER ctr;
QueryPerformanceCounter(&ctr);
rdtsc_data[i] = (_mm_lfence(), __rdtsc());
rdtsc_data[i] = (lfence(), utils::get_tsc());
timer_data[i] = ctr.QuadPart;
#else
usleep(200);
error_data[i] = (_mm_lfence(), __rdtsc());
error_data[i] = (lfence(), utils::get_tsc());
struct timespec ts;
clock_gettime(CLOCK_MONOTONIC, &ts);
rdtsc_data[i] = (_mm_lfence(), __rdtsc());
rdtsc_data[i] = (lfence(), utils::get_tsc());
timer_data[i] = ts.tv_nsec + (ts.tv_sec - sec_base) * 1'000'000'000;
#endif
}

View file

@ -1,13 +1,5 @@
#pragma once // No BOM and only basic ASCII in this header, or a neko will die
#ifdef _MSC_VER
#include <intrin.h>
#else
#include <x86intrin.h>
#endif
#include <immintrin.h>
#include <emmintrin.h>
#include <cstdint>
#include <cstddef>
#include <cstring>
@ -278,10 +270,28 @@ public:
};
#ifndef _MSC_VER
using u128 = __uint128_t;
using s128 = __int128_t;
using __m128i = long long __attribute__((vector_size(16)));
using __m128d = double __attribute__((vector_size(16)));
using __m128 = float __attribute__((vector_size(16)));
#else
extern "C"
{
union __m128;
union __m128i;
struct __m128d;
uchar _addcarry_u64(uchar, u64, u64, u64*);
uchar _subborrow_u64(uchar, u64, u64, u64*);
u64 __shiftleft128(u64, u64, uchar);
u64 __shiftright128(u64, u64, uchar);
}
// Unsigned 128-bit integer implementation (TODO)
struct alignas(16) u128
{

View file

@ -1,7 +1,6 @@
#pragma once // No BOM and only basic ASCII in this header, or a neko will die
#include "util/types.hpp"
#include <cmath>
// 128-bit vector type
union alignas(16) v128
@ -12,17 +11,17 @@ union alignas(16) v128
template <typename T, usz N, usz M>
struct masked_array_t // array type accessed as (index ^ M)
{
char m_data[16];
T m_data[N];
public:
T& operator[](usz index)
{
return reinterpret_cast<T*>(m_data)[index ^ M];
return m_data[index ^ M];
}
const T& operator[](usz index) const
{
return reinterpret_cast<const T*>(m_data)[index ^ M];
return m_data[index ^ M];
}
};
@ -56,88 +55,55 @@ union alignas(16) v128
reversed_array_t<f32> fr;
reversed_array_t<f64> dr;
u128 _u;
//s128 _s;
#ifdef _MSC_VER
template <typename T>
struct opaque_wrapper
{
u128 m_data;
opaque_wrapper() = default;
opaque_wrapper(const T& value)
: m_data(std::bit_cast<u128>(value))
{
}
opaque_wrapper& operator=(const T& value)
{
m_data = std::bit_cast<u128>(value);
return *this;
}
operator T() const
{
return std::bit_cast<T>(m_data);
}
};
opaque_wrapper<__m128> vf;
opaque_wrapper<__m128i> vi;
opaque_wrapper<__m128d> vd;
#else
__m128 vf;
__m128i vi;
__m128d vd;
#endif
struct bit_array_128
{
char m_data[16];
public:
class bit_element
{
u64& data;
const u64 mask;
public:
bit_element(u64& data, const u64 mask)
: data(data)
, mask(mask)
{
}
operator bool() const
{
return (data & mask) != 0;
}
bit_element& operator=(const bool right)
{
if (right)
{
data |= mask;
}
else
{
data &= ~mask;
}
return *this;
}
bit_element& operator=(const bit_element& right)
{
if (right)
{
data |= mask;
}
else
{
data &= ~mask;
}
return *this;
}
};
class bit_element;
// Index 0 returns the MSB and index 127 returns the LSB
bit_element operator[](u32 index)
{
const auto data_ptr = reinterpret_cast<u64*>(m_data);
if constexpr (std::endian::little == std::endian::native)
{
return bit_element(data_ptr[1 - (index >> 6)], 0x8000000000000000ull >> (index & 0x3F));
}
else
{
return bit_element(data_ptr[index >> 6], 0x8000000000000000ull >> (index & 0x3F));
}
}
[[deprecated]] bit_element operator[](u32 index);
// Index 0 returns the MSB and index 127 returns the LSB
bool operator[](u32 index) const
{
const auto data_ptr = reinterpret_cast<const u64*>(m_data);
if constexpr (std::endian::little == std::endian::native)
{
return (data_ptr[1 - (index >> 6)] & (0x8000000000000000ull >> (index & 0x3F))) != 0;
}
else
{
return (data_ptr[index >> 6] & (0x8000000000000000ull >> (index & 0x3F))) != 0;
}
}
[[deprecated]] bool operator[](u32 index) const;
} _bit;
static v128 from64(u64 _0, u64 _1 = 0)
@ -171,51 +137,39 @@ union alignas(16) v128
static v128 from32p(u32 value)
{
v128 ret;
ret.vi = _mm_set1_epi32(static_cast<s32>(value));
ret._u32[0] = value;
ret._u32[1] = value;
ret._u32[2] = value;
ret._u32[3] = value;
return ret;
}
static v128 from16p(u16 value)
{
v128 ret;
ret.vi = _mm_set1_epi16(static_cast<s16>(value));
ret._u16[0] = value;
ret._u16[1] = value;
ret._u16[2] = value;
ret._u16[3] = value;
ret._u16[4] = value;
ret._u16[5] = value;
ret._u16[6] = value;
ret._u16[7] = value;
return ret;
}
static v128 from8p(u8 value)
{
v128 ret;
ret.vi = _mm_set1_epi8(static_cast<s8>(value));
std::memset(&ret, value, sizeof(ret));
return ret;
}
static v128 fromBit(u32 bit)
{
v128 ret = {};
ret._bit[bit] = true;
return ret;
}
static inline v128 fromV(const __m128i& value);
static v128 fromV(__m128i value)
{
v128 ret;
ret.vi = value;
return ret;
}
static inline v128 fromF(const __m128& value);
static v128 fromF(__m128 value)
{
v128 ret;
ret.vf = value;
return ret;
}
static v128 fromD(__m128d value)
{
v128 ret;
ret.vd = value;
return ret;
}
static inline v128 fromD(const __m128d& value);
// Unaligned load with optional index offset
static v128 loadu(const void* ptr, usz index = 0)
@ -231,136 +185,46 @@ union alignas(16) v128
std::memcpy(static_cast<u8*>(ptr) + index * sizeof(v128), &value, sizeof(v128));
}
static inline v128 add8(const v128& left, const v128& right)
{
return fromV(_mm_add_epi8(left.vi, right.vi));
}
static inline v128 add8(const v128& left, const v128& right);
static inline v128 add16(const v128& left, const v128& right)
{
return fromV(_mm_add_epi16(left.vi, right.vi));
}
static inline v128 add16(const v128& left, const v128& right);
static inline v128 add32(const v128& left, const v128& right)
{
return fromV(_mm_add_epi32(left.vi, right.vi));
}
static inline v128 add32(const v128& left, const v128& right);
static inline v128 addfs(const v128& left, const v128& right)
{
return fromF(_mm_add_ps(left.vf, right.vf));
}
static inline v128 addfs(const v128& left, const v128& right);
static inline v128 addfd(const v128& left, const v128& right)
{
return fromD(_mm_add_pd(left.vd, right.vd));
}
static inline v128 addfd(const v128& left, const v128& right);
static inline v128 sub8(const v128& left, const v128& right)
{
return fromV(_mm_sub_epi8(left.vi, right.vi));
}
static inline v128 sub8(const v128& left, const v128& right);
static inline v128 sub16(const v128& left, const v128& right)
{
return fromV(_mm_sub_epi16(left.vi, right.vi));
}
static inline v128 sub16(const v128& left, const v128& right);
static inline v128 sub32(const v128& left, const v128& right)
{
return fromV(_mm_sub_epi32(left.vi, right.vi));
}
static inline v128 sub32(const v128& left, const v128& right);
static inline v128 subfs(const v128& left, const v128& right)
{
return fromF(_mm_sub_ps(left.vf, right.vf));
}
static inline v128 subfs(const v128& left, const v128& right);
static inline v128 subfd(const v128& left, const v128& right)
{
return fromD(_mm_sub_pd(left.vd, right.vd));
}
static inline v128 subfd(const v128& left, const v128& right);
static inline v128 maxu8(const v128& left, const v128& right)
{
return fromV(_mm_max_epu8(left.vi, right.vi));
}
static inline v128 maxu8(const v128& left, const v128& right);
static inline v128 minu8(const v128& left, const v128& right)
{
return fromV(_mm_min_epu8(left.vi, right.vi));
}
static inline v128 minu8(const v128& left, const v128& right);
static inline v128 eq8(const v128& left, const v128& right)
{
return fromV(_mm_cmpeq_epi8(left.vi, right.vi));
}
static inline v128 eq8(const v128& left, const v128& right);
static inline v128 eq16(const v128& left, const v128& right)
{
return fromV(_mm_cmpeq_epi16(left.vi, right.vi));
}
static inline v128 eq16(const v128& left, const v128& right);
static inline v128 eq32(const v128& left, const v128& right)
{
return fromV(_mm_cmpeq_epi32(left.vi, right.vi));
}
static inline v128 eq32(const v128& left, const v128& right);
static inline v128 eq32f(const v128& left, const v128& right)
{
return fromF(_mm_cmpeq_ps(left.vf, right.vf));
}
static inline v128 eq32f(const v128& left, const v128& right);
static inline v128 eq64f(const v128& left, const v128& right)
{
return fromD(_mm_cmpeq_pd(left.vd, right.vd));
}
static inline v128 fma32f(v128 a, const v128& b, const v128& c);
static inline bool use_fma = false;
bool operator==(const v128& right) const;
static inline v128 fma32f(v128 a, const v128& b, const v128& c)
{
#ifndef __FMA__
if (use_fma) [[likely]]
{
#ifdef _MSC_VER
a.vf = _mm_fmadd_ps(a.vf, b.vf, c.vf);
return a;
#else
__asm__("vfmadd213ps %[c], %[b], %[a]"
: [a] "+x" (a.vf)
: [b] "x" (b.vf)
, [c] "x" (c.vf));
return a;
#endif
}
for (int i = 0; i < 4; i++)
{
a._f[i] = std::fmaf(a._f[i], b._f[i], c._f[i]);
}
return a;
#else
a.vf = _mm_fmadd_ps(a.vf, b.vf, c.vf);
return a;
#endif
}
bool operator==(const v128& right) const
{
return _mm_movemask_epi8(v128::eq32(*this, right).vi) == 0xffff;
}
bool operator!=(const v128& right) const
{
return !operator==(right);
}
bool operator!=(const v128& right) const;
// result = (~left) & (right)
static inline v128 andnot(const v128& left, const v128& right)
{
return fromV(_mm_andnot_si128(left.vi, right.vi));
}
static inline v128 andnot(const v128& left, const v128& right);
void clear()
{
@ -377,23 +241,3 @@ struct offset32_array<v128::masked_array_t<T, N, M>>
return u32{sizeof(T)} * (static_cast<u32>(arg) ^ static_cast<u32>(M));
}
};
inline v128 operator|(const v128& left, const v128& right)
{
return v128::fromV(_mm_or_si128(left.vi, right.vi));
}
inline v128 operator&(const v128& left, const v128& right)
{
return v128::fromV(_mm_and_si128(left.vi, right.vi));
}
inline v128 operator^(const v128& left, const v128& right)
{
return v128::fromV(_mm_xor_si128(left.vi, right.vi));
}
inline v128 operator~(const v128& other)
{
return other ^ v128::from32p(UINT32_MAX); // XOR with ones
}

255
rpcs3/util/v128sse.hpp Normal file
View file

@ -0,0 +1,255 @@
#pragma once
#include "util/types.hpp"
#include "util/v128.hpp"
#include "util/sysinfo.hpp"
#ifdef _MSC_VER
#include <intrin.h>
#else
#include <x86intrin.h>
#endif
#include <immintrin.h>
#include <emmintrin.h>
#include <cmath>
inline bool v128_use_fma = utils::has_fma3();
class v128::bit_array_128::bit_element
{
u64& data;
const u64 mask;
public:
bit_element(u64& data, const u64 mask)
: data(data)
, mask(mask)
{
}
operator bool() const
{
return (data & mask) != 0;
}
bit_element& operator=(const bool right)
{
if (right)
{
data |= mask;
}
else
{
data &= ~mask;
}
return *this;
}
bit_element& operator=(const bit_element& right)
{
if (right)
{
data |= mask;
}
else
{
data &= ~mask;
}
return *this;
}
};
[[deprecated]] inline v128::bit_array_128::bit_element v128::bit_array_128::operator[](u32 index)
{
const auto data_ptr = reinterpret_cast<u64*>(m_data);
if constexpr (std::endian::little == std::endian::native)
{
return bit_element(data_ptr[1 - (index >> 6)], 0x8000000000000000ull >> (index & 0x3F));
}
else
{
return bit_element(data_ptr[index >> 6], 0x8000000000000000ull >> (index & 0x3F));
}
}
[[deprecated]] inline bool v128::bit_array_128::operator[](u32 index) const
{
const auto data_ptr = reinterpret_cast<const u64*>(m_data);
if constexpr (std::endian::little == std::endian::native)
{
return (data_ptr[1 - (index >> 6)] & (0x8000000000000000ull >> (index & 0x3F))) != 0;
}
else
{
return (data_ptr[index >> 6] & (0x8000000000000000ull >> (index & 0x3F))) != 0;
}
}
inline v128 v128::fromV(const __m128i& value)
{
v128 ret;
ret.vi = value;
return ret;
}
inline v128 v128::fromF(const __m128& value)
{
v128 ret;
ret.vf = value;
return ret;
}
inline v128 v128::fromD(const __m128d& value)
{
v128 ret;
ret.vd = value;
return ret;
}
inline v128 v128::add8(const v128& left, const v128& right)
{
return fromV(_mm_add_epi8(left.vi, right.vi));
}
inline v128 v128::add16(const v128& left, const v128& right)
{
return fromV(_mm_add_epi16(left.vi, right.vi));
}
inline v128 v128::add32(const v128& left, const v128& right)
{
return fromV(_mm_add_epi32(left.vi, right.vi));
}
inline v128 v128::addfs(const v128& left, const v128& right)
{
return fromF(_mm_add_ps(left.vf, right.vf));
}
inline v128 v128::addfd(const v128& left, const v128& right)
{
return fromD(_mm_add_pd(left.vd, right.vd));
}
inline v128 v128::sub8(const v128& left, const v128& right)
{
return fromV(_mm_sub_epi8(left.vi, right.vi));
}
inline v128 v128::sub16(const v128& left, const v128& right)
{
return fromV(_mm_sub_epi16(left.vi, right.vi));
}
inline v128 v128::sub32(const v128& left, const v128& right)
{
return fromV(_mm_sub_epi32(left.vi, right.vi));
}
inline v128 v128::subfs(const v128& left, const v128& right)
{
return fromF(_mm_sub_ps(left.vf, right.vf));
}
inline v128 v128::subfd(const v128& left, const v128& right)
{
return fromD(_mm_sub_pd(left.vd, right.vd));
}
inline v128 v128::maxu8(const v128& left, const v128& right)
{
return fromV(_mm_max_epu8(left.vi, right.vi));
}
inline v128 v128::minu8(const v128& left, const v128& right)
{
return fromV(_mm_min_epu8(left.vi, right.vi));
}
inline v128 v128::eq8(const v128& left, const v128& right)
{
return fromV(_mm_cmpeq_epi8(left.vi, right.vi));
}
inline v128 v128::eq16(const v128& left, const v128& right)
{
return fromV(_mm_cmpeq_epi16(left.vi, right.vi));
}
inline v128 v128::eq32(const v128& left, const v128& right)
{
return fromV(_mm_cmpeq_epi32(left.vi, right.vi));
}
inline v128 v128::eq32f(const v128& left, const v128& right)
{
return fromF(_mm_cmpeq_ps(left.vf, right.vf));
}
inline v128 v128::fma32f(v128 a, const v128& b, const v128& c)
{
#ifndef __FMA__
if (v128_use_fma) [[likely]]
{
#ifdef _MSC_VER
a.vf = _mm_fmadd_ps(a.vf, b.vf, c.vf);
return a;
#else
__asm__("vfmadd213ps %[c], %[b], %[a]"
: [a] "+x" (a.vf)
: [b] "x" (b.vf)
, [c] "x" (c.vf));
return a;
#endif
}
for (int i = 0; i < 4; i++)
{
a._f[i] = std::fmaf(a._f[i], b._f[i], c._f[i]);
}
return a;
#else
a.vf = _mm_fmadd_ps(a.vf, b.vf, c.vf);
return a;
#endif
}
inline bool v128::operator==(const v128& right) const
{
return _mm_movemask_epi8(v128::eq32(*this, right).vi) == 0xffff;
}
inline bool v128::operator!=(const v128& right) const
{
return !operator==(right);
}
// result = (~left) & (right)
inline v128 v128::andnot(const v128& left, const v128& right)
{
return fromV(_mm_andnot_si128(left.vi, right.vi));
}
inline v128 operator|(const v128& left, const v128& right)
{
return v128::fromV(_mm_or_si128(left.vi, right.vi));
}
inline v128 operator&(const v128& left, const v128& right)
{
return v128::fromV(_mm_and_si128(left.vi, right.vi));
}
inline v128 operator^(const v128& left, const v128& right)
{
return v128::fromV(_mm_xor_si128(left.vi, right.vi));
}
inline v128 operator~(const v128& other)
{
return other ^ v128::from32p(UINT32_MAX); // XOR with ones
}