SPU: Implement execution wake-up delay

This commit is contained in:
Eladash 2020-06-23 16:41:16 +03:00 committed by Ivan
parent 149c593d89
commit cf0fcf5a2a
7 changed files with 83 additions and 64 deletions

View file

@ -2322,14 +2322,14 @@ thread_state thread_ctrl::state()
return static_cast<thread_state>(_this->m_sync & 3);
}
void thread_ctrl::_wait_for(u64 usec, [[maybe_unused]] bool alert /* true */)
void thread_ctrl::wait_for(u64 usec, [[maybe_unused]] bool alert /* true */)
{
auto _this = g_tls_this_thread;
#ifdef __linux__
static thread_local struct linux_timer_handle_t
{
// Allocate timer only if needed (i.e. someone calls _wait_for with alert and short period)
// Allocate timer only if needed (i.e. someone calls wait_for with alert and short period)
const int m_timer = timerfd_create(CLOCK_MONOTONIC, 0);
linux_timer_handle_t() noexcept
@ -2383,6 +2383,58 @@ void thread_ctrl::_wait_for(u64 usec, [[maybe_unused]] bool alert /* true */)
list.wait(atomic_wait_timeout{usec <= 0xffff'ffff'ffff'ffff / 1000 ? usec * 1000 : 0xffff'ffff'ffff'ffff});
}
void thread_ctrl::wait_for_accurate(u64 usec)
{
if (!usec)
{
return;
}
using namespace std::chrono_literals;
const auto until = std::chrono::steady_clock::now() + 1us * usec;
while (true)
{
#ifdef __linux__
// NOTE: Assumption that timer initialization has succeeded
u64 host_min_quantum = usec <= 1000 ? 10 : 50;
#else
// Host scheduler quantum for windows (worst case)
// NOTE: On ps3 this function has very high accuracy
constexpr u64 host_min_quantum = 500;
#endif
if (usec >= host_min_quantum)
{
#ifdef __linux__
// Do not wait for the last quantum to avoid loss of accuracy
wait_for(usec - ((usec % host_min_quantum) + host_min_quantum), false);
#else
// Wait on multiple of min quantum for large durations to avoid overloading low thread cpus
wait_for(usec - (usec % host_min_quantum), false);
#endif
}
// TODO: Determine best value for yield delay
else if (usec >= host_min_quantum / 2)
{
std::this_thread::yield();
}
else
{
busy_wait(100);
}
const auto current = std::chrono::steady_clock::now();
if (current >= until)
{
break;
}
usec = (until - current).count();
}
}
std::string thread_ctrl::get_name_cached()
{
auto _this = thread_ctrl::g_tls_this_thread;

View file

@ -201,9 +201,6 @@ class thread_ctrl final
// Target cpu core layout
static atomic_t<native_core_arrangement> g_native_core_layout;
// Internal waiting function, may throw. Infinite value is -1.
static void _wait_for(u64 usec, bool alert);
friend class thread_base;
// Optimized get_name() for logging
@ -263,16 +260,16 @@ public:
// Read current state, possibly executing some tasks
static thread_state state();
// Wait once with timeout. May spuriously return false.
static inline void wait_for(u64 usec, bool alert = true)
{
_wait_for(usec, alert);
}
// Wait once with timeout. Infinite value is -1.
static void wait_for(u64 usec, bool alert = true);
// Waiting with accurate timeout
static void wait_for_accurate(u64 usec);
// Wait.
static inline void wait()
{
_wait_for(-1, true);
wait_for(-1, true);
}
// Wait for both thread sync var and provided atomic var

View file

@ -3849,6 +3849,12 @@ s64 spu_thread::get_ch_value(u32 ch)
}
const s64 out = channel.pop_wait(*this);
if (state & cpu_flag::wait)
{
wakeup_delay();
}
static_cast<void>(test_stopped());
return out;
};
@ -4068,6 +4074,7 @@ s64 spu_thread::get_ch_value(u32 ch)
thread_ctrl::wait_on(state, old, 100);
}
wakeup_delay();
check_state();
return events.events & mask1;
}
@ -4114,6 +4121,7 @@ bool spu_thread::set_ch_value(u32 ch, u32 value)
}
int_ctrl[2].set(SPU_INT2_STAT_MAILBOX_INT);
wakeup_delay();
check_state();
return true;
}
@ -4680,6 +4688,7 @@ bool spu_thread::stop_and_signal(u32 code)
thread_ctrl::wait_on(state, old);
}
wakeup_delay();
return true;
}
@ -5000,6 +5009,12 @@ bool spu_thread::capture_local_storage() const
return true;
}
void spu_thread::wakeup_delay(u32 div) const
{
if (g_cfg.core.spu_wakeup_delay_mask & (1u << index))
thread_ctrl::wait_for_accurate(utils::aligned_div(+g_cfg.core.spu_wakeup_delay, div));
}
spu_function_logger::spu_function_logger(spu_thread& spu, const char* func)
: spu(spu)
{

View file

@ -872,6 +872,7 @@ public:
void fast_call(u32 ls_addr);
bool capture_local_storage() const;
void wakeup_delay(u32 div = 1) const;
// Convert specified SPU LS address to a pointer of specified (possibly converted to BE) type
template<typename T>

View file

@ -2780,59 +2780,10 @@ namespace rsx
return result;
}
void thread::fifo_wake_delay(u64 div)
void thread::fifo_wake_delay(u32 div)
{
// TODO: Nanoseconds accuracy
u64 remaining = g_cfg.video.driver_wakeup_delay;
if (!remaining)
{
return;
}
// Some cases do not need full delay
remaining = utils::aligned_div(remaining, div);
const u64 until = rsx::uclock() + remaining;
while (true)
{
#ifdef __linux__
// NOTE: Assumption that timer initialization has succeeded
u64 host_min_quantum = remaining <= 1000 ? 10 : 50;
#else
// Host scheduler quantum for windows (worst case)
// NOTE: On ps3 this function has very high accuracy
constexpr u64 host_min_quantum = 500;
#endif
if (remaining >= host_min_quantum)
{
#ifdef __linux__
// Do not wait for the last quantum to avoid loss of accuracy
thread_ctrl::wait_for(remaining - ((remaining % host_min_quantum) + host_min_quantum), false);
#else
// Wait on multiple of min quantum for large durations to avoid overloading low thread cpus
thread_ctrl::wait_for(remaining - (remaining % host_min_quantum), false);
#endif
}
// TODO: Determine best value for yield delay
else if (remaining >= host_min_quantum / 2)
{
std::this_thread::yield();
}
else
{
busy_wait(100);
}
const u64 current = rsx::uclock();
if (current >= until)
{
break;
}
remaining = until - current;
}
thread_ctrl::wait_for_accurate(utils::aligned_div(+g_cfg.video.driver_wakeup_delay, div));
}
u32 thread::get_fifo_cmd() const

View file

@ -24,6 +24,7 @@
#include "Capture/rsx_trace.h"
#include "Capture/rsx_replay.h"
#include "Emu/system_config.h"
#include "Emu/Cell/lv2/sys_rsx.h"
#include "Emu/IdManager.h"
#include "Emu/system_config.h"
@ -518,7 +519,7 @@ namespace rsx
const char* file = __builtin_FILE(),
const char* func = __builtin_FUNCTION());
static void fifo_wake_delay(u64 div = 1);
static void fifo_wake_delay(u32 div = 1);
u32 get_fifo_cmd() const;
void dump_regs(std::string&) const override;

View file

@ -87,6 +87,8 @@ struct cfg_root : cfg::node
cfg::uint64 tx_limit2_ns{this, "TSX Transaction Second Limit", 2000}; // In nanoseconds
cfg::_int<10, 3000> clocks_scale{ this, "Clocks scale", 100 }; // Changing this from 100 (percentage) may affect game speed in unexpected ways
cfg::uint<0, 3000> spu_wakeup_delay{ this, "SPU Wake-Up Delay", 0, true };
cfg::uint<0, (1 << 6) - 1> spu_wakeup_delay_mask{ this, "SPU Wake-Up Delay Thread Mask", (1 << 6) - 1, true };
#if defined (__linux__) || defined (__APPLE__)
cfg::_enum<sleep_timers_accuracy_level> sleep_timers_accuracy{ this, "Sleep Timers Accuracy", sleep_timers_accuracy_level::_as_host, true };
#else
@ -168,7 +170,7 @@ struct cfg_root : cfg::node
cfg::_int<1, 1024> min_scalable_dimension{ this, "Minimum Scalable Dimension", 16 };
cfg::_int<0, 16> shader_compiler_threads_count{ this, "Shader Compiler Threads", 0 };
cfg::_int<0, 30000000> driver_recovery_timeout{ this, "Driver Recovery Timeout", 1000000, true };
cfg::_int<0, 16667> driver_wakeup_delay{ this, "Driver Wake-Up Delay", 1, true };
cfg::uint<0, 16667> driver_wakeup_delay{ this, "Driver Wake-Up Delay", 1, true };
cfg::_int<1, 1800> vblank_rate{ this, "Vblank Rate", 60, true }; // Changing this from 60 may affect game speed in unexpected ways
cfg::_bool vblank_ntsc{ this, "Vblank NTSC Fixup", false, true };
cfg::_bool decr_memory_layout{ this, "DECR memory layout", false}; // Force enable increased allowed main memory range as DECR console