Fix QueryPerformanceCounter eating more than half the CPU on AMD Win-x86_64
The issue was in fast timers, making the naming highly ironic. Rewrote to use RDTSC intrinsic. Doubled the framerate.
This commit is contained in:
@@ -157,6 +157,7 @@
|
||||
|
||||
#if LL_WINDOWS
|
||||
#include "lltimer.h"
|
||||
#include <intrin.h>
|
||||
#elif LL_LINUX || LL_SOLARIS
|
||||
#include <sys/time.h>
|
||||
#include <sched.h>
|
||||
@@ -184,7 +185,7 @@ LLMutex* LLFastTimer::sLogLock = NULL;
|
||||
std::queue<LLSD> LLFastTimer::sLogQueue;
|
||||
const int LLFastTimer::NamedTimer::HISTORY_NUM = 300;
|
||||
|
||||
#if defined(LL_WINDOWS) && !defined(_WIN64)
|
||||
#if defined(LL_WINDOWS)
|
||||
#define USE_RDTSC 1
|
||||
#endif
|
||||
|
||||
@@ -952,34 +953,15 @@ LLFastTimer::LLFastTimer(LLFastTimer::FrameState* state)
|
||||
#if USE_RDTSC
|
||||
U32 LLFastTimer::getCPUClockCount32()
|
||||
{
|
||||
U32 ret_val;
|
||||
__asm
|
||||
{
|
||||
_emit 0x0f
|
||||
_emit 0x31
|
||||
shr eax,8
|
||||
shl edx,24
|
||||
or eax, edx
|
||||
mov dword ptr [ret_val], eax
|
||||
}
|
||||
return ret_val;
|
||||
return (U32)(__rdtsc()>>8);
|
||||
}
|
||||
|
||||
// return full timer value, *not* shifted by 8 bits
|
||||
U64 LLFastTimer::getCPUClockCount64()
|
||||
{
|
||||
U64 ret_val;
|
||||
__asm
|
||||
{
|
||||
_emit 0x0f
|
||||
_emit 0x31
|
||||
mov eax,eax
|
||||
mov edx,edx
|
||||
mov dword ptr [ret_val+4], edx
|
||||
mov dword ptr [ret_val], eax
|
||||
}
|
||||
return ret_val;
|
||||
return (U64)__rdtsc();
|
||||
}
|
||||
|
||||
#else
|
||||
//LL_COMMON_API U64 get_clock_count(); // in lltimer.cpp
|
||||
// These use QueryPerformanceCounter, which is arguably fine and also works on AMD architectures.
|
||||
|
||||
@@ -81,10 +81,10 @@ void ms_sleep(U32 ms)
|
||||
|
||||
U32 micro_sleep(U64 us, U32 max_yields)
|
||||
{
|
||||
// max_yields is unused; just fiddle with it to avoid warnings.
|
||||
max_yields = 0;
|
||||
ms_sleep(us / 1000);
|
||||
return 0;
|
||||
// max_yields is unused; just fiddle with it to avoid warnings.
|
||||
max_yields = 0;
|
||||
ms_sleep(us / 1000);
|
||||
return 0;
|
||||
}
|
||||
#elif LL_LINUX || LL_SOLARIS || LL_DARWIN
|
||||
static void _sleep_loop(struct timespec& thiswait)
|
||||
@@ -103,8 +103,8 @@ static void _sleep_loop(struct timespec& thiswait)
|
||||
if (sleep_more)
|
||||
{
|
||||
if ( nextwait.tv_sec > thiswait.tv_sec ||
|
||||
(nextwait.tv_sec == thiswait.tv_sec &&
|
||||
nextwait.tv_nsec >= thiswait.tv_nsec) )
|
||||
(nextwait.tv_sec == thiswait.tv_sec &&
|
||||
nextwait.tv_nsec >= thiswait.tv_nsec) )
|
||||
{
|
||||
// if the remaining time isn't actually going
|
||||
// down then we're being shafted by low clock
|
||||
@@ -130,31 +130,31 @@ static void _sleep_loop(struct timespec& thiswait)
|
||||
|
||||
U32 micro_sleep(U64 us, U32 max_yields)
|
||||
{
|
||||
U64 start = get_clock_count();
|
||||
// This is kernel dependent. Currently, our kernel generates software clock
|
||||
// interrupts at 250 Hz (every 4,000 microseconds).
|
||||
const U64 KERNEL_SLEEP_INTERVAL_US = 4000;
|
||||
U64 start = get_clock_count();
|
||||
// This is kernel dependent. Currently, our kernel generates software clock
|
||||
// interrupts at 250 Hz (every 4,000 microseconds).
|
||||
const U64 KERNEL_SLEEP_INTERVAL_US = 4000;
|
||||
|
||||
S32 num_sleep_intervals = (us - (KERNEL_SLEEP_INTERVAL_US >> 1)) / KERNEL_SLEEP_INTERVAL_US;
|
||||
if (num_sleep_intervals > 0)
|
||||
{
|
||||
U64 sleep_time = (num_sleep_intervals * KERNEL_SLEEP_INTERVAL_US) - (KERNEL_SLEEP_INTERVAL_US >> 1);
|
||||
struct timespec thiswait;
|
||||
thiswait.tv_sec = sleep_time / 1000000;
|
||||
thiswait.tv_nsec = (sleep_time % 1000000) * 1000l;
|
||||
_sleep_loop(thiswait);
|
||||
}
|
||||
S32 num_sleep_intervals = (us - (KERNEL_SLEEP_INTERVAL_US >> 1)) / KERNEL_SLEEP_INTERVAL_US;
|
||||
if (num_sleep_intervals > 0)
|
||||
{
|
||||
U64 sleep_time = (num_sleep_intervals * KERNEL_SLEEP_INTERVAL_US) - (KERNEL_SLEEP_INTERVAL_US >> 1);
|
||||
struct timespec thiswait;
|
||||
thiswait.tv_sec = sleep_time / 1000000;
|
||||
thiswait.tv_nsec = (sleep_time % 1000000) * 1000l;
|
||||
_sleep_loop(thiswait);
|
||||
}
|
||||
|
||||
U64 current_clock = get_clock_count();
|
||||
U32 yields = 0;
|
||||
while ( (yields < max_yields)
|
||||
&& (current_clock - start < us) )
|
||||
{
|
||||
sched_yield();
|
||||
++yields;
|
||||
current_clock = get_clock_count();
|
||||
}
|
||||
return yields;
|
||||
U64 current_clock = get_clock_count();
|
||||
U32 yields = 0;
|
||||
while ( (yields < max_yields)
|
||||
&& (current_clock - start < us) )
|
||||
{
|
||||
sched_yield();
|
||||
++yields;
|
||||
current_clock = get_clock_count();
|
||||
}
|
||||
return yields;
|
||||
}
|
||||
|
||||
void ms_sleep(U32 ms)
|
||||
@@ -163,7 +163,7 @@ void ms_sleep(U32 ms)
|
||||
struct timespec thiswait;
|
||||
thiswait.tv_sec = ms / 1000;
|
||||
thiswait.tv_nsec = (mslong % 1000) * 1000000l;
|
||||
_sleep_loop(thiswait);
|
||||
_sleep_loop(thiswait);
|
||||
}
|
||||
#else
|
||||
# error "architecture not supported"
|
||||
@@ -411,15 +411,15 @@ BOOL LLTimer::knownBadTimer()
|
||||
|
||||
#if LL_WINDOWS
|
||||
WCHAR bad_pci_list[][10] = {L"1039:0530",
|
||||
L"1039:0620",
|
||||
L"10B9:0533",
|
||||
L"10B9:1533",
|
||||
L"1106:0596",
|
||||
L"1106:0686",
|
||||
L"1166:004F",
|
||||
L"1166:0050",
|
||||
L"8086:7110",
|
||||
L"\0"
|
||||
L"1039:0620",
|
||||
L"10B9:0533",
|
||||
L"10B9:1533",
|
||||
L"1106:0596",
|
||||
L"1106:0686",
|
||||
L"1166:004F",
|
||||
L"1166:0050",
|
||||
L"8086:7110",
|
||||
L"\0"
|
||||
};
|
||||
|
||||
HKEY hKey = NULL;
|
||||
|
||||
Reference in New Issue
Block a user