Fix QueryPerformanceCounter eating more than half the CPU on AMD Win-x86_64

The issue was in fast timers, making the naming highly ironic. Rewrote to use RDTSC intrinsic. Doubled the framerate.
2015-11-18 00:55:11 +01:00
parent 62a7704a4f
commit c358ffe8e2
2 changed files with 44 additions and 62 deletions
--- a/indra/llcommon/llfasttimer_class.cpp
+++ b/indra/llcommon/llfasttimer_class.cpp
@@ -157,6 +157,7 @@

 #if LL_WINDOWS
 #include "lltimer.h"
+#include <intrin.h>
 #elif LL_LINUX || LL_SOLARIS
 #include <sys/time.h>
 #include <sched.h>
@@ -184,7 +185,7 @@ LLMutex* LLFastTimer::sLogLock = NULL;
 std::queue<LLSD> LLFastTimer::sLogQueue;
 const int LLFastTimer::NamedTimer::HISTORY_NUM = 300;

-#if defined(LL_WINDOWS) && !defined(_WIN64)
+#if defined(LL_WINDOWS)
 #define USE_RDTSC 1
 #endif

@@ -952,34 +953,15 @@ LLFastTimer::LLFastTimer(LLFastTimer::FrameState* state)
 #if USE_RDTSC
 U32 LLFastTimer::getCPUClockCount32()
 {
-	U32 ret_val;
-	__asm
-	{
-        _emit   0x0f
-        _emit   0x31
-		shr eax,8
-		shl edx,24
-		or eax, edx
-		mov dword ptr [ret_val], eax
-	}
-    return ret_val;
+	return (U32)(__rdtsc()>>8);
 }

 // return full timer value, *not* shifted by 8 bits
 U64 LLFastTimer::getCPUClockCount64()
 {
-	U64 ret_val;
-	__asm
-	{
-        _emit   0x0f
-        _emit   0x31
-		mov eax,eax
-		mov edx,edx
-		mov dword ptr [ret_val+4], edx
-		mov dword ptr [ret_val], eax
-	}
-    return ret_val;
+	return (U64)__rdtsc();
 }
+
 #else
 //LL_COMMON_API U64 get_clock_count(); // in lltimer.cpp
 // These use QueryPerformanceCounter, which is arguably fine and also works on AMD architectures.
--- a/indra/llcommon/lltimer.cpp
+++ b/indra/llcommon/lltimer.cpp
@@ -81,10 +81,10 @@ void ms_sleep(U32 ms)

 U32 micro_sleep(U64 us, U32 max_yields)
 {
-    // max_yields is unused; just fiddle with it to avoid warnings.
-    max_yields = 0;
-    ms_sleep(us / 1000);
-    return 0;
+	// max_yields is unused; just fiddle with it to avoid warnings.
+	max_yields = 0;
+	ms_sleep(us / 1000);
+	return 0;
 }
 #elif LL_LINUX || LL_SOLARIS || LL_DARWIN
 static void _sleep_loop(struct timespec& thiswait)
@@ -103,8 +103,8 @@ static void _sleep_loop(struct timespec& thiswait)
 		if (sleep_more)
 		{
 			if ( nextwait.tv_sec > thiswait.tv_sec ||
-			     (nextwait.tv_sec == thiswait.tv_sec &&
-			      nextwait.tv_nsec >= thiswait.tv_nsec) )
+				 (nextwait.tv_sec == thiswait.tv_sec &&
+				  nextwait.tv_nsec >= thiswait.tv_nsec) )
 			{
 				// if the remaining time isn't actually going
 				// down then we're being shafted by low clock
@@ -130,31 +130,31 @@ static void _sleep_loop(struct timespec& thiswait)

 U32 micro_sleep(U64 us, U32 max_yields)
 {
-    U64 start = get_clock_count();
-    // This is kernel dependent.  Currently, our kernel generates software clock
-    // interrupts at 250 Hz (every 4,000 microseconds).
-    const U64 KERNEL_SLEEP_INTERVAL_US = 4000;
+	U64 start = get_clock_count();
+	// This is kernel dependent.  Currently, our kernel generates software clock
+	// interrupts at 250 Hz (every 4,000 microseconds).
+	const U64 KERNEL_SLEEP_INTERVAL_US = 4000;

-    S32 num_sleep_intervals = (us - (KERNEL_SLEEP_INTERVAL_US >> 1)) / KERNEL_SLEEP_INTERVAL_US;
-    if (num_sleep_intervals > 0)
-    {
-        U64 sleep_time = (num_sleep_intervals * KERNEL_SLEEP_INTERVAL_US) - (KERNEL_SLEEP_INTERVAL_US >> 1);
-        struct timespec thiswait;
-        thiswait.tv_sec = sleep_time / 1000000;
-        thiswait.tv_nsec = (sleep_time % 1000000) * 1000l;
-        _sleep_loop(thiswait);
-    }
+	S32 num_sleep_intervals = (us - (KERNEL_SLEEP_INTERVAL_US >> 1)) / KERNEL_SLEEP_INTERVAL_US;
+	if (num_sleep_intervals > 0)
+	{
+		U64 sleep_time = (num_sleep_intervals * KERNEL_SLEEP_INTERVAL_US) - (KERNEL_SLEEP_INTERVAL_US >> 1);
+		struct timespec thiswait;
+		thiswait.tv_sec = sleep_time / 1000000;
+		thiswait.tv_nsec = (sleep_time % 1000000) * 1000l;
+		_sleep_loop(thiswait);
+	}

-    U64 current_clock = get_clock_count();
-    U32 yields = 0;
-    while (    (yields < max_yields)
-            && (current_clock - start < us) )
-    {
-        sched_yield();
-        ++yields;
-        current_clock = get_clock_count();
-    }
-    return yields;
+	U64 current_clock = get_clock_count();
+	U32 yields = 0;
+	while (    (yields < max_yields)
+			&& (current_clock - start < us) )
+	{
+		sched_yield();
+		++yields;
+		current_clock = get_clock_count();
+	}
+	return yields;
 }

 void ms_sleep(U32 ms)
@@ -163,7 +163,7 @@ void ms_sleep(U32 ms)
 	struct timespec thiswait;
 	thiswait.tv_sec = ms / 1000;
 	thiswait.tv_nsec = (mslong % 1000) * 1000000l;
-    _sleep_loop(thiswait);
+	_sleep_loop(thiswait);
 }
 #else
 # error "architecture not supported"
@@ -411,15 +411,15 @@ BOOL LLTimer::knownBadTimer()

 #if LL_WINDOWS
 	WCHAR bad_pci_list[][10] = {L"1039:0530",
-						        L"1039:0620",
-							    L"10B9:0533",
-							    L"10B9:1533",
-							    L"1106:0596",
-							    L"1106:0686",
-							    L"1166:004F",
-							    L"1166:0050",
- 							    L"8086:7110",
-							    L"\0"
+								L"1039:0620",
+								L"10B9:0533",
+								L"10B9:1533",
+								L"1106:0596",
+								L"1106:0686",
+								L"1166:004F",
+								L"1166:0050",
+								L"8086:7110",
+								L"\0"
 	};

 	HKEY hKey = NULL;