Unstaged changes cleanup. Further vectorization. Change in binormal/bitangent calculation.

This commit is contained in:
Shyotl
2013-10-09 14:47:06 -05:00
parent b473661cf4
commit f25eb07fab
51 changed files with 1987 additions and 1895 deletions

View File

@@ -42,27 +42,77 @@ class LLMutex ;
#define LL_CHECK_MEMORY
#endif
LL_COMMON_API void ll_assert_aligned_func(uintptr_t ptr,U32 alignment);
#ifdef SHOW_ASSERT
#define ll_assert_aligned(ptr,alignment) ll_assert_aligned_func(reinterpret_cast<uintptr_t>(ptr),((U32)alignment))
#else
#define ll_assert_aligned(ptr,alignment)
#endif
#include <xmmintrin.h>
template <typename T> T* LL_NEXT_ALIGNED_ADDRESS(T* address)
{
return reinterpret_cast<T*>(
(reinterpret_cast<uintptr_t>(address) + 0xF) & ~0xF);
}
template <typename T> T* LL_NEXT_ALIGNED_ADDRESS_64(T* address)
{
return reinterpret_cast<T*>(
(reinterpret_cast<uintptr_t>(address) + 0x3F) & ~0x3F);
}
#if LL_LINUX || LL_DARWIN
#define LL_ALIGN_PREFIX(x)
#define LL_ALIGN_POSTFIX(x) __attribute__((aligned(x)))
#elif LL_WINDOWS
#define LL_ALIGN_PREFIX(x) __declspec(align(x))
#define LL_ALIGN_POSTFIX(x)
#else
#error "LL_ALIGN_PREFIX and LL_ALIGN_POSTFIX undefined"
#endif
#define LL_ALIGN_16(var) LL_ALIGN_PREFIX(16) var LL_ALIGN_POSTFIX(16)
inline void* ll_aligned_malloc( size_t size, int align )
{
#if defined(LL_WINDOWS)
return _aligned_malloc(size, align);
#else
void* mem = malloc( size + (align - 1) + sizeof(void*) );
char* aligned = ((char*)mem) + sizeof(void*);
aligned += align - ((uintptr_t)aligned & (align - 1));
((void**)aligned)[-1] = mem;
return aligned;
#endif
}
inline void ll_aligned_free( void* ptr )
{
free( ((void**)ptr)[-1] );
#if defined(LL_WINDOWS)
_aligned_free(ptr);
#else
if (ptr)
{
free( ((void**)ptr)[-1] );
}
#endif
}
#if !LL_USE_TCMALLOC
inline void* ll_aligned_malloc_16(size_t size) // returned hunk MUST be freed with ll_aligned_free_16().
{
#if (LL_DARWIN || LL_USE_TCMALLOC)
return malloc(size); // default osx malloc is 16 byte aligned.
#elif LL_WINDOWS
#if defined(LL_WINDOWS)
return _aligned_malloc(size, 16);
#elif defined(LL_DARWIN)
return malloc(size); // default osx malloc is 16 byte aligned.
#else
void *rtn;
if (LL_LIKELY(0 == posix_memalign(&rtn, 16, size)))
@@ -74,10 +124,10 @@ inline void* ll_aligned_malloc_16(size_t size) // returned hunk MUST be freed wi
inline void ll_aligned_free_16(void *p)
{
#if (LL_DARWIN || LL_USE_TCMALLOC)
free(p);
#elif LL_WINDOWS
#if defined(LL_WINDOWS)
_aligned_free(p);
#elif defined(LL_DARWIN)
return free(p);
#else
free(p); // posix_memalign() is compatible with heap deallocator
#endif
@@ -85,10 +135,10 @@ inline void ll_aligned_free_16(void *p)
inline void* ll_aligned_realloc_16(void* ptr, size_t size, size_t old_size) // returned hunk MUST be freed with ll_aligned_free_16().
{
#if (LL_DARWIN || LL_USE_TCMALLOC)
return realloc(ptr,size); // default osx malloc is 16 byte aligned.
#elif LL_WINDOWS
#if defined(LL_WINDOWS)
return _aligned_realloc(ptr, size, 16);
#elif defined(LL_DARWIN)
return realloc(ptr,size); // default osx malloc is 16 byte aligned.
#else
//FIXME: memcpy is SLOW
void* ret = ll_aligned_malloc_16(size);
@@ -105,11 +155,18 @@ inline void* ll_aligned_realloc_16(void* ptr, size_t size, size_t old_size) // r
#endif
}
#else // USE_TCMALLOC
// ll_aligned_foo_16 are not needed with tcmalloc
#define ll_aligned_malloc_16 malloc
#define ll_aligned_realloc_16(a,b,c) realloc(a,b)
#define ll_aligned_free_16 free
#endif // USE_TCMALLOC
inline void* ll_aligned_malloc_32(size_t size) // returned hunk MUST be freed with ll_aligned_free_32().
{
#if LL_WINDOWS
#if defined(LL_WINDOWS)
return _aligned_malloc(size, 32);
#elif LL_DARWIN
#elif defined(LL_DARWIN)
return ll_aligned_malloc( size, 32 );
#else
void *rtn;
@@ -122,15 +179,87 @@ inline void* ll_aligned_malloc_32(size_t size) // returned hunk MUST be freed wi
inline void ll_aligned_free_32(void *p)
{
#if LL_WINDOWS
#if defined(LL_WINDOWS)
_aligned_free(p);
#elif LL_DARWIN
#elif defined(LL_DARWIN)
ll_aligned_free( p );
#else
free(p); // posix_memalign() is compatible with heap deallocator
#endif
}
// Copy words 16-byte blocks from src to dst. Source and destination MUST NOT OVERLAP.
// Source and dest must be 16-byte aligned and size must be multiple of 16.
//
inline void ll_memcpy_nonaliased_aligned_16(char* __restrict dst, const char* __restrict src, size_t bytes)
{
assert(src != NULL);
assert(dst != NULL);
assert(bytes > 0);
assert((bytes % sizeof(F32))== 0);
ll_assert_aligned(src,16);
ll_assert_aligned(dst,16);
assert((src < dst) ? ((src + bytes) < dst) : ((dst + bytes) < src));
assert(bytes%16==0);
char* end = dst + bytes;
if (bytes > 64)
{
// Find start of 64b aligned area within block
//
void* begin_64 = LL_NEXT_ALIGNED_ADDRESS_64(dst);
//at least 64 bytes before the end of the destination, switch to 16 byte copies
void* end_64 = end-64;
// Prefetch the head of the 64b area now
//
_mm_prefetch((char*)begin_64, _MM_HINT_NTA);
_mm_prefetch((char*)begin_64 + 64, _MM_HINT_NTA);
_mm_prefetch((char*)begin_64 + 128, _MM_HINT_NTA);
_mm_prefetch((char*)begin_64 + 192, _MM_HINT_NTA);
// Copy 16b chunks until we're 64b aligned
//
while (dst < begin_64)
{
_mm_store_ps((F32*)dst, _mm_load_ps((F32*)src));
dst += 16;
src += 16;
}
// Copy 64b chunks up to your tail
//
// might be good to shmoo the 512b prefetch offset
// (characterize performance for various values)
//
while (dst < end_64)
{
_mm_prefetch((char*)src + 512, _MM_HINT_NTA);
_mm_prefetch((char*)dst + 512, _MM_HINT_NTA);
_mm_store_ps((F32*)dst, _mm_load_ps((F32*)src));
_mm_store_ps((F32*)(dst + 16), _mm_load_ps((F32*)(src + 16)));
_mm_store_ps((F32*)(dst + 32), _mm_load_ps((F32*)(src + 32)));
_mm_store_ps((F32*)(dst + 48), _mm_load_ps((F32*)(src + 48)));
dst += 64;
src += 64;
}
}
// Copy remainder 16b tail chunks (or ALL 16b chunks for sub-64b copies)
//
while (dst < end)
{
_mm_store_ps((F32*)dst, _mm_load_ps((F32*)src));
dst += 16;
src += 16;
}
}
#ifndef __DEBUG_PRIVATE_MEM__
#define __DEBUG_PRIVATE_MEM__ 0
#endif