Unstaged changes cleanup. Further vectorization. Change in binormal/bitangent calculation.
This commit is contained in:
@@ -116,6 +116,7 @@ set(llcommon_HEADER_FILES
|
||||
linden_common.h
|
||||
linked_lists.h
|
||||
llaccountingcost.h
|
||||
llalignedarray.h
|
||||
llagentconstants.h
|
||||
llallocator.h
|
||||
llallocator_heap_profile.h
|
||||
|
||||
139
indra/llcommon/llalignedarray.h
Normal file
139
indra/llcommon/llalignedarray.h
Normal file
@@ -0,0 +1,139 @@
|
||||
/**
|
||||
* @file llalignedarray.h
|
||||
* @brief A static array which obeys alignment restrictions and mimics std::vector accessors.
|
||||
*
|
||||
* $LicenseInfo:firstyear=2002&license=viewerlgpl$
|
||||
* Second Life Viewer Source Code
|
||||
* Copyright (C) 2010, Linden Research, Inc.
|
||||
*
|
||||
* This library is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation;
|
||||
* version 2.1 of the License only.
|
||||
*
|
||||
* This library is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with this library; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
* Linden Research, Inc., 945 Battery Street, San Francisco, CA 94111 USA
|
||||
* $/LicenseInfo$
|
||||
*/
|
||||
|
||||
#ifndef LL_LLALIGNEDARRAY_H
|
||||
#define LL_LLALIGNEDARRAY_H
|
||||
|
||||
#include "llmemory.h"
|
||||
|
||||
template <class T, U32 alignment>
|
||||
class LLAlignedArray
|
||||
{
|
||||
public:
|
||||
T* mArray;
|
||||
U32 mElementCount;
|
||||
U32 mCapacity;
|
||||
|
||||
LLAlignedArray();
|
||||
~LLAlignedArray();
|
||||
|
||||
void push_back(const T& elem);
|
||||
U32 size() const { return mElementCount; }
|
||||
void resize(U32 size);
|
||||
T* append(S32 N);
|
||||
T& operator[](int idx);
|
||||
const T& operator[](int idx) const;
|
||||
};
|
||||
|
||||
template <class T, U32 alignment>
|
||||
LLAlignedArray<T, alignment>::LLAlignedArray()
|
||||
{
|
||||
llassert(alignment >= 16);
|
||||
mArray = NULL;
|
||||
mElementCount = 0;
|
||||
mCapacity = 0;
|
||||
}
|
||||
|
||||
template <class T, U32 alignment>
|
||||
LLAlignedArray<T, alignment>::~LLAlignedArray()
|
||||
{
|
||||
ll_aligned_free(mArray);
|
||||
mArray = NULL;
|
||||
mElementCount = 0;
|
||||
mCapacity = 0;
|
||||
}
|
||||
|
||||
template <class T, U32 alignment>
|
||||
void LLAlignedArray<T, alignment>::push_back(const T& elem)
|
||||
{
|
||||
T* old_buf = NULL;
|
||||
if (mCapacity <= mElementCount)
|
||||
{
|
||||
mCapacity++;
|
||||
mCapacity *= 2;
|
||||
T* new_buf = (T*) ll_aligned_malloc(mCapacity*sizeof(T), alignment);
|
||||
if (mArray)
|
||||
{
|
||||
ll_memcpy_nonaliased_aligned_16((char*)new_buf, (char*)mArray, sizeof(T)*mElementCount);
|
||||
}
|
||||
old_buf = mArray;
|
||||
mArray = new_buf;
|
||||
}
|
||||
|
||||
mArray[mElementCount++] = elem;
|
||||
|
||||
//delete old array here to prevent error on a.push_back(a[0])
|
||||
ll_aligned_free(old_buf);
|
||||
}
|
||||
|
||||
template <class T, U32 alignment>
|
||||
void LLAlignedArray<T, alignment>::resize(U32 size)
|
||||
{
|
||||
if (mCapacity < size)
|
||||
{
|
||||
mCapacity = size+mCapacity*2;
|
||||
T* new_buf = mCapacity > 0 ? (T*) ll_aligned_malloc(mCapacity*sizeof(T), alignment) : NULL;
|
||||
if (mArray)
|
||||
{
|
||||
ll_memcpy_nonaliased_aligned_16((char*) new_buf, (char*) mArray, sizeof(T)*mElementCount);
|
||||
ll_aligned_free(mArray);
|
||||
}
|
||||
|
||||
/*for (U32 i = mElementCount; i < mCapacity; ++i)
|
||||
{
|
||||
new(new_buf+i) T();
|
||||
}*/
|
||||
mArray = new_buf;
|
||||
}
|
||||
|
||||
mElementCount = size;
|
||||
}
|
||||
|
||||
|
||||
template <class T, U32 alignment>
|
||||
T& LLAlignedArray<T, alignment>::operator[](int idx)
|
||||
{
|
||||
llassert(idx < mElementCount);
|
||||
return mArray[idx];
|
||||
}
|
||||
|
||||
template <class T, U32 alignment>
|
||||
const T& LLAlignedArray<T, alignment>::operator[](int idx) const
|
||||
{
|
||||
llassert(idx < mElementCount);
|
||||
return mArray[idx];
|
||||
}
|
||||
|
||||
template <class T, U32 alignment>
|
||||
T* LLAlignedArray<T, alignment>::append(S32 N)
|
||||
{
|
||||
U32 sz = size();
|
||||
resize(sz+N);
|
||||
return &((*this)[sz]);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
@@ -42,27 +42,77 @@ class LLMutex ;
|
||||
#define LL_CHECK_MEMORY
|
||||
#endif
|
||||
|
||||
LL_COMMON_API void ll_assert_aligned_func(uintptr_t ptr,U32 alignment);
|
||||
|
||||
#ifdef SHOW_ASSERT
|
||||
#define ll_assert_aligned(ptr,alignment) ll_assert_aligned_func(reinterpret_cast<uintptr_t>(ptr),((U32)alignment))
|
||||
#else
|
||||
#define ll_assert_aligned(ptr,alignment)
|
||||
#endif
|
||||
|
||||
#include <xmmintrin.h>
|
||||
|
||||
template <typename T> T* LL_NEXT_ALIGNED_ADDRESS(T* address)
|
||||
{
|
||||
return reinterpret_cast<T*>(
|
||||
(reinterpret_cast<uintptr_t>(address) + 0xF) & ~0xF);
|
||||
}
|
||||
|
||||
template <typename T> T* LL_NEXT_ALIGNED_ADDRESS_64(T* address)
|
||||
{
|
||||
return reinterpret_cast<T*>(
|
||||
(reinterpret_cast<uintptr_t>(address) + 0x3F) & ~0x3F);
|
||||
}
|
||||
|
||||
#if LL_LINUX || LL_DARWIN
|
||||
|
||||
#define LL_ALIGN_PREFIX(x)
|
||||
#define LL_ALIGN_POSTFIX(x) __attribute__((aligned(x)))
|
||||
|
||||
#elif LL_WINDOWS
|
||||
|
||||
#define LL_ALIGN_PREFIX(x) __declspec(align(x))
|
||||
#define LL_ALIGN_POSTFIX(x)
|
||||
|
||||
#else
|
||||
#error "LL_ALIGN_PREFIX and LL_ALIGN_POSTFIX undefined"
|
||||
#endif
|
||||
|
||||
#define LL_ALIGN_16(var) LL_ALIGN_PREFIX(16) var LL_ALIGN_POSTFIX(16)
|
||||
|
||||
inline void* ll_aligned_malloc( size_t size, int align )
|
||||
{
|
||||
#if defined(LL_WINDOWS)
|
||||
return _aligned_malloc(size, align);
|
||||
#else
|
||||
void* mem = malloc( size + (align - 1) + sizeof(void*) );
|
||||
char* aligned = ((char*)mem) + sizeof(void*);
|
||||
aligned += align - ((uintptr_t)aligned & (align - 1));
|
||||
|
||||
((void**)aligned)[-1] = mem;
|
||||
return aligned;
|
||||
#endif
|
||||
}
|
||||
|
||||
inline void ll_aligned_free( void* ptr )
|
||||
{
|
||||
free( ((void**)ptr)[-1] );
|
||||
#if defined(LL_WINDOWS)
|
||||
_aligned_free(ptr);
|
||||
#else
|
||||
if (ptr)
|
||||
{
|
||||
free( ((void**)ptr)[-1] );
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
#if !LL_USE_TCMALLOC
|
||||
inline void* ll_aligned_malloc_16(size_t size) // returned hunk MUST be freed with ll_aligned_free_16().
|
||||
{
|
||||
#if (LL_DARWIN || LL_USE_TCMALLOC)
|
||||
return malloc(size); // default osx malloc is 16 byte aligned.
|
||||
#elif LL_WINDOWS
|
||||
#if defined(LL_WINDOWS)
|
||||
return _aligned_malloc(size, 16);
|
||||
#elif defined(LL_DARWIN)
|
||||
return malloc(size); // default osx malloc is 16 byte aligned.
|
||||
#else
|
||||
void *rtn;
|
||||
if (LL_LIKELY(0 == posix_memalign(&rtn, 16, size)))
|
||||
@@ -74,10 +124,10 @@ inline void* ll_aligned_malloc_16(size_t size) // returned hunk MUST be freed wi
|
||||
|
||||
inline void ll_aligned_free_16(void *p)
|
||||
{
|
||||
#if (LL_DARWIN || LL_USE_TCMALLOC)
|
||||
free(p);
|
||||
#elif LL_WINDOWS
|
||||
#if defined(LL_WINDOWS)
|
||||
_aligned_free(p);
|
||||
#elif defined(LL_DARWIN)
|
||||
return free(p);
|
||||
#else
|
||||
free(p); // posix_memalign() is compatible with heap deallocator
|
||||
#endif
|
||||
@@ -85,10 +135,10 @@ inline void ll_aligned_free_16(void *p)
|
||||
|
||||
inline void* ll_aligned_realloc_16(void* ptr, size_t size, size_t old_size) // returned hunk MUST be freed with ll_aligned_free_16().
|
||||
{
|
||||
#if (LL_DARWIN || LL_USE_TCMALLOC)
|
||||
return realloc(ptr,size); // default osx malloc is 16 byte aligned.
|
||||
#elif LL_WINDOWS
|
||||
#if defined(LL_WINDOWS)
|
||||
return _aligned_realloc(ptr, size, 16);
|
||||
#elif defined(LL_DARWIN)
|
||||
return realloc(ptr,size); // default osx malloc is 16 byte aligned.
|
||||
#else
|
||||
//FIXME: memcpy is SLOW
|
||||
void* ret = ll_aligned_malloc_16(size);
|
||||
@@ -105,11 +155,18 @@ inline void* ll_aligned_realloc_16(void* ptr, size_t size, size_t old_size) // r
|
||||
#endif
|
||||
}
|
||||
|
||||
#else // USE_TCMALLOC
|
||||
// ll_aligned_foo_16 are not needed with tcmalloc
|
||||
#define ll_aligned_malloc_16 malloc
|
||||
#define ll_aligned_realloc_16(a,b,c) realloc(a,b)
|
||||
#define ll_aligned_free_16 free
|
||||
#endif // USE_TCMALLOC
|
||||
|
||||
inline void* ll_aligned_malloc_32(size_t size) // returned hunk MUST be freed with ll_aligned_free_32().
|
||||
{
|
||||
#if LL_WINDOWS
|
||||
#if defined(LL_WINDOWS)
|
||||
return _aligned_malloc(size, 32);
|
||||
#elif LL_DARWIN
|
||||
#elif defined(LL_DARWIN)
|
||||
return ll_aligned_malloc( size, 32 );
|
||||
#else
|
||||
void *rtn;
|
||||
@@ -122,15 +179,87 @@ inline void* ll_aligned_malloc_32(size_t size) // returned hunk MUST be freed wi
|
||||
|
||||
inline void ll_aligned_free_32(void *p)
|
||||
{
|
||||
#if LL_WINDOWS
|
||||
#if defined(LL_WINDOWS)
|
||||
_aligned_free(p);
|
||||
#elif LL_DARWIN
|
||||
#elif defined(LL_DARWIN)
|
||||
ll_aligned_free( p );
|
||||
#else
|
||||
free(p); // posix_memalign() is compatible with heap deallocator
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
// Copy words 16-byte blocks from src to dst. Source and destination MUST NOT OVERLAP.
|
||||
// Source and dest must be 16-byte aligned and size must be multiple of 16.
|
||||
//
|
||||
inline void ll_memcpy_nonaliased_aligned_16(char* __restrict dst, const char* __restrict src, size_t bytes)
|
||||
{
|
||||
assert(src != NULL);
|
||||
assert(dst != NULL);
|
||||
assert(bytes > 0);
|
||||
assert((bytes % sizeof(F32))== 0);
|
||||
ll_assert_aligned(src,16);
|
||||
ll_assert_aligned(dst,16);
|
||||
assert((src < dst) ? ((src + bytes) < dst) : ((dst + bytes) < src));
|
||||
assert(bytes%16==0);
|
||||
|
||||
char* end = dst + bytes;
|
||||
|
||||
if (bytes > 64)
|
||||
{
|
||||
|
||||
// Find start of 64b aligned area within block
|
||||
//
|
||||
void* begin_64 = LL_NEXT_ALIGNED_ADDRESS_64(dst);
|
||||
|
||||
//at least 64 bytes before the end of the destination, switch to 16 byte copies
|
||||
void* end_64 = end-64;
|
||||
|
||||
// Prefetch the head of the 64b area now
|
||||
//
|
||||
_mm_prefetch((char*)begin_64, _MM_HINT_NTA);
|
||||
_mm_prefetch((char*)begin_64 + 64, _MM_HINT_NTA);
|
||||
_mm_prefetch((char*)begin_64 + 128, _MM_HINT_NTA);
|
||||
_mm_prefetch((char*)begin_64 + 192, _MM_HINT_NTA);
|
||||
|
||||
// Copy 16b chunks until we're 64b aligned
|
||||
//
|
||||
while (dst < begin_64)
|
||||
{
|
||||
|
||||
_mm_store_ps((F32*)dst, _mm_load_ps((F32*)src));
|
||||
dst += 16;
|
||||
src += 16;
|
||||
}
|
||||
|
||||
// Copy 64b chunks up to your tail
|
||||
//
|
||||
// might be good to shmoo the 512b prefetch offset
|
||||
// (characterize performance for various values)
|
||||
//
|
||||
while (dst < end_64)
|
||||
{
|
||||
_mm_prefetch((char*)src + 512, _MM_HINT_NTA);
|
||||
_mm_prefetch((char*)dst + 512, _MM_HINT_NTA);
|
||||
_mm_store_ps((F32*)dst, _mm_load_ps((F32*)src));
|
||||
_mm_store_ps((F32*)(dst + 16), _mm_load_ps((F32*)(src + 16)));
|
||||
_mm_store_ps((F32*)(dst + 32), _mm_load_ps((F32*)(src + 32)));
|
||||
_mm_store_ps((F32*)(dst + 48), _mm_load_ps((F32*)(src + 48)));
|
||||
dst += 64;
|
||||
src += 64;
|
||||
}
|
||||
}
|
||||
|
||||
// Copy remainder 16b tail chunks (or ALL 16b chunks for sub-64b copies)
|
||||
//
|
||||
while (dst < end)
|
||||
{
|
||||
_mm_store_ps((F32*)dst, _mm_load_ps((F32*)src));
|
||||
dst += 16;
|
||||
src += 16;
|
||||
}
|
||||
}
|
||||
|
||||
#ifndef __DEBUG_PRIVATE_MEM__
|
||||
#define __DEBUG_PRIVATE_MEM__ 0
|
||||
#endif
|
||||
|
||||
Reference in New Issue
Block a user