Unstaged changes cleanup. Further vectorization. Change in binormal/bitangent calculation.

2013-10-09 14:47:06 -05:00
parent b473661cf4
commit f25eb07fab
51 changed files with 1987 additions and 1895 deletions
--- a/indra/llcommon/CMakeLists.txt
+++ b/indra/llcommon/CMakeLists.txt
@@ -116,6 +116,7 @@ set(llcommon_HEADER_FILES
    linden_common.h
    linked_lists.h
    llaccountingcost.h
+    llalignedarray.h
    llagentconstants.h
    llallocator.h
    llallocator_heap_profile.h
--- a/indra/llcommon/llalignedarray.h
+++ b/indra/llcommon/llalignedarray.h
@@ -0,0 +1,139 @@
+/** 
+ * @file llalignedarray.h
+ * @brief A static array which obeys alignment restrictions and mimics std::vector accessors.
+ *
+ * $LicenseInfo:firstyear=2002&license=viewerlgpl$
+ * Second Life Viewer Source Code
+ * Copyright (C) 2010, Linden Research, Inc.
+ * 
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation;
+ * version 2.1 of the License only.
+ * 
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ * 
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ * 
+ * Linden Research, Inc., 945 Battery Street, San Francisco, CA  94111  USA
+ * $/LicenseInfo$
+ */
+
+#ifndef LL_LLALIGNEDARRAY_H
+#define LL_LLALIGNEDARRAY_H
+
+#include "llmemory.h"
+
+template <class T, U32 alignment>
+class LLAlignedArray
+{
+public:
+	T* mArray;
+	U32 mElementCount;
+	U32 mCapacity;
+
+	LLAlignedArray();
+	~LLAlignedArray();
+
+	void push_back(const T& elem);
+	U32 size() const { return mElementCount; }
+	void resize(U32 size);
+	T* append(S32 N);
+	T& operator[](int idx);
+	const T& operator[](int idx) const;
+};
+
+template <class T, U32 alignment>
+LLAlignedArray<T, alignment>::LLAlignedArray()
+{
+	llassert(alignment >= 16);
+	mArray = NULL;
+	mElementCount = 0;
+	mCapacity = 0;
+}
+
+template <class T, U32 alignment>
+LLAlignedArray<T, alignment>::~LLAlignedArray()
+{
+	ll_aligned_free(mArray);
+	mArray = NULL;
+	mElementCount = 0;
+	mCapacity = 0;
+}
+
+template <class T, U32 alignment>
+void LLAlignedArray<T, alignment>::push_back(const T& elem)
+{
+	T* old_buf = NULL;
+	if (mCapacity <= mElementCount)
+	{
+		mCapacity++;
+		mCapacity *= 2;
+		T* new_buf = (T*) ll_aligned_malloc(mCapacity*sizeof(T), alignment);
+		if (mArray)
+		{
+			ll_memcpy_nonaliased_aligned_16((char*)new_buf, (char*)mArray, sizeof(T)*mElementCount);
+		}
+		old_buf = mArray;
+		mArray = new_buf;
+	}
+
+	mArray[mElementCount++] = elem;
+
+	//delete old array here to prevent error on a.push_back(a[0])
+	ll_aligned_free(old_buf);
+}
+
+template <class T, U32 alignment>
+void LLAlignedArray<T, alignment>::resize(U32 size)
+{
+	if (mCapacity < size)
+	{
+		mCapacity = size+mCapacity*2;
+		T* new_buf = mCapacity > 0 ? (T*) ll_aligned_malloc(mCapacity*sizeof(T), alignment) : NULL;
+		if (mArray)
+		{
+			ll_memcpy_nonaliased_aligned_16((char*) new_buf, (char*) mArray, sizeof(T)*mElementCount);
+			ll_aligned_free(mArray);
+		}
+
+		/*for (U32 i = mElementCount; i < mCapacity; ++i)
+		{
+			new(new_buf+i) T();
+		}*/
+		mArray = new_buf;
+	}
+
+	mElementCount = size;
+}
+
+
+template <class T, U32 alignment>
+T& LLAlignedArray<T, alignment>::operator[](int idx)
+{
+	llassert(idx < mElementCount);
+	return mArray[idx];
+}
+
+template <class T, U32 alignment>
+const T& LLAlignedArray<T, alignment>::operator[](int idx) const
+{
+	llassert(idx < mElementCount);
+	return mArray[idx];
+}
+
+template <class T, U32 alignment>
+T* LLAlignedArray<T, alignment>::append(S32 N)
+{
+	U32 sz = size();
+	resize(sz+N);
+	return &((*this)[sz]);
+}
+
+#endif
+
--- a/indra/llcommon/llmemory.h
+++ b/indra/llcommon/llmemory.h
@@ -42,27 +42,77 @@ class LLMutex ;
 #define LL_CHECK_MEMORY
 #endif

+LL_COMMON_API void ll_assert_aligned_func(uintptr_t ptr,U32 alignment);
+
+#ifdef SHOW_ASSERT
+#define ll_assert_aligned(ptr,alignment) ll_assert_aligned_func(reinterpret_cast<uintptr_t>(ptr),((U32)alignment))
+#else
+#define ll_assert_aligned(ptr,alignment)
+#endif
+
+#include <xmmintrin.h>
+
+template <typename T> T* LL_NEXT_ALIGNED_ADDRESS(T* address) 
+{ 
+	return reinterpret_cast<T*>(
+		(reinterpret_cast<uintptr_t>(address) + 0xF) & ~0xF);
+}
+
+template <typename T> T* LL_NEXT_ALIGNED_ADDRESS_64(T* address) 
+{ 
+	return reinterpret_cast<T*>(
+		(reinterpret_cast<uintptr_t>(address) + 0x3F) & ~0x3F);
+}
+
+#if LL_LINUX || LL_DARWIN
+
+#define			LL_ALIGN_PREFIX(x)
+#define			LL_ALIGN_POSTFIX(x)		__attribute__((aligned(x)))
+
+#elif LL_WINDOWS
+
+#define			LL_ALIGN_PREFIX(x)		__declspec(align(x))
+#define			LL_ALIGN_POSTFIX(x)
+
+#else
+#error "LL_ALIGN_PREFIX and LL_ALIGN_POSTFIX undefined"
+#endif
+
+#define LL_ALIGN_16(var) LL_ALIGN_PREFIX(16) var LL_ALIGN_POSTFIX(16)
+
 inline void* ll_aligned_malloc( size_t size, int align )
 {
+#if defined(LL_WINDOWS)
+	return _aligned_malloc(size, align);
+#else
 	void* mem = malloc( size + (align - 1) + sizeof(void*) );
 	char* aligned = ((char*)mem) + sizeof(void*);
 	aligned += align - ((uintptr_t)aligned & (align - 1));

 	((void**)aligned)[-1] = mem;
 	return aligned;
+#endif
 }

 inline void ll_aligned_free( void* ptr )
 {
-	free( ((void**)ptr)[-1] );
+#if defined(LL_WINDOWS)
+	_aligned_free(ptr);
+#else
+	if (ptr)
+	{
+		free( ((void**)ptr)[-1] );
+	}
+#endif
 }

+#if !LL_USE_TCMALLOC
 inline void* ll_aligned_malloc_16(size_t size) // returned hunk MUST be freed with ll_aligned_free_16().
 {
-#if (LL_DARWIN || LL_USE_TCMALLOC)
-	return malloc(size); // default osx malloc is 16 byte aligned.
-#elif LL_WINDOWS
+#if defined(LL_WINDOWS)
 	return _aligned_malloc(size, 16);
+#elif defined(LL_DARWIN)
+	return malloc(size); // default osx malloc is 16 byte aligned.
 #else
 	void *rtn;
 	if (LL_LIKELY(0 == posix_memalign(&rtn, 16, size)))
@@ -74,10 +124,10 @@ inline void* ll_aligned_malloc_16(size_t size) // returned hunk MUST be freed wi

 inline void ll_aligned_free_16(void *p)
 {
-#if (LL_DARWIN || LL_USE_TCMALLOC)
-	free(p);
-#elif LL_WINDOWS
+#if defined(LL_WINDOWS)
 	_aligned_free(p);
+#elif defined(LL_DARWIN)
+	return free(p);
 #else
 	free(p); // posix_memalign() is compatible with heap deallocator
 #endif
@@ -85,10 +135,10 @@ inline void ll_aligned_free_16(void *p)

 inline void* ll_aligned_realloc_16(void* ptr, size_t size, size_t old_size) // returned hunk MUST be freed with ll_aligned_free_16().
 {
-#if (LL_DARWIN || LL_USE_TCMALLOC)
-	return realloc(ptr,size); // default osx malloc is 16 byte aligned.
-#elif LL_WINDOWS
+#if defined(LL_WINDOWS)
 	return _aligned_realloc(ptr, size, 16);
+#elif defined(LL_DARWIN)
+	return realloc(ptr,size); // default osx malloc is 16 byte aligned.
 #else
 	//FIXME: memcpy is SLOW
 	void* ret = ll_aligned_malloc_16(size);
@@ -105,11 +155,18 @@ inline void* ll_aligned_realloc_16(void* ptr, size_t size, size_t old_size) // r
 #endif
 }

+#else // USE_TCMALLOC
+// ll_aligned_foo_16 are not needed with tcmalloc
+#define ll_aligned_malloc_16 malloc
+#define ll_aligned_realloc_16(a,b,c) realloc(a,b)
+#define ll_aligned_free_16 free
+#endif // USE_TCMALLOC
+
 inline void* ll_aligned_malloc_32(size_t size) // returned hunk MUST be freed with ll_aligned_free_32().
 {
-#if LL_WINDOWS
+#if defined(LL_WINDOWS)
 	return _aligned_malloc(size, 32);
-#elif LL_DARWIN
+#elif defined(LL_DARWIN)
 	return ll_aligned_malloc( size, 32 );
 #else
 	void *rtn;
@@ -122,15 +179,87 @@ inline void* ll_aligned_malloc_32(size_t size) // returned hunk MUST be freed wi

 inline void ll_aligned_free_32(void *p)
 {
-#if LL_WINDOWS
+#if defined(LL_WINDOWS)
 	_aligned_free(p);
-#elif LL_DARWIN
+#elif defined(LL_DARWIN)
 	ll_aligned_free( p );
 #else
 	free(p); // posix_memalign() is compatible with heap deallocator
 #endif
 }

+
+// Copy words 16-byte blocks from src to dst. Source and destination MUST NOT OVERLAP. 
+// Source and dest must be 16-byte aligned and size must be multiple of 16.
+//
+inline void ll_memcpy_nonaliased_aligned_16(char* __restrict dst, const char* __restrict src, size_t bytes)
+{
+	assert(src != NULL);
+	assert(dst != NULL);
+	assert(bytes > 0);
+	assert((bytes % sizeof(F32))== 0); 
+	ll_assert_aligned(src,16);
+	ll_assert_aligned(dst,16);
+	assert((src < dst) ? ((src + bytes) < dst) : ((dst + bytes) < src));
+	assert(bytes%16==0);
+
+	char* end = dst + bytes;
+
+	if (bytes > 64)
+	{
+
+		// Find start of 64b aligned area within block
+		//
+		void* begin_64 = LL_NEXT_ALIGNED_ADDRESS_64(dst);
+		
+		//at least 64 bytes before the end of the destination, switch to 16 byte copies
+		void* end_64 = end-64;
+	
+		// Prefetch the head of the 64b area now
+		//
+		_mm_prefetch((char*)begin_64, _MM_HINT_NTA);
+		_mm_prefetch((char*)begin_64 + 64, _MM_HINT_NTA);
+		_mm_prefetch((char*)begin_64 + 128, _MM_HINT_NTA);
+		_mm_prefetch((char*)begin_64 + 192, _MM_HINT_NTA);
+	
+		// Copy 16b chunks until we're 64b aligned
+		//
+		while (dst < begin_64)
+		{
+
+			_mm_store_ps((F32*)dst, _mm_load_ps((F32*)src));
+			dst += 16;
+			src += 16;
+		}
+	
+		// Copy 64b chunks up to your tail
+		//
+		// might be good to shmoo the 512b prefetch offset
+		// (characterize performance for various values)
+		//
+		while (dst < end_64)
+		{
+			_mm_prefetch((char*)src + 512, _MM_HINT_NTA);
+			_mm_prefetch((char*)dst + 512, _MM_HINT_NTA);
+			_mm_store_ps((F32*)dst, _mm_load_ps((F32*)src));
+			_mm_store_ps((F32*)(dst + 16), _mm_load_ps((F32*)(src + 16)));
+			_mm_store_ps((F32*)(dst + 32), _mm_load_ps((F32*)(src + 32)));
+			_mm_store_ps((F32*)(dst + 48), _mm_load_ps((F32*)(src + 48)));
+			dst += 64;
+			src += 64;
+		}
+	}
+
+	// Copy remainder 16b tail chunks (or ALL 16b chunks for sub-64b copies)
+	//
+	while (dst < end)
+	{
+		_mm_store_ps((F32*)dst, _mm_load_ps((F32*)src));
+		dst += 16;
+		src += 16;
+	}
+}
+
 #ifndef __DEBUG_PRIVATE_MEM__
 #define __DEBUG_PRIVATE_MEM__  0
 #endif